Compare commits

...

10 Commits

Author SHA1 Message Date
Colin c05cfdc623 Update cmake . 2025-12-02 15:02:41 +00:00
Colin 1ad3aabcfe Add unified-tina and viplite runtime library in arm linux. 2025-12-02 15:01:18 +00:00
Kee 7b24f4d437
[vx_platform] Fix create sub device crash issue (#715)
* Fix native platform build issue

Redefinition of variable  deviceCount

Type: Bug fix

* [vx_platform] Fix create sub device crash issue

sub_device_ variable should be initialized

Type: Bug fix

Signed-off-by: Kee <xuke537@hotmail.com>

* Fix a typo

Signed-off-by: Kee <xuke537@hotmail.com>

* Fix another typo

Signed-off-by: Kee <xuke537@hotmail.com>

---------

Signed-off-by: Kee <xuke537@hotmail.com>
2025-11-25 17:19:56 +08:00
Peter Kjellerstedt 3c83eca946
Add include of cstdint to permute_vector.h (#711)
This avoids the following error with GCC 15:

  src/tim/transform/ops/../permute_vector.h:41:11: error: 'uint32_t'
  does not name a type
     41 |   virtual uint32_t Rank() const = 0;
        |           ^~~~~~~~
  src/tim/transform/ops/../permute_vector.h:32:1: note: 'uint32_t' is
  defined in header '<cstdint>'; this is probably fixable by adding
  '#include <cstdint>'
     31 | #include <string>
    +++ |+#include <cstdint>
     32 |

Co-authored-by: Peter Kjellerstedt <pkj@axis.com>
2025-10-13 13:15:57 +08:00
Kee c4e75674fa
Refine platform code and samples (#713)
* Refine platform code and samples

1. Support viplite v2 API
2. Unify the Lite and Native platform APIs so that the same code
   can run on different platforms through different compilation options.

Type: Code Improvement

Signed-off-by: Kee <xuke537@hotmail.com>

* Fix build error if VSI device API is not supported

Signed-off-by: Kee <xuke537@hotmail.com>

---------

Signed-off-by: Kee <xuke537@hotmail.com>
2025-10-13 13:15:31 +08:00
Kee 6810d310d3
update CI workflows to use v4 of the artifact actions (#714) 2025-10-09 18:39:22 +08:00
Chen Feiyue 8494275d76
Update internal ovxlib to release/1.2.22 (#706)
* Update internal ovxlib to release/1.2.22

Signed-off-by: Feiyue.Chen <Feiyue.Chen@verisilicon.com>

* Refine yaml file for blocking tfhub model tests

Signed-off-by: Feiyue.Chen <Feiyue.Chen@verisilicon.com>

---------

Signed-off-by: Feiyue.Chen <Feiyue.Chen@verisilicon.com>
2025-01-08 13:22:46 +08:00
Kainan Cha 149834832c
Update README.md
Add ONNX Runtime Link
2024-12-12 09:24:49 +08:00
Chen Feiyue fcdf223d06
Fixed layernorm and logsoftmax ut error (#702)
Type:  Bug Fix

Signed-off-by: Feiyue Chen <Feiyue.Chen@verisilicon.com>
2024-07-29 10:44:28 +08:00
Chen Feiyue 81b6c07c5d
Update timvx_overview.svg (#701)
Type: Documentation

Signed-off-by: Feiyue Chen <Feiyue.Chen@verisilicon.com>
2024-07-29 10:44:04 +08:00
246 changed files with 54658 additions and 4550 deletions

View File

@ -35,7 +35,7 @@ jobs:
run: |
cmake --install ${{github.workspace}}/build --config ${{env.BUILD_TYPE}}
- name: upload tim-vx-install
uses: actions/upload-artifact@v3
uses: actions/upload-artifact@v4
with:
name: tim-vx-install
path: |
@ -75,7 +75,7 @@ jobs:
VIVANTE_SDK_DIR: ${{github.workspace}}/prebuilt-sdk/x86_64_linux/
steps:
- name: download tim-vx build output
uses: actions/download-artifact@v3
uses: actions/download-artifact@v4
with:
name: tim-vx-install
@ -102,7 +102,7 @@ jobs:
VIV_VX_DISABLE_TP_NN_EVIS: 1
steps:
- name: download tim-vx build output
uses: actions/download-artifact@v3
uses: actions/download-artifact@v4
with:
name: tim-vx-install
@ -117,21 +117,21 @@ jobs:
needs: tim-vx-build
steps:
- name: download tim-vx build output
uses: actions/download-artifact@v3
uses: actions/download-artifact@v4
with:
name: tim-vx-install
- name: apply-patch-build
run: |
git config --global user.email "xiang.zhang@verisilicon.com"
git config --global user.name "xiang.zhang"
git clone https://github.com/tensorflow/tensorflow.git ${{github.workspace}}/3rd-party/tensorflow && cd ${{github.workspace}}/3rd-party/tensorflow/ && git checkout v2.10.0
git clone https://github.com/tensorflow/tensorflow.git ${{github.workspace}}/3rd-party/tensorflow && cd ${{github.workspace}}/3rd-party/tensorflow/ && git checkout v2.16.1
git clone https://github.com/VeriSilicon/tflite-vx-delegate.git ${{github.workspace}}/vx-delegate
cmake -B ${{github.workspace}}/vx-delegate/build -S ${{github.workspace}}/vx-delegate -DFETCHCONTENT_SOURCE_DIR_TENSORFLOW=${{github.workspace}}/3rd-party/tensorflow -DTIM_VX_INSTALL=${{github.workspace}}/tim-vx.install.dir/ -DTFLITE_ENABLE_NNAPI=OFF -DTFLITE_ENABLE_XNNPACK=OFF
cmake --build ${{github.workspace}}/vx-delegate/build --config ${{env.BUILD_TYPE}}
cd ${{github.workspace}}/vx-delegate/build
make vx_delegate benchmark_model
- name: upload vx-delegate
uses: actions/upload-artifact@v3
uses: actions/upload-artifact@v4
with:
name: vx-delegate-bin
path: |
@ -144,7 +144,7 @@ jobs:
needs: [vx-delegate-build, tim-vx-unit-test]
steps:
- name: download binary
uses: actions/download-artifact@v3
uses: actions/download-artifact@v4
- name: download mobilenet_v2_quant.tflite
run: |
@ -159,7 +159,7 @@ jobs:
needs: [vx-delegate-build, tim-vx-unit-test]
steps:
- name: download binary
uses: actions/download-artifact@v3
uses: actions/download-artifact@v4
- name: download mobilenet_v2_b8_quant.tflite
run: |
curl -LJO https://github.com/sunshinemyson/TIM-VX/releases/download/v1.1.30.2/mobilenet_v2_b8_quant.tflite
@ -173,7 +173,7 @@ jobs:
needs: [vx-delegate-build, tim-vx-unit-test]
steps:
- name: download test binary
uses: actions/download-artifact@v3
uses: actions/download-artifact@v4
- name: download resnet_quant.tflite
run: |
curl -LJO https://github.com/sunshinemyson/TIM-VX/releases/download/v1.1.30.2/resnet_quant.tflite
@ -187,7 +187,7 @@ jobs:
needs: [vx-delegate-build, tim-vx-unit-test]
steps:
- name: download test binary
uses: actions/download-artifact@v3
uses: actions/download-artifact@v4
- name: download model
run: |
curl -LJO https://github.com/sunshinemyson/TIM-VX/releases/download/v1.1.30.2/inception_v3_quant.tflite
@ -201,7 +201,7 @@ jobs:
needs: [vx-delegate-build, tim-vx-unit-test]
steps:
- name: download test binary
uses: actions/download-artifact@v3
uses: actions/download-artifact@v4
- name: download model
run: |
curl -LJO https://github.com/sunshinemyson/TIM-VX/releases/download/v1.1.30.2/mobilenet_v3_b4_quant.tflite
@ -215,7 +215,7 @@ jobs:
needs: [vx-delegate-build, tim-vx-unit-test]
steps:
- name: download test binary
uses: actions/download-artifact@v3
uses: actions/download-artifact@v4
- name: download model
run: |
curl -LJO https://github.com/sunshinemyson/TIM-VX/releases/download/v1.1.30.2/mobilenet_v3_quant.tflite
@ -229,7 +229,7 @@ jobs:
needs: [vx-delegate-build, tim-vx-unit-test]
steps:
- name: download test binary
uses: actions/download-artifact@v3
uses: actions/download-artifact@v4
- name: download model
run: |
curl -LJO https://github.com/sunshinemyson/TIM-VX/releases/download/v1.1.30.2/mv3_depth_quant.tflite
@ -243,7 +243,7 @@ jobs:
needs: [vx-delegate-build, tim-vx-unit-test]
steps:
- name: download test binary
uses: actions/download-artifact@v3
uses: actions/download-artifact@v4
- name: download model
run: |
curl -LJO https://github.com/sunshinemyson/TIM-VX/releases/download/v1.1.30.2/yolo_v4_tiny_quant.tflite
@ -258,7 +258,7 @@ jobs:
# needs: [vx-delegate-build, tim-vx-unit-test]
# steps:
# - name: download test binary
# uses: actions/download-artifact@v3
# uses: actions/download-artifact@v4
# - name: download model
# run: |
# curl -LJO https://github.com/sunshinemyson/TIM-VX/releases/download/v1.1.30.2/deeplab_v3_plus_quant.tflite
@ -273,7 +273,7 @@ jobs:
# needs: vx-delegate-build
# steps:
# - name: download test binary
# uses: actions/download-artifact@v3
# uses: actions/download-artifact@v4
# - name: download model
# run: |
# wget https://storage.googleapis.com/tfhub-lite-models/google/lite-model/movenet/multipose/lightning/tflite/float16/1.tflite
@ -283,68 +283,68 @@ jobs:
# chmod u+x ${{github.workspace}}/vx-delegate-bin/_deps/tensorflow-build/tools/benchmark/benchmark_model
# ${{github.workspace}}/vx-delegate-bin/_deps/tensorflow-build/tools/benchmark/benchmark_model --num_runs=1 --external_delegate_path=${{github.workspace}}/vx-delegate-bin/libvx_delegate.so --graph=${{github.workspace}}/tfhub.movenet.multipose.tflite
tfhub-efficientdet-lite0:
runs-on: ubuntu-latest
needs: [vx-delegate-build, tim-vx-unit-test]
steps:
- name: download test binary
uses: actions/download-artifact@v3
- name: download model
run: |
wget https://storage.googleapis.com/tfhub-lite-models/tensorflow/lite-model/efficientdet/lite0/detection/metadata/1.tflite
- name: benchmark-model
run: |
chmod u+x ${{github.workspace}}/vx-delegate-bin/_deps/tensorflow-build/tools/benchmark/benchmark_model
${{github.workspace}}/vx-delegate-bin/_deps/tensorflow-build/tools/benchmark/benchmark_model --num_runs=1 --external_delegate_path=${{github.workspace}}/vx-delegate-bin/libvx_delegate.so --graph=${{github.workspace}}/1.tflite
# tfhub-efficientdet-lite0:
# runs-on: ubuntu-latest
# needs: [vx-delegate-build, tim-vx-unit-test]
# steps:
# - name: download test binary
# uses: actions/download-artifact@v4
# - name: download model
# run: |
# wget https://storage.googleapis.com/tfhub-lite-models/tensorflow/lite-model/efficientdet/lite0/detection/metadata/1.tflite
# - name: benchmark-model
# run: |
# chmod u+x ${{github.workspace}}/vx-delegate-bin/_deps/tensorflow-build/tools/benchmark/benchmark_model
# ${{github.workspace}}/vx-delegate-bin/_deps/tensorflow-build/tools/benchmark/benchmark_model --num_runs=1 --external_delegate_path=${{github.workspace}}/vx-delegate-bin/libvx_delegate.so --graph=${{github.workspace}}/1.tflite
tfhub-efficientdet-lite1:
runs-on: ubuntu-latest
needs: [vx-delegate-build, tim-vx-unit-test]
steps:
- name: download test binary
uses: actions/download-artifact@v3
- name: download model
run: |
wget https://storage.googleapis.com/tfhub-lite-models/tensorflow/lite-model/efficientdet/lite1/detection/metadata/1.tflite
- name: benchmark-model
run: |
chmod u+x ${{github.workspace}}/vx-delegate-bin/_deps/tensorflow-build/tools/benchmark/benchmark_model
${{github.workspace}}/vx-delegate-bin/_deps/tensorflow-build/tools/benchmark/benchmark_model --num_runs=1 --external_delegate_path=${{github.workspace}}/vx-delegate-bin/libvx_delegate.so --graph=${{github.workspace}}/1.tflite
# tfhub-efficientdet-lite1:
# runs-on: ubuntu-latest
# needs: [vx-delegate-build, tim-vx-unit-test]
# steps:
# - name: download test binary
# uses: actions/download-artifact@v4
# - name: download model
# run: |
# wget https://storage.googleapis.com/tfhub-lite-models/tensorflow/lite-model/efficientdet/lite1/detection/metadata/1.tflite
# - name: benchmark-model
# run: |
# chmod u+x ${{github.workspace}}/vx-delegate-bin/_deps/tensorflow-build/tools/benchmark/benchmark_model
# ${{github.workspace}}/vx-delegate-bin/_deps/tensorflow-build/tools/benchmark/benchmark_model --num_runs=1 --external_delegate_path=${{github.workspace}}/vx-delegate-bin/libvx_delegate.so --graph=${{github.workspace}}/1.tflite
tfhub-efficientdet-lite2:
runs-on: ubuntu-latest
needs: [vx-delegate-build, tim-vx-unit-test]
steps:
- name: download test binary
uses: actions/download-artifact@v3
- name: download model
run: |
wget https://storage.googleapis.com/tfhub-lite-models/tensorflow/lite-model/efficientdet/lite2/detection/metadata/1.tflite
- name: benchmark-model
run: |
chmod u+x ${{github.workspace}}/vx-delegate-bin/_deps/tensorflow-build/tools/benchmark/benchmark_model
${{github.workspace}}/vx-delegate-bin/_deps/tensorflow-build/tools/benchmark/benchmark_model --num_runs=1 --external_delegate_path=${{github.workspace}}/vx-delegate-bin/libvx_delegate.so --graph=${{github.workspace}}/1.tflite
# tfhub-efficientdet-lite2:
# runs-on: ubuntu-latest
# needs: [vx-delegate-build, tim-vx-unit-test]
# steps:
# - name: download test binary
# uses: actions/download-artifact@v4
# - name: download model
# run: |
# wget https://storage.googleapis.com/tfhub-lite-models/tensorflow/lite-model/efficientdet/lite2/detection/metadata/1.tflite
# - name: benchmark-model
# run: |
# chmod u+x ${{github.workspace}}/vx-delegate-bin/_deps/tensorflow-build/tools/benchmark/benchmark_model
# ${{github.workspace}}/vx-delegate-bin/_deps/tensorflow-build/tools/benchmark/benchmark_model --num_runs=1 --external_delegate_path=${{github.workspace}}/vx-delegate-bin/libvx_delegate.so --graph=${{github.workspace}}/1.tflite
tfhub-efficientdet-lite3:
runs-on: ubuntu-latest
needs: [vx-delegate-build, tim-vx-unit-test]
steps:
- name: download test binary
uses: actions/download-artifact@v3
- name: download model
run: |
wget https://storage.googleapis.com/tfhub-lite-models/tensorflow/lite-model/efficientdet/lite2/detection/metadata/1.tflite
- name: benchmark-model
run: |
chmod u+x ${{github.workspace}}/vx-delegate-bin/_deps/tensorflow-build/tools/benchmark/benchmark_model
${{github.workspace}}/vx-delegate-bin/_deps/tensorflow-build/tools/benchmark/benchmark_model --num_runs=1 --external_delegate_path=${{github.workspace}}/vx-delegate-bin/libvx_delegate.so --graph=${{github.workspace}}/1.tflite
# tfhub-efficientdet-lite3:
# runs-on: ubuntu-latest
# needs: [vx-delegate-build, tim-vx-unit-test]
# steps:
# - name: download test binary
# uses: actions/download-artifact@v4
# - name: download model
# run: |
# wget https://storage.googleapis.com/tfhub-lite-models/tensorflow/lite-model/efficientdet/lite2/detection/metadata/1.tflite
# - name: benchmark-model
# run: |
# chmod u+x ${{github.workspace}}/vx-delegate-bin/_deps/tensorflow-build/tools/benchmark/benchmark_model
# ${{github.workspace}}/vx-delegate-bin/_deps/tensorflow-build/tools/benchmark/benchmark_model --num_runs=1 --external_delegate_path=${{github.workspace}}/vx-delegate-bin/libvx_delegate.so --graph=${{github.workspace}}/1.tflite
# acuity-yolov3-608-quant:
# runs-on: ubuntu-latest
# needs: [vx-delegate-build, tim-vx-unit-test]
# steps:
# - name: download test binary
# uses: actions/download-artifact@v3
# uses: actions/download-artifact@v4
# - name: download model
# run: |
# curl -LJO https://github.com/sunshinemyson/TIM-VX/releases/download/v1.1.30.2/yolov3_608relu_quant.acuity.tflite
@ -359,7 +359,7 @@ jobs:
# needs: vx-delegate-build
# steps:
# - name: download test binary
# uses: actions/download-artifact@v3
# uses: actions/download-artifact@v4
# - name: download model
# run: |
# wget https://storage.googleapis.com/tfhub-lite-models/tensorflow/lite-model/efficientdet/lite4/detection/metadata/1.tflite

View File

@ -2,13 +2,13 @@ cmake_minimum_required (VERSION 3.14)
project(tim-vx LANGUAGES C CXX)
option(BUILD_SHARED_LIBS "Build using shared libraries" ON)
option(TIM_VX_ENABLE_CUSTOM_OP "Enable custom op support" OFF)
option(TIM_VX_ENABLE_CUSTOM_OP "Enable custom op support" ON)
option(TIM_VX_ENABLE_TEST "Build the unit test" OFF)
option(TIM_VX_ENABLE_LAYOUT_INFER "Enable layout inference support" ON)
option(TIM_VX_ENABLE_NBG_PARSER "Enable NBG parser" OFF)
option(TIM_VX_ENABLE_NBG_PARSER "Enable NBG parser" ON)
option(TIM_VX_CODE_COVERAGE "Run code coverage with gconv(gcc only" OFF)
option(TIM_VX_USE_EXTERNAL_OVXLIB "Use external OVXLIB" OFF)
option(TIM_VX_BUILD_EXAMPLES "Build demos show general usage" OFF)
option(TIM_VX_BUILD_EXAMPLES "Build demos show general usage" ON)
option(TIM_VX_ENABLE_VIPLITE "Enable lite driver api support" OFF)
option(TIM_VX_ENABLE_40BIT "Enable large memory support" OFF)
option(TIM_VX_ENABLE_PLATFORM "Enable multi devices support" OFF)

View File

@ -35,7 +35,7 @@ Main Features
- [TVM](https://github.com/VeriSilicon/tvm) (Fork)
- [Paddle-Lite](https://github.com/PaddlePaddle/Paddle-Lite) (Official)
- [OpenCV](https://github.com/opencv/opencv/wiki/TIM-VX-Backend-For-Running-OpenCV-On-NPU) (Offical)
- MLIR Dialect (In development)
- [ONNXRuntime](https://github.com/microsoft/onnxruntime/tree/main/onnxruntime/core/providers/vsinpu) (Official)
Feel free to raise a github issue if you wish to add TIM-VX for other frameworks.

View File

@ -1 +1 @@
1.2.14
1.2.22

View File

@ -9,7 +9,11 @@ list(APPEND OVXDRV_INCLUDE_DIRS
if("${CONFIG}" STREQUAL "BUILDROOT")
set(VIV_SDK_DRIVER_PREFIX "usr/lib")
else()
set(VIV_SDK_DRIVER_PREFIX "drivers")
if(EXISTS ${EXTERNAL_VIV_SDK}/drivers)
set(VIV_SDK_DRIVER_PREFIX "drivers")
else()
set(VIV_SDK_DRIVER_PREFIX "lib")
endif()
endif()
message("using driver libs from ${EXTERNAL_VIV_SDK}/${VIV_SDK_DRIVER_PREFIX}")

File diff suppressed because one or more lines are too long

Before

Width:  |  Height:  |  Size: 56 KiB

After

Width:  |  Height:  |  Size: 53 KiB

View File

@ -25,72 +25,58 @@
#define TIM_VX_LITE_NATIVE_H_
#include "tim/vx/platform/platform.h"
#include "vip_lite.h"
#include "nbg_linker.h"
namespace tim {
namespace vx {
namespace platform {
class LiteNativeExecutor
: public IExecutor,
public std::enable_shared_from_this<LiteNativeExecutor> {
class LiteNativeDevice : public IDevice {
public:
LiteNativeExecutor(const std::shared_ptr<IDevice>& device);
virtual ~LiteNativeExecutor();
bool Submit(const std::shared_ptr<IExecutable>& executable,
const std::shared_ptr<IExecutable>& ref,
bool after = true) override;
bool Trigger(bool async = false) override;
std::shared_ptr<IExecutable> Compile(
const std::shared_ptr<Graph>& graph) override;
private:
vip_task_descriptor_t* task_descriptor_;
vip_database database_;
virtual ~LiteNativeDevice() {};
virtual bool Submit(const std::shared_ptr<Graph>& graph) = 0;
virtual bool Trigger(bool async = false, async_callback cb = NULL) = 0;
virtual bool DeviceExit() = 0;
virtual void WaitDeviceIdle() = 0;
virtual std::shared_ptr<IExecutor> CreateExecutor(const int32_t core_index = 0,
const int32_t core_count = -1,
const std::shared_ptr<Context>& context = nullptr) = 0;
static std::vector<std::shared_ptr<IDevice>> Enumerate();
static bool vip_initialized;
};
class LiteNativeExecutor
: public IExecutor {
public:
virtual ~LiteNativeExecutor() {};
virtual bool Submit(const std::shared_ptr<IExecutable>& executable,
const std::shared_ptr<IExecutable>& ref,
bool after = true) = 0;
virtual bool Trigger(bool async = false) = 0;
virtual std::shared_ptr<IExecutable> Compile(
const std::shared_ptr<Graph>& graph) = 0;
};
class LiteNativeExecutable : public IExecutable {
public:
LiteNativeExecutable(const std::shared_ptr<IExecutor>& executor,
const std::vector<char>& nb_buf);
virtual ~LiteNativeExecutable();
void SetInput(const std::shared_ptr<ITensorHandle>& th) override;
void SetOutput(const std::shared_ptr<ITensorHandle>& th) override;
void GetOutput(
const std::vector<std::shared_ptr<ITensorHandle>>& th) override;
bool Submit(const std::shared_ptr<IExecutable>& ref, bool after) override;
bool Trigger(bool async) override;
bool Verify() override;
std::shared_ptr<ITensorHandle> AllocateTensor(
const TensorSpec& tensor_spec) override;
vip_network network_;
private:
void SetBuffer(vip_memory_t* dst, gcvip_videomemory_t* src);
int32_t input_count_;
int32_t output_count_;
gcvip_videomemory_t* coeff_;
gcvip_videomemory_t* command_;
gcvip_videomemory_t* memory_pool_;
gcvip_videomemory_t* others_;
gcvip_videomemory_t* pre_command_;
virtual ~LiteNativeExecutable() {};
virtual void SetInput(const std::shared_ptr<ITensorHandle>& th) = 0;
virtual void SetOutput(const std::shared_ptr<ITensorHandle>& th) = 0;
virtual void SetInputs(const std::vector<std::shared_ptr<ITensorHandle>>& ths) = 0;
virtual void SetOutputs(const std::vector<std::shared_ptr<ITensorHandle>>& ths) = 0;
virtual bool Submit(const std::shared_ptr<IExecutable>& ref, bool after) = 0;
virtual bool Trigger(bool async) = 0;
virtual bool Verify() = 0;
virtual std::shared_ptr<ITensorHandle> AllocateTensor(const TensorSpec& tensor_spec,
void* data = nullptr, uint32_t size = 0) = 0;
};
class LiteNativeTensorHandle : public ITensorHandle {
public:
LiteNativeTensorHandle(const std::shared_ptr<Tensor>& tensr);
virtual ~LiteNativeTensorHandle();
bool CopyDataToTensor(const void* data, uint32_t size_in_bytes) override;
bool CopyDataFromTensor(void* data) override;
gcvip_videomemory_t* tensor_buffer_;
virtual ~LiteNativeTensorHandle() {};
bool CopyDataToTensor(const void* data, uint32_t size_in_bytes) = 0;
bool CopyDataFromTensor(void* data) = 0;
};
} // namespace platform
} // namespace vx
} // namespace tim
#endif
#endif

View File

@ -37,51 +37,41 @@ class NativeDevice : public IDevice {
virtual bool Trigger(bool async = false, async_callback cb = NULL) = 0;
virtual bool DeviceExit() = 0;
virtual void WaitDeviceIdle() = 0;
virtual std::shared_ptr<IExecutor> CreateExecutor(const int32_t core_index = 0,
const int32_t core_count = -1,
const std::shared_ptr<Context>& context = nullptr) = 0;
static std::vector<std::shared_ptr<IDevice>> Enumerate();
};
class NativeExecutable : public IExecutable {
public:
NativeExecutable(const std::shared_ptr<IExecutor>& executor,
const std::vector<char>& nb_buf, size_t inputs,
size_t outputs);
~NativeExecutable(){};
void SetInput(const std::shared_ptr<ITensorHandle>& th) override;
void SetOutput(const std::shared_ptr<ITensorHandle>& th) override;
void GetOutput(
const std::vector<std::shared_ptr<ITensorHandle>>& th) override;
bool Submit(const std::shared_ptr<IExecutable>& ref,
bool after = true) override;
bool Trigger(bool async = false) override;
std::shared_ptr<ITensorHandle> AllocateTensor(
const TensorSpec& tensor_spec) override;
bool Verify() override;
protected:
std::shared_ptr<tim::vx::ops::NBG> nb_node_;
std::vector<char> nb_buf_;
virtual ~NativeExecutable() {};
virtual void SetInput(const std::shared_ptr<ITensorHandle>& th) = 0;
virtual void SetOutput(const std::shared_ptr<ITensorHandle>& th) = 0;
virtual void SetInputs(const std::vector<std::shared_ptr<ITensorHandle>>& ths) = 0;
virtual void SetOutputs(const std::vector<std::shared_ptr<ITensorHandle>>& ths) = 0;
virtual bool Submit(const std::shared_ptr<IExecutable>& ref,
bool after = true) = 0;
virtual bool Trigger(bool async = false) = 0;
virtual std::shared_ptr<ITensorHandle> AllocateTensor(const TensorSpec& tensor_spec,
void* data = nullptr, uint32_t size = 0) = 0;
virtual bool Verify() = 0;
};
class NativeExecutor : public IExecutor,
public std::enable_shared_from_this<NativeExecutor> {
class NativeExecutor : public IExecutor {
public:
NativeExecutor(const std::shared_ptr<IDevice>& device);
NativeExecutor(const std::shared_ptr<IDevice>& device,
const std::shared_ptr<Context>& context);
~NativeExecutor(){};
bool Submit(const std::shared_ptr<IExecutable>& executable,
const std::shared_ptr<IExecutable>& ref,
bool after = true) override;
bool Trigger(bool async = false) override;
std::shared_ptr<IExecutable> Compile(
const std::shared_ptr<Graph>& graph) override;
virtual ~NativeExecutor(){};
virtual bool Submit(const std::shared_ptr<IExecutable>& executable,
const std::shared_ptr<IExecutable>& ref,
bool after = true) = 0;
virtual bool Trigger(bool async = false) = 0;
virtual std::shared_ptr<IExecutable> Compile(const std::shared_ptr<Graph>& graph) = 0;
};
class NativeTensorHandle : public ITensorHandle {
public:
NativeTensorHandle(const std::shared_ptr<Tensor>& tensor);
bool CopyDataToTensor(const void* data, uint32_t size_in_bytes) override;
bool CopyDataFromTensor(void* data) override;
virtual bool CopyDataToTensor(const void* data, uint32_t size_in_bytes) = 0;
virtual bool CopyDataFromTensor(void* data) = 0;
};
} // namespace platform

View File

@ -46,15 +46,12 @@ namespace platform {
class IDevice;
class IExecutable;
class ExecutableSet;
class IExecutor;
class ITensorHandle;
std::shared_ptr<IExecutable> Compile(
const std::shared_ptr<Graph>& graph,
const std::shared_ptr<IExecutor>& executor);
std::shared_ptr<IExecutable> CreateExecutableSet(
const std::vector<std::shared_ptr<IExecutable>>& executables);
class IDevice {
public:
@ -68,17 +65,25 @@ class IDevice {
virtual ~IDevice(){};
virtual bool Submit(const std::shared_ptr<Graph>& graph) = 0;
virtual bool Trigger(bool async = false, async_callback cb = NULL) = 0;
device_id_t Id() const;
device_id_t Id() const { return device_id_;};
virtual void WaitDeviceIdle() = 0;
virtual bool DeviceExit() = 0;
virtual void RemoteReset();
uint32_t CoreCount() const {return core_count_;};
virtual std::shared_ptr<IExecutor> CreateExecutor(const int32_t core_index = 0,
const int32_t core_count = -1,
const std::shared_ptr<Context>& context = nullptr) = 0;
static std::vector<std::shared_ptr<IDevice>> Enumerate();
protected:
device_id_t device_id_;
uint32_t core_count_;
};
class IExecutor {
public:
//using task = std::shared_ptr<IExecutable>;
using task = std::weak_ptr<IExecutable>;
virtual ~IExecutor(){};
virtual bool Submit(const std::shared_ptr<IExecutable>& executable,
@ -87,13 +92,17 @@ class IExecutor {
virtual bool Trigger(bool async = false) = 0; // todo: async=true
virtual std::shared_ptr<IExecutable> Compile(
const std::shared_ptr<Graph>& graph) = 0;
virtual std::shared_ptr<IDevice> Device() const;
virtual std::shared_ptr<Context> Contex() const;
virtual std::shared_ptr<IDevice> Device() const {return device_;};
virtual std::shared_ptr<Context> Contex() const {return context_;};
virtual uint32_t CoreIndex() const {return core_index_; };
virtual uint32_t CoreCount() const {return core_count_; };
protected:
std::vector<task> tasks_;
std::shared_ptr<IDevice> device_;
std::shared_ptr<Context> context_;
uint32_t core_index_;
uint32_t core_count_;
};
class IExecutable : public std::enable_shared_from_this<IExecutable> {
@ -101,40 +110,24 @@ class IExecutable : public std::enable_shared_from_this<IExecutable> {
virtual ~IExecutable(){};
virtual void SetInput(const std::shared_ptr<ITensorHandle>& th) = 0;
virtual void SetOutput(const std::shared_ptr<ITensorHandle>& th) = 0;
virtual void GetOutput(
const std::vector<std::shared_ptr<ITensorHandle>>& th) = 0; // for remote
virtual void SetInputs(const std::vector<std::shared_ptr<ITensorHandle>>& ths) = 0;
virtual void SetOutputs(const std::vector<std::shared_ptr<ITensorHandle>>& ths) = 0;
virtual std::vector<std::shared_ptr<ITensorHandle>> GetOutputs() { return input_handles_;};
virtual std::vector<std::shared_ptr<ITensorHandle>> Getinputs() { return input_handles_;};
virtual bool Submit(const std::shared_ptr<IExecutable>& ref,
bool after = true) = 0;
virtual bool Trigger(bool async = false) = 0; // todo: async=true
virtual bool Verify() = 0;
virtual std::shared_ptr<Graph> NBGraph() const;
virtual std::shared_ptr<ITensorHandle> AllocateTensor(
const TensorSpec& tensor_spec) = 0;
virtual std::shared_ptr<IExecutor> Executor() const;
std::shared_ptr<Graph> NBGraph() const {return nb_graph_;};
virtual std::shared_ptr<ITensorHandle> AllocateTensor(const TensorSpec& tensor_spec ,
void* data = nullptr, uint32_t size = 0) = 0;
protected:
std::weak_ptr<IExecutor> executor_;
std::shared_ptr<Context> context_;
std::shared_ptr<Graph> nb_graph_;
};
class ExecutableSet : public IExecutable {
public:
ExecutableSet(const std::vector<std::shared_ptr<IExecutable>>& executables);
void SetInput(const std::shared_ptr<ITensorHandle>& th) override;
void SetOutput(const std::shared_ptr<ITensorHandle>& th) override;
void GetOutput(
const std::vector<std::shared_ptr<ITensorHandle>>& th) override;
bool Submit(const std::shared_ptr<IExecutable>& ref,
bool after = true) override;
bool Trigger(bool async = false) override;
bool Verify() override;
std::shared_ptr<ITensorHandle> AllocateTensor(
const TensorSpec& tensor_spec) override;
std::vector<std::shared_ptr<IExecutable>> Executables() const;
protected:
std::vector<std::shared_ptr<IExecutable>> executables_;
std::vector<std::shared_ptr<ITensorHandle>> input_handles_;
std::vector<std::shared_ptr<ITensorHandle>> output_handles_;
};
class ITensorHandle {
@ -142,13 +135,15 @@ class ITensorHandle {
virtual ~ITensorHandle(){};
virtual bool CopyDataToTensor(const void* data, uint32_t size_in_bytes) = 0;
virtual bool CopyDataFromTensor(void* data) = 0;
virtual std::shared_ptr<Tensor> GetTensor() const;
virtual std::shared_ptr<Tensor> GetTensor() const { return tensor_;};
virtual TensorSpec& GetSpec() { return spec_;};
protected:
std::shared_ptr<Tensor> tensor_;
TensorSpec spec_;
};
} // namespace platform
} // namespace vx
} // namespace tim
#endif
#endif

View File

@ -20,9 +20,7 @@ endif()
if(TIM_VX_ENABLE_PLATFORM)
add_subdirectory("lenet_multi_device")
add_subdirectory("multi_device")
if(${TIM_VX_ENABLE_PLATFORM_LITE})
add_subdirectory("lite_multi_device")
endif()
add_subdirectory("platform_sample")
if(TIM_VX_ENABLE_GRPC)
add_subdirectory("grpc")
endif()

View File

@ -11,5 +11,10 @@ target_include_directories(${TARGET_NAME} PRIVATE
${PROJECT_SOURCE_DIR}/include
)
target_include_directories(${TARGET_NAME} PRIVATE
${CMAKE_CURRENT_SOURCE_DIR}
${PROJECT_SOURCE_DIR}/include
)
install(TARGETS ${TARGET_NAME} ${TARGET_NAME}
DESTINATION ${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_BINDIR})

View File

@ -33,7 +33,6 @@
#include "tim/vx/context.h"
#include "tim/vx/graph.h"
#include "tim/vx/platform/platform.h"
#include "tim/vx/platform/native.h"
std::vector<uint8_t> input_data = {
0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 2, 0, 0, 8, 0,
@ -108,17 +107,17 @@ static void printTopN(const T* prob, int outputCount, int topNum) {
}
}
int main(int argc, char** argv) {
(void) argc, (void) argv;
auto context0 = tim::vx::Context::Create();
auto graph0 = lenet(context0);
auto graph1 = lenet(context0);
auto devices = tim::vx::platform::NativeDevice::Enumerate();
auto devices = tim::vx::platform::IDevice::Enumerate();
auto device = devices[0];
std::shared_ptr<tim::vx::platform::IExecutor> executor = std::make_shared<tim::vx::platform::NativeExecutor> (device);
auto executable0 = tim::vx::platform::Compile(graph0, executor); // compile to nbg
auto executor = device->CreateExecutor(0,-1,context0);
auto executable0 = tim::vx::platform::Compile(graph0, executor);
auto input_handle0 = executable0->AllocateTensor(graph0->InputsTensor()[0]->GetSpec());
auto output_handle0 = executable0->AllocateTensor(graph0->OutputsTensor()[0]->GetSpec());
executable0->SetInput(input_handle0);
@ -127,7 +126,18 @@ int main(int argc, char** argv) {
assert(executable0->Submit(executable0));
executable0->Trigger();
auto executable1 = tim::vx::platform::Compile(graph1, executor); // compile to nbg
std::vector<float> output_data;
output_data.resize(1 * 10);
if (!output_handle0->CopyDataFromTensor(output_data.data())) {
std::cout << "Copy output data fail." << std::endl;
return -1;
}
std::cout << "executable0 out." << std::endl;
printTopN(output_data.data(), output_data.size(), 5);
output_data.assign(output_data.size(),0);
output_handle0->CopyDataToTensor(output_data.data(), output_data.size());
auto executable1 = tim::vx::platform::Compile(graph1, executor);
auto input_handle1 = executable1->AllocateTensor(graph1->InputsTensor()[0]->GetSpec());
auto output_handle1 = executable1->AllocateTensor(graph1->OutputsTensor()[0]->GetSpec());
executable1->SetInput(input_handle1);
@ -136,34 +146,28 @@ int main(int argc, char** argv) {
assert(executable1->Submit(executable0));
executable1->Trigger();
std::vector<float> output_data1;
output_data1.resize(1 * 10);
if (!output_handle1->CopyDataFromTensor(output_data1.data())) {
std::cout << "Copy output data fail." << std::endl;
return -1;
}
std::cout << "executable1 out." << std::endl;
printTopN(output_data1.data(), output_data1.size(), 5);
output_data1.assign(output_data1.size(),0);
output_handle1->CopyDataToTensor(output_data1.data(), output_data1.size());
executor->Submit(executable0, executable0);
executor->Submit(executable1, executable0);
std::vector<std::shared_ptr<tim::vx::platform::IExecutable>> executables0;
executables0.push_back(executable0);
executables0.push_back(executable1);
auto executable_set0 = tim::vx::platform::CreateExecutableSet(executables0);
executor->Submit(executable_set0, executable_set0);
executor->Trigger();
std::vector<uint8_t> input_data0;
input_data0.resize(28 * 28);
if (!input_handle0->CopyDataFromTensor(input_data0.data())) {
std::cout << "Copy intput data fail." << std::endl;
return -1;
}
printTopN(input_data0.data(), input_data0.size(), 5);
std::vector<float> output_data;
output_data.resize(1 * 10);
std::cout << "executor out." << std::endl;
if (!output_handle0->CopyDataFromTensor(output_data.data())) {
std::cout << "Copy output data fail." << std::endl;
return -1;
}
printTopN(output_data.data(), output_data.size(), 5);
std::vector<float> output_data1;
output_data1.resize(1 * 10);
if (!output_handle1->CopyDataFromTensor(output_data1.data())) {
std::cout << "Copy output data fail." << std::endl;
return -1;

View File

@ -1,13 +0,0 @@
message("samples/lite_multi_device")
set(TARGET_NAME "lite_multi_device")
add_executable(${TARGET_NAME} ${CMAKE_CURRENT_SOURCE_DIR}/lite_multi_device.cc)
target_link_libraries(${TARGET_NAME} PRIVATE -Wl,--whole-archive tim-vx)
target_include_directories(${TARGET_NAME} PRIVATE
${PROJECT_SOURCE_DIR}/include
${PROJECT_SOURCE_DIR}/prebuilt-sdk/viplite/build/sdk/include)
install(TARGETS ${TARGET_NAME} ${TARGET_NAME}
DESTINATION ${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_BINDIR})

View File

@ -1,15 +1,25 @@
## brief
The multi_device demo uses some acuity exported tim-vx networks, and running on 4 devices of NPU using platform api.
The multi_device demo uses some acuity exported tim-vx networks, and running on multi-core devices of NPU using platform api.
## environment
export VSIMULATOR_CONFIG=VIP9400O_PID0XD9
export VIV_MGPU_AFFINITY="1:0"
export VIV_OVX_USE_MULTI_DEVICE="1:1"
export TIM_VX_ROOT="${workspaceFolder}/tim-vx"
## note
Please note that if you have enabled lite platform, a dedicated VIVANTE_SDK(NO_KERNEL) is required as the compiler for NBG.
The driver for the NPU is the VIPLITE driver
##requirements
Vivante SDK >= 6.4.22
ovxlib >= 1.2.26
viplite >=2.0.0
## build
cd build
cmake .. -DCMAKE_BUILD_TYPE=Debug -DTIM_VX_BUILD_EXAMPLES=ON -DTIM_VX_ENABLE_PLATFORM=ON
cmake .. -DCMAKE_BUILD_TYPE=Release -DTIM_VX_USE_EXTERNAL_OVXLIB=ON -DEXTERNAL_VIV_SDK=${VIVANTE_NOKERNEL_SDK_DIR} -DOVXLIB_INC=${OVXLIB_DIR}/include \
-DOVXLIB_LIB=/path/to/libovxlib.so -DTIM_VX_BUILD_EXAMPLES=ON -DTIM_VX_ENABLE_PLATFORM=ON -DTIM_VX_ENABLE_PLATFORM_LITE=ON -DVIP_LITE_SDK=${VIP_LITE_SDK}
## environment
# Export VIV_GPU_FILE to specify the NPU hardware configuration file for the NBG compiler
# VIV_GPU_FILE Specify the NPU hardware configuration file for the NBG compiler
export VIV_GPU_FILE="/path/to/VIP9400NANOQ_PLUS_PID0X10000055.config"
export TIM_VX_ROOT="${workspaceFolder}/tim-vx"
## run
cd build

View File

@ -35,7 +35,6 @@
#include "tim/vx/context.h"
#include "tim/vx/graph.h"
#include "tim/vx/platform/platform.h"
#include "tim/vx/platform/native.h"
#include "vx_lenet.h"
#include "vx_mobilenet.h"
#include "vx_resnet50.h"
@ -59,7 +58,7 @@ static void printTopN(const T* prob, int outputCount, int topNum) {
}
template <typename T>
void print_topN(std::size_t size, std::shared_ptr<tim::vx::platform::ITensorHandle> handle) {
void print_topN(std::size_t size, std::shared_ptr<tim::vx::platform::ITensorHandle> & handle) {
std::vector<T> output_data;
output_data.resize(size);
if (!handle->CopyDataFromTensor(output_data.data())) {
@ -94,7 +93,8 @@ void executor_trigger(std::shared_ptr<tim::vx::platform::IExecutor> executor) {
}
auto context = tim::vx::Context::Create();
std::pair<std::shared_ptr<tim::vx::platform::IExecutable>, std::shared_ptr<tim::vx::platform::ITensorHandle>> generate_executable(
std::pair<std::shared_ptr<tim::vx::platform::IExecutable>, std::shared_ptr<tim::vx::platform::ITensorHandle>>
generate_executable(
std::shared_ptr<tim::vx::platform::IExecutor> executor,
std::function<void(std::shared_ptr<tim::vx::Graph>, const char*)> construct_func,
std::string weight_file,
@ -114,15 +114,17 @@ std::pair<std::shared_ptr<tim::vx::platform::IExecutable>, std::shared_ptr<tim::
int main(int argc, char** argv) {
(void) argc, (void) argv;
auto devices = tim::vx::platform::NativeDevice::Enumerate();
auto devices = tim::vx::platform::IDevice::Enumerate();
auto device0 = devices[0];
std::shared_ptr<tim::vx::platform::IExecutor> executor0 = std::make_shared<tim::vx::platform::NativeExecutor> (device0);
auto device1 = devices[1];
std::shared_ptr<tim::vx::platform::IExecutor> executor1 = std::make_shared<tim::vx::platform::NativeExecutor> (device1);
auto device2 = devices[2];
std::shared_ptr<tim::vx::platform::IExecutor> executor2 = std::make_shared<tim::vx::platform::NativeExecutor> (device2);
auto device3 = devices[3];
std::shared_ptr<tim::vx::platform::IExecutor> executor3 = std::make_shared<tim::vx::platform::NativeExecutor> (device3);
auto total_core_count = device0->CoreCount();
uint32_t core_index = 0;
auto use_core_count = 1;
std::vector<std::shared_ptr<tim::vx::platform::IExecutor>> executors;
for(core_index = 0; core_index < total_core_count; core_index += use_core_count) {
auto executor = device0->CreateExecutor(core_index,use_core_count, context);
executors.push_back(executor);
}
auto root = std::getenv("TIM_VX_ROOT");
assert(root != NULL);
@ -142,46 +144,57 @@ int main(int argc, char** argv) {
auto resnet50_weight_file = ROOT + "/samples/multi_device/resnet50/resnet50.export.data";
std::function<void(std::shared_ptr<tim::vx::Graph>, const char*)> resnet50_construct_func = acuitylite::resnet50::construct_graph;
std::shared_ptr<tim::vx::platform::IExecutable> lenet_0, lenet_2, lenet_3, mobilenet_1, mobilenet_2, mobilenet_3, resnet50_0, resnet50_1;
std::shared_ptr<tim::vx::platform::ITensorHandle> lenet_0_outhandle, lenet_2_outhandle, lenet_3_outhandle, mobilenet_1_outhandle, mobilenet_2_outhandle, mobilenet_3_outhandle,
resnet50_0_outhandle, resnet50_1_outhandle;
auto excutor_cnt = executors.size();
std::tie(lenet_0, lenet_0_outhandle) = generate_executable(executor0, lenet_construct_func, lenet_weight_file, lenet_input_files, lenet_input_bytes);
std::tie(resnet50_0, resnet50_0_outhandle) = generate_executable(executor0, resnet50_construct_func, resnet50_weight_file, resnet50_input_files, resnet50_input_bytes);
executor0->Submit(lenet_0, lenet_0);
executor0->Submit(resnet50_0, lenet_0);
//each excutor run 2 models.
auto lenet = [&](std::shared_ptr<tim::vx::platform::IExecutor> executor) {
return generate_executable(executor, lenet_construct_func, lenet_weight_file,
lenet_input_files, lenet_input_bytes);
};
auto resnet = [&](std::shared_ptr<tim::vx::platform::IExecutor> executor) {
return generate_executable(executor, resnet50_construct_func, resnet50_weight_file,
resnet50_input_files, resnet50_input_bytes);
};
auto mobilenet = [&](std::shared_ptr<tim::vx::platform::IExecutor> executor) {
return generate_executable(executor, mobilenet_construct_func, mobilenet_weight_file,
mobilenet_input_files, mobilenet_input_bytes);
};
std::vector<std::pair<std::shared_ptr<tim::vx::platform::IExecutable>,
std::shared_ptr<tim::vx::platform::ITensorHandle>>> nets;
for (size_t i = 0; i < excutor_cnt; i++) {
if(i % 3 == 0) {
//lenet + resnet
nets.push_back(lenet(executors[i]));
executors[i]->Submit(nets.back().first, nets.back().first);
nets.push_back(resnet(executors[i]));
executors[i]->Submit(nets.back().first, nets.back().first);
}
if(i % 3 == 1) {
//resnet + mobilenet
nets.push_back(resnet(executors[i]));
executors[i]->Submit(nets.back().first, nets.back().first);
nets.push_back(mobilenet(executors[i]));
executors[i]->Submit(nets.back().first, nets.back().first);
}
if(i % 3 == 2) {
//lenet + mobilenet
nets.push_back(mobilenet(executors[i]));
executors[i]->Submit(nets.back().first, nets.back().first);
nets.push_back(lenet(executors[i]));
executors[i]->Submit(nets.back().first, nets.back().first);
}
}
std::vector<std::thread> threads;
for(auto executor:executors) {
threads.push_back(std::thread(executor_trigger, executor));
}
for(std::thread &t : threads) {
t.join();
}
std::tie(mobilenet_1, mobilenet_1_outhandle) = generate_executable(executor1, mobilenet_construct_func, mobilenet_weight_file, mobilenet_input_files, mobilenet_input_bytes);
std::tie(resnet50_1, resnet50_1_outhandle) = generate_executable(executor1, resnet50_construct_func, resnet50_weight_file, resnet50_input_files, resnet50_input_bytes);
auto executable_set1 = tim::vx::platform::CreateExecutableSet({mobilenet_1, resnet50_1});
executor1->Submit(executable_set1, executable_set1);
std::tie(lenet_2, lenet_2_outhandle) = generate_executable(executor2, lenet_construct_func, lenet_weight_file, lenet_input_files, lenet_input_bytes);
std::tie(mobilenet_2, mobilenet_2_outhandle) = generate_executable(executor2, mobilenet_construct_func, mobilenet_weight_file, mobilenet_input_files, mobilenet_input_bytes);
auto executable_set2 = tim::vx::platform::CreateExecutableSet({lenet_2, mobilenet_2});
executor2->Submit(executable_set2, executable_set2);
std::tie(lenet_3, lenet_3_outhandle) = generate_executable(executor3, lenet_construct_func, lenet_weight_file, lenet_input_files, lenet_input_bytes);
std::tie(mobilenet_3, mobilenet_3_outhandle) = generate_executable(executor3, mobilenet_construct_func, mobilenet_weight_file, mobilenet_input_files, mobilenet_input_bytes);
auto executable_set3 = tim::vx::platform::CreateExecutableSet({lenet_3, mobilenet_3});
executor3->Submit(executable_set3, executable_set3);
std::thread t0(executor_trigger, executor0);
std::thread t1(executor_trigger, executor1);
std::thread t2(executor_trigger, executor2);
std::thread t3(executor_trigger, executor3);
t0.join();
t1.join();
t2.join();
t3.join();
print_topN<float>(1 * 10, lenet_0_outhandle);
print_topN<float>(1 * 10, lenet_2_outhandle);
print_topN<float>(1 * 10, lenet_3_outhandle);
print_topN<float>(1 * 1001, mobilenet_1_outhandle);
print_topN<float>(1 * 1001, mobilenet_2_outhandle);
print_topN<float>(1 * 1001, mobilenet_3_outhandle);
print_topN<uint16_t>(1 * 1000, resnet50_0_outhandle);
print_topN<uint16_t>(1 * 1000, resnet50_1_outhandle);
for (auto net : nets) {
auto size = net.second->GetSpec().GetElementNum();
print_topN<float>(size, net.second);
}
return 0;
}

View File

@ -29,7 +29,7 @@
#include "tim/vx/graph.h"
#include "tim/vx/operation.h"
#include "tim/vx/tensor.h"
#include "tim/vx/platform/native.h"
#include "tim/vx/platform/platform.h"
static void printTopN() {
}
@ -46,9 +46,9 @@ int demo(int argc, char** argv) {
tim::vx::TensorSpec g0_input0, g0_output0, g1_output0, g2_output0, g3_output0, g4_output0, g5_output0;
// query device and get executor of devcie
auto devices = tim::vx::platform::NativeDevice::Enumerate();
auto devices = tim::vx::platform::IDevice::Enumerate();
auto device = devices[0];
std::shared_ptr<tim::vx::platform::IExecutor> executor = std::make_shared<tim::vx::platform::NativeExecutor> (device);
auto executor = device->CreateExecutor(0,-1, context);
// executable0
auto executable0 = executor->Compile(g0); // compile to nbg
@ -89,33 +89,6 @@ int demo(int argc, char** argv) {
// trigger
executor->Trigger(); // run all submitted executables
/* 2. another way to run */
// executable_set0
std::vector<std::shared_ptr<tim::vx::platform::IExecutable>> executables0;
executables0.push_back(executable0);
auto executable_set0 = CreateExecutableSet(executables0);
// executable_set1
std::vector<std::shared_ptr<tim::vx::platform::IExecutable>> executables1;
executables1.push_back(executable1);
executables1.push_back(executable3);
auto executable_set1 = CreateExecutableSet(executables1);
// executable_set2
std::vector<std::shared_ptr<tim::vx::platform::IExecutable>> executables2;
executables2.push_back(executable2);
executables2.push_back(executable4);
auto executable_set2 = CreateExecutableSet(executables2);
// executable_set3
std::vector<std::shared_ptr<tim::vx::platform::IExecutable>> executables3;
executables3.push_back(executable5);
auto executable_set3 = CreateExecutableSet(executables3);
// submit executaleSets
executable_set0->Submit(executable_set0);
executable_set1->Submit(executable_set0);
executable_set2->Submit(executable_set1);
executable_set3->Submit(executable_set2);
// trigger
executor->Trigger(); // run all submitted executableSets
printTopN();
return 0;

View File

@ -1296,7 +1296,7 @@ void resnet50::construct_graph
auto input_0 = graph->CreateTensor(input_0_spec);
tim::vx::ShapeType output_229_shape({1000,1});
tim::vx::TensorSpec output_229_spec(tim::vx::DataType::FLOAT16, output_229_shape,
tim::vx::TensorSpec output_229_spec(tim::vx::DataType::FLOAT32, output_229_shape,
tim::vx::TensorAttribute::OUTPUT);
auto output_229 = graph->CreateTensor(output_229_spec);

View File

@ -0,0 +1,13 @@
message("samples/platform_sample")
set(TARGET_NAME "platform_sample")
add_executable(${TARGET_NAME} ${CMAKE_CURRENT_SOURCE_DIR}/platform_sample.cc)
target_link_libraries(${TARGET_NAME} PRIVATE -Wl,--whole-archive tim-vx)
target_include_directories(${TARGET_NAME} PRIVATE
${CMAKE_CURRENT_SOURCE_DIR}
${PROJECT_SOURCE_DIR}/include)
install(TARGETS ${TARGET_NAME} ${TARGET_NAME}
DESTINATION ${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_BINDIR})

View File

@ -0,0 +1,25 @@
## brief
The platform sample usage which use platform api.
## note
Please note that if you have enabled lite platform, a dedicated VIVANTE_SDK(NO_KERNEL) is required as the compiler for NBG.
The driver for the NPU is the VIPLITE driver
##requirements
Vivante SDK >= 6.4.22
ovxlib >= 1.2.26
viplite >=2.0.0
## build
cd build
cmake .. -DCMAKE_BUILD_TYPE=Release -DTIM_VX_USE_EXTERNAL_OVXLIB=ON -DEXTERNAL_VIV_SDK=${VIVANTE_NOKERNEL_SDK_DIR} -DOVXLIB_INC=${OVXLIB_DIR}/include \
-DOVXLIB_LIB=${VIVANTE_NOKERNEL_SDK_DIR}/drivers/libovxlib.so -DTIM_VX_BUILD_EXAMPLES=ON -DTIM_VX_ENABLE_PLATFORM=ON \
-DTIM_VX_ENABLE_PLATFORM_LITE=ON -DVIP_LITE_SDK=${VIP_LITE_SDK}
## environment
# Export VIV_GPU_FILE to specify the NPU hardware configuration file for the NBG compiler
export VIV_GPU_FILE="/path/to/VIP9000NANOQ_PLUS_PID0X100000XX.config"
## run
cd build
./samples/platform_sample/platform_sample

View File

@ -26,8 +26,8 @@
#include "tim/vx/graph.h"
#include "tim/vx/ops.h"
#include "tim/vx/types.h"
#include "tim/vx/platform/native.h"
#include "tim/vx/platform/lite/lite_native.h"
#include "tim/vx/platform/platform.h"
int main() {
//construct tim-vx graph
@ -49,9 +49,15 @@ int main() {
std::vector<int> data_vec_i0({1, 2, 3, 4});
std::vector<int> data_vec_i1({4, 3, 2, 1});
auto devices = tim::vx::platform::NativeDevice::Enumerate();
auto devices = tim::vx::platform::IDevice::Enumerate();
std::cout << "NPU device count: " << devices.size() <<std::endl;
auto device = devices[0];
auto executor = std::make_shared<tim::vx::platform::LiteNativeExecutor>(device);
//run 1 core in device 0
std::cout << "NPU device[0] has " << device->CoreCount() << "cores" <<std::endl;
auto use_core_count = -1;
auto executor = device->CreateExecutor(use_core_count);
auto executable = executor->Compile(graph);
auto input0_handle = executable->AllocateTensor(input_spec);
auto input1_handle = executable->AllocateTensor(input_spec);
@ -73,6 +79,10 @@ int main() {
//each output value should be "5" in this demo
for (int i = 0; i < 4; ++i) {
std::cout << "output value: " << data[i] << std::endl;
if(data[i] != 5) {
std::cout << "test failed" << std::endl;
break;
}
}
free(data);
return 0;

View File

@ -61,8 +61,10 @@ if(TIM_VX_ENABLE_PLATFORM)
endif()
list(APPEND LITE_EXTERNAL_LIBS
${VIP_LITE_SDK}/drivers/libNBGlinker.so
${VIP_LITE_SDK}/drivers/libVIPlite.so)
list(APPEND LITE_INC_DIRS ${VIP_LITE_SDK}/include)
${VIP_LITE_SDK}/drivers/libVIPhal.so)
list(APPEND LITE_INC_DIRS
${VIP_LITE_SDK}/include
${VIP_LITE_SDK}/include/nbg_linker)
endif()
if(TIM_VX_ENABLE_GRPC)

View File

@ -26,6 +26,7 @@
#include <array>
#include <cassert>
#include <cstdint>
#include <memory>
#include <vector>
#include <string>

View File

@ -9,3 +9,4 @@ DEF_NODE_TYPE(custom_sample)
DEF_NODE_TYPE(custom_tiny_yolov4_postprocess)
DEF_NODE_TYPE(custom_tiny_yolov4_postprocess_confidence)
DEF_NODE_TYPE(custom_tiny_yolov4_postprocess_box)
DEF_NODE_TYPE(custom_letterbox)

View File

@ -9,3 +9,4 @@ DEF_OP(CUSTOM_SAMPLE)
DEF_OP(CUSTOM_TINY_YOLOV4_POSTPROCESS)
DEF_OP(CUSTOM_TINY_YOLOV4_POSTPROCESS_CONFIDENCE)
DEF_OP(CUSTOM_TINY_YOLOV4_POSTPROCESS_BOX)
DEF_OP(CUSTOM_LETTERBOX)

View File

@ -0,0 +1,61 @@
/****************************************************************************
*
* Copyright (c) 2020 Vivante Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*
*****************************************************************************/
#ifndef _VSI_NN_OP_CUSTOM_LETTERBOX_H
#define _VSI_NN_OP_CUSTOM_LETTERBOX_H
#include "vsi_nn_types.h"
#ifdef __cplusplus
extern "C" {
#endif
typedef struct _vsi_nn_custom_letterbox_param
{
struct _custom_letterbox_local_data_t* local;
int32_t new_shape_w;
int32_t new_shape_h;
vx_bool auto_bool;
vx_bool scaleFill;
vx_bool scaleup;
int32_t stride;
vx_bool center;
float mean_r;
float mean_g;
float mean_b;
float scale_r;
float scale_g;
float scale_b;
int32_t pad_value_r;
int32_t pad_value_g;
int32_t pad_value_b;
vx_bool reverse_channel;
} vsi_nn_custom_letterbox_param;
_compiler_assert(offsetof(vsi_nn_custom_letterbox_param, local) == 0, \
vsi_nn_custom_lertterbox_h );
#ifdef __cplusplus
}
#endif
#endif

View File

@ -34,5 +34,6 @@
#include "custom/ops/vsi_nn_op_custom_tiny_yolov4_postprocess.h"
#include "custom/ops/vsi_nn_op_custom_tiny_yolov4_postprocess_confidence.h"
#include "custom/ops/vsi_nn_op_custom_tiny_yolov4_postprocess_box.h"
#include "custom/ops/vsi_nn_op_custom_letterbox.h"
#endif

View File

@ -203,3 +203,4 @@ DEF_OP(BITCAST)
DEF_OP(GROUPED_CONV3D)
DEF_OP(COL2IM)
DEF_OP(L1_LAYER_NORM)
DEF_OP(ROPE)

View File

@ -80,7 +80,7 @@ typedef struct _vsi_nn_pre_process_rgb_param
float g_scale;
float b_scale;
/* pre process rgb layer local data structure */
vsi_nn_pre_process_rgb_lcl_data local;
vsi_nn_pre_process_rgb_lcl_data *local;
} vsi_nn_pre_process_rgb_param;
#ifdef __cplusplus

View File

@ -1,6 +1,6 @@
/****************************************************************************
*
* Copyright (c) 2020-2023 Vivante Corporation
* Copyright (c) 2020 Vivante Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
@ -21,38 +21,29 @@
* DEALINGS IN THE SOFTWARE.
*
*****************************************************************************/
#ifndef TIM_VX_NATIVE_DEVICE_PRIVATE_H_
#define TIM_VX_NATIVE_DEVICE_PRIVATE_H_
#include "tim/vx/platform/native.h"
#include "vip/virtual_device.h"
#include "graph_private.h"
#ifndef _VSI_NN_OP_ROPE_H
#define _VSI_NN_OP_ROPE_H
namespace tim {
namespace vx {
#include "vsi_nn_types.h"
class GraphImpl;
#ifdef __cplusplus
extern "C" {
#endif
namespace platform {
typedef struct _vsi_nn_rope_param
{
struct _rope_local_data_t* local;
// Add parameters here
int32_t axis;
vsi_bool interleaved;
} vsi_nn_rope_param;
_compiler_assert(offsetof(vsi_nn_rope_param, local) == 0, \
vsi_nn_rope_h );
class NativeDeviceImpl : public NativeDevice {
public:
NativeDeviceImpl(device_id_t id);
~NativeDeviceImpl(){};
#ifdef __cplusplus
}
#endif
bool Submit(const std::shared_ptr<tim::vx::Graph>& graph) override;
bool Trigger(bool async = false, async_callback cb = NULL) override;
bool DeviceExit() override;
void WaitDeviceIdle() override;
#endif
protected:
std::unique_ptr<vip::IDevice> vip_device_;
std::vector<vsi_nn_graph_t*> vsi_graph_v_;
};
} // namespace platform
} // namespace vx
} // namespace tim
#endif /* TIM_VX_NATIVE_DEVICE_PRIVATE_H_*/

View File

@ -34,6 +34,7 @@ typedef struct _vsi_nn_topk_param
{
uint32_t k;
int32_t axis;
struct _topk_local_data_t* local;
} vsi_nn_topk_param;
#ifdef __cplusplus

View File

@ -384,25 +384,17 @@ static VSI_INLINE_API float fp16_to_fp32
static VSI_INLINE_API float bfp16_to_fp32
(
int16_t in
uint16_t in
)
{
uint32_t t1, t2, t3;
float out;
fp32_bit_cast_t fp32_bit_cast;
t1 = in & 0x00FF; // Mantissa
t2 = in & 0xFF00; // Sign bit + Exponent
t3 = in & 0x7F00; // Exponent
fp32_bit_cast.data = (uint32_t)(in << 16);
t1 <<= 16;
t2 <<= 16; // Shift (sign + Exponent) bit into position
t1 |= t2; // Re-insert (sign + Exponent) bit
fp32_bit_cast.data = t1;
out = fp32_bit_cast.val;
return t3 == 0 ? 0.0f : out;
return out;
} /* bfp16_to_fp32() */
static VSI_INLINE_API uint16_t fp32_to_fp16
@ -720,7 +712,7 @@ static VSI_INLINE_API vsi_status dtype_to_float32
*dst = fp16_to_fp32( *(int16_t *)src );
break;
case VSI_NN_TYPE_BFLOAT16:
*dst = bfp16_to_fp32( *(int16_t *)src );
*dst = bfp16_to_fp32( *(uint16_t *)src );
break;
case VSI_NN_TYPE_FLOAT8_E4M3:
*dst = fp8_e4m3_to_fp32(*(int8_t*)src, src_dtype->scale);

File diff suppressed because it is too large Load Diff

View File

@ -61,14 +61,13 @@ typedef struct _vsi_nn_hw_config_t
{
char target_name[VSI_NN_MAX_TARGET_NAME];
vsi_nn_hw_evis_t evis;
#if VX_HARDWARE_CAPS_PARAMS_EXT_SUPPORT
uint32_t subGroupSize;
#endif
uint32_t use_40bits_va;
uint32_t support_stream_processor;
uint32_t sp_exec_count;
uint32_t sp_vector_depth;
uint32_t sp_per_core_vector_depth;
uint32_t support_ffd;
} vsi_nn_hw_config_t;
typedef struct _vsi_nn_runtime_option_t
@ -89,6 +88,7 @@ typedef struct _vsi_nn_runtime_option_t
int32_t enable_save_file_type;
int32_t enable_use_image_process;
int32_t enable_use_from_handle;
vsi_nn_hw_config_t config;
} vsi_nn_runtime_option_t;
/**
@ -101,6 +101,15 @@ typedef struct _vsi_nn_context_t
vsi_nn_runtime_option_t options;
} VSI_PUBLIC_TYPE *vsi_nn_context_t;
/**
* Query and set options->config hw params.
*/
OVXLIB_API vsi_status query_hardware_caps_runtime
(
vsi_nn_context_t ctx,
vsi_nn_runtime_option_t *options
);
/**
* Create context
* Create ovxlib NN runtime context.
@ -113,6 +122,11 @@ OVXLIB_API vsi_status vsi_nn_initOptions
(
vsi_nn_runtime_option_t *options
);
OVXLIB_API vsi_status vsi_nn_initOptions_runtime
(
vsi_nn_runtime_option_t *options,
vsi_nn_context_t ctx
);
/**
* Release context
* Release ovxlib NN runtime resource and reset context handle to NULL.

View File

@ -57,5 +57,8 @@
#define VSI_PER_GROUP_QUANTIZATION_SUPPORT
#endif
#define VSI_GRAPH_RUNTIME_ENV_SUPPORT
#if defined(VX_TENSOR_SPARSITY_SUPPORT)
#define VSI_TENSOR_SPARSITY_SUPPORT
#endif
#endif

View File

@ -216,6 +216,7 @@
#include "ops/vsi_nn_op_grouped_conv3d.h"
#include "ops/vsi_nn_op_col2im.h"
#include "ops/vsi_nn_op_l1_layer_norm.h"
#include "ops/vsi_nn_op_rope.h"
/* custom node head define define */
#include "custom/vsi_nn_custom_node_type.h"
#include "ops/vsi_nn_op_inverse_sigmoid.h"
@ -420,6 +421,7 @@ typedef union _vsi_nn_nn_param
vsi_nn_grouped_conv3d_param grouped_conv3d;
vsi_nn_col2im_param col2im;
vsi_nn_l1_layer_norm_param l1_layer_norm;
vsi_nn_rope_param rope;
void* client_param;
/* custom node data struct define */

View File

@ -86,8 +86,10 @@ typedef enum
VSI_NN_QNT_TYPE_SYMMETRIC_FLOAT8 = 0x6,
/** perchannel float8 */
VSI_NN_QNT_TYPE_PERCHANNEL_SYMMETRIC_FLOAT8 = 0x7,
/** GPQT */
/** pergroup symmetric */
VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_GROUP_SYMMETRIC = 0x8,
/** pergroup asymmetric */
VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_GROUP_ASYMMETRIC = 0x9,
/** undefined type */
VSI_NN_QNT_TYPE_NA = 0xff,
} vsi_nn_qnt_type_e;

View File

@ -418,6 +418,34 @@ OVXLIB_API vsi_status vsi_nn_SetTensorIsScalar
int8_t is_scalar
);
/**
* Get Tensor is_scalar
* Get the is_sparsity of the tensor
*
* @param[in] tensor Tensor.
*
* @return is_sparsity flag of the tensor.
*/
OVXLIB_API int32_t vsi_nn_GetTensorIsSparsity
(
vsi_nn_tensor_t* tensor
);
/**
* Set Weight Tensor whether is sparsity
* Set the is_sparsity for the tensor
*
* @param[in] tensor Tensor.
* @param[in] new is_sparsity value of the tensor.
*
* @return VSI_SUCCESS on success, or error core otherwise.
**/
OVXLIB_API vsi_status vsi_nn_SetTensorIsSparsity(
vsi_nn_tensor_t* tensor,
int32_t is_sparsity
);
OVXLIB_API vsi_status vsi_nn_CopyRawDataToTensor
(
vsi_nn_graph_t* graph,

View File

@ -33,7 +33,7 @@ extern "C"{
#define VSI_NN_VERSION_MAJOR 1
#define VSI_NN_VERSION_MINOR 2
#define VSI_NN_VERSION_PATCH 14
#define VSI_NN_VERSION_PATCH 22
#define VSI_NN_VERSION \
(VSI_NN_VERSION_MAJOR * 10000 + VSI_NN_VERSION_MINOR * 100 + VSI_NN_VERSION_PATCH)

View File

@ -0,0 +1,475 @@
/****************************************************************************
*
* Copyright (c) 2020 Vivante Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*
*****************************************************************************/
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include "vsi_nn_types.h"
#include "vsi_nn_tensor.h"
#include "vsi_nn_graph.h"
#include "vsi_nn_log.h"
#include "vsi_nn_error.h"
#include "vsi_nn_prv.h"
#include "vsi_nn_tensor_util.h"
#include "utils/vsi_nn_util.h"
#include "kernel/vsi_nn_kernel.h"
#include "utils/vsi_nn_dtype_util.h"
#include "utils/vsi_nn_dtype_util_prv.h"
__BEGIN_DECLS
/*
* Define kernel meta.
*/
#define _CUSTOM_LETTERBOX_KERNEL_SOURCE "custom_letterbox"
// Add kernel hashtable here
#define CUSTOM_LETTERBOX_HASH_KEY( IN_DTYPE, OUT_DTYPE ) \
(( IN_DTYPE ) | ( OUT_DTYPE << 8 ))
#define PACK_KERNEL_MAP( IN_DTYPE, OUT_DTYPE ) \
{ CUSTOM_LETTERBOX_HASH_KEY( IN_DTYPE, OUT_DTYPE ), \
CVIVANTE_NAMESPACE("evis.custom_letterbox_"#IN_DTYPE"to"#OUT_DTYPE), \
_CUSTOM_LETTERBOX_KERNEL_SOURCE }
typedef struct
{
uint32_t key;
char * function_name;
const char * source_name;
} _kernel_map_type;
static const _kernel_map_type _custom_letterbox_kernel_map[] =
{
// Register kernel here
PACK_KERNEL_MAP( U8, U8 ),
PACK_KERNEL_MAP( U8, I8 ),
PACK_KERNEL_MAP( U8, F16 ),
};
/*
* Kernel params
*/
static vx_param_description_t _custom_letterbox_kernel_param_def[] =
{
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
};
#define _CUSTOM_LETTERBOX_PARAM_NUM _cnt_of_array( _custom_letterbox_kernel_param_def )
/*
* Kernel initializer
*/
DEF_KERNEL_INITIALIZER(_custom_letterbox_initializer)
(
vsi_nn_kernel_node_t node,
const vsi_nn_kernel_node_param_t * param,
size_t param_size
)
{
vsi_status status = VSI_FAILURE;
gpu_param_t gpu_param = {
2,
{0, 0, 0},
{0, 0, 0},
{0, 0, 0},
{0, 0, 0}
};
vsi_nn_kernel_tensor_attr_t* attr[2] = {NULL, NULL};
VSI_UNREFERENCED(param_size);
int32_t top = 0;
int32_t bottom = 0;
int32_t left = 0;
int32_t right = 0;
float scale_w = 0;
float scale_h = 0;
int32_t resize_w = 0;
int32_t resize_h = 0;
int32_t resize_max_w = 0;
int32_t resize_max_h = 0;
float output_scale = 1.0f;
float output_zp = 0;
float out_scale_r = 0;
float out_zp_r = 0;
float out_scale_g = 0;
float out_zp_g = 0;
float out_scale_b = 0;
float out_zp_b = 0;
float pad_v_r = 0;
float pad_v_g = 0;
float pad_v_b = 0;
int32_t in_width = 0;
int32_t in_height = 0;
int32_t out_width = 0;
int32_t out_height = 0;
float mean_r = 0;
float mean_g = 0;
float mean_b = 0;
float scale_r = 0;
float scale_g = 0;
float scale_b = 0;
vx_int32 pad_value_r = 0;
vx_int32 pad_value_g = 0;
vx_int32 pad_value_b = 0;
vx_int32 r_order = 0;
vx_int32 b_order = 0;
vx_int32 reverse_channel = 0;
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[2], &top);
status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[3], &bottom);
status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[4], &left);
status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[5], &right);
status |= vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[6], &mean_r);
status |= vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[7], &mean_g);
status |= vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[8], &mean_b);
status |= vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[9], &scale_r);
status |= vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[10], &scale_g);
status |= vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[11], &scale_b);
status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[12], &pad_value_r);
status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[13], &pad_value_g);
status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[14], &pad_value_b);
status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[15], &reverse_channel);
CHECK_STATUS_FAIL_GOTO(status, final );
in_width = (int32_t)attr[0]->shape->data[0] / 3;
in_height = (int32_t)attr[0]->shape->data[1];
out_width = (int32_t)attr[1]->shape->data[0];
out_height = (int32_t)attr[1]->shape->data[1] / 3;
output_scale = 1.0f / attr[1]->scale;
output_zp = (float)(attr[1]->zero_point);
resize_w = out_width - left - right;
resize_h = out_height - top - bottom;
resize_max_w = out_width - right;
resize_max_h = out_height - bottom;
scale_w = (float)in_width / resize_w;
scale_h = (float)in_height / resize_h;
out_scale_r = scale_r / output_scale;
out_zp_r = output_zp - out_scale_r * mean_r;
out_scale_g = scale_g / output_scale;
out_zp_g = output_zp - out_scale_g * mean_g;
out_scale_b = scale_b / output_scale;
out_zp_b = output_zp - out_scale_b * mean_b;
pad_v_r = pad_value_r * out_scale_r + out_zp_r;
pad_v_g = pad_value_g * out_scale_g + out_zp_g;
pad_v_b = pad_value_b * out_scale_b + out_zp_b;
if (reverse_channel)
{
r_order = out_height * 2;
b_order = 0;
}
else
{
r_order = 0;
b_order = out_height * 2;
}
{
gpu_dp_inst_t uniU8RightSubLeft_4x4 = {{
0x00090909, // TCfg
0x00000000, // ASelt
0x00140003, 0x00000025, // ABin
0x000a0a0a, // BSelt
0x00000000, 0x00000000, // BBin
0x00000400, // AccumType, ConstantType, and PostShift
0x00010001, 0x00000000, 0x00010001, 0x00000000,
0x00010001, 0x00000000, 0x00000000, 0x00000000 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniLeftToFloat32_4x4 = {{
0x00010101, // TCfg
0x00000000, // ASelt
0x00010000, 0x00000002, // ABin
0x00020202, // BSelt
0x00000000, 0x00000000, // BBin
0x00000400, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000000, 0x00000001, 0x00000000,
0x00000001, 0x00000000, 0x00000000, 0x00000000 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniExtactHalf8_2x8 = {{
0x11111111, // TCfg
0x11110000, // ASelt
0x06040200, 0x06040200, // ABin
0x22222222, // BSelt
0x00000000, 0x00000000, // BBin
0x00000100, // AccumType, ConstantType, and PostShift
0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00,
0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniExtract8Data_2x8 = {{
0x33333333, // TCfg
0x11110000, // ASelt
0x03020100, 0x03020100, // ABin
0x00000000, // BSelt
0x00000000, 0x00000000, // BBin
0x00002400, // AccumType, ConstantType, and PostShift
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
}, GPU_DP_TYPE_16 };
status |= vsi_nn_kernel_gpu_add_param( node, "uniU8RightSubLeft_4x4", &uniU8RightSubLeft_4x4 );
status |= vsi_nn_kernel_gpu_add_param( node, "uniLeftToFloat32_4x4", &uniLeftToFloat32_4x4 );
status |= vsi_nn_kernel_gpu_add_param( node, "uniExtactHalf8_2x8", &uniExtactHalf8_2x8 );
status |= vsi_nn_kernel_gpu_add_param( node, "uniExtract8Data_2x8", &uniExtract8Data_2x8 );
}
status |= vsi_nn_kernel_gpu_add_param( node, "top", &top );
status |= vsi_nn_kernel_gpu_add_param( node, "left", &left );
status |= vsi_nn_kernel_gpu_add_param( node, "out_scale_r", &out_scale_r );
status |= vsi_nn_kernel_gpu_add_param( node, "out_scale_g", &out_scale_g );
status |= vsi_nn_kernel_gpu_add_param( node, "out_scale_b", &out_scale_b );
status |= vsi_nn_kernel_gpu_add_param( node, "out_zp_r", &out_zp_r );
status |= vsi_nn_kernel_gpu_add_param( node, "out_zp_g", &out_zp_g );
status |= vsi_nn_kernel_gpu_add_param( node, "out_zp_b", &out_zp_b );
status |= vsi_nn_kernel_gpu_add_param( node, "pad_v_r", &pad_v_r );
status |= vsi_nn_kernel_gpu_add_param( node, "pad_v_g", &pad_v_g );
status |= vsi_nn_kernel_gpu_add_param( node, "pad_v_b", &pad_v_b );
status |= vsi_nn_kernel_gpu_add_param( node, "scale_w", &scale_w );
status |= vsi_nn_kernel_gpu_add_param( node, "scale_h", &scale_h );
status |= vsi_nn_kernel_gpu_add_param( node, "resize_max_w", &resize_max_w );
status |= vsi_nn_kernel_gpu_add_param( node, "resize_max_h", &resize_max_h );
status |= vsi_nn_kernel_gpu_add_param( node, "out_height", &out_height );
status |= vsi_nn_kernel_gpu_add_param( node, "r_order", &r_order );
status |= vsi_nn_kernel_gpu_add_param( node, "b_order", &b_order );
gpu_param.global_scale[0] = 1;
gpu_param.global_scale[1] = 1;
gpu_param.global_size[0] = out_width;
gpu_param.global_size[1] = out_height;
status |= vsi_nn_kernel_gpu_config( node, &gpu_param );
CHECK_STATUS_FAIL_GOTO(status, final );
final:
if (attr[0])
{
vsi_nn_kernel_tensor_attr_release( &attr[0] );
attr[0] = NULL;
}
if (attr[1])
{
vsi_nn_kernel_tensor_attr_release( &attr[1] );
attr[1] = NULL;
}
return status;
} /* _custom_warp_affine_initializer() */
/*
* Query kernel
*/
static vsi_status _query_kernel
(
vsi_nn_kernel_t * kernel,
vsi_nn_tensor_t * const * const inputs,
vsi_nn_tensor_t * const * const outputs
)
{
vsi_status status = VSI_FAILURE;
vsi_nn_kernel_dtype_e in_dtype;
vsi_nn_kernel_dtype_e out_dtype;
const _kernel_map_type * kernel_map = _custom_letterbox_kernel_map;
size_t kernel_map_size = _cnt_of_array( _custom_letterbox_kernel_map );
vx_param_description_t * param_def = _custom_letterbox_kernel_param_def;
size_t param_def_size = _cnt_of_array( _custom_letterbox_kernel_param_def );
vx_kernel_initialize_f initializer = _custom_letterbox_initializer;
uint32_t key = 0;
uint32_t i = 0;
in_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
key = CUSTOM_LETTERBOX_HASH_KEY( in_dtype, out_dtype );
for ( i = 0; i < (uint32_t)kernel_map_size; i ++ )
{
if ( kernel_map[i].key == key )
{
break;
}
}
if ( i < (uint32_t)kernel_map_size )
{
snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name );
kernel->info.parameters = param_def;
kernel->info.numParams = (vx_uint32)param_def_size;
kernel->info.initialize = initializer;
// Register code source
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
"vsi_nn_kernel_header",
kernel_map[i].source_name );
// Register binary source
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
kernel_map[i].source_name );
status = VSI_SUCCESS;
}
return status;
} /* _query_kernel() */
static vsi_nn_kernel_node_t _setup
(
vsi_nn_graph_t * graph,
vsi_nn_tensor_t ** inputs,
size_t input_num,
vsi_nn_tensor_t ** outputs,
size_t output_num,
const vsi_nn_kernel_param_t * params,
vsi_nn_kernel_t * kernel
)
{
vsi_status status = VSI_FAILURE;
vsi_nn_kernel_node_param_t node_params[_CUSTOM_LETTERBOX_PARAM_NUM];
vsi_nn_kernel_node_t node = NULL;
size_t i = 0;
int32_t top = vsi_nn_kernel_param_get_int32( params, "top");
int32_t bottom = vsi_nn_kernel_param_get_int32( params, "bottom");
int32_t left = vsi_nn_kernel_param_get_int32( params, "left");
int32_t right = vsi_nn_kernel_param_get_int32( params, "right");
float mean_r = vsi_nn_kernel_param_get_float32( params, "mean_r");
float mean_g = vsi_nn_kernel_param_get_float32( params, "mean_g");
float mean_b = vsi_nn_kernel_param_get_float32( params, "mean_b");
float scale_r = vsi_nn_kernel_param_get_float32( params, "scale_r");
float scale_g = vsi_nn_kernel_param_get_float32( params, "scale_g");
float scale_b = vsi_nn_kernel_param_get_float32( params, "scale_b");
int32_t pad_value_r = vsi_nn_kernel_param_get_int32( params, "pad_value_r");
int32_t pad_value_g = vsi_nn_kernel_param_get_int32( params, "pad_value_g");
int32_t pad_value_b = vsi_nn_kernel_param_get_int32( params, "pad_value_b");
int32_t reverse_channel = vsi_nn_kernel_param_get_int32( params, "reverse_channel");
vsi_nn_tensor_t* reshape_tensors[2] = { NULL };
vsi_size_t shapes[2][VSI_NN_MAX_DIM_NUM] = { { 0 } };
uint32_t param_num = _CUSTOM_LETTERBOX_PARAM_NUM;
VSI_UNREFERENCED(input_num);
VSI_UNREFERENCED(output_num);
shapes[0][0] = inputs[0]->attr.size[1] * 3;
shapes[0][1] = inputs[0]->attr.size[2];
shapes[1][0] = outputs[0]->attr.size[0];
shapes[1][1] = outputs[0]->attr.size[1] * 3;
reshape_tensors[0] = vsi_nn_reshape_tensor( graph,
inputs[0], shapes[0], 2 );
reshape_tensors[1] = vsi_nn_reshape_tensor( graph,
outputs[0], shapes[1], 2 );
if (reshape_tensors[0] == NULL ||
reshape_tensors[1] == NULL)
{
goto final;
}
if (reverse_channel)
{
float mean_temp = mean_r;
float scale_temp = scale_r;
int32_t pad_value_temp = pad_value_r;
mean_r = mean_b;
mean_b = mean_temp;
scale_r = scale_b;
scale_b = scale_temp;
pad_value_r = pad_value_b;
pad_value_b = pad_value_temp;
}
status = _query_kernel( kernel, inputs, outputs );
if ( VSI_SUCCESS == status)
{
node = vsi_nn_kernel_create_node( graph, kernel );
if ( node )
{
uint32_t index = 2;
vsi_nn_kernel_node_pack_io( node_params, param_num,
reshape_tensors, 1, &reshape_tensors[1], 1 );
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &top );
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &bottom );
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &left );
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &right );
node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &mean_r );
node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &mean_g );
node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &mean_b );
node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &scale_r );
node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &scale_g );
node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &scale_b );
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &pad_value_r );
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &pad_value_g );
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &pad_value_b );
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &reverse_channel );
/* Pass parameters to node. */
status = vsi_nn_kernel_node_pass_param( node, node_params, param_num );
vsi_nn_kernel_scalar_release( &node_params[2] );
vsi_nn_kernel_scalar_release( &node_params[3] );
vsi_nn_kernel_scalar_release( &node_params[4] );
vsi_nn_kernel_scalar_release( &node_params[5] );
vsi_nn_kernel_scalar_release( &node_params[6] );
vsi_nn_kernel_scalar_release( &node_params[7] );
vsi_nn_kernel_scalar_release( &node_params[8] );
vsi_nn_kernel_scalar_release( &node_params[9] );
vsi_nn_kernel_scalar_release( &node_params[10] );
vsi_nn_kernel_scalar_release( &node_params[11] );
vsi_nn_kernel_scalar_release( &node_params[12] );
vsi_nn_kernel_scalar_release( &node_params[13] );
vsi_nn_kernel_scalar_release( &node_params[14] );
vsi_nn_kernel_scalar_release( &node_params[15] );
CHECK_STATUS(status);
}
}
final:
for (i = 0; i < 2; i++)
{
vsi_safe_release_tensor(reshape_tensors[i]);
}
return node;
} /* _setup() */
__END_DECLS
REGISTER_BACKEND_EVIS( custom_letterbox, _setup )

View File

@ -35,6 +35,7 @@
#include "utils/vsi_nn_dtype_util.h"
#include "kernel/vsi_nn_kernel.h"
#include "libnnext/vsi_nn_vxkernel.h"
#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h"
#define _CPU_ARG_NUM (1)
#define _CPU_INPUT_NUM (1)
@ -42,6 +43,7 @@
#define _CPU_IO_NUM (_CPU_INPUT_NUM + _CPU_OUTPUT_NUM)
#define _CPU_PARAM_NUM (_CPU_ARG_NUM + _CPU_IO_NUM)
#define _KERNEL_NAME ("com.vivantecorp.extension.Softmax2VXC")
#define _KERNEL_NAME_U8 ("com.vivantecorp.extension.Softmax2VXC_u8")
#define SCALAR_INPUT_AXIS (2)
@ -64,7 +66,11 @@ DEF_KERNEL_INITIALIZER(_softmax_initializer)
{
vsi_status status = VSI_FAILURE;
int sf_size = 0;
vsi_nn_kernel_tensor_attr_t* attr = NULL;
vsi_nn_kernel_tensor_attr_t* attr[2] = {NULL, NULL};
float srcZP = 0.0f;
float srcScale = 1.0f;
float dstZP = 0.0f;
float dstScale = 1.0f;
// Alignment with a power of two value.
gpu_param_t gpu_param = {
2, // workdim
@ -75,14 +81,19 @@ DEF_KERNEL_INITIALIZER(_softmax_initializer)
VSI_UNREFERENCED(param_size);
attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
if (!attr)
attr[0] = vsi_nn_kernel_tensor_attr_create((vsi_nn_kernel_tensor_t)param[0]);
attr[1] = vsi_nn_kernel_tensor_attr_create((vsi_nn_kernel_tensor_t)param[1]);
if ((!attr[0]) || (!attr[1]))
{
VSILOGE("Query failure! at line");
return status;
}
sf_size = (int)attr->shape->data[0];
sf_size = (int)attr[0]->shape->data[0];
srcScale = attr[0]->scale;
srcZP = (float)attr[0]->zero_point;
dstScale = 1.0f / attr[1]->scale;
dstZP = (float)attr[1]->zero_point;
gpu_param.global_offset[0] = 0;
gpu_param.global_offset[1] = 0;
@ -91,7 +102,7 @@ DEF_KERNEL_INITIALIZER(_softmax_initializer)
gpu_param.local_size[0] = 1;
gpu_param.local_size[1] = 1;
gpu_param.global_size[0] =
gpu_align_p2((1 + gpu_param.global_scale[0] - 1) / gpu_param.global_scale[0],
gpu_align_p2((attr[0]->shape->data[1] + gpu_param.global_scale[0] - 1) / gpu_param.global_scale[0],
gpu_param.local_size[0]);
gpu_param.global_size[1] =
gpu_align_p2((1 + gpu_param.global_scale[1] - 1) / gpu_param.global_scale[1],
@ -107,25 +118,50 @@ DEF_KERNEL_INITIALIZER(_softmax_initializer)
0x00000001, 0x00000000, 0x00000001, 0x00000000,
0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
}, GPU_DP_TYPE_16};
gpu_dp_inst_t uniExtract8Bin_2x8 = {{
0x11111111, // TCfg
0x11110000, // ASelt
0x06040200, 0x06040200, // ABin
0x22222222, // BSelt
0x00000000, 0x00000000, // BBin
0x00002400, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000001, 0x00000001, 0x00000001,
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
}, GPU_DP_TYPE_16};
status = vsi_nn_kernel_gpu_add_param( node,
"Uni4x4_Fp16ToFp32", &Uni4x4_Fp16ToFp32 );
vsi_nn_kernel_gpu_add_param(node,
status |= vsi_nn_kernel_gpu_add_param( node,
"uniExtract8Bin_2x8", &uniExtract8Bin_2x8 );
status |= vsi_nn_kernel_gpu_add_param(node,
"sf_size", &sf_size);
status |= vsi_nn_kernel_gpu_add_param(node, "srcScale", &srcScale);
status |= vsi_nn_kernel_gpu_add_param(node, "srcZP", &srcZP);
status |= vsi_nn_kernel_gpu_add_param(node, "dstScale", &dstScale);
status |= vsi_nn_kernel_gpu_add_param(node, "dstZP", &dstZP);
}
status = vsi_nn_kernel_gpu_config( node, &gpu_param );
status |= vsi_nn_kernel_gpu_config( node, &gpu_param );
if(status != VSI_SUCCESS)
{
VSILOGE("Initializer failure!");
}
if (attr) vsi_nn_kernel_tensor_attr_release( &attr );
if (attr[0])
{
vsi_nn_kernel_tensor_attr_release( &attr[0] );
attr[0] = NULL;
}
if (attr[1])
{
vsi_nn_kernel_tensor_attr_release( &attr[1] );
attr[1] = NULL;
}
return status;
}
static const vx_kernel_description_t _kernel_info =
static const vx_kernel_description_t _kernel_info1 =
{
KERNEL_ID_PLACEHOLDER,
_KERNEL_NAME,
@ -139,6 +175,20 @@ static const vx_kernel_description_t _kernel_info =
vsi_nn_KernelDeinitializer
};
static const vx_kernel_description_t _kernel_info2 =
{
KERNEL_ID_PLACEHOLDER,
_KERNEL_NAME_U8,
NULL,
kernel_param_def,
_cnt_of_array( kernel_param_def ),
vsi_nn_KernelValidator,
NULL,
NULL,
_softmax_initializer,
vsi_nn_KernelDeinitializer
};
static vsi_status _query_kernel
(
vsi_nn_tensor_t* const* const inputs,
@ -146,9 +196,20 @@ static vsi_status _query_kernel
vsi_nn_kernel_t* kernel
)
{
VSI_UNREFERENCED(inputs);
VSI_UNREFERENCED(outputs);
memmove( &kernel->info, &_kernel_info, sizeof(vx_kernel_description_t) );
vsi_nn_kernel_dtype_e in_dtype;
vsi_nn_kernel_dtype_e out_dtype;
in_dtype = vsi_nn_kernel_map_dtype(inputs[0]->attr.dtype.vx_type);
out_dtype = vsi_nn_kernel_map_dtype(outputs[0]->attr.dtype.vx_type);
if (in_dtype == U8 && out_dtype == U8)
{
memmove( &kernel->info, &_kernel_info2, sizeof(vx_kernel_description_t) );
}
else
{
memmove( &kernel->info, &_kernel_info1, sizeof(vx_kernel_description_t) );
}
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
"vsi_nn_kernel_header",
@ -173,12 +234,42 @@ static vsi_nn_kernel_node_t _setup
vsi_nn_kernel_node_param_t backend_params[_CPU_PARAM_NUM] = {NULL};
vsi_nn_kernel_node_t node = NULL;
int32_t axis = 0;
vsi_nn_tensor_t* reshape_tensors[2] = {NULL};
vsi_size_t shapes[2][VSI_NN_MAX_DIM_NUM] = {{0}};
uint32_t rank_in = 0;
int32_t new_axis = 0;
uint32_t i = 0;
vsi_bool ret = vx_false_e;
VSI_UNREFERENCED(input_num);
VSI_UNREFERENCED(output_num);
axis = vsi_nn_kernel_param_get_int32(params, "axis");
ret = vsi_nn_kernel_optimize_softmax_shape(inputs[0]->attr.size,
inputs[0]->attr.dim_num,
axis,
shapes[0],
&rank_in,
&new_axis);
if (ret)
{
reshape_tensors[0] = vsi_nn_reshape_tensor(graph, inputs[0], shapes[0], rank_in);
reshape_tensors[1] = vsi_nn_reshape_tensor(graph, outputs[0], shapes[0], rank_in);
}
else
{
return NULL;
}
if (!vsi_nn_kernel_gpu_check_shape(reshape_tensors[0]->attr.size,
reshape_tensors[0]->attr.dim_num) ||
new_axis > 2)
{
return NULL;
}
status = _query_kernel( inputs, outputs, kernel );
if( VSI_SUCCESS == status)
{
@ -187,9 +278,9 @@ static vsi_nn_kernel_node_t _setup
{
/* Set inputs and outputs */
vsi_nn_kernel_node_pack_io( backend_params, _CPU_PARAM_NUM,
inputs, _CPU_INPUT_NUM, outputs, _CPU_OUTPUT_NUM );
reshape_tensors, _CPU_INPUT_NUM, &reshape_tensors[1], _CPU_OUTPUT_NUM );
backend_params[SCALAR_INPUT_AXIS] = vsi_nn_kernel_scalar_create(
graph, I32, &axis );
graph, I32, &new_axis );
/* Pass parameters to node. */
status = vsi_nn_kernel_node_pass_param( node, backend_params, _CPU_PARAM_NUM );
@ -200,6 +291,11 @@ static vsi_nn_kernel_node_t _setup
status = VSI_FAILURE;
}
}
for (i = 0; i < 2; i++)
{
vsi_safe_release_tensor(reshape_tensors[i]);
}
return node;
} /* _setup() */

View File

@ -0,0 +1,227 @@
/****************************************************************************
*
* Copyright (c) 2020 Vivante Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*
*****************************************************************************/
#include <string.h>
#include <stdlib.h>
#include "vsi_nn_types.h"
#include "vsi_nn_log.h"
#include "vsi_nn_node.h"
#include "vsi_nn_prv.h"
#include "vsi_nn_ops.h"
#include "vsi_nn_tensor.h"
#include "utils/vsi_nn_util.h"
#include "kernel/vsi_nn_kernel.h"
#include "vsi_nn_internal_node.h"
#include "utils/vsi_nn_constraint_check.h"
typedef struct _custom_letterbox_local_data_t {
int32_t placeholder;
} custom_letterbox_local_data_t;
/*
Declare number of input and output.
*/
#define _INPUT_NUM (1)
#define _OUTPUT_NUM (1)
int32_t my_round(float in)
{
if (in >= 0)
{
return (int)(in + 0.5f);
}
else
{
return (int)(in - 0.5f);
}
}
static vsi_status op_compute
(
vsi_nn_node_t * self,
vsi_nn_tensor_t ** inputs,
vsi_nn_tensor_t ** outputs
)
{
vsi_nn_kernel_param_t * param = NULL;
vsi_nn_custom_letterbox_param * p;
p = &(self->nn_param.custom_letterbox);
int32_t shape_w = (int32_t)inputs[0]->attr.size[1];
int32_t shape_h = (int32_t)inputs[0]->attr.size[2];
int32_t new_shape_w = (int32_t)outputs[0]->attr.size[0];
int32_t new_shape_h = (int32_t)outputs[0]->attr.size[1];
vx_bool auto_bool = p->auto_bool;
vx_bool scaleFill = p->scaleFill;
vx_bool scaleup = p->scaleup;
int32_t stride = p->stride;
vx_bool center = p->center;
float r = 1.0f;
int32_t new_unpad_w = 0;
int32_t new_unpad_h = 0;
int32_t dw = 0;
int32_t dh = 0;
int32_t top = 0;
int32_t bottom = 0;
int32_t left = 0;
int32_t right = 0;
r = (float)fmin((float)new_shape_w / shape_w, (float)new_shape_h / shape_h);
if (!scaleup)
{
r = (float)fmin(r, 1.0f);
}
new_unpad_w = my_round(r * shape_w);
new_unpad_h = my_round(r * shape_h);
dw = new_shape_w - new_unpad_w;
dh = new_shape_h - new_unpad_h;
if (auto_bool)
{
dw = dw % stride;
dh = dh % stride;
}
else if (scaleFill)
{
dw = 0;
dh = 0;
new_unpad_w = new_shape_w;
new_unpad_h = new_shape_h;
}
if (center)
{
top = my_round(dh / 2.0f - 0.1f);
bottom = my_round(dh / 2.0f + 0.1f);
left = my_round(dw / 2.0f - 0.1f);
right = my_round(dw / 2.0f + 0.1f);
}
else
{
top = 0;
bottom = my_round(dh + 0.1f);
left = 0;
right = my_round(dw + 0.1f);
}
param = vsi_nn_kernel_param_create();
vsi_nn_kernel_param_add_int32( param, "top", top);
vsi_nn_kernel_param_add_int32( param, "bottom", bottom);
vsi_nn_kernel_param_add_int32( param, "left", left);
vsi_nn_kernel_param_add_int32( param, "right", right);
vsi_nn_kernel_param_add_float32( param, "mean_r", p->mean_r);
vsi_nn_kernel_param_add_float32( param, "mean_g", p->mean_g);
vsi_nn_kernel_param_add_float32( param, "mean_b", p->mean_b);
vsi_nn_kernel_param_add_float32( param, "scale_r", p->scale_r);
vsi_nn_kernel_param_add_float32( param, "scale_g", p->scale_g);
vsi_nn_kernel_param_add_float32( param, "scale_b", p->scale_b);
vsi_nn_kernel_param_add_int32( param, "pad_value_r", p->pad_value_r);
vsi_nn_kernel_param_add_int32( param, "pad_value_g", p->pad_value_g);
vsi_nn_kernel_param_add_int32( param, "pad_value_b", p->pad_value_b);
vsi_nn_kernel_param_add_int32( param, "reverse_channel", p->reverse_channel);
self->n = (vx_node)vsi_nn_kernel_selector( self->graph,
"custom_letterbox",
inputs, 1,
outputs, 1, param );
vsi_nn_kernel_param_release( &param );
return VSI_SUCCESS;
} /* op_compute() */
static vsi_bool op_check
(
vsi_nn_node_t * self,
vsi_nn_tensor_t ** inputs,
vsi_nn_tensor_t ** outputs
)
{
BEGIN_IO_TYPE_DECL(LETTERBOX, 1, 1)
IO_TYPE(D_U8, D_F16)
IO_TYPE(D_U8, D_U8|Q_ASYM)
IO_TYPE(D_U8, D_I8|Q_DFP)
IO_TYPE(D_U8, D_I8|Q_ASYM)
IO_TYPE(D_U8, D_I8|Q_SYM)
END_IO_TYPE_DECL(LETTERBOX)
if (!VALIDATE_OP_IO_TYPES(LETTERBOX, self, inputs, self->input.num, outputs, self->output.num)) {
char* desc = generate_op_io_types_desc(inputs,
self->input.num, outputs, self->output.num);
VSILOGE("Inputs/Outputs data type not support: %s", desc);
destroy_op_io_types_desc(desc);
return FALSE;
}
return TRUE;
} /* op_check() */
static vsi_bool op_setup
(
vsi_nn_node_t * self,
vsi_nn_tensor_t ** inputs,
vsi_nn_tensor_t ** outputs
)
{
if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
{
outputs[0]->attr.dim_num = inputs[0]->attr.dim_num;
outputs[0]->attr.size[0] = self->nn_param.custom_letterbox.new_shape_w;
outputs[0]->attr.size[1] = self->nn_param.custom_letterbox.new_shape_h;
outputs[0]->attr.size[2] = 3;
outputs[0]->attr.size[3] = inputs[0]->attr.size[3];
}
return TRUE;
} /* op_setup() */
static vsi_status op_deinit
(
vsi_nn_node_t* self
)
{
vsi_status status = VSI_SUCCESS;
status = vsi_nn_op_common_deinit(self);
return status;
} /* op_deinit() */
__BEGIN_DECLS
/* Registrar */
DEF_OP_REG
(
/* op_name */ CUSTOM_LETTERBOX,
/* init */ NULL,
/* compute */ op_compute,
/* deinit */ op_deinit,
/* check */ op_check,
/* setup */ op_setup,
/* optimize */ NULL,
/* input_num */ _INPUT_NUM,
/* output_num */ _OUTPUT_NUM
);
__END_DECLS

View File

@ -85,18 +85,24 @@ static const struct {
HASH_CUMSUM_KERNELS(0, U8, U8)
HASH_CUMSUM_KERNELS(0, F32, F32)
HASH_CUMSUM_KERNELS(0, F32, U8)
HASH_CUMSUM_KERNELS(0, I32, I32)
HASH_CUMSUM_KERNELS(1, U8, U8)
HASH_CUMSUM_KERNELS(1, F32, F32)
HASH_CUMSUM_KERNELS(1, F32, U8)
HASH_CUMSUM_KERNELS(1, I32, I32)
HASH_CUMSUM_KERNELS(2, U8, U8)
HASH_CUMSUM_KERNELS(2, F32, F32)
HASH_CUMSUM_KERNELS(2, F32, U8)
HASH_CUMSUM_KERNELS(2, I32, I32)
HASH_CUMSUM_KERNELS_2D(0, U8, U8)
HASH_CUMSUM_KERNELS_2D(0, F32, F32)
HASH_CUMSUM_KERNELS_2D(0, F32, U8)
HASH_CUMSUM_KERNELS_2D(0, I32, I32)
HASH_CUMSUM_KERNELS_2D(1, U8, U8)
HASH_CUMSUM_KERNELS_2D(1, F32, F32)
HASH_CUMSUM_KERNELS_2D(1, F32, U8)
HASH_CUMSUM_KERNELS_2D(1, I32, I32)
HASH_CUMSUM_ARRAY_KERNELS(0, U8, U8, KERNEL_SOURCE_3)
HASH_CUMSUM_ARRAY_KERNELS(0, F32, F32, KERNEL_SOURCE_3)

View File

@ -26,6 +26,7 @@
#include <stdlib.h>
#include <string.h>
#include "vsi_nn_types.h"
#include "vsi_nn_types_prv.h"
#include "vsi_nn_tensor.h"
#include "vsi_nn_graph.h"
#include "vsi_nn_log.h"
@ -644,7 +645,8 @@ static vsi_nn_kernel_node_t _setup
#if VX_HARDWARE_CAPS_PARAMS_EXT_SUPPORT
shader_cnt_support =
(graph->ctx->config.subGroupSize >= 64 && graph->ctx->config.use_40bits_va) ? TRUE : FALSE;
(((vsi_nn_graph_prv_t*)graph)->options->config.subGroupSize >= 64 &&
((vsi_nn_graph_prv_t*)graph)->options->config.use_40bits_va) ? TRUE : FALSE;
#endif
if ((in1_h % 64 == 0) && (transFlg == 1) && (out_h % 8 == 0) && shader_cnt_support)
{

View File

@ -75,6 +75,7 @@ static const _kernel_map_type _one_hot_kernel_map[] =
PACK_ONE_HOT_KERNEL_MAP( F32, F32 ),
PACK_ONE_HOT_KERNEL_MAP( I32, I32 ),
PACK_ONE_HOT_KERNEL_MAP( I32, F32 ),
PACK_ONE_HOT_KERNEL_MAP( I32, BF16 ),
PACK_ONE_HOT_KERNEL_MAP( I32, U8 ),
PACK_ONE_HOT_KERNEL_MAP( U8, U8 ),
};

View File

@ -79,7 +79,7 @@ static const struct {
const char* source_name;
} kernel_map[] =
{
PRELU_KERNELS_FLOAT(F32, F32, F32, KERNEL_SOURCE_1)
PRELU_KERNELS_FLOAT(F32, F32, F32, KERNEL_SOURCE_1)
PRELU_KERNELS_FLOAT(F16, F16, F16, KERNEL_SOURCE_1)
PRELU_KERNELS(U8, U8, U8, KERNEL_SOURCE_1)
PRELU_KERNELS(I32, I32, I32, KERNEL_SOURCE_1)

View File

@ -0,0 +1,329 @@
/****************************************************************************
*
* Copyright (c) 2020 Vivante Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*
*****************************************************************************/
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include "vsi_nn_types.h"
#include "vsi_nn_tensor.h"
#include "vsi_nn_graph.h"
#include "vsi_nn_log.h"
#include "vsi_nn_error.h"
#include "vsi_nn_prv.h"
#include "vsi_nn_tensor_util.h"
#include "utils/vsi_nn_util.h"
#include "kernel/vsi_nn_kernel.h"
__BEGIN_DECLS
/*
* Define kernel meta.
*/
typedef enum
{
INTERNAL_KERNEL_ROPE,
} _internal_kernel_e;
#define _ROPE_KERNEL_SOURCE "rope"
#define _ROPE_KERNEL_NAME CVIVANTE_NAMESPACE("cl.rope")
// Add kernel hashtable here
#define STR(a) #a
#define ROPE_HASH_KEY( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, AXIS ) \
((IN0_DTYPE) | (IN0_DTYPE << 8) | (OUT_DTYPE << 16) | (AXIS << 25))
#define PACK_KERNEL_MAP( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, AXIS ) \
{ ROPE_HASH_KEY( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, AXIS ), \
CVIVANTE_NAMESPACE("cl.rope_"STR(IN0_DTYPE)"_"STR(IN1_DTYPE)"to"STR(OUT_DTYPE)"_axis"STR(AXIS)), \
"rope_0" }
typedef struct
{
uint32_t key;
char * function_name;
const char * source_name;
} _kernel_map_type;
static const _kernel_map_type _rope_kernel_map[] =
{
// Register kernel here
PACK_KERNEL_MAP( F32, F32, F32, 0 ),
PACK_KERNEL_MAP( F32, F32, F32, 1 ),
PACK_KERNEL_MAP( F32, F32, F32, 2 ),
PACK_KERNEL_MAP( I32, I32, I32, 0 ),
PACK_KERNEL_MAP( I32, I32, I32, 1 ),
PACK_KERNEL_MAP( I32, I32, I32, 2 ),
PACK_KERNEL_MAP( U32, U32, U32, 0 ),
PACK_KERNEL_MAP( U32, U32, U32, 1 ),
PACK_KERNEL_MAP( U32, U32, U32, 2 ),
};
/*
* Kernel params
*/
static vx_param_description_t _rope_kernel_param_def[] =
{
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
};
#define _ROPE_PARAM_NUM _cnt_of_array( _rope_kernel_param_def )
#define SCALAR_AXIS (4)
#define SCALAR_IN_ZP (5)
#define SCALAR_COS_ZP (6)
#define SCALAR_SIN_ZP (7)
#define SCALAR_SCALE0 (8)
#define SCALAR_SCALE1 (9)
#define SCALAR_OUT_ZP (10)
#define SCALAR_HALF_HEAD_SIZE (11)
#define SCALAR_STEP (12)
/*
* Kernel initializer
*/
DEF_KERNEL_INITIALIZER(_rope_initializer)
(
vsi_nn_kernel_node_t node,
const vsi_nn_kernel_node_param_t * param,
size_t param_size
)
{
gpu_param_t gpu_param = {
3, // workdim
{0, 0, 0}, // globalWorkOffset: control the start location be processed in the image
{0, 0, 0}, // globalWorkScale: how many pixels could be processed by a single thread
{0, 0, 0}, // localWorkSize: local group size in thread
{0, 0, 0} // globalWorkSize: image size in thread
};
vsi_status status = VSI_FAILURE;
vsi_nn_kernel_tensor_attr_t* attr[2] = { NULL };
int32_t axis = 0;
vsi_size_array_t* out_shape = NULL;
vsi_size_t shape[3] = { 1 };
VSI_UNREFERENCED(node);
VSI_UNREFERENCED(param_size);
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[3] );
CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[4], &axis);
CHECK_STATUS_FAIL_GOTO(status, final);
out_shape = attr[1]->shape;
shape[0] = out_shape->data[0];
shape[1] = out_shape->data[1];
shape[2] = out_shape->data[2];
shape[axis] = shape[axis] / 2;
gpu_param.global_scale[0] = 1;
gpu_param.global_scale[1] = 1;
gpu_param.global_scale[2] = 1;
gpu_param.global_size[0] = shape[0];
gpu_param.global_size[1] = shape[1];
gpu_param.global_size[2] = out_shape->size > 2 ? shape[2] : 1;
status = vsi_nn_kernel_gpu_config(node, &gpu_param);
final:
#define SAFE_FREE_TENSOR_ATTR(_PTR) if ( _PTR ) { vsi_nn_kernel_tensor_attr_release( &_PTR ); _PTR = NULL; }
SAFE_FREE_TENSOR_ATTR(attr[0]);
SAFE_FREE_TENSOR_ATTR(attr[1]);
#undef SAFE_FREE_TENSOR_ATTR
return status;
} /* _rope_initializer() */
/*
* Query kernel
*/
static vsi_status _query_kernel
(
vsi_nn_kernel_t * kernel,
vsi_nn_tensor_t * const * const inputs,
vsi_nn_tensor_t * const * const outputs,
int32_t axis
)
{
vsi_status status = VSI_FAILURE;
vsi_nn_kernel_dtype_e in0_dtype;
vsi_nn_kernel_dtype_e in1_dtype;
vsi_nn_kernel_dtype_e in2_dtype;
vsi_nn_kernel_dtype_e out_dtype;
const _kernel_map_type * kernel_map = _rope_kernel_map;
size_t kernel_map_size = _cnt_of_array( _rope_kernel_map );
vx_param_description_t * param_def = _rope_kernel_param_def;
vx_kernel_initialize_f initializer = _rope_initializer;
uint32_t key = 0;
uint32_t i;
in0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
in1_dtype = vsi_nn_kernel_map_dtype(inputs[1]->attr.dtype.vx_type);
in2_dtype = vsi_nn_kernel_map_dtype(inputs[2]->attr.dtype.vx_type);
out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
#define _PACK_SELECT_KEY( in0_type, in1_type, in2_type, out_type ) \
((in0_type) | (in1_type << 8) | (in2_type << 16) | (out_type << 24))
switch (_PACK_SELECT_KEY(in0_dtype, in1_dtype, in2_dtype, out_dtype))
{
case _PACK_SELECT_KEY(F32, F32, F32, F32):
case _PACK_SELECT_KEY(F16, F16, F16, F16):
key = ROPE_HASH_KEY(F32, F32, F32, axis);
break;
case _PACK_SELECT_KEY(U8, U8, U8, U8):
case _PACK_SELECT_KEY(U16, U16, U16, U16):
key = ROPE_HASH_KEY(U32, U32, U32, axis);
break;
case _PACK_SELECT_KEY(I8, I8, I8, I8):
case _PACK_SELECT_KEY(I16, I16, I16, I16):
case _PACK_SELECT_KEY(I32, I32, I32, I32):
key = ROPE_HASH_KEY(I32, I32, I32, axis);
break;
default:
break;
}
#undef _PACK_SELECT_KEY
for ( i = 0; i < (uint32_t)kernel_map_size; i ++ )
{
if ( kernel_map[i].key == key )
{
break;
}
}
if ( i < (uint32_t)kernel_map_size )
{
snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name );
kernel->info.parameters = param_def;
kernel->info.numParams = _cnt_of_array( _rope_kernel_param_def );
kernel->info.initialize = initializer;
// Register code source
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
"eltwise_ops_helper",
kernel_map[i].source_name );
// Register binary source
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
kernel_map[i].source_name );
status = VSI_SUCCESS;
}
return status;
} /* _query_kernel() */
static vsi_nn_kernel_node_t _setup
(
vsi_nn_graph_t * graph,
vsi_nn_tensor_t ** inputs,
size_t input_num,
vsi_nn_tensor_t ** outputs,
size_t output_num,
const vsi_nn_kernel_param_t * params,
vsi_nn_kernel_t * kernel
)
{
vsi_status status = VSI_FAILURE;
vsi_nn_kernel_node_param_t node_params[_ROPE_PARAM_NUM] = {NULL};
vsi_nn_kernel_node_t node = NULL;
int32_t axis = vsi_nn_kernel_param_get_int32(params, "axis");
int32_t interleaved = vsi_nn_kernel_param_get_int32(params, "interleaved");
float in_scale = vsi_nn_get_tensor_scale(inputs[0]);
float cos_scale = vsi_nn_get_tensor_scale(inputs[1]);
float sin_scale = vsi_nn_get_tensor_scale(inputs[2]);
float out_scale = vsi_nn_get_tensor_scale(outputs[0]);
float in_zp = (float)vsi_nn_get_tensor_zero_point(inputs[0]);
float cos_zp = (float)vsi_nn_get_tensor_zero_point(inputs[1]);
float sin_zp = (float)vsi_nn_get_tensor_zero_point(inputs[2]);
float output_zp = (float)vsi_nn_get_tensor_zero_point(outputs[0]);
int32_t half_head_size = interleaved ? 1 : (int32_t)(inputs[0]->attr.size[axis] / 2);
float scale0 = in_scale * cos_scale / out_scale;
float scale1 = in_scale * sin_scale / out_scale;
int32_t step = interleaved ? 2 : 1;
int32_t i = 0;
// Check if gpu can support the size
if ( !vsi_nn_kernel_gpu_check_shape(
inputs[0]->attr.size, inputs[0]->attr.dim_num ) )
{
return NULL;
}
status = _query_kernel( kernel, inputs, outputs, axis );
if (VSI_SUCCESS == status)
{
node = vsi_nn_kernel_create_node( graph, kernel );
if ( node )
{
/* Set inputs and outputs */
vsi_nn_kernel_node_pack_io( node_params, _ROPE_PARAM_NUM,
inputs, input_num, outputs, output_num );
/* Pass parameters to node. */
node_params[SCALAR_AXIS] = vsi_nn_kernel_scalar_create(
graph, I32, &axis);
node_params[SCALAR_IN_ZP] = vsi_nn_kernel_scalar_create(
graph, F32, &in_zp);
node_params[SCALAR_COS_ZP] = vsi_nn_kernel_scalar_create(
graph, F32, &cos_zp);
node_params[SCALAR_SIN_ZP] = vsi_nn_kernel_scalar_create(
graph, F32, &sin_zp);
node_params[SCALAR_SCALE0] = vsi_nn_kernel_scalar_create(
graph, F32, &scale0);
node_params[SCALAR_SCALE1] = vsi_nn_kernel_scalar_create(
graph, F32, &scale1);
node_params[SCALAR_OUT_ZP] = vsi_nn_kernel_scalar_create(
graph, F32, &output_zp);
node_params[SCALAR_HALF_HEAD_SIZE] = vsi_nn_kernel_scalar_create(
graph, I32, &half_head_size);
node_params[SCALAR_STEP] = vsi_nn_kernel_scalar_create(
graph, I32, &step);
status = vsi_nn_kernel_node_pass_param( node, node_params, _ROPE_PARAM_NUM );
}
}
for (i = SCALAR_AXIS; i < (int32_t)_ROPE_PARAM_NUM; i++)
{
if (node_params[i])
{
vsi_nn_kernel_scalar_release(&node_params[i]);
}
}
return node;
} /* _setup() */
__END_DECLS
REGISTER_BACKEND_CL( rope, _setup )

View File

@ -27,6 +27,7 @@
#include <stdlib.h>
#include <string.h>
#include "vsi_nn_types.h"
#include "vsi_nn_types_prv.h"
#include "vsi_nn_tensor.h"
#include "vsi_nn_graph.h"
#include "vsi_nn_log.h"
@ -299,7 +300,7 @@ static vsi_nn_kernel_node_t _setup
VSI_UNREFERENCED(output_num);
#if (VX_ACTIVATION_EXT_SUPPORT)
if (VSI_NN_HW_EVIS_2 == graph->ctx->config.evis.ver)
if (VSI_NN_HW_EVIS_2 == ((vsi_nn_graph_prv_t*)graph)->options->config.evis.ver)
{
return NULL;
}

View File

@ -26,6 +26,7 @@
#include <stdlib.h>
#include <string.h>
#include "vsi_nn_types.h"
#include "vsi_nn_types_prv.h"
#include "vsi_nn_tensor.h"
#include "vsi_nn_graph.h"
#include "vsi_nn_log.h"
@ -457,7 +458,7 @@ static vsi_nn_kernel_node_t _setup
vsi_bool is_odd_even_sort = FALSE;
vsi_bool is_bitnoic_segment = FALSE;
size_t param_num = _TOPK_PARAM_NUM;
int32_t max_stages = 7 + (int32_t)log2(graph->ctx->config.subGroupSize >> 2);
int32_t max_stages = 7 + (int32_t)log2(((vsi_nn_graph_prv_t*)graph)->options->config.subGroupSize >> 2);
vsi_nn_kernel_dtype_e type0 = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
vsi_nn_kernel_dtype_e type1 = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
@ -483,6 +484,11 @@ static vsi_nn_kernel_node_t _setup
return NULL;
}
if (block_size >= GPU_TENSOR_MAX_WIDTH)
{
return NULL;
}
shape[0][0] = block_size;
shape[0][1] = block_num;
shape[1][0] = top_k;

View File

@ -27,6 +27,7 @@
#include <stdlib.h>
#include <string.h>
#include "vsi_nn_types.h"
#include "vsi_nn_types_prv.h"
#include "vsi_nn_tensor.h"
#include "vsi_nn_graph.h"
#include "vsi_nn_log.h"
@ -192,7 +193,7 @@ static vsi_bool _bucketize_support_types
return FALSE;
}
if (in_dtype == F16 && graph->ctx->config.evis.ver != VSI_NN_HW_EVIS_2)
if (in_dtype == F16 && ((vsi_nn_graph_prv_t*)graph)->options->config.evis.ver != VSI_NN_HW_EVIS_2)
{
return FALSE;
}

View File

@ -27,6 +27,7 @@
#include <stdlib.h>
#include <string.h>
#include "vsi_nn_types.h"
#include "vsi_nn_types_prv.h"
#include "vsi_nn_tensor.h"
#include "vsi_nn_graph.h"
#include "vsi_nn_log.h"
@ -771,7 +772,8 @@ static vsi_nn_kernel_node_t _setup
temp_tensor[1] = weights;
temp_tensor[2] = biases;
ks = get_kernel_size(weights->attr.size[0], dilation, stride, graph->ctx->config.evis.ver);
ks = get_kernel_size(weights->attr.size[0], dilation, stride,
((vsi_nn_graph_prv_t*)graph)->options->config.evis.ver);
status = _query_kernel( kernel, temp_tensor, outputs, dilation, ks);

View File

@ -121,7 +121,9 @@ static const _kernel_map_type _groupnorm_sums_kernel_map[] =
TENSOR_GROUPNORM_SUMS_KERNELS( U8, F32, KERNEL_SOURCE_0 )
TENSOR_GROUPNORM_SUMS_KERNELS_2D( U8, F32, KERNEL_SOURCE_0 )
TENSOR_GROUPNORM_SUMS_KERNELS( I16, F32, KERNEL_SOURCE_2 )
TENSOR_GROUPNORM_SUMS_KERNELS( U16, F32, KERNEL_SOURCE_2 )
TENSOR_GROUPNORM_SUMS_KERNELS_2D( I16, F32, KERNEL_SOURCE_2 )
TENSOR_GROUPNORM_SUMS_KERNELS_2D( U16, F32, KERNEL_SOURCE_2 )
TENSOR_GROUPNORM_SUMS_KERNELS( F16, F32, KERNEL_SOURCE_2 )
TENSOR_GROUPNORM_SUMS_KERNELS_2D( F16, F32, KERNEL_SOURCE_2 )
};
@ -174,6 +176,9 @@ static const _kernel_map_type _groupnorm_kernel_map[] =
TENSOR_GROUPNORM_SCALE_KERNELS_2D( F16, F32, U8, KERNEL_SOURCE_2 )
TENSOR_GROUPNORM_SCALE_KERNELS( F16, F32, F16, KERNEL_SOURCE_2 )
TENSOR_GROUPNORM_SCALE_KERNELS_2D( F16, F32, F16, KERNEL_SOURCE_2 )
TENSOR_GROUPNORM_SCALE_KERNELS( U16, F32, U16, KERNEL_SOURCE_2 )
TENSOR_GROUPNORM_SCALE_KERNELS_2D( U16, F32, U16, KERNEL_SOURCE_2 )
};
/*
@ -245,6 +250,7 @@ DEF_KERNEL_INITIALIZER(_groupnorm_sums_initializer)
float sum_x2_tail0 = 1;
float sum_x2_tail1 = 1;
float work_item_pixels = 1;
vsi_bool is_input_8bits = FALSE;
VSI_UNREFERENCED(param_size);
@ -263,12 +269,13 @@ DEF_KERNEL_INITIALIZER(_groupnorm_sums_initializer)
width = (int32_t)(input_shape->data[0]);
height = (int32_t)(input_shape->data[1]);
chn = (int32_t)(attr[1]->shape->data[1]);
is_input_8bits = attr[0]->dtype == I8 || attr[0]->dtype == U8;
if (is2D)
{
height = 1;
}
work_item_pixels = (float)height * 16;
work_item_pixels = is_input_8bits ? 16 * (float)height : 8 * (float)height;
sum_x_tail = -work_item_pixels * input_zp * input_scale;
sum_x2_tail0 = work_item_pixels * input_zp * input_zp * input_scale2;
@ -281,11 +288,11 @@ DEF_KERNEL_INITIALIZER(_groupnorm_sums_initializer)
shaderParam.local_size[1] = 1;
shaderParam.local_size[2] = 1;
if (attr[0]->dtype == I8 || attr[0]->dtype == U8)
if (is_input_8bits)
{
shaderParam.global_size[0] = (width + 255) / 256 * 16;
}
else if (attr[0]->dtype == I16 || attr[0]->dtype == F16)
else if (attr[0]->dtype == I16 || attr[0]->dtype == F16 || attr[0]->dtype == U16)
{
shaderParam.global_size[0] = (width + 127) / 128 * 16;
}
@ -324,7 +331,7 @@ DEF_KERNEL_INITIALIZER(_groupnorm_sums_initializer)
status |= vsi_nn_kernel_gpu_add_param(node, "sum_x2_tail1", &sum_x2_tail1);
CHECK_STATUS_FAIL_GOTO(status, OnError );
}
else if (attr[0]->dtype == I16 || attr[0]->dtype == F16)
else if (attr[0]->dtype == I16 || attr[0]->dtype == F16 || attr[0]->dtype == U16)
{
gpu_dp_inst_t uniSum_X_X2_8x2 = {{
0x55555555, // TCfg
@ -483,7 +490,7 @@ DEF_KERNEL_INITIALIZER(_groupnorm_initializer)
}
shaderParam.global_scale[0] = 16;
if (attr[0]->dtype == I16 || attr[0]->dtype == F16)
if (attr[0]->dtype == I16 || attr[0]->dtype == F16 || attr[0]->dtype == U16)
{
shaderParam.global_scale[0] = 8;
}
@ -610,6 +617,7 @@ DEF_KERNEL_INITIALIZER(_groupnorm_initializer)
CHECK_STATUS_FAIL_GOTO(status, OnError );
}
break;
case _PACK_SELECT_KEY( U16, U16 ):
case _PACK_SELECT_KEY( I16, I16 ):
case _PACK_SELECT_KEY( I16, F16 ):
case _PACK_SELECT_KEY( F16, F16 ):
@ -838,8 +846,7 @@ static vsi_nn_kernel_node_t _setup
attr.is_const = FALSE;
attr.vtl = TRUE;
attr.size[0] = ((new_shape[0] + 255) / 256) * 4;
if ( inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT16
|| inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT16)
if (in0_dtype == I16 || in0_dtype == F16 || in0_dtype == U16)
{
attr.size[0] = ((new_shape[0] + 127) / 128) * 4;
}

View File

@ -124,22 +124,23 @@ DEF_KERNEL_INITIALIZER(_prelu_initializer)
{0, 0, 0}
};
int8_t in0_fl = 0;
int32_t inputZP0 = 0;
float input_scale0 = 1.0f;
int32_t inputZP1 = 0;
float input_scale1 = 1.0f;
int32_t input0_zp = 0;
float input0_scale = 1.0f;
int32_t input1_zp = 0;
float input1_scale = 1.0f;
float output_zp = 0;
int8_t out_fl = 0;
float outputZP = 0;
int32_t shift0 = 0;
vsi_bool is_ge_fl = FALSE;
int32_t shift0 = 0;
vsi_bool is_ge_fl = FALSE;
vsi_bool is_2d_img = FALSE;
uint32_t evis_version = 0;
vsi_nn_kernel_tensor_attr_t * attr[3] = { NULL };
vsi_size_array_t * out_shape = NULL;
uint32_t pack_key;
vx_context ctx = vxGetContext((vx_reference)node);
vx_context ctx = vxGetContext((vx_reference)node);
vx_hardware_caps_params_t hw_param;
VSI_UNREFERENCED(param_size);
@ -165,34 +166,30 @@ DEF_KERNEL_INITIALIZER(_prelu_initializer)
CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", final );
out_shape = attr[2]->shape;
inputZP0 = attr[0]->zero_point;
input_scale0 = attr[0]->scale;
inputZP1 = attr[1]->zero_point;
input_scale1 = attr[1]->scale;
outputZP = (float)attr[2]->zero_point;
input_scale0 = input_scale0 / attr[2]->scale;
input0_zp = attr[0]->zero_point;
input0_scale = attr[0]->scale;
input1_zp = attr[1]->zero_point;
input1_scale = attr[1]->scale;
output_zp = (float)attr[2]->zero_point;
input0_scale = input0_scale / attr[2]->scale;
if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP )
if (attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP &&
attr[2]->quant == VSI_NN_KERNEL_QUANT_DFP)
{
in0_fl = (int8_t)attr[0]->dfp.fl;
}
if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_DFP )
{
out_fl = (int8_t)attr[2]->dfp.fl;
shift0 = in0_fl - out_fl;
is_ge_fl = shift0 >= 0;
}
shift0 = in0_fl - out_fl;
is_2d_img = (out_shape->size < 3) || (out_shape->data[2] == 1);
is_ge_fl = shift0 >= 0;
#define _PACK_SELECT_KEY( IN0_TYPE, OUT_TYPE, GE_FL, IMG_2D, EVIS2 ) \
(IN0_TYPE | ( OUT_TYPE << 16) | (GE_FL << 24) | (IMG_2D << 25) | (EVIS2 << 26))
pack_key = _PACK_SELECT_KEY( attr[0]->dtype, attr[2]->dtype, is_ge_fl, is_2d_img, evis_version );
pack_key = _PACK_SELECT_KEY(attr[0]->dtype, attr[2]->dtype, is_ge_fl, is_2d_img, evis_version);
if ( attr[0]->dtype == I8 && attr[2]->dtype == I8 && is_ge_fl)
if (attr[0]->dtype == I8 && attr[2]->dtype == I8 && is_ge_fl)
{
gpu_param.global_scale[0] = 16;
gpu_param.global_scale[1] = 1;
@ -204,7 +201,6 @@ DEF_KERNEL_INITIALIZER(_prelu_initializer)
gpu_param.global_scale[1] = 1;
gpu_param.global_scale[2] = 1;
}
gpu_param.global_size[0] = gpu_align_p2(
(out_shape->data[0] + gpu_param.global_scale[0] - 1)
/ gpu_param.global_scale[0], 4);
@ -215,97 +211,97 @@ DEF_KERNEL_INITIALIZER(_prelu_initializer)
switch( pack_key )
{
case _PACK_SELECT_KEY( I8, I8, 1, 1, 2 ):
case _PACK_SELECT_KEY( I16, I16, 1, 1, 2 ):
case _PACK_SELECT_KEY(I8, I8, 1, 1, 2):
case _PACK_SELECT_KEY(I16, I16, 1, 1, 2):
{
gpu_dp_inst_t uniPreluDFPLo_2x8b = { {
0x77777777, // TCfg
0x44444444, // ASelt
0x33221100, 0x77665544, // ABin
0x00000000, // BSelt
0x30201000, 0x70605040, // BBin
0x00004000, // AccumType, ConstantType, and PostShift
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniPreluDFPHi_2x8b = { {
0x77777777, // TCfg
0x44444444, // ASelt
0xbbaa9988, 0xffeeddcc, // ABin
0x00000000, // BSelt
0x30201000, 0x70605040, // BBin
0x00004000, // AccumType, ConstantType, and PostShift
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
}, GPU_DP_TYPE_16 };
if (attr[0]->dtype == I16)
{
gpu_dp_inst_t uniPreluDFPLo_2x8b = {{
0x77777777, // TCfg
0x44444444, // ASelt
0x33221100, 0x77665544, // ABin
0x00000000, // BSelt
0x30201000, 0x70605040, // BBin
0x00004000, // AccumType, ConstantType, and PostShift
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniPreluDFPHi_2x8b = {{
0x77777777, // TCfg
0x44444444, // ASelt
0xbbaa9988, 0xffeeddcc, // ABin
0x00000000, // BSelt
0x30201000, 0x70605040, // BBin
0x00004000, // AccumType, ConstantType, and PostShift
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
}, GPU_DP_TYPE_16 };
if ( attr[0]->dtype == I16 )
{
uniPreluDFPLo_2x8b.data[7] = 0x00003000;
uniPreluDFPHi_2x8b.data[7] = 0x00003000;
}
gpu_dp_inst_update_postshfit( &uniPreluDFPLo_2x8b, shift0 );
gpu_dp_inst_update_postshfit( &uniPreluDFPHi_2x8b, shift0 );
status = vsi_nn_kernel_gpu_add_param( node,
"uniPreluDFPLo_2x8b", &uniPreluDFPLo_2x8b );
status |= vsi_nn_kernel_gpu_add_param( node,
"uniPreluDFPHi_2x8b", &uniPreluDFPHi_2x8b );
CHECK_STATUS_FAIL_GOTO(status, final );
uniPreluDFPLo_2x8b.data[7] = 0x00003000;
uniPreluDFPHi_2x8b.data[7] = 0x00003000;
}
break;
case _PACK_SELECT_KEY( I8, I8, 1, 1, 1 ):
case _PACK_SELECT_KEY( I16, I16, 1, 1, 1 ):
{
gpu_dp_inst_t uniPreluInt8_2x8 = {{
0x55555555, // TCfg
0x00000000, // ASelt
0xb3a29180, 0xf7e6d5c4, // ABin
0x66666666, // BSelt
0x30201000, 0x70605040, // BBin
0x00000600, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000001, 0x00000001, 0x00000001,
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniPreluInt16_part0_4x4 = {{
0x05050505, // TCfg
0x00000000, // ASelt
0x00510040, 0x00730062, // ABin
0x06060606, // BSelt
0x00100000, 0x00300020, // BBin
0x00000400, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000000, 0x00000001, 0x00000000,
0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniPreluInt16_part1_4x4 = {{
0x05050505, // TCfg
0x00000000, // ASelt
0x00510040, 0x00730062, // ABin
0x06060606, // BSelt
0x00500040, 0x00700060, // BBin
0x00000400, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000000, 0x00000001, 0x00000000,
0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_update_postshfit( &uniPreluInt8_2x8, shift0 );
gpu_dp_inst_update_postshfit( &uniPreluInt16_part0_4x4, shift0 );
gpu_dp_inst_update_postshfit( &uniPreluInt16_part1_4x4, shift0 );
gpu_dp_inst_update_postshfit(&uniPreluDFPLo_2x8b, shift0);
gpu_dp_inst_update_postshfit(&uniPreluDFPHi_2x8b, shift0);
status = vsi_nn_kernel_gpu_add_param( node,
"uniPreluInt8_2x8", &uniPreluInt8_2x8 );
status |= vsi_nn_kernel_gpu_add_param( node,
"uniPreluInt16_part0_4x4", &uniPreluInt16_part0_4x4 );
status |= vsi_nn_kernel_gpu_add_param( node,
"uniPreluInt16_part1_4x4", &uniPreluInt16_part1_4x4 );
CHECK_STATUS_FAIL_GOTO(status, final );
}
break;
case _PACK_SELECT_KEY( BF16, BF16, 1, 1, 1 ):
case _PACK_SELECT_KEY( BF16, BF16, 1, 1, 2 ):
case _PACK_SELECT_KEY( BF16, BF16, 1, 0, 1 ):
case _PACK_SELECT_KEY( BF16, BF16, 1, 0, 2 ):
status = vsi_nn_kernel_gpu_add_param(node,
"uniPreluDFPLo_2x8b", &uniPreluDFPLo_2x8b);
status |= vsi_nn_kernel_gpu_add_param(node,
"uniPreluDFPHi_2x8b", &uniPreluDFPHi_2x8b);
CHECK_STATUS_FAIL_GOTO(status, final);
}
break;
case _PACK_SELECT_KEY(I8, I8, 1, 1, 1):
case _PACK_SELECT_KEY(I16, I16, 1, 1, 1):
{
gpu_dp_inst_t uniPreluInt8_2x8 = { {
0x55555555, // TCfg
0x00000000, // ASelt
0xb3a29180, 0xf7e6d5c4, // ABin
0x66666666, // BSelt
0x30201000, 0x70605040, // BBin
0x00000600, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000001, 0x00000001, 0x00000001,
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniPreluInt16_part0_4x4 = { {
0x05050505, // TCfg
0x00000000, // ASelt
0x00510040, 0x00730062, // ABin
0x06060606, // BSelt
0x00100000, 0x00300020, // BBin
0x00000400, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000000, 0x00000001, 0x00000000,
0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniPreluInt16_part1_4x4 = { {
0x05050505, // TCfg
0x00000000, // ASelt
0x00510040, 0x00730062, // ABin
0x06060606, // BSelt
0x00500040, 0x00700060, // BBin
0x00000400, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000000, 0x00000001, 0x00000000,
0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_update_postshfit(&uniPreluInt8_2x8, shift0);
gpu_dp_inst_update_postshfit(&uniPreluInt16_part0_4x4, shift0);
gpu_dp_inst_update_postshfit(&uniPreluInt16_part1_4x4, shift0);
status = vsi_nn_kernel_gpu_add_param(node,
"uniPreluInt8_2x8", &uniPreluInt8_2x8);
status |= vsi_nn_kernel_gpu_add_param(node,
"uniPreluInt16_part0_4x4", &uniPreluInt16_part0_4x4);
status |= vsi_nn_kernel_gpu_add_param(node,
"uniPreluInt16_part1_4x4", &uniPreluInt16_part1_4x4);
CHECK_STATUS_FAIL_GOTO(status, final);
}
break;
case _PACK_SELECT_KEY(BF16, BF16, 0, 1, 1):
case _PACK_SELECT_KEY(BF16, BF16, 0, 1, 2):
case _PACK_SELECT_KEY(BF16, BF16, 0, 0, 1):
case _PACK_SELECT_KEY(BF16, BF16, 0, 0, 2):
{
gpu_dp_inst_t uniConvBF16toF32_Part0_2x8 = {{
0x11111111, // TCfg
@ -446,15 +442,15 @@ DEF_KERNEL_INITIALIZER(_prelu_initializer)
status |= vsi_nn_kernel_gpu_add_param( node,
"uniConvF16toF32_part1_4x4", &uniConvF16toF32_part1_4x4 );
status |= vsi_nn_kernel_gpu_add_param( node,
"inputZP0", &inputZP0 );
"input0_zp", &input0_zp);
status |= vsi_nn_kernel_gpu_add_param( node,
"input_scale0", &input_scale0 );
"input0_scale", &input0_scale );
status |= vsi_nn_kernel_gpu_add_param( node,
"inputZP1", &inputZP1 );
"input1_zp", &input1_zp);
status |= vsi_nn_kernel_gpu_add_param( node,
"input_scale1", &input_scale1 );
"input1_scale", &input1_scale );
status |= vsi_nn_kernel_gpu_add_param( node,
"outputZP", &outputZP );
"output_zp", &output_zp );
if (attr[2]->dtype == F16)
{
status |= vsi_nn_kernel_gpu_add_param( node,

View File

@ -27,6 +27,7 @@
#include <stdlib.h>
#include <string.h>
#include "vsi_nn_types.h"
#include "vsi_nn_types_prv.h"
#include "vsi_nn_tensor.h"
#include "vsi_nn_graph.h"
#include "vsi_nn_log.h"
@ -58,53 +59,92 @@ typedef enum
#define _RESIZE_BILINEAR_KERNEL_SOURCE_OPT(_input_type) "resize_bilinear_"#_input_type"_opt"
#define _RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC1(_input_type) "resize_bilinear_"#_input_type"_half_pixel_centers_1"
#define _RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC2(_input_type) "resize_bilinear_"#_input_type"_half_pixel_centers_2"
#define _RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC3(_input_type) "resize_bilinear_"#_input_type"_half_pixel_centers_3"
#define _RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC4(_input_type) "resize_bilinear_"#_input_type"_half_pixel_centers_4"
#define _RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC5(_input_type) "resize_bilinear_"#_input_type"_half_pixel_centers_5"
#define STR(a) #a
// Add kernel hashtable here
#define RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, scale_flag ) \
(( IN_DTYPE << 20 ) | ( OUT_DTYPE << 8) | (scale_flag))
#define RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, scale_flag, same_type ) \
(( IN_DTYPE ) | ( OUT_DTYPE << 8) | (scale_flag << 16) | (same_type << 22))
#define PACK_KERNEL_MAP_DOWN( IN_DTYPE, OUT_DTYPE ) \
{ RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, DOWN ), \
#define _PACK_KERNEL_MAP_DOWN( IN_DTYPE, OUT_DTYPE, SAME_TYPE ) \
{ RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, DOWN, SAME_TYPE ), \
CVIVANTE_NAMESPACE("evis.resize_bilinear_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)"_DOWN"), \
_RESIZE_BILINEAR_KERNEL_SOURCE(IN_DTYPE) }
#define PACK_KERNEL_MAP_UP( IN_DTYPE, OUT_DTYPE ) \
{ RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP ), \
#define PACK_KERNEL_MAP_DOWN( IN_DTYPE, OUT_DTYPE ) \
_PACK_KERNEL_MAP_DOWN( IN_DTYPE, OUT_DTYPE, TRUE ), \
_PACK_KERNEL_MAP_DOWN( IN_DTYPE, OUT_DTYPE, FALSE )
#define _PACK_KERNEL_MAP_UP( IN_DTYPE, OUT_DTYPE, SAME_TYPE ) \
{ RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP, SAME_TYPE ), \
CVIVANTE_NAMESPACE("evis.resize_bilinear_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)"_UP"), \
_RESIZE_BILINEAR_KERNEL_SOURCE(IN_DTYPE) }
#define PACK_KERNEL_MAP_UP_OPT( IN_DTYPE, OUT_DTYPE ) \
{ RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_OPT ), \
#define PACK_KERNEL_MAP_UP( IN_DTYPE, OUT_DTYPE ) \
_PACK_KERNEL_MAP_UP( IN_DTYPE, OUT_DTYPE, TRUE ), \
_PACK_KERNEL_MAP_UP( IN_DTYPE, OUT_DTYPE, FALSE )
#define _PACK_KERNEL_MAP_UP_OPT( IN_DTYPE, OUT_DTYPE, SAME_TYPE ) \
{ RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_OPT, SAME_TYPE ), \
CVIVANTE_NAMESPACE("evis.resize_bilinear_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)"_UP_opt"), \
_RESIZE_BILINEAR_KERNEL_SOURCE_OPT(IN_DTYPE) }
#define PACK_KERNEL_MAP_UP_2X_HALF( IN_DTYPE, OUT_DTYPE ) \
{ RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_2X_HALF ), \
#define PACK_KERNEL_MAP_UP_OPT( IN_DTYPE, OUT_DTYPE ) \
_PACK_KERNEL_MAP_UP_OPT( IN_DTYPE, OUT_DTYPE, TRUE ), \
_PACK_KERNEL_MAP_UP_OPT( IN_DTYPE, OUT_DTYPE, FALSE )
#define PACK_KERNEL_MAP_UP_2X_HALF_SAME_TYPE( IN_DTYPE, OUT_DTYPE ) \
{ RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_2X_HALF, TRUE ), \
CVIVANTE_NAMESPACE("evis.resize_bilinear_"STR(IN_DTYPE)"to"STR(OUT_DTYPE) \
"_SAME_2x_upsample_half_pixel_centers"), \
_RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC1(IN_DTYPE) }
#define PACK_KERNEL_MAP_UP_4X_HALF( IN_DTYPE, OUT_DTYPE ) \
{ RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_4X_HALF ), \
#define PACK_KERNEL_MAP_UP_4X_HALF_SAME_TYPE( IN_DTYPE, OUT_DTYPE ) \
{ RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_4X_HALF, TRUE ), \
CVIVANTE_NAMESPACE("evis.resize_bilinear_"STR(IN_DTYPE)"to"STR(OUT_DTYPE) \
"_SAME_4x_upsample_half_pixel_centers"), \
_RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC1(IN_DTYPE) }
#define PACK_KERNEL_MAP_UP_8X_HALF( IN_DTYPE, OUT_DTYPE ) \
{ RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_8X_HALF ), \
#define PACK_KERNEL_MAP_UP_8X_HALF_SAME_TYPE( IN_DTYPE, OUT_DTYPE ) \
{ RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_8X_HALF, TRUE ), \
CVIVANTE_NAMESPACE("evis.resize_bilinear_"STR(IN_DTYPE)"to"STR(OUT_DTYPE) \
"_SAME_8x_upsample_half_pixel_centers"), \
_RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC2(IN_DTYPE) }
#define PACK_KERNEL_MAP_UP_3X_HALF( IN_DTYPE, OUT_DTYPE ) \
{ RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_3X_HALF ), \
#define PACK_KERNEL_MAP_UP_3X_HALF_SAME_TYPE( IN_DTYPE, OUT_DTYPE ) \
{ RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_3X_HALF, TRUE ), \
CVIVANTE_NAMESPACE("evis.resize_bilinear_"STR(IN_DTYPE)"to"STR(OUT_DTYPE) \
"_SAME_3x_upsample_half_pixel_centers"), \
_RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC1(IN_DTYPE) }
#define PACK_KERNEL_MAP_UP_2X_HALF( IN_DTYPE, OUT_DTYPE ) \
{ RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_2X_HALF, FALSE ), \
CVIVANTE_NAMESPACE("evis.resize_bilinear_"STR(IN_DTYPE)"to"STR(OUT_DTYPE) \
"_2x_upsample_half_pixel_centers"), \
_RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC3(IN_DTYPE) }
#define PACK_KERNEL_MAP_UP_4X_HALF( IN_DTYPE, OUT_DTYPE ) \
{ RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_4X_HALF, FALSE ), \
CVIVANTE_NAMESPACE("evis.resize_bilinear_"STR(IN_DTYPE)"to"STR(OUT_DTYPE) \
"_4x_upsample_half_pixel_centers"), \
_RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC3(IN_DTYPE) }
#define PACK_KERNEL_MAP_UP_8X_HALF( IN_DTYPE, OUT_DTYPE ) \
{ RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_8X_HALF, FALSE ), \
CVIVANTE_NAMESPACE("evis.resize_bilinear_"STR(IN_DTYPE)"to"STR(OUT_DTYPE) \
"_8x_upsample_half_pixel_centers"), \
_RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC5(IN_DTYPE) }
#define PACK_KERNEL_MAP_UP_3X_HALF( IN_DTYPE, OUT_DTYPE ) \
{ RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_3X_HALF, FALSE ), \
CVIVANTE_NAMESPACE("evis.resize_bilinear_"STR(IN_DTYPE)"to"STR(OUT_DTYPE) \
"_3x_upsample_half_pixel_centers"), \
_RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC4(IN_DTYPE) }
#define PACK_KERNEL_MAP_UP_8X_ALIGN( IN_DTYPE, OUT_DTYPE ) \
{ RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_8X_ALIGN ), \
{ RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_8X_ALIGN, TRUE ), \
CVIVANTE_NAMESPACE("evis.resize_bilinear_"STR(IN_DTYPE)"to"STR(OUT_DTYPE) \
"_SAME_8x_upsample_align_corners"), \
"resize_bilinear_align_corners" }
@ -135,6 +175,10 @@ static const _kernel_map_type _resize_bilinear_kernel_map[] =
PACK_KERNEL_MAP_UP(F16, F16),
PACK_KERNEL_MAP_UP(BF16, BF16),
PACK_KERNEL_MAP_UP_OPT(U8, U8),
PACK_KERNEL_MAP_UP_2X_HALF_SAME_TYPE(U8, U8),
PACK_KERNEL_MAP_UP_3X_HALF_SAME_TYPE(U8, U8),
PACK_KERNEL_MAP_UP_4X_HALF_SAME_TYPE(U8, U8),
PACK_KERNEL_MAP_UP_8X_HALF_SAME_TYPE(U8, U8),
PACK_KERNEL_MAP_UP_2X_HALF(U8, U8),
PACK_KERNEL_MAP_UP_3X_HALF(U8, U8),
PACK_KERNEL_MAP_UP_4X_HALF(U8, U8),
@ -672,18 +716,23 @@ DEF_KERNEL_INITIALIZER(_bilinear_half_pixel_centers_opt_initializer)
};
vsi_nn_kernel_tensor_attr_t * output_attr = NULL;
vsi_nn_kernel_tensor_attr_t * input_attr = NULL;
vsi_size_array_t * out_shape = NULL;
vsi_size_array_t * in_shape = NULL;
vsi_size_array_t * out_shape = NULL;
vsi_size_array_t * in_shape = NULL;
vsi_nn_kernel_dtype_e input_dtype = F16;
vsi_nn_kernel_dtype_e output_dtype = F16;
uint32_t depth = 0;
uint32_t in_width = 0;
uint32_t in_height = 0;
uint32_t out_width = 0;
uint32_t out_height = 0;
vsi_bool is_same_type = FALSE;
vsi_bool is_2x_up_kernel = FALSE;
vsi_bool is_3x_up_kernel = FALSE;
vsi_bool is_4x_up_kernel = FALSE;
vsi_bool is_8x_up_kernel = FALSE;
float scale = 1.f;
int32_t input_zp = 0;
int32_t output_zp = 0;
VSI_UNREFERENCED(param_size);
@ -692,17 +741,23 @@ DEF_KERNEL_INITIALIZER(_bilinear_half_pixel_centers_opt_initializer)
output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final );
out_shape = output_attr->shape;
in_shape = input_attr->shape;
input_dtype = input_attr->dtype;
out_shape = output_attr->shape;
in_shape = input_attr->shape;
input_dtype = input_attr->dtype;
output_dtype = output_attr->dtype;
in_width = (uint32_t)(in_shape->data[0]);
in_height = (uint32_t)(in_shape->data[1]);
depth = (uint32_t)(in_shape->data[2]);
out_width = (uint32_t)(out_shape->data[0]);
out_height = (uint32_t)(out_shape->data[1]);
scale = input_attr->scale;
input_zp = input_attr->zero_point;
scale /= output_attr->scale;
output_zp = output_attr->zero_point;
is_same_type = _is_same_quant(input_attr, output_attr);
if ((U8 == input_dtype) && (_is_same_quant(input_attr, output_attr)))
if ((U8 == input_dtype) && (output_dtype == U8))
{
is_2x_up_kernel = (2 * in_width == out_width) && (2 * in_height == out_height);
is_3x_up_kernel = (3 * in_width == out_width) && (3 * in_height == out_height);
@ -728,206 +783,303 @@ DEF_KERNEL_INITIALIZER(_bilinear_half_pixel_centers_opt_initializer)
gpu_param.global_scale[2] = 1;
}
if (is_2x_up_kernel)
if (is_2x_up_kernel || is_3x_up_kernel || is_4x_up_kernel || is_8x_up_kernel)
{
gpu_dp_inst_t uniResize2xUp_0_4x8 = {{
0x55555555, 0x55555555, // TCfg
0x4418c020, 0x29444194, 0xc8629c86, 0x83a4c839, 0xad0a4a4c, // BinSelect
0x00000704, // AccumType, ConstantType, and PostShift
0x09030301, 0x03090103, 0x09030301, 0x03090103,
0x09030301, 0x03090103, 0x09030301, 0x03090103 // Constant
}, GPU_DP_TYPE_16};
gpu_dp_inst_t uniResize2xUp_1_4x8 = {{
0x55555555, 0x55555555, // TCfg
0x4c5ad0a4, 0x6b54c5b5, 0xd8e6bd8e, 0x07c5d07b, 0xce128c5d, // BinSelect
0x00000704, // AccumType, ConstantType, and PostShift
0x09030301, 0x03090103, 0x09030301, 0x03090103,
0x09030301, 0x03090103, 0x09030301, 0x03090103 // Constant
}, GPU_DP_TYPE_16};
uint16_t M0 = 0;
int32_t postShift = 0;
uint32_t multAndoutZP[2] = { 0 };
gpu_dp_inst_t uniU8PostProcess_2x8 = { {
0xdddddddd, // TCfg
0x44444444, // ASelt
0x13121110, 0x17161514, // ABin
0x11111111, // BSelt
0x00000000, 0x00000000, // BBin
0x00002600, // AccumType, ConstantType, and PostShift
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
}, GPU_DP_TYPE_16 };
status = vsi_nn_kernel_gpu_add_param( node, "uniResize2xUp_0_4x8", &uniResize2xUp_0_4x8);
status |= vsi_nn_kernel_gpu_add_param( node, "uniResize2xUp_1_4x8", &uniResize2xUp_1_4x8);
status |= vsi_nn_kernel_gpu_add_param( node, "out_height", &out_height);
CHECK_STATUS_FAIL_GOTO(status, final );
}
else if (is_3x_up_kernel)
{
gpu_dp_inst_t uniResize3xUp_l00_2x8 = {{
0x15515515, // TCfg
0x00000000, // ASelt
0x21210110, 0x03323202, // ABin
0x2aa2aa2a, // BSelt
0x00000000, 0x00000000, // BBin
0x00000610, // AccumType, ConstantType, and PostShift
0xaaaa5555, 0x0000ffff, 0x5555aaaa, 0xaaaa5555,
0x0000ffff, 0x5555aaaa, 0xaaaa5555, 0x0000ffff // Constant
}, GPU_DP_TYPE_16};
gpu_dp_inst_t uniResize3xUp_l01_2x8 = {{
0x05155155, // TCfg
0x00000000, // ASelt
0x54044343, 0x00650554, // ABin
0x0a2aa2aa, // BSelt
0x00000000, 0x00000000, // BBin
0x00000610, // AccumType, ConstantType, and PostShift
0x5555aaaa, 0xaaaa5555, 0x0000ffff, 0x5555aaaa,
0xaaaa5555, 0x0000ffff, 0x5555aaaa, 0x00000000 // Constant
}, GPU_DP_TYPE_16};
gpu_dp_inst_t uniResize3xUp_l10_4x4 = {{
0x55551155, // TCfg
0x50501050, // ASelt
0x01011010, 0x21212121, // ABin
0xaaaa22aa, // BSelt
0x00000000, 0x00000000, // BBin
0x0000060f, // AccumType, ConstantType, and PostShift
0x38e41c72, 0x1c720e39, 0x00005556, 0x00002aab,
0x1c7238e4, 0x0e391c72, 0x38e41c72, 0x1c720e39 // Constant
}, GPU_DP_TYPE_16};
gpu_dp_inst_t uniResize3xUp_l11_4x4 = {{
0x11555511, // TCfg
0x10505010, // ASelt
0x32320202, 0x03033232, // ABin
0x22aaaa22, // BSelt
0x00000000, 0x00000000, // BBin
0x0000060f, // AccumType, ConstantType, and PostShift
0x00005556, 0x00002aab, 0x1c7238e4, 0x0e391c72,
0x38e41c72, 0x1c720e39, 0x00005556, 0x00002aab // Constant
}, GPU_DP_TYPE_16};
gpu_dp_inst_t uniResize3xUp_l12_4x4 = {{
0x55115555, // TCfg
0x50105050, // ASelt
0x43434343, 0x54540404, // ABin
0xaa22aaaa, // BSelt
0x00000000, 0x00000000, // BBin
0x0000060f, // AccumType, ConstantType, and PostShift
0x1c7238e4, 0x0e391c72, 0x38e41c72, 0x1c720e39,
0x00005556, 0x00002aab, 0x1c7238e4, 0x0e391c72 // Constant
}, GPU_DP_TYPE_16};
gpu_dp_inst_t uniResize3xUp_l13_4x4 = {{
0x00551155, // TCfg
0x00501050, // ASelt
0x05055454, 0x00006565, // ABin
0x00aa22aa, // BSelt
0x00000000, 0x00000000, // BBin
0x0000060f, // AccumType, ConstantType, and PostShift
0x38e41c72, 0x1c720e39, 0x00005556, 0x00002aab,
0x1c7238e4, 0x0e391c72, 0x00000000, 0x00000000 // Constant
}, GPU_DP_TYPE_16};
if (is_2x_up_kernel)
{
gpu_dp_inst_t uniResize2xUp_0_4x8 = { {
0x55555555, 0x55555555, // TCfg
0x4418c020, 0x29444194, 0xc8629c86, 0x83a4c839, 0xad0a4a4c, // BinSelect
0x00000704, // AccumType, ConstantType, and PostShift
0x09030301, 0x03090103, 0x09030301, 0x03090103,
0x09030301, 0x03090103, 0x09030301, 0x03090103 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniResize2xUp_1_4x8 = { {
0x55555555, 0x55555555, // TCfg
0x4c5ad0a4, 0x6b54c5b5, 0xd8e6bd8e, 0x07c5d07b, 0xce128c5d, // BinSelect
0x00000704, // AccumType, ConstantType, and PostShift
0x09030301, 0x03090103, 0x09030301, 0x03090103,
0x09030301, 0x03090103, 0x09030301, 0x03090103 // Constant
}, GPU_DP_TYPE_16 };
status = vsi_nn_kernel_gpu_add_param( node, "uniResize3xUp_l00_2x8", &uniResize3xUp_l00_2x8);
status |= vsi_nn_kernel_gpu_add_param( node, "uniResize3xUp_l01_2x8", &uniResize3xUp_l01_2x8);
status |= vsi_nn_kernel_gpu_add_param( node, "uniResize3xUp_l10_4x4", &uniResize3xUp_l10_4x4);
status |= vsi_nn_kernel_gpu_add_param( node, "uniResize3xUp_l11_4x4", &uniResize3xUp_l11_4x4);
status |= vsi_nn_kernel_gpu_add_param( node, "uniResize3xUp_l12_4x4", &uniResize3xUp_l12_4x4);
status |= vsi_nn_kernel_gpu_add_param( node, "uniResize3xUp_l13_4x4", &uniResize3xUp_l13_4x4);
CHECK_STATUS_FAIL_GOTO(status, final );
}
else if (is_4x_up_kernel)
{
gpu_dp_inst_t uniResize4xUp_l00_4x8 = {{
0x55555555, 0x55555555, // TCfg
0x0208c020, 0x1944418c, 0x44419444, 0x62944419, 0x9c8629c8, // BinSelect
0x00000406, // AccumType, ConstantType, and PostShift
0x190f0f09, 0x23051503, 0x05230315, 0x0f19090f,
0x190f0f09, 0x23051503, 0x05230315, 0x0f19090f // Constant
}, GPU_DP_TYPE_16};
gpu_dp_inst_t uniResize4xUp_l01_4x8 = {{
0x55555555, 0x55555555, // TCfg
0x8629c862, 0x3a4c839c, 0x4c83a4c8, 0xa4a4c83a, 0xad0a4ad0, // BinSelect
0x00000406, // AccumType, ConstantType, and PostShift
0x190f0f09, 0x23051503, 0x05230315, 0x0f19090f,
0x190f0f09, 0x23051503, 0x05230315, 0x0f19090f // Constant
}, GPU_DP_TYPE_16};
gpu_dp_inst_t uniResize4xUp_l10_4x8 = {{
0x55555555, 0x55555555, // TCfg
0x0208c020, 0x1944418c, 0x44419444, 0x62944419, 0x9c8629c8, // BinSelect
0x00000406, // AccumType, ConstantType, and PostShift
0x23150503, 0x31070701, 0x07310107, 0x15230305,
0x23150503, 0x31070701, 0x07310107, 0x15230305 // Constant
}, GPU_DP_TYPE_16};
gpu_dp_inst_t uniResize4xUp_l11_4x8 = {{
0x55555555, 0x55555555, // TCfg
0x8629c862, 0x3a4c839c, 0x4c83a4c8, 0xa4a4c83a, 0xad0a4ad0, // BinSelect
0x00000406, // AccumType, ConstantType, and PostShift
0x23150503, 0x31070701, 0x07310107, 0x15230305,
0x23150503, 0x31070701, 0x07310107, 0x15230305 // Constant
}, GPU_DP_TYPE_16};
if (!is_same_type)
{
float f2i_radio = 16.0f;
gpu_quantize_multiplier_16bit((double)scale / f2i_radio, &M0, &postShift);
multAndoutZP[0] = (uint32_t)(M0);
multAndoutZP[1] = (uint32_t)((output_zp << postShift) - input_zp * M0 * f2i_radio);
status = vsi_nn_kernel_gpu_add_param( node, "uniResize4xUp_l00_4x8", &uniResize4xUp_l00_4x8);
status |= vsi_nn_kernel_gpu_add_param( node, "uniResize4xUp_l01_4x8", &uniResize4xUp_l01_4x8);
status |= vsi_nn_kernel_gpu_add_param( node, "uniResize4xUp_l10_4x8", &uniResize4xUp_l10_4x8);
status |= vsi_nn_kernel_gpu_add_param( node, "uniResize4xUp_l11_4x8", &uniResize4xUp_l11_4x8);
status |= vsi_nn_kernel_gpu_add_param( node, "out_height", &out_height);
CHECK_STATUS_FAIL_GOTO(status, final );
}
else if (is_8x_up_kernel)
{
gpu_dp_inst_t uniResize8xUp_l00_4x8 = {{
0x55555555, 0x55555555, // TCfg
0x0208c020, 0x08c0208c, 0x44418c02, 0x41944419, 0x94441944, // BinSelect
0x00000708, // AccumType, ConstantType, and PostShift
0x513f3f31, 0x632d4d23, 0x751b5b15, 0x87096907,
0x09870769, 0x1b75155b, 0x2d63234d, 0x3f51313f // Constant
}, GPU_DP_TYPE_16};
gpu_dp_inst_t uniResize8xUp_l01_4x8 = {{
0x55555555, 0x55555555, // TCfg
0x44194441, 0x19444194, 0xc8629444, 0x629c8629, 0x9c8629c8, // BinSelect
0x00000708, // AccumType, ConstantType, and PostShift
0x513f3f31, 0x632d4d23, 0x751b5b15, 0x87096907,
0x09870769, 0x1b75155b, 0x2d63234d, 0x3f51313f // Constant
}, GPU_DP_TYPE_16};
gpu_dp_inst_t uniResize8xUp_l10_4x8 = {{
0x55555555, 0x55555555, // TCfg
0x0208c020, 0x08c0208c, 0x44418c02, 0x41944419, 0x94441944, // BinSelect
0x00000708, // AccumType, ConstantType, and PostShift
0x634d2d23, 0x79373719, 0x8f21410f, 0xa50b4b05,
0x0ba5054b, 0x218f0f41, 0x37791937, 0x4d63232d // Constant
}, GPU_DP_TYPE_16};
gpu_dp_inst_t uniResize8xUp_l11_4x8 = {{
0x55555555, 0x55555555, // TCfg
0x44194441, 0x19444194, 0xc8629444, 0x629c8629, 0x9c8629c8, // BinSelect
0x00000708, // AccumType, ConstantType, and PostShift
0x634d2d23, 0x79373719, 0x8f21410f, 0xa50b4b05,
0x0ba5054b, 0x218f0f41, 0x37791937, 0x4d63232d // Constant
}, GPU_DP_TYPE_16};
gpu_dp_inst_t uniResize8xUp_l20_4x8 = {{
0x55555555, 0x55555555, // TCfg
0x0208c020, 0x08c0208c, 0x44418c02, 0x41944419, 0x94441944, // BinSelect
0x00000708, // AccumType, ConstantType, and PostShift
0x755b1b15, 0x8f41210f, 0xa9272709, 0xc30d2d03,
0x0dc3032d, 0x27a90927, 0x418f0f21, 0x5b75151b // Constant
}, GPU_DP_TYPE_16};
gpu_dp_inst_t uniResize8xUp_l21_4x8 = {{
0x55555555, 0x55555555, // TCfg
0x44194441, 0x19444194, 0xc8629444, 0x629c8629, 0x9c8629c8, // BinSelect
0x00000708, // AccumType, ConstantType, and PostShift
0x755b1b15, 0x8f41210f, 0xa9272709, 0xc30d2d03,
0x0dc3032d, 0x27a90927, 0x418f0f21, 0x5b75151b // Constant
}, GPU_DP_TYPE_16};
gpu_dp_inst_t uniResize8xUp_l30_4x8 = {{
0x55555555, 0x55555555, // TCfg
0x0208c020, 0x08c0208c, 0x44418c02, 0x41944419, 0x94441944, // BinSelect
0x00000708, // AccumType, ConstantType, and PostShift
0x87690907, 0xa54b0b05, 0xc32d0d03, 0xe10f0f01,
0x0fe1010f, 0x2dc3030d, 0x4ba5050b, 0x69870709 // Constant
}, GPU_DP_TYPE_16};
gpu_dp_inst_t uniResize8xUp_l31_4x8 = {{
0x55555555, 0x55555555, // TCfg
0x44194441, 0x19444194, 0xc8629444, 0x629c8629, 0x9c8629c8, // BinSelect
0x00000708, // AccumType, ConstantType, and PostShift
0x87690907, 0xa54b0b05, 0xc32d0d03, 0xe10f0f01,
0x0fe1010f, 0x2dc3030d, 0x4ba5050b, 0x69870709 // Constant
}, GPU_DP_TYPE_16};
gpu_dp_inst_update_postshfit(&uniU8PostProcess_2x8, postShift);
uniResize2xUp_0_4x8.data[7] = 0x00000700;
uniResize2xUp_1_4x8.data[7] = 0x00000700;
status = vsi_nn_kernel_gpu_add_param( node, "uniResize8xUp_l00_4x8", &uniResize8xUp_l00_4x8);
status |= vsi_nn_kernel_gpu_add_param( node, "uniResize8xUp_l01_4x8", &uniResize8xUp_l01_4x8);
status |= vsi_nn_kernel_gpu_add_param( node, "uniResize8xUp_l10_4x8", &uniResize8xUp_l10_4x8);
status |= vsi_nn_kernel_gpu_add_param( node, "uniResize8xUp_l11_4x8", &uniResize8xUp_l11_4x8);
status |= vsi_nn_kernel_gpu_add_param( node, "uniResize8xUp_l20_4x8", &uniResize8xUp_l20_4x8);
status |= vsi_nn_kernel_gpu_add_param( node, "uniResize8xUp_l21_4x8", &uniResize8xUp_l21_4x8);
status |= vsi_nn_kernel_gpu_add_param( node, "uniResize8xUp_l30_4x8", &uniResize8xUp_l30_4x8);
status |= vsi_nn_kernel_gpu_add_param( node, "uniResize8xUp_l31_4x8", &uniResize8xUp_l31_4x8);
status |= vsi_nn_kernel_gpu_add_param( node, "out_height", &out_height);
CHECK_STATUS_FAIL_GOTO(status, final );
status = vsi_nn_kernel_gpu_add_param(node, "uniU8PostProcess_2x8",
&uniU8PostProcess_2x8);
status |= vsi_nn_kernel_gpu_add_param(node, "multAndoutZP", &multAndoutZP);
CHECK_STATUS_FAIL_GOTO(status, final);
}
status = vsi_nn_kernel_gpu_add_param(node, "uniResize2xUp_0_4x8", &uniResize2xUp_0_4x8);
status |= vsi_nn_kernel_gpu_add_param(node, "uniResize2xUp_1_4x8", &uniResize2xUp_1_4x8);
status |= vsi_nn_kernel_gpu_add_param(node, "out_height", &out_height);
CHECK_STATUS_FAIL_GOTO(status, final);
}
else if (is_3x_up_kernel)
{
gpu_dp_inst_t uniResize3xUp_l00_2x8 = { {
0x15515515, // TCfg
0x00000000, // ASelt
0x21210110, 0x03323202, // ABin
0x2aa2aa2a, // BSelt
0x00000000, 0x00000000, // BBin
0x00000610, // AccumType, ConstantType, and PostShift
0xaaaa5555, 0x0000ffff, 0x5555aaaa, 0xaaaa5555,
0x0000ffff, 0x5555aaaa, 0xaaaa5555, 0x0000ffff // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniResize3xUp_l01_2x8 = { {
0x05155155, // TCfg
0x00000000, // ASelt
0x54044343, 0x00650554, // ABin
0x0a2aa2aa, // BSelt
0x00000000, 0x00000000, // BBin
0x00000610, // AccumType, ConstantType, and PostShift
0x5555aaaa, 0xaaaa5555, 0x0000ffff, 0x5555aaaa,
0xaaaa5555, 0x0000ffff, 0x5555aaaa, 0x00000000 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniResize3xUp_l10_4x4 = { {
0x55551155, // TCfg
0x50501050, // ASelt
0x01011010, 0x21212121, // ABin
0xaaaa22aa, // BSelt
0x00000000, 0x00000000, // BBin
0x0000060f, // AccumType, ConstantType, and PostShift
0x38e41c72, 0x1c720e39, 0x00005556, 0x00002aab,
0x1c7238e4, 0x0e391c72, 0x38e41c72, 0x1c720e39 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniResize3xUp_l11_4x4 = { {
0x11555511, // TCfg
0x10505010, // ASelt
0x32320202, 0x03033232, // ABin
0x22aaaa22, // BSelt
0x00000000, 0x00000000, // BBin
0x0000060f, // AccumType, ConstantType, and PostShift
0x00005556, 0x00002aab, 0x1c7238e4, 0x0e391c72,
0x38e41c72, 0x1c720e39, 0x00005556, 0x00002aab // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniResize3xUp_l12_4x4 = { {
0x55115555, // TCfg
0x50105050, // ASelt
0x43434343, 0x54540404, // ABin
0xaa22aaaa, // BSelt
0x00000000, 0x00000000, // BBin
0x0000060f, // AccumType, ConstantType, and PostShift
0x1c7238e4, 0x0e391c72, 0x38e41c72, 0x1c720e39,
0x00005556, 0x00002aab, 0x1c7238e4, 0x0e391c72 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniResize3xUp_l13_4x4 = { {
0x00551155, // TCfg
0x00501050, // ASelt
0x05055454, 0x00006565, // ABin
0x00aa22aa, // BSelt
0x00000000, 0x00000000, // BBin
0x0000060f, // AccumType, ConstantType, and PostShift
0x38e41c72, 0x1c720e39, 0x00005556, 0x00002aab,
0x1c7238e4, 0x0e391c72, 0x00000000, 0x00000000 // Constant
}, GPU_DP_TYPE_16 };
if (!is_same_type)
{
float f2i_radio = 256.0f;
gpu_quantize_multiplier_16bit((double)scale / f2i_radio, &M0, &postShift);
multAndoutZP[0] = (uint32_t)(M0);
multAndoutZP[1] = (uint32_t)((output_zp << postShift) - input_zp * M0 * f2i_radio);
gpu_dp_inst_update_postshfit(&uniU8PostProcess_2x8, postShift);
uniResize3xUp_l00_2x8.data[7] = 0x00000608;
uniResize3xUp_l01_2x8.data[7] = 0x00000608;
uniResize3xUp_l10_4x4.data[7] = 0x00000607;
uniResize3xUp_l11_4x4.data[7] = 0x00000607;
uniResize3xUp_l12_4x4.data[7] = 0x00000607;
uniResize3xUp_l13_4x4.data[7] = 0x00000607;
status = vsi_nn_kernel_gpu_add_param(node, "uniU8PostProcess_2x8",
&uniU8PostProcess_2x8);
status |= vsi_nn_kernel_gpu_add_param(node, "multAndoutZP", &multAndoutZP);
CHECK_STATUS_FAIL_GOTO(status, final);
}
status = vsi_nn_kernel_gpu_add_param(node, "uniResize3xUp_l00_2x8", &uniResize3xUp_l00_2x8);
status |= vsi_nn_kernel_gpu_add_param(node, "uniResize3xUp_l01_2x8", &uniResize3xUp_l01_2x8);
status |= vsi_nn_kernel_gpu_add_param(node, "uniResize3xUp_l10_4x4", &uniResize3xUp_l10_4x4);
status |= vsi_nn_kernel_gpu_add_param(node, "uniResize3xUp_l11_4x4", &uniResize3xUp_l11_4x4);
status |= vsi_nn_kernel_gpu_add_param(node, "uniResize3xUp_l12_4x4", &uniResize3xUp_l12_4x4);
status |= vsi_nn_kernel_gpu_add_param(node, "uniResize3xUp_l13_4x4", &uniResize3xUp_l13_4x4);
CHECK_STATUS_FAIL_GOTO(status, final);
}
else if (is_4x_up_kernel)
{
gpu_dp_inst_t uniResize4xUp_l00_4x8 = { {
0x55555555, 0x55555555, // TCfg
0x0208c020, 0x1944418c, 0x44419444, 0x62944419, 0x9c8629c8, // BinSelect
0x00000406, // AccumType, ConstantType, and PostShift
0x190f0f09, 0x23051503, 0x05230315, 0x0f19090f,
0x190f0f09, 0x23051503, 0x05230315, 0x0f19090f // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniResize4xUp_l01_4x8 = { {
0x55555555, 0x55555555, // TCfg
0x8629c862, 0x3a4c839c, 0x4c83a4c8, 0xa4a4c83a, 0xad0a4ad0, // BinSelect
0x00000406, // AccumType, ConstantType, and PostShift
0x190f0f09, 0x23051503, 0x05230315, 0x0f19090f,
0x190f0f09, 0x23051503, 0x05230315, 0x0f19090f // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniResize4xUp_l10_4x8 = { {
0x55555555, 0x55555555, // TCfg
0x0208c020, 0x1944418c, 0x44419444, 0x62944419, 0x9c8629c8, // BinSelect
0x00000406, // AccumType, ConstantType, and PostShift
0x23150503, 0x31070701, 0x07310107, 0x15230305,
0x23150503, 0x31070701, 0x07310107, 0x15230305 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniResize4xUp_l11_4x8 = { {
0x55555555, 0x55555555, // TCfg
0x8629c862, 0x3a4c839c, 0x4c83a4c8, 0xa4a4c83a, 0xad0a4ad0, // BinSelect
0x00000406, // AccumType, ConstantType, and PostShift
0x23150503, 0x31070701, 0x07310107, 0x15230305,
0x23150503, 0x31070701, 0x07310107, 0x15230305 // Constant
}, GPU_DP_TYPE_16 };
if (!is_same_type)
{
float f2i_radio = 64.0f;
gpu_quantize_multiplier_16bit((double)scale / f2i_radio, &M0, &postShift);
multAndoutZP[0] = (uint32_t)(M0);
multAndoutZP[1] = (uint32_t)((output_zp << postShift) - input_zp * M0 * f2i_radio);
gpu_dp_inst_update_postshfit(&uniU8PostProcess_2x8, postShift);
uniResize4xUp_l00_4x8.data[7] = 0x00000400;
uniResize4xUp_l01_4x8.data[7] = 0x00000400;
uniResize4xUp_l10_4x8.data[7] = 0x00000400;
uniResize4xUp_l11_4x8.data[7] = 0x00000400;
status = vsi_nn_kernel_gpu_add_param(node, "uniU8PostProcess_2x8",
&uniU8PostProcess_2x8);
status |= vsi_nn_kernel_gpu_add_param(node, "multAndoutZP", &multAndoutZP);
CHECK_STATUS_FAIL_GOTO(status, final);
}
status = vsi_nn_kernel_gpu_add_param(node, "uniResize4xUp_l00_4x8", &uniResize4xUp_l00_4x8);
status |= vsi_nn_kernel_gpu_add_param(node, "uniResize4xUp_l01_4x8", &uniResize4xUp_l01_4x8);
status |= vsi_nn_kernel_gpu_add_param(node, "uniResize4xUp_l10_4x8", &uniResize4xUp_l10_4x8);
status |= vsi_nn_kernel_gpu_add_param(node, "uniResize4xUp_l11_4x8", &uniResize4xUp_l11_4x8);
status |= vsi_nn_kernel_gpu_add_param(node, "out_height", &out_height);
CHECK_STATUS_FAIL_GOTO(status, final);
}
else if (is_8x_up_kernel)
{
gpu_dp_inst_t uniResize8xUp_l00_4x8 = { {
0x55555555, 0x55555555, // TCfg
0x0208c020, 0x08c0208c, 0x44418c02, 0x41944419, 0x94441944, // BinSelect
0x00000708, // AccumType, ConstantType, and PostShift
0x513f3f31, 0x632d4d23, 0x751b5b15, 0x87096907,
0x09870769, 0x1b75155b, 0x2d63234d, 0x3f51313f // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniResize8xUp_l01_4x8 = { {
0x55555555, 0x55555555, // TCfg
0x44194441, 0x19444194, 0xc8629444, 0x629c8629, 0x9c8629c8, // BinSelect
0x00000708, // AccumType, ConstantType, and PostShift
0x513f3f31, 0x632d4d23, 0x751b5b15, 0x87096907,
0x09870769, 0x1b75155b, 0x2d63234d, 0x3f51313f // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniResize8xUp_l10_4x8 = { {
0x55555555, 0x55555555, // TCfg
0x0208c020, 0x08c0208c, 0x44418c02, 0x41944419, 0x94441944, // BinSelect
0x00000708, // AccumType, ConstantType, and PostShift
0x634d2d23, 0x79373719, 0x8f21410f, 0xa50b4b05,
0x0ba5054b, 0x218f0f41, 0x37791937, 0x4d63232d // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniResize8xUp_l11_4x8 = { {
0x55555555, 0x55555555, // TCfg
0x44194441, 0x19444194, 0xc8629444, 0x629c8629, 0x9c8629c8, // BinSelect
0x00000708, // AccumType, ConstantType, and PostShift
0x634d2d23, 0x79373719, 0x8f21410f, 0xa50b4b05,
0x0ba5054b, 0x218f0f41, 0x37791937, 0x4d63232d // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniResize8xUp_l20_4x8 = { {
0x55555555, 0x55555555, // TCfg
0x0208c020, 0x08c0208c, 0x44418c02, 0x41944419, 0x94441944, // BinSelect
0x00000708, // AccumType, ConstantType, and PostShift
0x755b1b15, 0x8f41210f, 0xa9272709, 0xc30d2d03,
0x0dc3032d, 0x27a90927, 0x418f0f21, 0x5b75151b // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniResize8xUp_l21_4x8 = { {
0x55555555, 0x55555555, // TCfg
0x44194441, 0x19444194, 0xc8629444, 0x629c8629, 0x9c8629c8, // BinSelect
0x00000708, // AccumType, ConstantType, and PostShift
0x755b1b15, 0x8f41210f, 0xa9272709, 0xc30d2d03,
0x0dc3032d, 0x27a90927, 0x418f0f21, 0x5b75151b // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniResize8xUp_l30_4x8 = { {
0x55555555, 0x55555555, // TCfg
0x0208c020, 0x08c0208c, 0x44418c02, 0x41944419, 0x94441944, // BinSelect
0x00000708, // AccumType, ConstantType, and PostShift
0x87690907, 0xa54b0b05, 0xc32d0d03, 0xe10f0f01,
0x0fe1010f, 0x2dc3030d, 0x4ba5050b, 0x69870709 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniResize8xUp_l31_4x8 = { {
0x55555555, 0x55555555, // TCfg
0x44194441, 0x19444194, 0xc8629444, 0x629c8629, 0x9c8629c8, // BinSelect
0x00000708, // AccumType, ConstantType, and PostShift
0x87690907, 0xa54b0b05, 0xc32d0d03, 0xe10f0f01,
0x0fe1010f, 0x2dc3030d, 0x4ba5050b, 0x69870709 // Constant
}, GPU_DP_TYPE_16 };
if (!is_same_type)
{
float f2i_radio = 256.0f;
gpu_quantize_multiplier_16bit((double)scale / f2i_radio, &M0, &postShift);
multAndoutZP[0] = (uint32_t)(M0);
multAndoutZP[1] = (uint32_t)((output_zp << postShift) - input_zp * M0 * f2i_radio);
gpu_dp_inst_update_postshfit(&uniU8PostProcess_2x8, postShift);
uniResize8xUp_l00_4x8.data[7] = 0x00000700;
uniResize8xUp_l01_4x8.data[7] = 0x00000700;
uniResize8xUp_l10_4x8.data[7] = 0x00000700;
uniResize8xUp_l11_4x8.data[7] = 0x00000700;
uniResize8xUp_l20_4x8.data[7] = 0x00000700;
uniResize8xUp_l21_4x8.data[7] = 0x00000700;
uniResize8xUp_l30_4x8.data[7] = 0x00000700;
uniResize8xUp_l31_4x8.data[7] = 0x00000700;
status = vsi_nn_kernel_gpu_add_param(node, "uniU8PostProcess_2x8",
&uniU8PostProcess_2x8);
status |= vsi_nn_kernel_gpu_add_param(node, "multAndoutZP", &multAndoutZP);
CHECK_STATUS_FAIL_GOTO(status, final);
}
status = vsi_nn_kernel_gpu_add_param(node, "uniResize8xUp_l00_4x8", &uniResize8xUp_l00_4x8);
status |= vsi_nn_kernel_gpu_add_param(node, "uniResize8xUp_l01_4x8", &uniResize8xUp_l01_4x8);
status |= vsi_nn_kernel_gpu_add_param(node, "uniResize8xUp_l10_4x8", &uniResize8xUp_l10_4x8);
status |= vsi_nn_kernel_gpu_add_param(node, "uniResize8xUp_l11_4x8", &uniResize8xUp_l11_4x8);
status |= vsi_nn_kernel_gpu_add_param(node, "uniResize8xUp_l20_4x8", &uniResize8xUp_l20_4x8);
status |= vsi_nn_kernel_gpu_add_param(node, "uniResize8xUp_l21_4x8", &uniResize8xUp_l21_4x8);
status |= vsi_nn_kernel_gpu_add_param(node, "uniResize8xUp_l30_4x8", &uniResize8xUp_l30_4x8);
status |= vsi_nn_kernel_gpu_add_param(node, "uniResize8xUp_l31_4x8", &uniResize8xUp_l31_4x8);
status |= vsi_nn_kernel_gpu_add_param(node, "out_height", &out_height);
CHECK_STATUS_FAIL_GOTO(status, final);
}
}
else
{
@ -1193,22 +1345,22 @@ static vsi_status _query_kernel
if (outputs[0]->attr.size[0] > inputs[0]->attr.size[0])
{
if (is_same_type && (!align_corners) && (half_pixel_centers) && is_2x_upsample)
if ((!align_corners) && (half_pixel_centers) && is_2x_upsample)
{
scale_flag = UP_2X_HALF;
initializer = _bilinear_half_pixel_centers_opt_initializer;
}
else if (is_same_type && (!align_corners) && (half_pixel_centers) && is_3x_upsample)
else if ((!align_corners) && (half_pixel_centers) && is_3x_upsample)
{
scale_flag = UP_3X_HALF;
initializer = _bilinear_half_pixel_centers_opt_initializer;
}
else if (is_same_type && (!align_corners) && (half_pixel_centers) && is_4x_upsample)
else if ((!align_corners) && (half_pixel_centers) && is_4x_upsample)
{
scale_flag = UP_4X_HALF;
initializer = _bilinear_half_pixel_centers_opt_initializer;
}
else if (is_same_type && (!align_corners) && (half_pixel_centers) && is_8x_upsample)
else if ((!align_corners) && (half_pixel_centers) && is_8x_upsample)
{
scale_flag = UP_8X_HALF;
initializer = _bilinear_half_pixel_centers_opt_initializer;
@ -1232,7 +1384,7 @@ static vsi_status _query_kernel
scale_flag = DOWN;
}
key = RESIZE_BILINEAR_HASH_KEY( in_dtype, out_dtype, scale_flag );
key = RESIZE_BILINEAR_HASH_KEY( in_dtype, out_dtype, scale_flag, is_same_type);
for( i = 0; i < (uint32_t)kernel_map_size; i ++ )
{
if( kernel_map[i].key == key )
@ -1244,7 +1396,7 @@ static vsi_status _query_kernel
if ((UP_OPT == scale_flag) && (i >= kernel_map_size))
{
scale_flag = UP;
key = RESIZE_BILINEAR_HASH_KEY( in_dtype, out_dtype, scale_flag );
key = RESIZE_BILINEAR_HASH_KEY( in_dtype, out_dtype, scale_flag, is_same_type);
for( i = 0; i < (uint32_t)kernel_map_size; i ++ )
{
if( kernel_map[i].key == key )
@ -1257,7 +1409,7 @@ static vsi_status _query_kernel
if ((UP == scale_flag) && (i >= kernel_map_size))
{
scale_flag = DOWN;
key = RESIZE_BILINEAR_HASH_KEY( in_dtype, out_dtype, scale_flag );
key = RESIZE_BILINEAR_HASH_KEY( in_dtype, out_dtype, scale_flag, is_same_type);
for( i = 0; i < (uint32_t)kernel_map_size; i ++ )
{
if( kernel_map[i].key == key )
@ -1433,7 +1585,7 @@ static vsi_bool _is_image_width_lt16
size_t bytes = vsi_nn_kernel_dtype_get_bytes(in_dtype);
vsi_size_t max_cross_read_img_width = bytes == 1 ? 16 : 8;
if (VSI_NN_HW_EVIS_2 == graph->ctx->config.evis.ver)
if (VSI_NN_HW_EVIS_2 == ((vsi_nn_graph_prv_t*)graph)->options->config.evis.ver)
{
return FALSE;
}
@ -1468,7 +1620,8 @@ static vsi_nn_kernel_node_t _setup
int32_t align_corners = vsi_nn_kernel_param_get_int32( params, "align_corners" );
int32_t half_pixel_centers = vsi_nn_kernel_param_get_int32( params, "half_pixel_centers" );
vsi_bool is_same_type = vsi_nn_is_same_type(inputs[0], outputs[0]);
vsi_bool is_evis2 = (vsi_bool)(graph->ctx->config.evis.ver == VSI_NN_HW_EVIS_2);
vsi_bool is_evis2 = \
(vsi_bool)(((vsi_nn_graph_prv_t*)graph)->options->config.evis.ver == VSI_NN_HW_EVIS_2);
vsi_bool is_run_opt_kernel = FALSE;
vsi_nn_tensor_t* scale = NULL;
int32_t pad_left = half_pixel_centers ? 1 : 0;

View File

@ -0,0 +1,744 @@
/****************************************************************************
*
* Copyright (c) 2020 Vivante Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*
*****************************************************************************/
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include "vsi_nn_types.h"
#include "vsi_nn_tensor.h"
#include "vsi_nn_graph.h"
#include "vsi_nn_log.h"
#include "vsi_nn_error.h"
#include "vsi_nn_prv.h"
#include "vsi_nn_tensor_util.h"
#include "utils/vsi_nn_util.h"
#include "kernel/vsi_nn_kernel.h"
__BEGIN_DECLS
/*
* Define kernel meta.
B---batch
N---num_heads
S---sequence length
H---head size
*/
typedef enum
{
LAYOUT_NONE,
LAYOUT_BNHS,
LAYOUT_BNH1,
LAYOUT_BSNH,
LAYOUT_BNSH,
} _internal_rope_layout_e;
// Add kernel hashtable here
#define STR(a) #a
#define ROPE_HASH_KEY( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, LAYOUT, INTERLEAVED ) \
((IN0_DTYPE) | (IN1_DTYPE << 8) | (OUT_DTYPE << 16) | (LAYOUT << 24) | (INTERLEAVED << 28))
#define PACK_KERNEL_BNHS_MAP( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ) \
{ ROPE_HASH_KEY( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, LAYOUT_BNHS, 0 ), \
CVIVANTE_NAMESPACE("evis.rope_"STR(IN0_DTYPE)"_"STR(IN1_DTYPE)"to"STR(OUT_DTYPE)"_bnhs"), \
"rope_0" }
#define PACK_KERNEL_BNH1_MAP( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ) \
{ ROPE_HASH_KEY( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, LAYOUT_BNH1, 0 ), \
CVIVANTE_NAMESPACE("evis.rope_"STR(IN0_DTYPE)"_"STR(IN1_DTYPE)"to"STR(OUT_DTYPE)"_bnh1"), \
"rope_1" }
#define PACK_KERNEL_BSNH_INTERLEVEAD_MAP( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ) \
{ ROPE_HASH_KEY( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, LAYOUT_BSNH, 1 ), \
CVIVANTE_NAMESPACE("evis.rope_"STR(IN0_DTYPE)"_"STR(IN1_DTYPE)"to"STR(OUT_DTYPE)"_bsnh"), \
"rope_2" }
#define PACK_KERNEL_BNSH_INTERLEVEAD_MAP( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ) \
{ ROPE_HASH_KEY( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, LAYOUT_BNSH, 1 ), \
CVIVANTE_NAMESPACE("evis.rope_"STR(IN0_DTYPE)"_"STR(IN1_DTYPE)"to"STR(OUT_DTYPE)"_bnsh"), \
"rope_3" }
typedef struct
{
uint32_t key;
char * function_name;
const char * source_name;
} _kernel_map_type;
#define PACK_KERNEL_MAP(IN0_TYPE, IN1_TYPE, OUT_TYPE) \
PACK_KERNEL_BNHS_MAP(IN0_TYPE, IN1_TYPE, OUT_TYPE), \
PACK_KERNEL_BNH1_MAP(IN0_TYPE, IN1_TYPE, OUT_TYPE), \
PACK_KERNEL_BSNH_INTERLEVEAD_MAP(IN0_TYPE, IN1_TYPE, OUT_TYPE), \
PACK_KERNEL_BNSH_INTERLEVEAD_MAP(IN0_TYPE, IN1_TYPE, OUT_TYPE),
static const _kernel_map_type _rope_kernel_map[] =
{
// Register kernel here
PACK_KERNEL_MAP( BF16, BF16, BF16)
PACK_KERNEL_MAP( F16, F16, F16 )
PACK_KERNEL_MAP( I16, I16, I16 )
PACK_KERNEL_MAP( I16, F16, I16 )
PACK_KERNEL_MAP( I16, I16, I8 )
PACK_KERNEL_MAP( I16, F16, I8 )
PACK_KERNEL_MAP( I16, I16, U8 )
PACK_KERNEL_MAP( I16, F16, U8 )
PACK_KERNEL_MAP( U16, U16, U16 )
PACK_KERNEL_MAP( U16, F16, U16 )
PACK_KERNEL_MAP( I8, I8, I8 )
PACK_KERNEL_MAP( I8, F16, I8 )
PACK_KERNEL_MAP( U8, U8, U8 )
PACK_KERNEL_MAP( U8, F16, U8 )
};
/*
* Kernel params
*/
static vx_param_description_t _rope_kernel_param_def[] =
{
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
};
#define _ROPE_PARAM_NUM _cnt_of_array( _rope_kernel_param_def )
#define SCALAR_AXIS (4)
/*
* Kernel initializer
*/
DEF_KERNEL_INITIALIZER(_rope_initializer)
(
vsi_nn_kernel_node_t node,
const vsi_nn_kernel_node_param_t * param,
size_t param_size
)
{
vsi_status status = VSI_FAILURE;
gpu_param_t gpu_param = {
3,
{0, 0, 0},
{0, 0, 0},
{0, 0, 0},
{0, 0, 0}
};
vsi_nn_kernel_tensor_attr_t* out_attr = NULL;
vsi_nn_kernel_tensor_attr_t* in0_attr = NULL;
vsi_nn_kernel_tensor_attr_t* in1_attr = NULL;
vsi_nn_kernel_tensor_attr_t* in2_attr = NULL;
vsi_size_array_t* in_shape = NULL;
vsi_nn_kernel_dtype_e in0_dtype = F16;
vsi_nn_kernel_dtype_e in1_dtype = F16;
vsi_nn_kernel_dtype_e in2_dtype = F16;
vsi_nn_kernel_dtype_e out_dtype = F16;
float in0_scale = 1.0f;
float in1_scale = 1.0f;
float in2_scale = 1.0f;
float output_scale = 1.0f;
float output_zp = 0;
int32_t in0_zp = 0;
int32_t cos_zp = 0;
int32_t sin_zp = 0;
int32_t p = 0;
int32_t axis = 0;
int32_t interleaved = 0;
int32_t half_head_size = 1;
vsi_size_t shape[3] = {1};
uint32_t pack_key = 0;
VSI_UNREFERENCED(node);
VSI_UNREFERENCED(param);
VSI_UNREFERENCED(param_size);
// Add initializer
in0_attr = vsi_nn_kernel_tensor_attr_create((vsi_nn_kernel_tensor_t)param[0]);
CHECK_PTR_FAIL_GOTO(in0_attr, "Create tensor attr buffer fail.", final);
in1_attr = vsi_nn_kernel_tensor_attr_create((vsi_nn_kernel_tensor_t)param[1]);
CHECK_PTR_FAIL_GOTO(in1_attr, "Create tensor attr buffer fail.", final);
in2_attr = vsi_nn_kernel_tensor_attr_create((vsi_nn_kernel_tensor_t)param[2]);
CHECK_PTR_FAIL_GOTO(in2_attr, "Create tensor attr buffer fail.", final);
out_attr = vsi_nn_kernel_tensor_attr_create((vsi_nn_kernel_tensor_t)param[3]);
CHECK_PTR_FAIL_GOTO(out_attr, "Create tensor attr buffer fail.", final);
status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[4], &p);
CHECK_STATUS_FAIL_GOTO(status, final);
axis = p & 0xFFFF;
interleaved = (p >> 16) & 0xFFFF;
in_shape = in0_attr->shape;
in0_dtype = in0_attr->dtype;
in1_dtype = in1_attr->dtype;
in2_dtype = in2_attr->dtype;
out_dtype = out_attr->dtype;
in0_scale = in0_attr->scale;
in1_scale = in1_attr->scale;
in2_scale = in2_attr->scale;
in0_zp = -in0_attr->zero_point;
cos_zp = -in1_attr->zero_point;
sin_zp = -in2_attr->zero_point;
output_scale = out_attr->scale;
output_zp = (float)out_attr->zero_point;
half_head_size = (int32_t)(in_shape->data[axis] / 2);
shape[0] = in_shape->data[0];
shape[1] = in_shape->data[1];
shape[2] = in_shape->data[2];
shape[axis] = half_head_size;
gpu_param.global_scale[0] = 8;
gpu_param.global_scale[1] = 1;
gpu_param.global_scale[2] = 1;
gpu_param.global_size[0] = gpu_align_p2((shape[0] + \
gpu_param.global_scale[0] - 1) / gpu_param.global_scale[0], 4);
gpu_param.global_size[1] = shape[1];
gpu_param.global_size[2] = shape[2];
#define _PACK_SELECT_KEY( IN0_TYPE, IN1_TYPE, IN2_TYPE, OUT_TYPE ) \
((IN0_TYPE) | (IN1_TYPE << 8) | (IN2_TYPE << 16) | (OUT_TYPE << 24))
pack_key = _PACK_SELECT_KEY(in0_dtype, in1_dtype, in2_dtype, out_dtype);
switch (pack_key)
{
case _PACK_SELECT_KEY(BF16, BF16, BF16, BF16):
{
gpu_dp_inst_t uniConvBF16toF32_Part0_2x8 = { {
0x11111111, // TCfg
0x01010101, // ASelt
0x01050004, 0x03070206, // ABin
0x22222222, // BSelt
0x00000000, 0x00000000, // BBin
0x00000600, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000001, 0x00000001, 0x00000001,
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniConvBF16toF32_Part1_2x8 = { {
0x11111111, // TCfg
0x01010101, // ASelt
0x05050404, 0x07070606, // ABin
0x22222222, // BSelt
0x00000000, 0x00000000, // BBin
0x00000600, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000001, 0x00000001, 0x00000001,
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniExtractOddData_2x8 = { {
0x11111111, // TCfg
0x11110000, // ASelt
0x07050301, 0x07050301, // ABin
0x22222222, // BSelt
0x00000000, 0x00000000, // BBin
0x00000600, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000001, 0x00000001, 0x00000001,
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
}, GPU_DP_TYPE_16 };
if (interleaved && axis == 0)
{
uniExtractOddData_2x8.data[1] = 0x10101010;
uniExtractOddData_2x8.data[2] = 0x03030101;
uniExtractOddData_2x8.data[3] = 0x07070505;
}
else
{
status = vsi_nn_kernel_gpu_add_param(node,
"half_head_size", &half_head_size);
CHECK_STATUS_FAIL_GOTO(status, final);
}
status = vsi_nn_kernel_gpu_add_param(node,
"uniConvBF16toF32_Part0_2x8", &uniConvBF16toF32_Part0_2x8);
status |= vsi_nn_kernel_gpu_add_param(node,
"uniConvBF16toF32_Part1_2x8", &uniConvBF16toF32_Part1_2x8);
status |= vsi_nn_kernel_gpu_add_param(node,
"uniExtractOddData_2x8", &uniExtractOddData_2x8);
CHECK_STATUS_FAIL_GOTO(status, final);
}
break;
case _PACK_SELECT_KEY(I16, I16, I16, I16):
case _PACK_SELECT_KEY(I16, F16, F16, I16):
case _PACK_SELECT_KEY(I16, I16, I16, I8):
case _PACK_SELECT_KEY(I16, F16, F16, I8):
case _PACK_SELECT_KEY(I16, I16, I16, U8):
case _PACK_SELECT_KEY(I16, F16, F16, U8):
case _PACK_SELECT_KEY(F16, F16, F16, F16):
{
float scale0 = in0_scale * in1_scale / output_scale;
float scale1 = in0_scale* in2_scale / output_scale;
gpu_dp_inst_t uniExtractHalf8_2x8 = { {
0x11111111, // TCfg
0x11110000, // ASelt
0x06040200, 0x06040200, // ABin
0x22222222, // BSelt
0x00000000, 0x00000000, // BBin
0x00000100, // AccumType, ConstantType, and PostShift
0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00,
0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniExtractInteger_2x8 = { {
0x33333333, // TCfg
0x11110000, // ASelt
0x03020100, 0x03020100, // ABin
0x00000000, // BSelt
0x00000000, 0x00000000, // BBin
0x00002400, // AccumType, ConstantType, and PostShift
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniATimesB_0_4x4 = { {
0x01010101, // TCfg
0x00000000, // ASelt
0x00010000, 0x00030002, // ABin
0x01010101, // BSelt
0x00010000, 0x00030002, // BBin
0x00000400, // AccumType, ConstantType, and PostShift
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniATimesB_1_4x4 = { {
0x01010101, // TCfg
0x00000000, // ASelt
0x00050004, 0x00070006, // ABin
0x01010101, // BSelt
0x00050004, 0x00070006, // BBin
0x00000400, // AccumType, ConstantType, and PostShift
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniAEvenTimesB_0_4x4 = { {
0x01010101, // TCfg
0x00000000, // ASelt
0x00020000, 0x00060004, // ABin
0x01010101, // BSelt
0x00010000, 0x00030002, // BBin
0x00000400, // AccumType, ConstantType, and PostShift
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniAEvenTimesB_1_4x4 = { {
0x01010101, // TCfg
0x00000000, // ASelt
0x00020000, 0x00060004, // ABin
0x01010101, // BSelt
0x00050004, 0x00070006, // BBin
0x00000400, // AccumType, ConstantType, and PostShift
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniAOddTimesB_0_4x4 = { {
0x01010101, // TCfg
0x00000000, // ASelt
0x00030001, 0x00070005, // ABin
0x01010101, // BSelt
0x00010000, 0x00030002, // BBin
0x00000400, // AccumType, ConstantType, and PostShift
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniAOddTimesB_1_4x4 = { {
0x01010101, // TCfg
0x00000000, // ASelt
0x00030001, 0x00070005, // ABin
0x01010101, // BSelt
0x00050004, 0x00070006, // BBin
0x00000400, // AccumType, ConstantType, and PostShift
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
}, GPU_DP_TYPE_16 };
if (interleaved && axis == 0)
{
uniExtractHalf8_2x8.data[1] = 0x10101010;
uniExtractHalf8_2x8.data[2] = 0x02020000;
uniExtractHalf8_2x8.data[3] = 0x06060404;
uniExtractInteger_2x8.data[1] = 0x10101010;
uniExtractInteger_2x8.data[2] = 0x01010000;
uniExtractInteger_2x8.data[3] = 0x03030202;
status = vsi_nn_kernel_gpu_add_param(node,
"uniAEvenTimesB_0_4x4", &uniAEvenTimesB_0_4x4);
status |= vsi_nn_kernel_gpu_add_param(node,
"uniAEvenTimesB_1_4x4", &uniAEvenTimesB_1_4x4);
status |= vsi_nn_kernel_gpu_add_param(node,
"uniAOddTimesB_0_4x4", &uniAOddTimesB_0_4x4);
status |= vsi_nn_kernel_gpu_add_param(node,
"uniAOddTimesB_1_4x4", &uniAOddTimesB_1_4x4);
}
else
{
status = vsi_nn_kernel_gpu_add_param(node,
"uniATimesB_0_4x4", &uniATimesB_0_4x4);
status |= vsi_nn_kernel_gpu_add_param(node,
"uniATimesB_1_4x4", &uniATimesB_1_4x4);
status |= vsi_nn_kernel_gpu_add_param(node,
"half_head_size", &half_head_size);
}
status |= vsi_nn_kernel_gpu_add_param(node,
"scale0", &scale0);
status |= vsi_nn_kernel_gpu_add_param(node,
"scale1", &scale1);
status |= vsi_nn_kernel_gpu_add_param(node,
"output_zp", &output_zp);
if (out_dtype == F16)
{
status |= vsi_nn_kernel_gpu_add_param(node,
"uniExtract8Data_2x8", &uniExtractHalf8_2x8);
}
else
{
status |= vsi_nn_kernel_gpu_add_param(node,
"uniExtract8Data_2x8", &uniExtractInteger_2x8);
}
CHECK_STATUS_FAIL_GOTO(status, final);
}
break;
case _PACK_SELECT_KEY(I8, I8, I8, I8):
case _PACK_SELECT_KEY(U8, U8, U8, U8):
case _PACK_SELECT_KEY(U16, U16, U16, U16):
case _PACK_SELECT_KEY(I8, F16, F16, I8):
case _PACK_SELECT_KEY(U8, F16, F16, U8):
case _PACK_SELECT_KEY(U16, F16, F16, U16):
{
float scale0 = in0_scale * in1_scale / output_scale;
float scale1 = in0_scale* in2_scale / output_scale;
gpu_dp_inst_t uniExtractInteger_2x8 = { {
0x33333333, // TCfg
0x11110000, // ASelt
0x03020100, 0x03020100, // ABin
0x00000000, // BSelt
0x00000000, 0x00000000, // BBin
0x00002400, // AccumType, ConstantType, and PostShift
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniAMinusZp_0_4x4 = { {
0x0d0d0d0d, // TCfg
0x04040404, // ASelt
0x00010000, 0x00030002, // ABin
0x02020202, // BSelt
0x00000000, 0x00000000, // BBin
0x00002400, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000000, 0x00000001, 0x00000000,
0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniAMinusZp_1_4x4 = { {
0x0d0d0d0d, // TCfg
0x04040404, // ASelt
0x00050004, 0x00070006, // ABin
0x02020202, // BSelt
0x00000000, 0x00000000, // BBin
0x00002400, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000000, 0x00000001, 0x00000000,
0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniAEvenMinusZp_4x4 = { {
0x0d0d0d0d, // TCfg
0x04040404, // ASelt
0x00020000, 0x00060004, // ABin
0x02020202, // BSelt
0x00000000, 0x00000000, // BBin
0x00002400, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000000, 0x00000001, 0x00000000,
0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniAOddMinusZp_4x4 = { {
0x0d0d0d0d, // TCfg
0x04040404, // ASelt
0x00030001, 0x00070005, // ABin
0x02020202, // BSelt
0x00000000, 0x00000000, // BBin
0x00002400, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000000, 0x00000001, 0x00000000,
0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
}, GPU_DP_TYPE_16 };
if (interleaved && axis == 0)
{
uniExtractInteger_2x8.data[1] = 0x10101010;
uniExtractInteger_2x8.data[2] = 0x01010000;
uniExtractInteger_2x8.data[3] = 0x03030202;
status = vsi_nn_kernel_gpu_add_param(node,
"uniAEvenMinusZp_4x4", &uniAEvenMinusZp_4x4);
status |= vsi_nn_kernel_gpu_add_param(node,
"uniAOddMinusZp_4x4", &uniAOddMinusZp_4x4);
}
else
{
status = vsi_nn_kernel_gpu_add_param(node,
"half_head_size", &half_head_size);
}
status |= vsi_nn_kernel_gpu_add_param(node,
"uniAMinusZp_0_4x4", &uniAMinusZp_0_4x4);
status |= vsi_nn_kernel_gpu_add_param(node,
"uniAMinusZp_1_4x4", &uniAMinusZp_1_4x4);
status |= vsi_nn_kernel_gpu_add_param(node,
"scale0", &scale0);
status |= vsi_nn_kernel_gpu_add_param(node,
"scale1", &scale1);
status |= vsi_nn_kernel_gpu_add_param(node,
"output_zp", &output_zp);
status |= vsi_nn_kernel_gpu_add_param(node,
"in0_zp", &in0_zp);
status |= vsi_nn_kernel_gpu_add_param(node,
"cos_zp", &cos_zp);
status |= vsi_nn_kernel_gpu_add_param(node,
"sin_zp", &sin_zp);
status |= vsi_nn_kernel_gpu_add_param(node,
"uniExtract8Data_2x8", &uniExtractInteger_2x8);
CHECK_STATUS_FAIL_GOTO(status, final);
}
break;
default:
break;
}
status = vsi_nn_kernel_gpu_config(node, &gpu_param);
final:
if (in0_attr) vsi_nn_kernel_tensor_attr_release(&in0_attr);
if (in1_attr) vsi_nn_kernel_tensor_attr_release(&in1_attr);
if (in2_attr) vsi_nn_kernel_tensor_attr_release(&in2_attr);
if (out_attr) vsi_nn_kernel_tensor_attr_release(&out_attr);
return status;
} /* _rope_initializer() */
/*
* Query kernel
*/
static vsi_status _query_kernel
(
vsi_nn_kernel_t * kernel,
vsi_nn_tensor_t * const * const inputs,
vsi_nn_tensor_t * const * const outputs,
int32_t axis,
int32_t interleaved,
_internal_rope_layout_e *layout
)
{
vsi_status status = VSI_FAILURE;
vsi_nn_kernel_dtype_e in0_dtype;
vsi_nn_kernel_dtype_e in1_dtype;
vsi_nn_kernel_dtype_e in2_dtype;
vsi_nn_kernel_dtype_e out_dtype;
int32_t in0_zp = vsi_nn_get_tensor_zero_point(inputs[0]);
int32_t in1_zp = vsi_nn_get_tensor_zero_point(inputs[1]);
int32_t in2_zp = vsi_nn_get_tensor_zero_point(inputs[2]);
const _kernel_map_type * kernel_map = _rope_kernel_map;
size_t kernel_map_size = _cnt_of_array( _rope_kernel_map );
vx_param_description_t * param_def = _rope_kernel_param_def;
vx_kernel_initialize_f initializer = _rope_initializer;
uint32_t key;
uint32_t i;
in0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
in1_dtype = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type );
in2_dtype = vsi_nn_kernel_map_dtype( inputs[2]->attr.dtype.vx_type );
out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
/*only support symmetric int16*/
if ( ( (in0_dtype == I16 && in1_dtype == I16 && out_dtype == I16) ||
(in0_dtype == I16 && in1_dtype == F16 && out_dtype == I16) ||
(in0_dtype == I16 && in1_dtype == F16 && out_dtype == I8) ||
(in0_dtype == I16 && in1_dtype == I16 && out_dtype == I8) ||
(in0_dtype == I16 && in1_dtype == F16 && out_dtype == U8) ||
(in0_dtype == I16 && in1_dtype == I16 && out_dtype == U8) ) &&
(in0_zp != 0 || in1_zp != 0 || in2_zp != 0))
{
return VSI_FAILURE;
}
if (axis == 1 && inputs[0]->attr.size[0] == inputs[1]->attr.size[0] &&
in1_dtype == in2_dtype)
{
if (inputs[0]->attr.size[0] == 1)
{
*layout = LAYOUT_BNH1;
}
else
{
*layout = LAYOUT_BNHS;
}
}
else if (axis == 0 && in1_dtype == in2_dtype)
{
if (inputs[0]->attr.size[2] == inputs[1]->attr.size[2] &&
inputs[1]->attr.size[1] == 1)
{
*layout = LAYOUT_BSNH;
}
else
{
*layout = LAYOUT_BNSH;
}
}
key = ROPE_HASH_KEY(in0_dtype, in1_dtype, out_dtype, *layout, interleaved);
for ( i = 0; i < (uint32_t)kernel_map_size; i ++ )
{
if ( kernel_map[i].key == key )
{
break;
}
}
if ( i < (uint32_t)kernel_map_size )
{
snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name );
kernel->info.parameters = param_def;
kernel->info.numParams = _cnt_of_array( _rope_kernel_param_def );
kernel->info.initialize = initializer;
// Register code source
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
"vsi_nn_kernel_header",
kernel_map[i].source_name );
// Register binary source
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
kernel_map[i].source_name );
status = VSI_SUCCESS;
}
return status;
} /* _query_kernel() */
static vsi_nn_kernel_node_t _setup
(
vsi_nn_graph_t * graph,
vsi_nn_tensor_t ** inputs,
size_t input_num,
vsi_nn_tensor_t ** outputs,
size_t output_num,
const vsi_nn_kernel_param_t * params,
vsi_nn_kernel_t * kernel
)
{
vsi_status status = VSI_FAILURE;
vsi_nn_kernel_node_param_t node_params[_ROPE_PARAM_NUM] = { NULL };
vsi_nn_kernel_node_t node = NULL;
int32_t axis = 0;
int32_t i = 0;
int32_t interleaved = 0;
int32_t param = 0;
vsi_size_t shape[3][VSI_NN_MAX_DIM_NUM] = { 0 };
vsi_nn_tensor_t* rs_tensors[4] = { NULL };
vsi_nn_tensor_t* reshape_tensors[4] = { NULL };
_internal_rope_layout_e layout = LAYOUT_NONE;
VSI_UNREFERENCED(params);
axis = vsi_nn_kernel_param_get_int32(params, "axis");
interleaved = vsi_nn_kernel_param_get_int32(params, "interleaved");
// Check if gpu can support the size
if ( !vsi_nn_kernel_gpu_check_shape(
inputs[0]->attr.size, inputs[0]->attr.dim_num ) )
{
return NULL;
}
status = _query_kernel( kernel, inputs, outputs, axis, interleaved, &layout );
if (outputs[0]->attr.size[0] == 1 || layout == LAYOUT_BSNH)
{
memcpy(shape[0], inputs[0]->attr.size, VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t));
memcpy(shape[1], inputs[1]->attr.size, VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t));
memcpy(shape[2], outputs[0]->attr.size, VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t));
if (outputs[0]->attr.size[0] == 1)
{
for (i = 1; i < 3; i++)
{
shape[0][i - 1] = shape[0][i];
shape[1][i - 1] = shape[1][i];
shape[2][i - 1] = shape[2][i];
}
shape[0][2] = 1;
shape[1][2] = 1;
shape[2][2] = 1;
}
else
{
int32_t j = 0;
for (i = 0; i < 3; i++)
{
if (shape[1][i] != 1)
{
shape[1][j] = shape[1][i];
j ++;
}
}
for (; j < 3; j++)
{
shape[1][j] = 1;
}
}
rs_tensors[0] = vsi_nn_reshape_tensor(graph,
inputs[0], shape[0], inputs[0]->attr.dim_num);
rs_tensors[1] = vsi_nn_reshape_tensor(graph,
inputs[1], shape[1], inputs[1]->attr.dim_num);
rs_tensors[2] = vsi_nn_reshape_tensor(graph,
inputs[2], shape[1], inputs[2]->attr.dim_num);
rs_tensors[3] = vsi_nn_reshape_tensor(graph,
outputs[0], shape[2], outputs[0]->attr.dim_num);
if (outputs[0]->attr.size[0] == 1 && axis > 0)
{
axis--;
}
reshape_tensors[0] = rs_tensors[0];
reshape_tensors[1] = rs_tensors[1];
reshape_tensors[2] = rs_tensors[2];
reshape_tensors[3] = rs_tensors[3];
}
else
{
reshape_tensors[0] = inputs[0];
reshape_tensors[1] = inputs[1];
reshape_tensors[2] = inputs[2];
reshape_tensors[3] = outputs[0];
}
param = (interleaved << 16) | axis;
if ( VSI_SUCCESS == status)
{
node = vsi_nn_kernel_create_node( graph, kernel );
if ( node )
{
/* Set inputs and outputs */
vsi_nn_kernel_node_pack_io( node_params, _ROPE_PARAM_NUM,
reshape_tensors, input_num, &reshape_tensors[3], output_num );
/* Pass parameters to node. */
node_params[SCALAR_AXIS] = vsi_nn_kernel_scalar_create(graph, I32, &param);
status = vsi_nn_kernel_node_pass_param( node, node_params, _ROPE_PARAM_NUM );
vsi_nn_kernel_scalar_release(&node_params[SCALAR_AXIS]);
}
}
for (i = 0; i < 4; i++)
{
vsi_safe_release_tensor(rs_tensors[i]);
}
return node;
} /* _setup() */
__END_DECLS
REGISTER_BACKEND_EVIS( rope, _setup )

View File

@ -186,18 +186,26 @@ static const _kernel_map_type scatter_nd_update_special_ref_map[] =
{
TENSOR_SCATTER_ND_UPDATE_SPECIAL_REF_KERNELS(U8, I32, U8, U8, KERNEL_SOURCE_4)
TENSOR_SCATTER_ND_UPDATE_SPECIAL_REF_KERNELS(I8, I32, I8, I8, KERNEL_SOURCE_4)
TENSOR_SCATTER_ND_UPDATE_SPECIAL_REF_KERNELS(I16, I32, I16, I16, KERNEL_SOURCE_4)
TENSOR_SCATTER_ND_UPDATE_SPECIAL_REF_KERNELS(U16, I32, U16, U16, KERNEL_SOURCE_4)
TENSOR_SCATTER_ND_UPDATE_SPECIAL_REF_KERNELS(F16, I32, F16, F16, KERNEL_SOURCE_4)
};
static const _kernel_map_type scatter_nd_update_special_update_map[] =
{
TENSOR_SCATTER_ND_UPDATE_SPECIAL_UPDATE_KERNELS(U8, I32, U8, U8, KERNEL_SOURCE_4)
TENSOR_SCATTER_ND_UPDATE_SPECIAL_UPDATE_KERNELS(I8, I32, I8, I8, KERNEL_SOURCE_4)
TENSOR_SCATTER_ND_UPDATE_SPECIAL_UPDATE_KERNELS(U16, I32, U16, U16, KERNEL_SOURCE_4)
TENSOR_SCATTER_ND_UPDATE_SPECIAL_UPDATE_KERNELS(I16, I32, I16, I16, KERNEL_SOURCE_4)
TENSOR_SCATTER_ND_UPDATE_SPECIAL_UPDATE_KERNELS(F16, I32, F16, F16, KERNEL_SOURCE_4)
};
static const _kernel_map_type scatter_nd_update_special_copy_map[] =
{
TENSOR_SCATTER_ND_UPDATE_SPECIAL_COPY_KERNELS(U8, I32, U8, U8, KERNEL_SOURCE_4)
TENSOR_SCATTER_ND_UPDATE_SPECIAL_COPY_KERNELS(I8, I32, I8, I8, KERNEL_SOURCE_4)
TENSOR_SCATTER_ND_UPDATE_SPECIAL_COPY_KERNELS(U16, I32, U16, U16, KERNEL_SOURCE_4)
TENSOR_SCATTER_ND_UPDATE_SPECIAL_COPY_KERNELS(I16, I32, I16, I16, KERNEL_SOURCE_4)
};
/*
@ -563,6 +571,8 @@ DEF_KERNEL_INITIALIZER(_scatter_nd_update_special_ref_initializer)
{
case _PACK_SELECT_KEY( I8, I8 ):
case _PACK_SELECT_KEY( U8, U8 ):
case _PACK_SELECT_KEY( I16, I16 ):
case _PACK_SELECT_KEY( U16, U16 ):
{
uint16_t M0 = 0;
int32_t postShift0 = 0;
@ -605,6 +615,8 @@ DEF_KERNEL_INITIALIZER(_scatter_nd_update_special_ref_initializer)
CHECK_STATUS_FAIL_GOTO(status, OnError );
}
break;
case _PACK_SELECT_KEY( F16, F16 ):
break;
default:
break;
}
@ -759,6 +771,8 @@ DEF_KERNEL_INITIALIZER(_scatter_nd_update_special_update_initializer)
{
case _PACK_SELECT_KEY( I8, I8 ):
case _PACK_SELECT_KEY( U8, U8 ):
case _PACK_SELECT_KEY( I16, I16 ):
case _PACK_SELECT_KEY( U16, U16 ):
{
uint16_t M1 = 0;
int32_t postShift1 = 0;
@ -801,6 +815,8 @@ DEF_KERNEL_INITIALIZER(_scatter_nd_update_special_update_initializer)
CHECK_STATUS_FAIL_GOTO(status, OnError );
}
break;
case _PACK_SELECT_KEY( F16, F16 ):
break;
default:
break;
}
@ -1597,6 +1613,19 @@ static vsi_status _query_kernel_special
status |= VSI_FAILURE;
}
if (input0_dtype == F16)
{
input0_dtype = U16;
}
if (input2_dtype == F16)
{
input2_dtype = U16;
}
if (output_dtype == F16)
{
output_dtype = U16;
}
key = HASH_SCATTER_ND_UPDATE_KEY( input0_dtype, input2_dtype, output_dtype, 6, 1, 0);
for ( i = 0; i < _cnt_of_array(scatter_nd_update_special_copy_map); i ++ )

View File

@ -27,6 +27,7 @@
#include <stdlib.h>
#include <string.h>
#include "vsi_nn_types.h"
#include "vsi_nn_types_prv.h"
#include "vsi_nn_tensor.h"
#include "vsi_nn_graph.h"
#include "vsi_nn_log.h"
@ -591,7 +592,7 @@ static vsi_nn_kernel_node_t _setup
VSI_UNREFERENCED(input_num);
VSI_UNREFERENCED(output_num);
#if (VX_ACTIVATION_EXT_SUPPORT)
if (VSI_NN_HW_EVIS_2 == graph->ctx->config.evis.ver)
if (VSI_NN_HW_EVIS_2 == ((vsi_nn_graph_prv_t*)graph)->options->config.evis.ver)
{
return NULL;
}

View File

@ -548,16 +548,16 @@ static vsi_status _gpu_register
vsi_status status;
vx_kernel_description_t* info;
vx_kernel obj;
vsi_nn_context_t context;
vx_program program = NULL;
const vsi_nn_gpu_source_fmt_e active_fmt = kernel->gpu.active_source_fmt;
vsi_nn_runtime_option_t* options;
options = ((vsi_nn_graph_prv_t*)graph)->options;
#define MAX_BUILDPROGRAM_LEN 1024
char cmd[MAX_BUILDPROGRAM_LEN] = { 0 };
size_t cost_bytes = 0;
memset( cmd, 0, sizeof(char) * MAX_BUILDPROGRAM_LEN );
context = graph->ctx;
status = VSI_FAILURE;
info = &(kernel->info);
@ -579,21 +579,21 @@ static vsi_status _gpu_register
return status;
}
if( context->config.evis.ver == VSI_NN_HW_EVIS_NONE )
if (options->config.evis.ver == VSI_NN_HW_EVIS_NONE)
{
// set default evis version is 2
if( VSI_NN_KERNEL_TYPE_EVIS == kernel->type )
{
cost_bytes = snprintf( cmd, MAX_BUILDPROGRAM_LEN,
"-cl-viv-vx-extension -D VX_VERSION=2 -D USE_40BITS_VA=%d",
context->config.use_40bits_va );
options->config.use_40bits_va );
}
}
else
{
cost_bytes = snprintf( cmd, MAX_BUILDPROGRAM_LEN,
"-cl-viv-vx-extension -D VX_VERSION=%d -D USE_40BITS_VA=%d",
context->config.evis.ver, context->config.use_40bits_va );
options->config.evis.ver, options->config.use_40bits_va );
}
// Pack build option
if( kernel->gpu.sources[active_fmt].build_option.data )
@ -655,16 +655,16 @@ static vsi_status _gpu_register_ext
vsi_status status;
vx_kernel_description_t* info;
vx_kernel obj;
vsi_nn_context_t context;
vx_program program = NULL;
const vsi_nn_gpu_source_fmt_e active_fmt = kernel->gpu.active_source_fmt;
vsi_nn_runtime_option_t* options;
options = ((vsi_nn_graph_prv_t*)graph)->options;
#define MAX_BUILDPROGRAM_LEN 1024
char cmd[MAX_BUILDPROGRAM_LEN] = { 0 };
size_t cost_bytes = 0;
memset( cmd, 0, sizeof(char) * MAX_BUILDPROGRAM_LEN );
context = graph->ctx;
status = VSI_FAILURE;
info = &(kernel->info);
@ -686,21 +686,21 @@ static vsi_status _gpu_register_ext
return status;
}
if( context->config.evis.ver == VSI_NN_HW_EVIS_NONE )
if (options->config.evis.ver == VSI_NN_HW_EVIS_NONE)
{
// set default evis version is 2
if( VSI_NN_KERNEL_TYPE_EVIS == kernel->type )
{
cost_bytes = snprintf( cmd, MAX_BUILDPROGRAM_LEN,
"-cl-viv-vx-extension -D VX_VERSION=2 -D USE_40BITS_VA=%d",
context->config.use_40bits_va );
options->config.use_40bits_va );
}
}
else
{
cost_bytes = snprintf( cmd, MAX_BUILDPROGRAM_LEN,
"-cl-viv-vx-extension -D VX_VERSION=%d -D USE_40BITS_VA=%d",
context->config.evis.ver, context->config.use_40bits_va );
options->config.evis.ver, options->config.use_40bits_va );
}
// Pack build option
if( kernel->gpu.sources[active_fmt].build_option.data )
@ -1258,7 +1258,7 @@ vsi_nn_kernel_node_t vsi_nn_kernel_selector
}
/* Skip evis if not support */
if( type == VSI_NN_KERNEL_TYPE_EVIS
&& graph->ctx->config.evis.ver == VSI_NN_HW_EVIS_NONE )
&& ((vsi_nn_graph_prv_t*)graph)->options->config.evis.ver == VSI_NN_HW_EVIS_NONE )
{
continue;
}
@ -1677,7 +1677,7 @@ static vsi_bool _check_shader_support(vsi_nn_graph_t* graph)
int32_t enableShader = ((vsi_nn_graph_prv_t*)graph)->options->enable_shader;
#if VX_HARDWARE_CAPS_PARAMS_EXT_SUPPORT
if ( graph->ctx->config.subGroupSize == 0 )
if ( ((vsi_nn_graph_prv_t*)graph)->options->config.subGroupSize == 0 )
{
return FALSE;
}

View File

@ -162,15 +162,11 @@ REGISTER_VX_FIRST_KERNEL_SELECTOR(pow)
#if (VX_TENSOR_GATHER_API_SUPPORT)
REGISTER_VX_FIRST_KERNEL_SELECTOR(gather)
#endif
#if (VX_RELATIONAL_OPS_VX_SUPPORT_EXT)
REGISTER_VX_FIRST_KERNEL_SELECTOR(relational_ops)
#endif
#if (VX_TENSOR_TILE_API_SUPPORT)
REGISTER_VX_FIRST_KERNEL_SELECTOR(tile)
#endif
#if (VX_LAYER_NORMALIZATION_VX_SUPPORT_EXT)
REGISTER_VX_FIRST_KERNEL_SELECTOR(layer_norm)
#endif
#if (VX_ACTIVATION_EXP_VX_SUPPORT_EXT)
REGISTER_VX_FIRST_KERNEL_SELECTOR(exp)
#endif
@ -184,6 +180,7 @@ REGISTER_VX_FIRST_KERNEL_SELECTOR(log_softmax)
#if (VX_BITCAST_VX_SUPPORT)
REGISTER_VX_FIRST_KERNEL_SELECTOR(bitcast)
#endif
REGISTER_VX_FIRST_KERNEL_SELECTOR(group_norm)
REGISTER_VX_FIRST_KERNEL_SELECTOR(instance_norm)
__END_DECLS

View File

@ -0,0 +1,89 @@
/****************************************************************************
*
* Copyright (c) 2021 Vivante Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*
*****************************************************************************/
#include "vsi_nn_types.h"
#include "vsi_nn_tensor.h"
#include "vsi_nn_node.h"
#include "vsi_nn_log.h"
#include "vsi_nn_prv.h"
#include "vsi_nn_tensor_util.h"
#include "kernel/vsi_nn_kernel.h"
#if VX_GROUP_NORMALIZATION_VX_SUPPORT
#define REGISTER_GROUP_NORM_OPENVX_KERNEL( kernel_name ) \
static vsi_nn_kernel_node_t _##kernel_name##setup \
( \
vsi_nn_graph_t * graph, \
vsi_nn_tensor_t ** inputs, \
size_t input_num, \
vsi_nn_tensor_t ** outputs, \
size_t output_num,\
const vsi_nn_kernel_param_t * params, \
vsi_nn_kernel_t * kernel \
); \
REGISTER_BACKEND_OPENVX( kernel_name, _##kernel_name##setup ) \
static vsi_nn_kernel_node_t _##kernel_name##setup \
( \
vsi_nn_graph_t * graph, \
vsi_nn_tensor_t ** inputs, \
size_t input_num, \
vsi_nn_tensor_t ** outputs, \
size_t output_num,\
const vsi_nn_kernel_param_t * params, \
vsi_nn_kernel_t * kernel \
)
REGISTER_GROUP_NORM_OPENVX_KERNEL(group_norm)
{
vx_node node = NULL;
float eps = vsi_nn_kernel_param_get_float32(params, "eps");
int32_t group_num = vsi_nn_kernel_param_get_int32(params, "group_num");
vx_tensor inputs_tensor[3] = { NULL };
vx_tensor output_tensor = NULL;
inputs_tensor[0] = inputs[0]->t;
inputs_tensor[1] = inputs[1]->t;
inputs_tensor[2] = inputs[2]->t;
output_tensor = outputs[0]->t;
VSI_UNREFERENCED(output_num);
VSI_UNREFERENCED(kernel);
if (graph->ctx->config.support_ffd ||
graph->ctx->config.support_stream_processor)
{
node = vxGroupNormalizationLayer(
graph->g,
eps,
group_num,
inputs_tensor,
(vx_uint32)input_num,
output_tensor
);
}
return (vsi_nn_kernel_node_t)node;
} /* group_norm() */
#endif

View File

@ -0,0 +1,87 @@
/****************************************************************************
*
* Copyright (c) 2021 Vivante Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*
*****************************************************************************/
#include "vsi_nn_types.h"
#include "vsi_nn_tensor.h"
#include "vsi_nn_node.h"
#include "vsi_nn_log.h"
#include "vsi_nn_prv.h"
#include "vsi_nn_tensor_util.h"
#include "kernel/vsi_nn_kernel.h"
#if VX_INSTANCE_NORMALIZATION_VX_SUPPORT
#define REGISTER_INSTANCE_NORM_OPENVX_KERNEL( kernel_name ) \
static vsi_nn_kernel_node_t _##kernel_name##setup \
( \
vsi_nn_graph_t * graph, \
vsi_nn_tensor_t ** inputs, \
size_t input_num, \
vsi_nn_tensor_t ** outputs, \
size_t output_num,\
const vsi_nn_kernel_param_t * params, \
vsi_nn_kernel_t * kernel \
); \
REGISTER_BACKEND_OPENVX( kernel_name, _##kernel_name##setup ) \
static vsi_nn_kernel_node_t _##kernel_name##setup \
( \
vsi_nn_graph_t * graph, \
vsi_nn_tensor_t ** inputs, \
size_t input_num, \
vsi_nn_tensor_t ** outputs, \
size_t output_num,\
const vsi_nn_kernel_param_t * params, \
vsi_nn_kernel_t * kernel \
)
REGISTER_INSTANCE_NORM_OPENVX_KERNEL(instance_norm)
{
vsi_nn_kernel_node_t node = NULL;
float eps = vsi_nn_kernel_param_get_float32(params, "eps");
vx_tensor inputs_tensor[3] = { NULL };
vx_tensor output_tensor = NULL;
inputs_tensor[0] = inputs[0]->t;
inputs_tensor[1] = inputs[1]->t;
inputs_tensor[2] = inputs[2]->t;
output_tensor = outputs[0]->t;
VSI_UNREFERENCED(output_num);
VSI_UNREFERENCED(kernel);
if (graph->ctx->config.support_ffd ||
graph->ctx->config.support_stream_processor)
{
node = vxInstanceNormalizationLayer(
graph->g,
eps,
inputs_tensor,
(vx_uint32)input_num,
output_tensor
);
}
return (vsi_nn_kernel_node_t)node;
} /* instance_norm() */
#endif

View File

@ -30,7 +30,7 @@
#include "vsi_nn_tensor_util.h"
#include "kernel/vsi_nn_kernel.h"
#if (VX_LAYER_NORMALIZATION_VX_SUPPORT_EXT)
#if (VX_LAYER_NORMALIZATION_VX_SUPPORT)
#define REGISTER_LAYER_NORM_OPENVX_KERNEL( kernel_name ) \
static vsi_nn_kernel_node_t _##kernel_name##setup \
( \
@ -71,14 +71,20 @@ REGISTER_LAYER_NORM_OPENVX_KERNEL( layer_norm )
inputs_tensor[2] = inputs[2]->t;
output_tensor = outputs[0]->t;
node = vxLayerNormalizationLayer(
graph->g,
eps,
axis,
inputs_tensor,
(uint32_t)input_num,
output_tensor
#if !defined(VX_LAYER_NORMALIZATION_VX_SUPPORT_EXT) || !(VX_LAYER_NORMALIZATION_VX_SUPPORT_EXT)
if (graph->ctx->config.support_ffd ||
graph->ctx->config.support_stream_processor)
#endif
{
node = vxLayerNormalizationLayer(
graph->g,
eps,
axis,
inputs_tensor,
(uint32_t)input_num,
output_tensor
);
}
return (vsi_nn_kernel_node_t)node;
} /* layer_norm() */

View File

@ -89,9 +89,10 @@ REGISTER_PAD2_OPENVX_KERNEL( pad2 )
if ( vsi_nn_DtypeCompare(&inputs[0]->attr.dtype, &outputs[0]->attr.dtype) == FALSE)
{
vsi_nn_tensor_attr_t attr;
memcpy( &attr, &outputs[0]->attr, sizeof( attr ) );
memcpy( &attr.size, &inputs[0]->attr.size, sizeof( attr.size ) );
attr.vtl = FALSE;
attr.vtl = TRUE;
attr.is_const = FALSE;
convert_tensor = vsi_nn_CreateTensor(graph, &attr);

View File

@ -30,7 +30,7 @@
#include "vsi_nn_tensor_util.h"
#include "kernel/vsi_nn_kernel.h"
#if (VX_RELATIONAL_OPS_VX_SUPPORT_EXT)
#if (VX_RELATIONAL_OPS_VX_SUPPORT)
#define REGISTER_RELATIONAL_OPS_OPENVX_KERNEL( kernel_name ) \
static vsi_nn_kernel_node_t _##kernel_name##setup \
@ -68,12 +68,25 @@ REGISTER_RELATIONAL_OPS_OPENVX_KERNEL( relational_ops )
VSI_UNREFERENCED(kernel);
VSI_UNREFERENCED(output_num);
node = vxRelationalLayer(graph->g,
operation,
inputs_tensor,
(uint32_t)input_num,
outputs[0]->t
);
#if !defined(VX_RELATIONAL_OPS_VX_SUPPORT_EXT) || !(VX_RELATIONAL_OPS_VX_SUPPORT_EXT)
if (vsi_nn_is_broadcast_operaton(inputs, input_num, outputs[0]))
{
return NULL;
}
#endif
#if !defined(VX_RELATIONAL_OPS_VX_SUPPORT_EXT) || !(VX_RELATIONAL_OPS_VX_SUPPORT_EXT)
if (graph->ctx->config.support_stream_processor)
#endif
{
node = vxRelationalLayer(
graph->g,
operation,
inputs_tensor,
(uint32_t)input_num,
outputs[0]->t
);
}
return (vsi_nn_kernel_node_t)node;
} /* relational_ops() */

View File

@ -23,6 +23,7 @@
*****************************************************************************/
#include "vsi_nn_types.h"
#include "vsi_nn_types_prv.h"
#include "vsi_nn_tensor.h"
#include "vsi_nn_node.h"
#include "vsi_nn_log.h"
@ -66,7 +67,7 @@ REGISTER_SWISH_OPENVX_KERNEL( swish )
VSI_UNREFERENCED(output_num);
VSI_UNREFERENCED(input_num);
if (VSI_NN_HW_EVIS_2 == graph->ctx->config.evis.ver)
if (VSI_NN_HW_EVIS_2 == ((vsi_nn_graph_prv_t*)graph)->options->config.evis.ver)
{
swish_type = (vsi_nn_swish_type)vsi_nn_kernel_param_get_int32(params, "type");

View File

@ -67,8 +67,8 @@ __kernel void cumsum_F32toF32_axis2(
}
}
#define CUMSUM_toU8_AXIS2_SH(name, src_type, read_image_type) \
__kernel void cumsum_##name##toU8_axis2( \
#define CUMSUM_toINT_AXIS2_SH(name, src_type, image_read, dst_type, image_write, convert_dtype) \
__kernel void cumsum_##name##_axis2( \
__read_only image2d_array_t input, \
__write_only image2d_array_t output, \
int axis, \
@ -87,19 +87,19 @@ __kernel void cumsum_##name##toU8_axis2( \
int4 coord_out = coord; \
\
src_type sum = (src_type)(0); \
uint4 dst = (uint4)(0); \
dst_type dst = (dst_type)(0); \
int tmp_zp = convert_int_rte(output_zp); \
dst.x = convert_uint_sat(tmp_zp); \
dst.x = convert_dtype(tmp_zp); \
\
float cnt = 0.0f; \
\
if(exclusive && rev) \
{ \
coord_out.z = channel - 1; \
write_imageui(output, coord_out, dst); \
image_write(output, coord_out, dst); \
for(coord.z = channel - 1; coord.z > 0; coord.z--) \
{ \
src_type data = read_image_type(input, coord); \
src_type data = image_read(input, coord); \
coord_out.z--; \
cnt += 1.0f; \
sum += data; \
@ -107,17 +107,17 @@ __kernel void cumsum_##name##toU8_axis2( \
float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
float tmpSum = sum.x * in_out_scale + tmpAlpha; \
\
dst.x = (uint)convert_int_rte(tmpSum); \
write_imageui(output, coord_out, dst); \
dst.x = convert_dtype(tmpSum); \
image_write(output, coord_out, dst); \
} \
} \
else if(exclusive) \
{ \
coord_out.z = 0; \
write_imageui(output, coord_out, dst); \
image_write(output, coord_out, dst); \
for(coord.z = 0; coord.z < channel - 1; coord.z++) \
{ \
src_type data = read_image_type(input, coord); \
src_type data = image_read(input, coord); \
coord_out.z++; \
cnt += 1.0f; \
sum += data; \
@ -125,45 +125,44 @@ __kernel void cumsum_##name##toU8_axis2( \
float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
float tmpSum = sum.x * in_out_scale + tmpAlpha; \
\
dst.x = (uint)convert_int_rte(tmpSum); \
write_imageui(output, coord_out, dst); \
dst.x = convert_dtype(tmpSum); \
image_write(output, coord_out, dst); \
} \
} \
else if(rev) \
{ \
for(coord.z = channel - 1; coord.z >= 0; coord.z--) \
{ \
src_type data = read_image_type(input, coord); \
src_type data = image_read(input, coord); \
cnt += 1.0f; \
sum += data; \
\
float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
float tmpSum = sum.x * in_out_scale + tmpAlpha; \
\
dst.x = (uint)convert_int_rte(tmpSum); \
write_imageui(output, coord, dst); \
dst.x = convert_dtype(tmpSum); \
image_write(output, coord, dst); \
} \
} \
else \
{ \
for(coord.z = 0; coord.z < channel; coord.z++) \
{ \
src_type data = read_image_type(input, coord); \
src_type data = image_read(input, coord); \
cnt += 1.0f; \
sum += data; \
\
float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
float tmpSum = sum.x * in_out_scale + tmpAlpha; \
\
dst.x = (uint)convert_int_rte(tmpSum); \
write_imageui(output, coord, dst); \
dst.x = convert_dtype(tmpSum); \
image_write(output, coord, dst); \
} \
} \
}
CUMSUM_toU8_AXIS2_SH(U8,uint4,read_imageui)
CUMSUM_toU8_AXIS2_SH(F32,float4,read_imagef)
CUMSUM_toINT_AXIS2_SH(U8toU8, uint4, read_imageui, uint4, write_imageui, convert_uint_sat_rte)
CUMSUM_toINT_AXIS2_SH(F32toU8, float4, read_imagef, uint4, write_imageui, convert_uint_sat_rte)
CUMSUM_toINT_AXIS2_SH(I32toI32, int4, read_imagei, int4, write_imagei, convert_int_sat_rte)
__kernel void cumsum_F32toF32_axis1(
__read_only image2d_array_t input,
@ -233,10 +232,10 @@ __kernel void cumsum_F32toF32_axis1(
}
}
#define CUMSUM_toU8_AXIS1_SH(name, src_type, read_image_type) \
__kernel void cumsum_##name##toU8_axis1( \
__read_only image2d_array_t input, \
__write_only image2d_array_t output, \
#define CUMSUM_toINT_AXIS1_SH(name, src_type, image_read, dst_type, image_write, convert_dtype) \
__kernel void cumsum_##name##_axis1( \
__read_only image2d_array_t input, \
__write_only image2d_array_t output, \
int axis, \
int exclusive, \
int rev, \
@ -253,20 +252,20 @@ __kernel void cumsum_##name##toU8_axis1( \
int4 coord_out = coord; \
\
src_type sum = (src_type)(0); \
uint4 dst = (uint4)(0); \
dst_type dst = (dst_type)(0); \
int tmp_zp = convert_int_rte(output_zp); \
dst.x = convert_uint_sat(tmp_zp); \
dst.x = convert_dtype(tmp_zp); \
\
float cnt = 0; \
\
if(exclusive && rev) \
{ \
coord_out.y = height - 1; \
write_imageui(output, coord_out, dst); \
image_write(output, coord_out, dst); \
\
for(coord.y = height - 1; coord.y > 0; coord.y--) \
{ \
src_type data = read_image_type(input, coord); \
src_type data = image_read(input, coord); \
cnt += 1.0f; \
coord_out.y--; \
sum += data; \
@ -274,17 +273,17 @@ __kernel void cumsum_##name##toU8_axis1( \
float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
float tmpSum = sum.x * in_out_scale + tmpAlpha; \
\
dst.x = (uint)convert_int_rte(tmpSum); \
write_imageui(output, coord_out, dst); \
dst.x = convert_dtype(tmpSum); \
image_write(output, coord_out, dst); \
} \
} \
else if(exclusive) \
{ \
coord_out.y = 0; \
write_imageui(output, coord_out, dst); \
image_write(output, coord_out, dst); \
for(coord.y = 0; coord.y < height - 1; coord.y++) \
{ \
src_type data = read_image_type(input, coord); \
src_type data = image_read(input, coord); \
cnt += 1.0f; \
coord_out.y++; \
sum += data; \
@ -292,44 +291,44 @@ __kernel void cumsum_##name##toU8_axis1( \
float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
float tmpSum = sum.x * in_out_scale + tmpAlpha; \
\
dst.x = (uint)convert_int_rte(tmpSum); \
write_imageui(output, coord_out, dst); \
dst.x = convert_dtype(tmpSum); \
image_write(output, coord_out, dst); \
} \
} \
else if(rev) \
{ \
for(coord.y = height - 1; coord.y >= 0; coord.y--) \
{ \
src_type data = read_image_type(input, coord); \
src_type data = image_read(input, coord); \
cnt += 1.0f; \
sum += data; \
\
float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
float tmpSum = sum.x * in_out_scale + tmpAlpha; \
\
dst.x = (uint)convert_int_rte(tmpSum); \
write_imageui(output, coord, dst); \
dst.x = convert_dtype(tmpSum); \
image_write(output, coord, dst); \
} \
} \
else \
{ \
for(coord.y = 0; coord.y < height; coord.y++) \
{ \
src_type data = read_image_type(input, coord); \
src_type data = image_read(input, coord); \
cnt += 1.0f; \
sum += data; \
\
float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
float tmpSum = sum.x * in_out_scale + tmpAlpha; \
\
dst.x = (uint)convert_int_rte(tmpSum); \
write_imageui(output, coord, dst); \
dst.x = convert_dtype(tmpSum); \
image_write(output, coord, dst); \
} \
} \
}
CUMSUM_toU8_AXIS1_SH(U8,uint4,read_imageui)
CUMSUM_toU8_AXIS1_SH(F32,float4,read_imagef)
CUMSUM_toINT_AXIS1_SH(U8toU8, uint4, read_imageui, uint4, write_imageui, convert_uint_sat_rte)
CUMSUM_toINT_AXIS1_SH(F32toU8, float4, read_imagef, uint4, write_imageui, convert_uint_sat_rte)
CUMSUM_toINT_AXIS1_SH(I32toI32, int4, read_imagei, int4, write_imagei, convert_int_sat_rte)
__kernel void cumsum_F32toF32_axis0(
__read_only image2d_array_t input,
@ -399,8 +398,8 @@ __kernel void cumsum_F32toF32_axis0(
}
}
#define CUMSUM_toU8_AXIS0_SH(name, src_type, read_image_type) \
__kernel void cumsum_##name##toU8_axis0( \
#define CUMSUM_toINT_AXIS0_SH(name, src_type, image_read, dst_type, image_write, convert_dtype) \
__kernel void cumsum_##name##_axis0( \
__read_only image2d_array_t input, \
__write_only image2d_array_t output, \
int axis, \
@ -419,19 +418,19 @@ __kernel void cumsum_##name##toU8_axis0( \
int4 coord_out = coord; \
\
src_type sum = (src_type)(0); \
uint4 dst = (uint4)(0); \
dst_type dst = (dst_type)(0); \
int tmp_zp = convert_int_rte(output_zp); \
dst.x = convert_uint_sat(tmp_zp); \
dst.x = convert_dtype(tmp_zp); \
\
float cnt = 0; \
\
if(exclusive && rev) \
{ \
coord_out.x = width - 1; \
write_imageui(output, coord_out, dst); \
image_write(output, coord_out, dst); \
for(coord.x = width - 1; coord.x > 0; coord.x--) \
{ \
src_type data = read_image_type(input, coord); \
src_type data = image_read(input, coord); \
coord_out.x--; \
cnt += 1.0f; \
sum += data; \
@ -439,8 +438,8 @@ __kernel void cumsum_##name##toU8_axis0( \
float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
float tmpSum = sum.x * in_out_scale + tmpAlpha; \
\
dst.x = (uint)convert_int_rte(tmpSum); \
write_imageui(output, coord_out, dst); \
dst.x = convert_dtype(tmpSum); \
image_write(output, coord_out, dst); \
} \
} \
else if(exclusive) \
@ -449,7 +448,7 @@ __kernel void cumsum_##name##toU8_axis0( \
write_imageui(output, coord_out, dst); \
for(coord.x = 0; coord.x < width - 1; coord.x++) \
{ \
src_type data = read_image_type(input, coord); \
src_type data = image_read(input, coord); \
coord_out.x++; \
cnt += 1.0f; \
sum += data; \
@ -457,40 +456,42 @@ __kernel void cumsum_##name##toU8_axis0( \
float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
float tmpSum = sum.x * in_out_scale + tmpAlpha; \
\
dst.x = (uint)convert_int_rte(tmpSum); \
write_imageui(output, coord_out, dst); \
dst.x = convert_dtype(tmpSum); \
image_write(output, coord_out, dst); \
} \
} \
else if(rev) \
{ \
for(coord.x = width - 1; coord.x >= 0; coord.x--) \
{ \
src_type data = read_image_type(input, coord); \
src_type data = image_read(input, coord); \
cnt += 1.0f; \
sum += data; \
\
float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
float tmpSum = sum.x * in_out_scale + tmpAlpha; \
\
dst.x = (uint)convert_int_rte(tmpSum); \
write_imageui(output, coord, dst); \
dst.x = convert_dtype(tmpSum); \
image_write(output, coord, dst); \
} \
} \
else \
{ \
for(coord.x = 0; coord.x < width; coord.x++) \
{ \
src_type data = read_image_type(input, coord); \
src_type data = image_read(input, coord); \
cnt += 1.0f; \
sum += data; \
\
float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
float tmpSum = sum.x * in_out_scale + tmpAlpha; \
\
dst.x = (uint)convert_int_rte(tmpSum); \
write_imageui(output, coord, dst); \
dst.x = convert_dtype(tmpSum); \
image_write(output, coord, dst); \
} \
} \
}
CUMSUM_toU8_AXIS0_SH(U8,uint4,read_imageui)
CUMSUM_toU8_AXIS0_SH(F32,float4,read_imagef)
CUMSUM_toINT_AXIS0_SH(U8toU8, uint4, read_imageui, uint4, write_imageui, convert_uint_sat_rte)
CUMSUM_toINT_AXIS0_SH(F32toU8, float4, read_imagef, uint4, write_imageui, convert_uint_sat_rte)
CUMSUM_toINT_AXIS0_SH(I32toI32, int4, read_imagei, int4, write_imagei, convert_int_sat_rte)

View File

@ -65,188 +65,100 @@ __kernel void cumsum_F32toF32_axis1_2D(
}
}
__kernel void cumsum_U8toU8_axis1_2D(
__read_only image2d_t input,
__write_only image2d_t output,
int axis,
int exclusive,
int rev,
int width,
int height,
int chn,
int input_zp,
float in_out_scale,
float in_out_zp_scale,
float output_zp
)
{
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));
uint4 sum = (uint4)(0);
uint4 dst = (uint4)(0);
int tmp_zp = convert_int_rte(output_zp);
dst.x = convert_uint_sat(tmp_zp);
float cnt = 0;
if(exclusive && rev)
{
coord.w = height - 1;
write_imageui(output, coord.zw, dst);
for(coord.y = height - 1; coord.y > 0; coord.y--)
{
uint4 data = read_imageui(input, coord.xy);
cnt += 1.0f;
coord.w--;
sum += data;
float tmpAlpha = cnt * in_out_zp_scale + output_zp;
float tmpSum = sum.x * in_out_scale + tmpAlpha;
dst.x = (uint)convert_int_rte(tmpSum);
write_imageui(output, coord.zw, dst);
}
}
else if(exclusive)
{
write_imageui(output, coord.zw, dst);
for(coord.y = 0; coord.y < height - 1; coord.y++)
{
uint4 data = read_imageui(input, coord.xy);
cnt += 1.0f;
coord.w++;
sum += data;
float tmpAlpha = cnt * in_out_zp_scale + output_zp;
float tmpSum = sum.x * in_out_scale + tmpAlpha;
dst.x = (uint)convert_int_rte(tmpSum);
write_imageui(output, coord.zw, dst);
}
}
else if(rev)
{
for(coord.y = height - 1; coord.y >= 0; coord.y--)
{
uint4 data = read_imageui(input, coord.xy);
cnt += 1.0f;
sum += data;
float tmpAlpha = cnt * in_out_zp_scale + output_zp;
float tmpSum = sum.x * in_out_scale + tmpAlpha;
dst.x = (uint)convert_int_rte(tmpSum);
write_imageui(output, coord.xy, dst);
}
}
else
{
for(coord.y = 0; coord.y < height; coord.y++)
{
uint4 data = read_imageui(input, coord.xy);
cnt += 1.0f;
sum += data;
float tmpAlpha = cnt * in_out_zp_scale + output_zp;
float tmpSum = sum.x * in_out_scale + tmpAlpha;
dst.x = (uint)convert_int_rte(tmpSum);
write_imageui(output, coord.xy, dst);
}
}
}
__kernel void cumsum_F32toU8_axis1_2D(
__read_only image2d_t input,
__write_only image2d_t output,
int axis,
int exclusive,
int rev,
int width,
int height,
int chn,
int input_zp,
float in_out_scale,
float in_out_zp_scale,
float output_zp
)
{
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));
float4 sum = (float4)(0);
uint4 dst = (uint4)(0);
int tmp_zp = convert_int_rte(output_zp);
dst.x = convert_uint_sat(tmp_zp);
float cnt = 0;
if(exclusive && rev)
{
coord.w = height - 1;
write_imageui(output, coord.zw, dst);
for(coord.y = height - 1; coord.y > 0; coord.y--)
{
float4 data = read_imagef(input, coord.xy);
cnt += 1.0f;
coord.w--;
sum += data;
float tmpAlpha = cnt * in_out_zp_scale + output_zp;
float tmpSum = sum.x * in_out_scale + tmpAlpha;
dst.x = (uint)convert_int_rte(tmpSum);
write_imageui(output, coord.zw, dst);
}
}
else if(exclusive)
{
write_imageui(output, coord.zw, dst);
for(coord.y = 0; coord.y < height - 1; coord.y++)
{
float4 data = read_imagef(input, coord.xy);
cnt += 1.0f;
coord.w++;
sum += data;
float tmpAlpha = cnt * in_out_zp_scale + output_zp;
float tmpSum = sum.x * in_out_scale + tmpAlpha;
dst.x = (uint)convert_int_rte(tmpSum);
write_imageui(output, coord.zw, dst);
}
}
else if(rev)
{
for(coord.y = height - 1; coord.y >= 0; coord.y--)
{
float4 data = read_imagef(input, coord.xy);
cnt += 1.0f;
sum += data;
float tmpAlpha = cnt * in_out_zp_scale + output_zp;
float tmpSum = sum.x * in_out_scale + tmpAlpha;
dst.x = (uint)convert_int_rte(tmpSum);
write_imageui(output, coord.xy, dst);
}
}
else
{
for(coord.y = 0; coord.y < height; coord.y++)
{
float4 data = read_imagef(input, coord.xy);
cnt += 1.0f;
sum += data;
float tmpAlpha = cnt * in_out_zp_scale + output_zp;
float tmpSum = sum.x * in_out_scale + tmpAlpha;
dst.x = (uint)convert_int_rte(tmpSum);
write_imageui(output, coord.xy, dst);
}
}
#define CUMSUM_INT_AXIS1_2D_SH(name, src_type, image_read, dst_type, image_write, convert_dtype) \
__kernel void cumsum_##name##_axis1_2D( \
__read_only image2d_t input, \
__write_only image2d_t output, \
int axis, \
int exclusive, \
int rev, \
int width, \
int height, \
int chn, \
int input_zp, \
float in_out_scale, \
float in_out_zp_scale, \
float output_zp \
) \
{ \
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); \
\
src_type sum = (src_type)(0); \
dst_type dst = (dst_type)(0); \
int tmp_zp = convert_int_rte(output_zp); \
dst.x = convert_dtype(tmp_zp); \
\
float cnt = 0; \
\
if(exclusive && rev) \
{ \
coord.w = height - 1; \
image_write(output, coord.zw, dst); \
for(coord.y = height - 1; coord.y > 0; coord.y--) \
{ \
src_type data = image_read(input, coord.xy); \
cnt += 1.0f; \
coord.w--; \
sum += data; \
\
float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
float tmpSum = sum.x * in_out_scale + tmpAlpha; \
\
dst.x = convert_dtype(tmpSum); \
image_write(output, coord.zw, dst); \
} \
} \
else if(exclusive) \
{ \
image_write(output, coord.zw, dst); \
for(coord.y = 0; coord.y < height - 1; coord.y++) \
{ \
src_type data = image_read(input, coord.xy); \
cnt += 1.0f; \
coord.w++; \
sum += data; \
\
float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
float tmpSum = sum.x * in_out_scale + tmpAlpha; \
\
dst.x = convert_dtype(tmpSum); \
image_write(output, coord.zw, dst); \
} \
} \
else if(rev) \
{ \
for(coord.y = height - 1; coord.y >= 0; coord.y--) \
{ \
src_type data = image_read(input, coord.xy); \
cnt += 1.0f; \
sum += data; \
\
float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
float tmpSum = sum.x * in_out_scale + tmpAlpha; \
\
dst.x = convert_dtype(tmpSum); \
image_write(output, coord.xy, dst); \
} \
} \
else \
{ \
for(coord.y = 0; coord.y < height; coord.y++) \
{ \
src_type data = image_read(input, coord.xy); \
cnt += 1.0f; \
sum += data; \
\
float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
float tmpSum = sum.x * in_out_scale + tmpAlpha; \
\
dst.x = convert_dtype(tmpSum); \
image_write(output, coord.xy, dst); \
} \
} \
}
CUMSUM_INT_AXIS1_2D_SH(U8toU8, uint4, read_imageui, uint4, write_imageui, convert_uint_sat_rte)
CUMSUM_INT_AXIS1_2D_SH(F32toU8, float4, read_imagef, uint4, write_imageui, convert_uint_sat_rte)
CUMSUM_INT_AXIS1_2D_SH(I32toI32, int4, read_imagei, int4, write_imagei, convert_int_sat_rte)
__kernel void cumsum_F32toF32_axis0_2D(
__read_only image2d_t input,
@ -316,188 +228,100 @@ __kernel void cumsum_F32toF32_axis0_2D(
}
}
__kernel void cumsum_U8toU8_axis0_2D(
__read_only image2d_t input,
__write_only image2d_t output,
int axis,
int exclusive,
int rev,
int width,
int height,
int chn,
int input_zp,
float in_out_scale,
float in_out_zp_scale,
float output_zp
)
{
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));
uint4 sum = (uint4)(0);
uint4 dst = (uint4)(0);
int tmp_zp = convert_int_rte(output_zp);
dst.x = convert_uint_sat(tmp_zp);
float cnt = 0.0f;
if(exclusive && rev)
{
coord.x = width - 1;
coord.z = coord.x;
write_imageui(output, coord.zw, dst);
for(; coord.x > 0; coord.x--)
{
uint4 data = read_imageui(input, coord.xy);
coord.z--;
cnt += 1.0;
sum += data;
float tmpAlpha = cnt * in_out_zp_scale + output_zp;
float tmpSum = sum.x * in_out_scale + tmpAlpha;
dst.x = (uint)convert_int_rte(tmpSum);
write_imageui(output, coord.zw, dst);
}
}
else if(exclusive)
{
coord.z = 0;
write_imageui(output, coord.zw, dst);
for(coord.x = 0; coord.x < width - 1; coord.x++)
{
uint4 data = read_imageui(input, coord.xy);
cnt += 1.0f;
coord.z++;
sum += data;
float tmpAlpha = cnt * in_out_zp_scale + output_zp;
float tmpSum = sum.x * in_out_scale + tmpAlpha;
dst.x = (uint)convert_int_rte(tmpSum);
write_imageui(output, coord.zw, dst);
}
}
else if(rev)
{
for(coord.x = width - 1; coord.x >= 0; coord.x--)
{
uint4 data = read_imageui(input, coord.xy);
cnt += 1.0f;
sum += data;
float tmpAlpha = cnt * in_out_zp_scale + output_zp;
float tmpSum = sum.x * in_out_scale + tmpAlpha;
dst.x = (uint)convert_int_rte(tmpSum);
write_imageui(output, coord.xy, dst);
}
}
else
{
for(coord.x = 0; coord.x < width; coord.x++)
{
uint4 data = read_imageui(input, coord.xy);
cnt += 1.0f;
sum += data;
float tmpAlpha = cnt * in_out_zp_scale + output_zp;
float tmpSum = sum.x * in_out_scale + tmpAlpha;
dst.x = (uint)convert_int_rte(tmpSum);
write_imageui(output, coord.xy, dst);
}
}
}
__kernel void cumsum_F32toU8_axis0_2D(
__read_only image2d_t input,
__write_only image2d_t output,
int axis,
int exclusive,
int rev,
int width,
int height,
int chn,
int input_zp,
float in_out_scale,
float in_out_zp_scale,
float output_zp
)
{
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));
float4 sum = (float4)(0);
uint4 dst = (uint4)(0);
int tmp_zp = convert_int_rte(output_zp);
dst.x = convert_uint_sat(tmp_zp);
float cnt = 0.0f;
if(exclusive && rev)
{
coord.x = width - 1;
coord.z = coord.x;
write_imageui(output, coord.zw, dst);
for(; coord.x > 0; coord.x--)
{
float4 data = read_imagef(input, coord.xy);
coord.z--;
cnt += 1.0;
sum += data;
float tmpAlpha = cnt * in_out_zp_scale + output_zp;
float tmpSum = sum.x * in_out_scale + tmpAlpha;
dst.x = (uint)convert_int_rte(tmpSum);
write_imageui(output, coord.zw, dst);
}
}
else if(exclusive)
{
coord.z = 0;
write_imageui(output, coord.zw, dst);
for(coord.x = 0; coord.x < width - 1; coord.x++)
{
float4 data = read_imagef(input, coord.xy);
cnt += 1.0f;
coord.z++;
sum += data;
float tmpAlpha = cnt * in_out_zp_scale + output_zp;
float tmpSum = sum.x * in_out_scale + tmpAlpha;
dst.x = (uint)convert_int_rte(tmpSum);
write_imageui(output, coord.zw, dst);
}
}
else if(rev)
{
for(coord.x = width - 1; coord.x >= 0; coord.x--)
{
float4 data = read_imagef(input, coord.xy);
cnt += 1.0f;
sum += data;
float tmpAlpha = cnt * in_out_zp_scale + output_zp;
float tmpSum = sum.x * in_out_scale + tmpAlpha;
dst.x = (uint)convert_int_rte(tmpSum);
write_imageui(output, coord.xy, dst);
}
}
else
{
for(coord.x = 0; coord.x < width; coord.x++)
{
float4 data = read_imagef(input, coord.xy);
cnt += 1.0f;
sum += data;
float tmpAlpha = cnt * in_out_zp_scale + output_zp;
float tmpSum = sum.x * in_out_scale + tmpAlpha;
dst.x = (uint)convert_int_rte(tmpSum);
write_imageui(output, coord.xy, dst);
}
}
#define CUMSUM_INT_AXIS0_2D_SH(name, src_type, image_read, dst_type, image_write, convert_dtype) \
__kernel void cumsum_##name##_axis0_2D( \
__read_only image2d_t input, \
__write_only image2d_t output, \
int axis, \
int exclusive, \
int rev, \
int width, \
int height, \
int chn, \
int input_zp, \
float in_out_scale, \
float in_out_zp_scale, \
float output_zp \
) \
{ \
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); \
\
src_type sum = (src_type)(0); \
dst_type dst = (dst_type)(0); \
\
int tmp_zp = convert_int_rte(output_zp); \
dst.x = convert_dtype(tmp_zp); \
\
float cnt = 0.0f; \
\
if(exclusive && rev) \
{ \
coord.x = width - 1; \
coord.z = coord.x; \
image_write(output, coord.zw, dst); \
for(; coord.x > 0; coord.x--) \
{ \
src_type data = image_read(input, coord.xy); \
coord.z--; \
cnt += 1.0; \
sum += data; \
\
float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
float tmpSum = sum.x * in_out_scale + tmpAlpha; \
\
dst.x = convert_dtype(tmpSum); \
image_write(output, coord.zw, dst); \
} \
} \
else if(exclusive) \
{ \
coord.z = 0; \
image_write(output, coord.zw, dst); \
for(coord.x = 0; coord.x < width - 1; coord.x++) \
{ \
src_type data = image_read(input, coord.xy); \
cnt += 1.0f; \
coord.z++; \
sum += data; \
\
float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
float tmpSum = sum.x * in_out_scale + tmpAlpha; \
\
dst.x = convert_dtype(tmpSum); \
image_write(output, coord.zw, dst); \
} \
} \
else if(rev) \
{ \
for(coord.x = width - 1; coord.x >= 0; coord.x--) \
{ \
src_type data = image_read(input, coord.xy); \
cnt += 1.0f; \
sum += data; \
\
float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
float tmpSum = sum.x * in_out_scale + tmpAlpha; \
\
dst.x = convert_dtype(tmpSum); \
image_write(output, coord.xy, dst); \
} \
} \
else \
{ \
for(coord.x = 0; coord.x < width; coord.x++) \
{ \
src_type data = image_read(input, coord.xy); \
cnt += 1.0f; \
sum += data; \
\
float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
float tmpSum = sum.x * in_out_scale + tmpAlpha; \
\
dst.x = convert_dtype(tmpSum); \
image_write(output, coord.xy, dst); \
} \
} \
}
CUMSUM_INT_AXIS0_2D_SH(U8toU8, uint4, read_imageui, uint4, write_imageui, convert_uint_sat_rte)
CUMSUM_INT_AXIS0_2D_SH(F32toU8, float4, read_imagef, uint4, write_imageui, convert_uint_sat_rte)
CUMSUM_INT_AXIS0_2D_SH(I32toI32, int4, read_imagei, int4, write_imagei, convert_int_sat_rte)

View File

@ -132,3 +132,30 @@ __kernel void one_hot_U8toU8
coord.z ++;
} while (coord.z < depth);
}
__kernel void one_hot_I32toBF16
(
__read_only image2d_t input,
__write_only image2d_array_t output,
int depth,
uint on_value,
uint off_value,
float inputScale,
float inputTail
)
{
int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0);
int4 src = read_imagei(input, coord.xy);
int val = convert_int(convert_float(src.x) * inputScale - inputTail);
do
{
uint4 dst;
dst.x = val == coord.z ? on_value : off_value;
write_imageui(output, coord.xzyw, dst.xxxx);
coord.z ++;
} while (coord.z < depth);
}

View File

@ -0,0 +1,373 @@
__kernel void rope_F32_F32toF32_axis0
(
__read_only image2d_array_t input,
__read_only image2d_array_t cos_cache,
__read_only image2d_array_t sin_cache,
__write_only image2d_array_t output,
int axis,
float input_zp,
float cos_zp,
float sin_zp,
float scale0,
float scale1,
float output_zp,
int half_head_size,
int step
)
{
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));
float4 cos, sin;
READ_IMAGEF_2DARRAY(cos, cos_cache, coord);
READ_IMAGEF_2DARRAY(sin, sin_cache, coord);
coord.x = coord.x * step;
float4 src0 = read_imagef(input, coord);
int4 coord_out = coord;
coord.x += half_head_size;
float4 src1 = read_imagef(input, coord);
float4 dst0 = src0 * cos - src1 * sin;
float4 dst1 = src0 * sin + src1 * cos;
write_imagef(output, coord_out, dst0);
coord_out.x += half_head_size;
write_imagef(output, coord_out, dst1);
}
__kernel void rope_F32_F32toF32_axis1
(
__read_only image2d_array_t input,
__read_only image2d_array_t cos_cache,
__read_only image2d_array_t sin_cache,
__write_only image2d_array_t output,
int axis,
float input_zp,
float cos_zp,
float sin_zp,
float scale0,
float scale1,
float output_zp,
int half_head_size,
int step
)
{
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));
float4 cos, sin;
READ_IMAGEF_2DARRAY(cos, cos_cache, coord);
READ_IMAGEF_2DARRAY(sin, sin_cache, coord);
coord.y = coord.y * step;
float4 src0 = read_imagef(input, coord);
int4 coord_out = coord;
coord.y += half_head_size;
float4 src1 = read_imagef(input, coord);
float4 dst0 = src0 * cos - src1 * sin;
float4 dst1 = src0 * sin + src1 * cos;
write_imagef(output, coord_out, dst0);
coord_out.y += half_head_size;
write_imagef(output, coord_out, dst1);
}
__kernel void rope_F32_F32toF32_axis2
(
__read_only image2d_array_t input,
__read_only image2d_array_t cos_cache,
__read_only image2d_array_t sin_cache,
__write_only image2d_array_t output,
int axis,
float input_zp,
float cos_zp,
float sin_zp,
float scale0,
float scale1,
float output_zp,
int half_head_size,
int step
)
{
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));
float4 cos = read_imagef(cos_cache, coord);
float4 sin = read_imagef(sin_cache, coord);
coord.z = coord.z * step;
float4 src0 = read_imagef(input, coord);
int4 coord_out = coord;
coord.z += half_head_size;
float4 src1 = read_imagef(input, coord);
float4 dst0 = src0 * cos - src1 * sin;
float4 dst1 = src0 * sin + src1 * cos;
write_imagef(output, coord_out, dst0);
coord_out.z += half_head_size;
write_imagef(output, coord_out, dst1);
}
__kernel void rope_I32_I32toI32_axis0
(
__read_only image2d_array_t input,
__read_only image2d_array_t cos_cache,
__read_only image2d_array_t sin_cache,
__write_only image2d_array_t output,
int axis,
float input_zp,
float cos_zp,
float sin_zp,
float scale0,
float scale1,
float output_zp,
int half_head_size,
int step
)
{
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));
int4 _cos, _sin;
float4 cos, sin;
READ_IMAGEI_2DARRAY(_cos, cos_cache, coord);
READ_IMAGEI_2DARRAY(_sin, sin_cache, coord);
coord.x = coord.x * step;
float4 src0 = convert_float4(read_imagei(input, coord));
int4 coord_out = coord;
coord.x += half_head_size;
float4 src1 = convert_float4(read_imagei(input, coord));
src0 = src0 - input_zp;
src1 = src1 - input_zp;
cos = convert_float4(_cos) - cos_zp;
sin = convert_float4(_sin) - sin_zp;
float4 _dst0 = src0 * cos * scale0 - src1 * sin * scale1 + output_zp;
float4 _dst1 = src0 * sin * scale1 + src1 * cos * scale0 + output_zp;
int4 dst0 = convert_int4_rte(_dst0);
int4 dst1 = convert_int4_rte(_dst1);
write_imagei(output, coord_out, dst0);
coord_out.x += half_head_size;
write_imagei(output, coord_out, dst1);
}
__kernel void rope_I32_I32toI32_axis1
(
__read_only image2d_array_t input,
__read_only image2d_array_t cos_cache,
__read_only image2d_array_t sin_cache,
__write_only image2d_array_t output,
int axis,
float input_zp,
float cos_zp,
float sin_zp,
float scale0,
float scale1,
float output_zp,
int half_head_size,
int step
)
{
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));
int4 _cos, _sin;
float4 cos, sin;
READ_IMAGEI_2DARRAY(_cos, cos_cache, coord);
READ_IMAGEI_2DARRAY(_sin, sin_cache, coord);
coord.y = coord.y * step;
float4 src0 = convert_float4(read_imagei(input, coord));
int4 coord_out = coord;
coord.y += half_head_size;
float4 src1 = convert_float4(read_imagei(input, coord));
src0 = src0 - input_zp;
src1 = src1 - input_zp;
cos = convert_float4(_cos) - cos_zp;
sin = convert_float4(_sin) - sin_zp;
float4 _dst0 = src0 * cos * scale0 - src1 * sin * scale1 + output_zp;
float4 _dst1 = src0 * sin * scale1 + src1 * cos * scale0 + output_zp;
int4 dst0 = convert_int4_rte(_dst0);
int4 dst1 = convert_int4_rte(_dst1);
write_imagei(output, coord_out, dst0);
coord_out.y += half_head_size;
write_imagei(output, coord_out, dst1);
}
__kernel void rope_I32_I32toI32_axis2
(
__read_only image2d_array_t input,
__read_only image2d_array_t cos_cache,
__read_only image2d_array_t sin_cache,
__write_only image2d_array_t output,
int axis,
float input_zp,
float cos_zp,
float sin_zp,
float scale0,
float scale1,
float output_zp,
int half_head_size,
int step
)
{
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));
float4 cos = convert_float4(read_imagei(cos_cache, coord));
float4 sin = convert_float4(read_imagei(sin_cache, coord));
coord.z = coord.z * step;
float4 src0 = convert_float4(read_imagei(input, coord));
int4 coord_out = coord;
coord.z += half_head_size;
float4 src1 = convert_float4(read_imagei(input, coord));
src0 = src0 - input_zp;
src1 = src1 - input_zp;
cos = cos - cos_zp;
sin = sin - sin_zp;
float4 _dst0 = src0 * cos * scale0 - src1 * sin * scale1 + output_zp;
float4 _dst1 = src0 * sin * scale1 + src1 * cos * scale0 + output_zp;
int4 dst0 = convert_int4_rte(_dst0);
int4 dst1 = convert_int4_rte(_dst1);
write_imagei(output, coord_out, dst0);
coord_out.z += half_head_size;
write_imagei(output, coord_out, dst1);
}
__kernel void rope_U32_U32toU32_axis0
(
__read_only image2d_array_t input,
__read_only image2d_array_t cos_cache,
__read_only image2d_array_t sin_cache,
__write_only image2d_array_t output,
int axis,
float input_zp,
float cos_zp,
float sin_zp,
float scale0,
float scale1,
float output_zp,
int half_head_size,
int step
)
{
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));
uint4 _cos, _sin;
float4 cos, sin;
READ_IMAGEUI_2DARRAY(_cos, cos_cache, coord);
READ_IMAGEUI_2DARRAY(_sin, sin_cache, coord);
coord.x = coord.x * step;
float4 src0 = convert_float4(read_imageui(input, coord));
int4 coord_out = coord;
coord.x += half_head_size;
float4 src1 = convert_float4(read_imageui(input, coord));
src0 = src0 - input_zp;
src1 = src1 - input_zp;
cos = convert_float4(_cos) - cos_zp;
sin = convert_float4(_sin) - sin_zp;
float4 _dst0 = src0 * cos * scale0 - src1 * sin * scale1 + output_zp;
float4 _dst1 = src0 * sin * scale1 + src1 * cos * scale0 + output_zp;
uint4 dst0 = convert_uint4_rte(_dst0);
uint4 dst1 = convert_uint4_rte(_dst1);
write_imageui(output, coord_out, dst0);
coord_out.x += half_head_size;
write_imageui(output, coord_out, dst1);
}
__kernel void rope_U32_U32toU32_axis1
(
__read_only image2d_array_t input,
__read_only image2d_array_t cos_cache,
__read_only image2d_array_t sin_cache,
__write_only image2d_array_t output,
int axis,
float input_zp,
float cos_zp,
float sin_zp,
float scale0,
float scale1,
float output_zp,
int half_head_size,
int step
)
{
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));
uint4 _cos, _sin;
float4 cos, sin;
READ_IMAGEUI_2DARRAY(_cos, cos_cache, coord);
READ_IMAGEUI_2DARRAY(_sin, sin_cache, coord);
coord.y = coord.y * step;
float4 src0 = convert_float4(read_imageui(input, coord));
int4 coord_out = coord;
coord.y += half_head_size;
float4 src1 = convert_float4(read_imageui(input, coord));
src0 = src0 - input_zp;
src1 = src1 - input_zp;
cos = convert_float4(_cos) - cos_zp;
sin = convert_float4(_sin) - sin_zp;
float4 _dst0 = src0 * cos * scale0 - src1 * sin * scale1 + output_zp;
float4 _dst1 = src0 * sin * scale1 + src1 * cos * scale0 + output_zp;
uint4 dst0 = convert_uint4_rte(_dst0);
uint4 dst1 = convert_uint4_rte(_dst1);
write_imageui(output, coord_out, dst0);
coord_out.y += half_head_size;
write_imageui(output, coord_out, dst1);
}
__kernel void rope_U32_U32toU32_axis2
(
__read_only image2d_array_t input,
__read_only image2d_array_t cos_cache,
__read_only image2d_array_t sin_cache,
__write_only image2d_array_t output,
int axis,
float input_zp,
float cos_zp,
float sin_zp,
float scale0,
float scale1,
float output_zp,
int half_head_size,
int step
)
{
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));
float4 cos = convert_float4(read_imageui(cos_cache, coord));
float4 sin = convert_float4(read_imageui(sin_cache, coord));
coord.z = coord.z * step;
float4 src0 = convert_float4(read_imageui(input, coord));
int4 coord_out = coord;
coord.z += half_head_size;
float4 src1 = convert_float4(read_imageui(input, coord));
src0 = src0 - input_zp;
src1 = src1 - input_zp;
cos = cos - cos_zp;
sin = sin - sin_zp;
float4 _dst0 = src0 * cos * scale0 - src1 * sin * scale1 + output_zp;
float4 _dst1 = src0 * sin * scale1 + src1 * cos * scale0 + output_zp;
uint4 dst0 = convert_uint4_rte(_dst0);
uint4 dst1 = convert_uint4_rte(_dst1);
write_imageui(output, coord_out, dst0);
coord_out.z += half_head_size;
write_imageui(output, coord_out, dst1);
}

View File

@ -0,0 +1,307 @@
#include "cl_viv_vx_ext.h"
_viv_uniform int top;
_viv_uniform int left;
_viv_uniform float out_scale_r;
_viv_uniform float out_scale_g;
_viv_uniform float out_scale_b;
_viv_uniform float out_zp_r;
_viv_uniform float out_zp_g;
_viv_uniform float out_zp_b;
_viv_uniform float pad_v_r;
_viv_uniform float pad_v_g;
_viv_uniform float pad_v_b;
_viv_uniform float scale_w;
_viv_uniform float scale_h;
_viv_uniform int resize_max_w;
_viv_uniform int resize_max_h;
_viv_uniform int out_height;
_viv_uniform int r_order;
_viv_uniform int b_order;
_viv_uniform VXC_512Bits uniU8RightSubLeft_4x4;
_viv_uniform VXC_512Bits uniLeftToFloat32_4x4;
_viv_uniform VXC_512Bits uniExtactHalf8_2x8;
_viv_uniform VXC_512Bits uniExtract8Data_2x8;
__kernel void custom_letterbox_U8toU8
(
__read_only image2d_t input,
__write_only image2d_t output,
int top_,
int bottom_,
int left_,
int right_,
float mean_r_,
float mean_g_,
float mean_b_,
float scale_r_,
float scale_g_,
float scale_b_,
int pad_r_,
int pad_g_,
int pad_b_,
int reverse_channel
)
{
int2 coord_out = (int2)(get_global_id(0), get_global_id(1));
int2 coord = coord_out;
uint4 dst = (uint4)(0,0,0,0);
vxc_uchar8 result;
if (coord_out.x < left || coord_out.x >= resize_max_w ||
coord_out.y < top || coord_out.y >= resize_max_h)
{
dst.x = convert_uint(pad_v_r);
coord.y = coord_out.y + r_order;
VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8);
VXC_WriteImage(output, coord, result, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
dst.x = convert_uint(pad_v_g);
coord.y = coord_out.y + out_height;
VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8);
VXC_WriteImage(output, coord, result, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
dst.x = convert_uint(pad_v_b);
coord.y = coord_out.y + b_order;
VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8);
VXC_WriteImage(output, coord, result, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
return;
}
float in_x = convert_float(coord_out.x - left) * scale_w;
float in_y = convert_float(coord_out.y - top) * scale_h;
float left_x_f = floor(in_x);
float top_y_f = floor(in_y);
float x_lerp = in_x - left_x_f;
float y_lerp = in_y - top_y_f;
int left_x_idx = convert_int(left_x_f);
int top_y_idx = convert_int(top_y_f);
int2 coord_in = (int2)(3 * left_x_idx, top_y_idx);
vxc_uchar8 top_data, bottom_data;
VXC_ReadImage(top_data, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), \
VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(bottom_data, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), \
VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0));
float4 left4 = (float4)(0,0,0,0);
float4 right4 = (float4)(0,0,0,0);
float4 top4 = (float4)(0,0,0,0);
float4 bottom4 = (float4)(0,0,0,0);
VXC_DP4x4(right4, top_data, top_data, VXC_MODIFIER(0, 2, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4);
VXC_DP4x4(left4, top_data, top_data, VXC_MODIFIER(0, 2, 0, VXC_RM_TowardZero, 0), uniLeftToFloat32_4x4);
top4 = right4 * x_lerp + left4;
VXC_DP4x4(right4, bottom_data, bottom_data, VXC_MODIFIER(0, 2, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4);
VXC_DP4x4(left4, bottom_data, bottom_data, VXC_MODIFIER(0, 2, 0, VXC_RM_TowardZero, 0), uniLeftToFloat32_4x4);
bottom4 = right4 * x_lerp + left4;
float4 out = (bottom4 - top4) * y_lerp + top4;
dst.x = convert_uint(out.x * out_scale_r + out_zp_r );
coord.y = coord_out.y + r_order;
VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8);
VXC_WriteImage(output, coord, result, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
dst.x = convert_uint(out.y * out_scale_g + out_zp_g);
coord.y = coord_out.y + out_height;
VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8);
VXC_WriteImage(output, coord, result, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
dst.x = convert_uint(out.z * out_scale_b + out_zp_b);
coord.y = coord_out.y + b_order;
VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8);
VXC_WriteImage(output, coord, result, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
}
__kernel void custom_letterbox_U8toI8
(
__read_only image2d_t input,
__write_only image2d_t output,
int top_,
int bottom_,
int left_,
int right_,
float mean_r_,
float mean_g_,
float mean_b_,
float scale_r_,
float scale_g_,
float scale_b_,
int pad_r_,
int pad_g_,
int pad_b_,
int reverse_channel
)
{
int2 coord_out = (int2)(get_global_id(0), get_global_id(1));
int2 coord = coord_out;
int4 dst = (int4)(0,0,0,0);
vxc_char8 result;
if (coord_out.x < left || coord_out.x >= resize_max_w ||
coord_out.y < top || coord_out.y >= resize_max_h)
{
dst.x = convert_int(pad_v_r);
coord.y = coord_out.y + r_order;
VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8);
VXC_WriteImage(output, coord, result, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
dst.x = convert_int(pad_v_g);
coord.y = coord_out.y + out_height;
VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8);
VXC_WriteImage(output, coord, result, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
dst.x = convert_int(pad_v_b);
coord.y = coord_out.y + b_order;
VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8);
VXC_WriteImage(output, coord, result, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
return;
}
float in_x = convert_float(coord_out.x - left) * scale_w;
float in_y = convert_float(coord_out.y - top) * scale_h;
float left_x_f = floor(in_x);
float top_y_f = floor(in_y);
float x_lerp = in_x - left_x_f;
float y_lerp = in_y - top_y_f;
int left_x_idx = convert_int(left_x_f);
int top_y_idx = convert_int(top_y_f);
int2 coord_in = (int2)(3 * left_x_idx, top_y_idx);
vxc_char8 top_data, bottom_data;
VXC_ReadImage(top_data, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), \
VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(bottom_data, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), \
VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0));
float4 left4 = (float4)(0,0,0,0);
float4 right4 = (float4)(0,0,0,0);
float4 top4 = (float4)(0,0,0,0);
float4 bottom4 = (float4)(0,0,0,0);
VXC_DP4x4(right4, top_data, top_data, VXC_MODIFIER(0, 2, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4);
VXC_DP4x4(left4, top_data, top_data, VXC_MODIFIER(0, 2, 0, VXC_RM_TowardZero, 0), uniLeftToFloat32_4x4);
top4 = right4 * x_lerp + left4;
VXC_DP4x4(right4, bottom_data, bottom_data, VXC_MODIFIER(0, 2, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4);
VXC_DP4x4(left4, bottom_data, bottom_data, VXC_MODIFIER(0, 2, 0, VXC_RM_TowardZero, 0), uniLeftToFloat32_4x4);
bottom4 = right4 * x_lerp + left4;
float4 out = (bottom4 - top4) * y_lerp + top4;
dst.x = convert_int(out.x * out_scale_r + out_zp_r);
coord.y = coord_out.y + r_order;
VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8);
VXC_WriteImage(output, coord, result, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
dst.x = convert_int(out.y * out_scale_g + out_zp_g);
coord.y = coord_out.y + out_height;
VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8);
VXC_WriteImage(output, coord, result, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
dst.x = convert_int(out.z * out_scale_b + out_zp_b);
coord.y = coord_out.y + b_order;
VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8);
VXC_WriteImage(output, coord, result, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
}
__kernel void custom_letterbox_U8toF16
(
__read_only image2d_t input,
__write_only image2d_t output,
int top_,
int bottom_,
int left_,
int right_,
float mean_r_,
float mean_g_,
float mean_b_,
float scale_r_,
float scale_g_,
float scale_b_,
int pad_r_,
int pad_g_,
int pad_b_,
int reverse_channel
)
{
int2 coord_out = (int2)(get_global_id(0), get_global_id(1));
int2 coord = coord_out;
half4 tmp;
vxc_half8 dst_temp;
vxc_ushort8 dst;
if (coord_out.x < left || coord_out.x >= resize_max_w ||
coord_out.y < top || coord_out.y >= resize_max_h)
{
float4 pad = (float4)(pad_v_r, pad_v_g, pad_v_b, 0);
_viv_asm(CONV, tmp, pad);
VXC_DP2x8(dst_temp, tmp, tmp, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtactHalf8_2x8);
_viv_asm(COPY, dst, dst_temp, 16);
coord.y = coord_out.y + r_order;
VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
tmp.x = tmp.y;
VXC_DP2x8(dst_temp, tmp, tmp, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtactHalf8_2x8);
_viv_asm(COPY, dst, dst_temp, 16);
coord.y = coord_out.y + out_height;
VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
tmp.x = tmp.z;
VXC_DP2x8(dst_temp, tmp, tmp, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtactHalf8_2x8);
_viv_asm(COPY, dst, dst_temp, 16);
coord.y = coord_out.y + b_order;
VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
return;
}
float in_x = convert_float(coord_out.x - left) * scale_w;
float in_y = convert_float(coord_out.y - top) * scale_h;
float left_x_f = floor(in_x);
float top_y_f = floor(in_y);
float x_lerp = in_x - left_x_f;
float y_lerp = in_y - top_y_f;
int left_x_idx = convert_int(left_x_f);
int top_y_idx = convert_int(top_y_f);
int2 coord_in = (int2)(3 * left_x_idx, top_y_idx);
vxc_uchar8 top_data, bottom_data;
VXC_ReadImage(top_data, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), \
VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(bottom_data, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), \
VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0));
float4 left4 = (float4)(0,0,0,0);
float4 right4 = (float4)(0,0,0,0);
float4 top4 = (float4)(0,0,0,0);
float4 bottom4 = (float4)(0,0,0,0);
VXC_DP4x4(right4, top_data, top_data, VXC_MODIFIER(0, 2, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4);
VXC_DP4x4(left4, top_data, top_data, VXC_MODIFIER(0, 2, 0, VXC_RM_TowardZero, 0), uniLeftToFloat32_4x4);
top4 = right4 * x_lerp + left4;
VXC_DP4x4(right4, bottom_data, bottom_data, VXC_MODIFIER(0, 2, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4);
VXC_DP4x4(left4, bottom_data, bottom_data, VXC_MODIFIER(0, 2, 0, VXC_RM_TowardZero, 0), uniLeftToFloat32_4x4);
bottom4 = right4 * x_lerp + left4;
float4 out = (bottom4 - top4) * y_lerp + top4;
float4 out_temp = (float4)(0,0,0,0);
out_temp.x = out.x * out_scale_r + out_zp_r;
_viv_asm(CONV, tmp, out_temp);
VXC_DP2x8(dst_temp, tmp, tmp, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtactHalf8_2x8);
_viv_asm(COPY, dst, dst_temp, 16);
coord.y = coord_out.y + r_order;
VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
out_temp.x = out.y * out_scale_g + out_zp_g;
_viv_asm(CONV, tmp, out_temp);
VXC_DP2x8(dst_temp, tmp, tmp, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtactHalf8_2x8);
_viv_asm(COPY, dst, dst_temp, 16);
coord.y = coord_out.y + out_height;
VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
out_temp.x = out.z * out_scale_b + out_zp_b;
_viv_asm(CONV, tmp, out_temp);
VXC_DP2x8(dst_temp, tmp, tmp, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniExtactHalf8_2x8);
_viv_asm(COPY, dst, dst_temp, 16);
coord.y = coord_out.y + out_height;
VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
}

View File

@ -10,7 +10,12 @@
#include "cl_viv_vx_ext.h"
_viv_uniform VXC_512Bits Uni4x4_Fp16ToFp32;
_viv_uniform VXC_512Bits uniExtract8Bin_2x8;
_viv_uniform int sf_size;
_viv_uniform float srcScale;
_viv_uniform float srcZP;
_viv_uniform float dstScale;
_viv_uniform float dstZP;
#define F_MAX(a,b) ((a)>(b)?(a):(b))
__kernel void Softmax2VXC
(
@ -19,35 +24,37 @@ __kernel void Softmax2VXC
int axis
)
{
int4 coord_in = (int4)(0,0,0,0);
float fMax = 0.0;
int4 coord_in = (int4)(0, get_global_id(0), 0, 0);
float fMax = 0;
for (int i = 0; i < sf_size; i++)
{
vxc_char8 val;
vxc_short8 val;
vxc_half8 val_h;
coord_in.x = i;
VXC_ReadImage2DArray(val, input, coord_in, 0, VXC_MODIFIER(0,0, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage2DArray(val, input, coord_in, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
_viv_asm(COPY, val_h, val, 16);
float fval;
VXC_DP4x4(fval, val, val, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 1), Uni4x4_Fp16ToFp32);
VXC_DP4x4(fval, val_h, val_h, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 1), Uni4x4_Fp16ToFp32);
fMax = F_MAX(fMax, fval);
}
float fProbSum = 0.0f;
vxc_short8 dst;
for (int i = 0; i < sf_size; i++)
{
vxc_char8 val;
vxc_short8 val;
vxc_half8 val_h;
coord_in.x = i;
VXC_ReadImage2DArray(val, input, coord_in, 0, VXC_MODIFIER(0,0, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage2DArray(val, input, coord_in, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
_viv_asm(COPY, val_h, val, 16);
float fval;
VXC_DP4x4(fval, val, val, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 1), Uni4x4_Fp16ToFp32);
VXC_DP4x4(fval, val_h, val_h, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 1), Uni4x4_Fp16ToFp32);
float fOut = (float)exp(fval - fMax);
fProbSum += fOut;
half hVal;
_viv_asm(CONV,hVal,fOut);
_viv_asm(COPY,dst,hVal, 4);
_viv_asm(CONV, hVal, fOut);
_viv_asm(COPY, dst, hVal, 4);
VXC_WriteImage2DArray(output, coord_in, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
}
@ -56,15 +63,68 @@ __kernel void Softmax2VXC
vxc_short8 val;
vxc_half8 val_h;
coord_in.x = i;
VXC_ReadImage2DArray(val, output, coord_in, 0, VXC_MODIFIER(0,0, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage2DArray(val, output, coord_in, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
float fval;
_viv_asm(COPY, val_h,val, 16);
VXC_DP4x4(fval, val_h, val_h, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 1), Uni4x4_Fp16ToFp32);
float fOut =fval/fProbSum;
float fOut =fval / fProbSum;
half hVal;
_viv_asm(CONV,hVal,fOut);
_viv_asm(CONV, hVal, fOut);
_viv_asm(COPY,dst,hVal, 4);
VXC_WriteImage2DArray(output, coord_in, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
}
}
__kernel void Softmax2VXC_u8
(
image2d_array_t input,
image2d_array_t output,
int axis
)
{
int4 coord_in = (int4)(0, get_global_id(0), 0, 0);
float fMax = -3.4e38f;
for (int i = 0; i < sf_size; i++)
{
vxc_uchar8 val;
coord_in.x = i;
VXC_ReadImage2DArray(val, input, coord_in, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
float fval;
VXC_DP4x4(fval, val, val, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 1), Uni4x4_Fp16ToFp32);
fval = (fval - srcZP) * srcScale;
fMax = F_MAX(fMax, fval);
}
float fProbSum = 0.0f;
vxc_uchar8 dst;
for (int i = 0; i < sf_size; i++)
{
vxc_uchar8 val;
coord_in.x = i;
VXC_ReadImage2DArray(val, input, coord_in, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
float fval;
VXC_DP4x4(fval, val, val, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 1), Uni4x4_Fp16ToFp32);
fval = (fval - srcZP) * srcScale;
float fOut = (float)exp(fval - fMax);
fProbSum += fOut;
}
for (int i = 0; i < sf_size; i++)
{
vxc_uchar8 val;
coord_in.x = i;
VXC_ReadImage2DArray(val, input, coord_in, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
float fval;
VXC_DP4x4(fval, val, val, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 1), Uni4x4_Fp16ToFp32);
fval = (fval - srcZP) * srcScale;
float fOut = exp(fval - fMax) / fProbSum;
fOut = fOut * dstScale + dstZP;
short dst0;
_viv_asm(CONV, dst0, fOut);
VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 1), uniExtract8Bin_2x8);
VXC_WriteImage2DArray(output, coord_in, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
}
}

View File

@ -16,7 +16,7 @@ _viv_uniform float sum_x2_tail1;
_viv_uniform float output_scale;
_viv_uniform float output_zp;
#define GROUP_NORM_SUMS_16BITS_IMPL(name, src_type) \
#define GROUP_NORM_SUMS_16BITS_IMPL(name, load_type, src_type) \
__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_sums_##name( \
__read_only image2d_array_t input, \
__write_only image2d_array_t output, \
@ -26,7 +26,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_sums_##
int lidx = get_local_id(0); \
int gidz = get_global_id(1); \
int4 coord = (int4)(gidx, 0, gidz, 0); \
vxc_short8 src0; \
load_type src; \
src_type in_h; \
float4 sumsqr; \
float4 tmpSumSqr = (float4)(0); \
@ -43,9 +43,9 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_sums_##
{ \
for(coord.y = 0; coord.y < height;) \
{ \
VXC_OP4(img_load_3d, src0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
VXC_OP4(img_load_3d, src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
coord.y++; \
_viv_asm(COPY, in_h, src0, 16); \
_viv_asm(COPY, in_h, src, 16); \
VXC_DP8x2(sumsqr, in_h, in_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniSum_X_X2_8x2); \
tmpSumSqr += sumsqr; \
} \
@ -76,10 +76,11 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_sums_##
write_imagef(output, coord_out, data); \
} \
}
GROUP_NORM_SUMS_16BITS_IMPL(F16, vxc_half8)
GROUP_NORM_SUMS_16BITS_IMPL(I16, vxc_short8)
GROUP_NORM_SUMS_16BITS_IMPL(F16, vxc_short8, vxc_half8)
GROUP_NORM_SUMS_16BITS_IMPL(I16, vxc_short8, vxc_short8)
GROUP_NORM_SUMS_16BITS_IMPL(U16, vxc_ushort8, vxc_ushort8)
#define GROUP_NORM_SUMS_16BITS_IMPL_2D(name, src_type) \
#define GROUP_NORM_SUMS_16BITS_IMPL_2D(name, load_type, src_type) \
__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_sums_##name##_2D( \
__read_only image2d_array_t input, \
__write_only image2d_array_t output, \
@ -89,7 +90,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_sums_##
int lidx = get_local_id(0); \
\
int2 coord = (int2)(gidx, get_global_id(1)); \
vxc_short8 src0; \
load_type src; \
src_type in_h; \
float4 sumsqr = (float4)(0); \
\
@ -98,8 +99,8 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_sums_##
\
if(gidx < width) \
{ \
VXC_ReadImage(src0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
_viv_asm(COPY, in_h, src0, 16); \
VXC_ReadImage(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
_viv_asm(COPY, in_h, src, 16); \
VXC_DP8x2(sumsqr, in_h, in_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniSum_X_X2_8x2); \
sumsqr.y = sumsqr.y * input_scale2 + sum_x2_tail0 + sum_x2_tail1 * sumsqr.x; \
sumsqr.x = sumsqr.x * input_scale + sum_x_tail; \
@ -128,8 +129,9 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_sums_##
write_imagef(output, coord_out, data); \
} \
}
GROUP_NORM_SUMS_16BITS_IMPL_2D(F16, vxc_half8)
GROUP_NORM_SUMS_16BITS_IMPL_2D(I16, vxc_short8)
GROUP_NORM_SUMS_16BITS_IMPL_2D(F16, vxc_short8, vxc_half8)
GROUP_NORM_SUMS_16BITS_IMPL_2D(I16, vxc_short8, vxc_short8)
GROUP_NORM_SUMS_16BITS_IMPL_2D(U16, vxc_ushort8, vxc_ushort8)
#define GROUP_NORM_16BITS_IMPL(name, src_type, dst_type, copy_type, conv_type) \
__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name( \
@ -178,7 +180,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name(
_viv_asm(CONV_RTE, tmpVal0, norm); \
norm = alpha * tmpData1 + bias_val; \
_viv_asm(CONV_RTE, tmpVal1, norm); \
VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8); \
VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
_viv_asm(COPY, outval, dst, 16); \
VXC_WriteImage2DArray(output, coord, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \
}
@ -230,10 +232,10 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name#
VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_1_4x4); \
float4 norm; \
norm = alpha * tmpData0 + bias_val; \
_viv_asm(CONV, tmpVal0, norm); \
_viv_asm(CONV_RTE, tmpVal0, norm); \
norm = alpha * tmpData1 + bias_val; \
_viv_asm(CONV, tmpVal1, norm); \
VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8); \
_viv_asm(CONV_RTE, tmpVal1, norm); \
VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
_viv_asm(COPY, outval, dst, 16); \
VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
}
@ -283,7 +285,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name(
\
float4 norm; \
norm = alpha * tmpData0 + bias_val; \
_viv_asm(CONV, tmpVal0, norm); \
_viv_asm(CONV_RTE, tmpVal0, norm); \
norm = alpha * tmpData1 + bias_val; \
_viv_asm(CONV_RTE, tmpVal1, norm); \
VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
@ -296,6 +298,7 @@ GROUP_NORM_16BITS_F32_IMPL(F16_F32toI8, vxc_half8, vxc_char8, vxc_char8, int
GROUP_NORM_16BITS_F32_IMPL(F16_F32toU8, vxc_half8, vxc_uchar8, vxc_uchar8, int4)
GROUP_NORM_16BITS_F32_IMPL(I16_F32toI16, vxc_short8, vxc_short8, vxc_short8, int4)
GROUP_NORM_16BITS_F32_IMPL(I16_F32toF16, vxc_short8, vxc_half8, vxc_short8, half4)
GROUP_NORM_16BITS_F32_IMPL(U16_F32toU16, vxc_ushort8, vxc_ushort8, vxc_ushort8, int4)
#define GROUP_NORM_16BITS_F32_IMPL_2D(name, src_type, dst_type, copy_type, conv_type) \
__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name##_2D( \
@ -333,10 +336,10 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name#
VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_1_4x4); \
float4 norm; \
norm = alpha * tmpData0 + bias_val; \
_viv_asm(CONV, tmpVal0, norm); \
_viv_asm(CONV_RTE, tmpVal0, norm); \
norm = alpha * tmpData1 + bias_val; \
_viv_asm(CONV, tmpVal1, norm); \
VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8); \
_viv_asm(CONV_RTE, tmpVal1, norm); \
VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
_viv_asm(COPY, outval, dst, 16); \
VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
}
@ -346,4 +349,5 @@ GROUP_NORM_16BITS_F32_IMPL_2D(F16_F32toI8, vxc_half8, vxc_char8, vxc_char8,
GROUP_NORM_16BITS_F32_IMPL_2D(F16_F32toU8, vxc_half8, vxc_uchar8, vxc_uchar8, int4)
GROUP_NORM_16BITS_F32_IMPL_2D(I16_F32toI16, vxc_short8, vxc_short8, vxc_short8, int4)
GROUP_NORM_16BITS_F32_IMPL_2D(I16_F32toF16, vxc_short8, vxc_half8, vxc_short8, half4)
GROUP_NORM_16BITS_F32_IMPL_2D(U16_F32toU16, vxc_ushort8, vxc_ushort8, vxc_ushort8, int4)

View File

@ -115,45 +115,45 @@ _viv_uniform VXC_512Bits uniDataSubZPtoFp32Part1_4x4;
_viv_uniform VXC_512Bits uniConvF16toF32_part0_4x4;
_viv_uniform VXC_512Bits uniConvF16toF32_part1_4x4;
_viv_uniform VXC_512Bits uniExtact8Bin_2x8;
_viv_uniform int inputZP0;
_viv_uniform int inputZP1;
_viv_uniform float input_scale0;
_viv_uniform float input_scale1;
_viv_uniform float outputZP;
#define PRELU_F16_3D(name0, name1, input_type0, copy_type0, output_type, convert_type, copy_type) \
__kernel void prelu_##name0##to##name1( \
_viv_uniform int input0_zp;
_viv_uniform int input1_zp;
_viv_uniform float input0_scale;
_viv_uniform float input1_scale;
_viv_uniform float output_zp;
#define PRELU_F16_3D(name, input_type0, copy_type0, output_type, convert_type, copy_type) \
__kernel void prelu_##name( \
__read_only image2d_array_t input0, \
__read_only image2d_array_t input1, \
__write_only image2d_array_t output) \
{\
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\
vxc_float4 vecA, vecB, vecC, vecD;\
float4 vecA, vecB, vecC, vecD;\
input_type0 srcA;\
copy_type0 src0;\
vxc_short8 srcB;\
vxc_half8 src1;\
input_type0 input_ZP;\
input_type0 zp;\
VXC_ReadImage2DArray(srcA, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\
_viv_asm(COPY, src0, srcA, 16); \
VXC_ReadImage2DArray(srcB, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\
_viv_asm(COPY, src1, srcB, 16); \
\
_viv_asm(COPY, input_ZP, inputZP0, 4);\
VXC_DP4x4(vecA, src0, input_ZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), \
_viv_asm(COPY, zp, input0_zp, 4);\
VXC_DP4x4(vecA, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), \
uniDataSubZPtoFp32Part0_4x4); \
VXC_DP4x4(vecB, src0, input_ZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), \
VXC_DP4x4(vecB, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), \
uniDataSubZPtoFp32Part1_4x4);\
VXC_DP4x4(vecC, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniConvF16toF32_part0_4x4);\
VXC_DP4x4(vecD, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniConvF16toF32_part1_4x4);\
\
vecA = vecA * input_scale0;\
vecB = vecB * input_scale0;\
vxc_float4 maxData0 = vecA > 0 ? vecA : 0.0; \
vxc_float4 maxData1 = vecB > 0 ? vecB : 0.0; \
vxc_float4 minData0 = vecA < 0 ? vecA : 0.0; \
vxc_float4 minData1 = vecB < 0 ? vecB : 0.0; \
vecA = maxData0 + vecC * minData0 + outputZP;\
vecB = maxData1 + vecD * minData1 + outputZP;\
vecA = vecA * input0_scale;\
vecB = vecB * input0_scale;\
float4 maxData0 = vecA > 0 ? vecA : 0.0; \
float4 maxData1 = vecB > 0 ? vecB : 0.0; \
float4 minData0 = vecA < 0 ? vecA : 0.0; \
float4 minData1 = vecB < 0 ? vecB : 0.0; \
vecA = maxData0 + vecC * minData0 + output_zp;\
vecB = maxData1 + vecD * minData1 + output_zp;\
convert_type dst0, dst1;\
_viv_asm(CONV_RTE, dst0, vecA);\
_viv_asm(CONV_RTE, dst1, vecB);\
@ -164,49 +164,49 @@ _viv_uniform float outputZP;
VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\
}
// name0, name1, input_type0, copy_type0, output_type, convert_type, copy_type
PRELU_F16_3D(I8F16, I8, vxc_char16, vxc_char16, vxc_char16, int4, vxc_char16)
PRELU_F16_3D(I8F16, F16, vxc_char16, vxc_char16, vxc_half8, half4, vxc_short8)
PRELU_F16_3D(I16F16, I16, vxc_short8, vxc_short8, vxc_short8, int4, vxc_short8)
PRELU_F16_3D(I16F16, F16, vxc_short8, vxc_short8, vxc_half8, half4, vxc_short8)
PRELU_F16_3D(U8F16, U8, vxc_uchar16, vxc_uchar16, vxc_uchar16, int4, vxc_uchar16)
PRELU_F16_3D(U8F16, F16, vxc_uchar16, vxc_uchar16, vxc_half8, half4, vxc_short8)
PRELU_F16_3D(F16F16, F16, vxc_short8, vxc_half8, vxc_half8, half4, vxc_short8)
PRELU_F16_3D(F16F16, I8, vxc_short8, vxc_half8, vxc_char16, int4, vxc_char16)
PRELU_F16_3D(F16F16, I16, vxc_short8, vxc_half8, vxc_short8, int4, vxc_short8)
PRELU_F16_3D(F16F16, U8, vxc_short8, vxc_half8, vxc_uchar16, int4, vxc_uchar16)
PRELU_F16_3D(I8F16toI8, vxc_char16, vxc_char16, vxc_char16, int4, vxc_char16)
PRELU_F16_3D(I8F16toF16, vxc_char16, vxc_char16, vxc_half8, half4, vxc_short8)
PRELU_F16_3D(I16F16toI16, vxc_short8, vxc_short8, vxc_short8, int4, vxc_short8)
PRELU_F16_3D(I16F16toF16, vxc_short8, vxc_short8, vxc_half8, half4, vxc_short8)
PRELU_F16_3D(U8F16toU8, vxc_uchar16, vxc_uchar16, vxc_uchar16, int4, vxc_uchar16)
PRELU_F16_3D(U8F16toF16, vxc_uchar16, vxc_uchar16, vxc_half8, half4, vxc_short8)
PRELU_F16_3D(F16F16toF16, vxc_short8, vxc_half8, vxc_half8, half4, vxc_short8)
PRELU_F16_3D(F16F16toI8, vxc_short8, vxc_half8, vxc_char16, int4, vxc_char16)
PRELU_F16_3D(F16F16toI16, vxc_short8, vxc_half8, vxc_short8, int4, vxc_short8)
PRELU_F16_3D(F16F16toU8, vxc_short8, vxc_half8, vxc_uchar16, int4, vxc_uchar16)
#define PRELU_F16_2D(name0, name1, input_type0, copy_type0, output_type, convert_type, copy_type) \
__kernel void prelu_##name0##to##name1##_2D( \
#define PRELU_F16_2D(name, input_type0, copy_type0, output_type, convert_type, copy_type) \
__kernel void prelu_##name##_2D( \
__read_only image2d_array_t input0, \
__read_only image2d_array_t input1, \
__write_only image2d_array_t output) \
{\
int2 coord = (int2)(get_global_id(0), get_global_id(1));\
vxc_float4 vecA, vecB, vecC, vecD;\
float4 vecA, vecB, vecC, vecD;\
input_type0 srcA;\
copy_type0 src0;\
vxc_short8 srcB;\
vxc_half8 src1;\
input_type0 input_ZP;\
input_type0 zp;\
VXC_ReadImage(srcA, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\
_viv_asm(COPY, src0, srcA, 16); \
VXC_ReadImage(srcB, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\
_viv_asm(COPY, src1, srcB, 16); \
\
_viv_asm(COPY, input_ZP, inputZP0, 4);\
VXC_DP4x4(vecA, src0, input_ZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniDataSubZPtoFp32Part0_4x4);\
VXC_DP4x4(vecB, src0, input_ZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniDataSubZPtoFp32Part1_4x4);\
_viv_asm(COPY, zp, input0_zp, 4);\
VXC_DP4x4(vecA, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniDataSubZPtoFp32Part0_4x4);\
VXC_DP4x4(vecB, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniDataSubZPtoFp32Part1_4x4);\
VXC_DP4x4(vecC, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniConvF16toF32_part0_4x4);\
VXC_DP4x4(vecD, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniConvF16toF32_part1_4x4);\
\
vecA = vecA * input_scale0;\
vecB = vecB * input_scale0;\
vxc_float4 maxData0 = vecA > 0 ? vecA : 0.0; \
vxc_float4 maxData1 = vecB > 0 ? vecB : 0.0; \
vxc_float4 minData0 = vecA < 0 ? vecA : 0.0; \
vxc_float4 minData1 = vecB < 0 ? vecB : 0.0; \
vecA = maxData0 + vecC * minData0 + outputZP;\
vecB = maxData1 + vecD * minData1 + outputZP;\
vecA = vecA * input0_scale;\
vecB = vecB * input0_scale;\
float4 maxData0 = vecA > 0 ? vecA : 0.0; \
float4 maxData1 = vecB > 0 ? vecB : 0.0; \
float4 minData0 = vecA < 0 ? vecA : 0.0; \
float4 minData1 = vecB < 0 ? vecB : 0.0; \
vecA = maxData0 + vecC * minData0 + output_zp;\
vecB = maxData1 + vecD * minData1 + output_zp;\
convert_type dst0, dst1;\
_viv_asm(CONV_RTE, dst0, vecA);\
_viv_asm(CONV_RTE, dst1, vecB);\
@ -216,49 +216,49 @@ PRELU_F16_3D(F16F16, U8, vxc_short8, vxc_half8, vxc_uchar16, int4, vxc_ucha
_viv_asm(COPY, dst, dst2, 16); \
VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\
}
PRELU_F16_2D(I8F16, F16, vxc_char16, vxc_char16, vxc_half8, half4, vxc_short8)
PRELU_F16_2D(I8F16, I8, vxc_char16, vxc_char16, vxc_char16, int4, vxc_char16)
PRELU_F16_2D(I16F16, F16, vxc_short8, vxc_short8, vxc_half8, half4, vxc_short8)
PRELU_F16_2D(U8F16, U8, vxc_uchar16, vxc_uchar16, vxc_uchar16, int4, vxc_uchar16)
PRELU_F16_2D(U8F16, F16, vxc_uchar16, vxc_uchar16, vxc_half8, half4, vxc_short8)
PRELU_F16_2D(F16F16, F16, vxc_short8, vxc_half8, vxc_half8, half4, vxc_short8)
PRELU_F16_2D(F16F16, I8, vxc_short8, vxc_half8, vxc_char16, int4, vxc_char16)
PRELU_F16_2D(F16F16, I16, vxc_short8, vxc_half8, vxc_short8, int4, vxc_short8)
PRELU_F16_2D(I16F16, I16, vxc_short8, vxc_short8, vxc_short8, int4, vxc_short8)
PRELU_F16_2D(F16F16, U8, vxc_short8, vxc_half8, vxc_uchar16, int4, vxc_uchar16)
PRELU_F16_2D(I8F16toF16, vxc_char16, vxc_char16, vxc_half8, half4, vxc_short8)
PRELU_F16_2D(I8F16toI8, vxc_char16, vxc_char16, vxc_char16, int4, vxc_char16)
PRELU_F16_2D(I16F16toF16, vxc_short8, vxc_short8, vxc_half8, half4, vxc_short8)
PRELU_F16_2D(U8F16toU8, vxc_uchar16, vxc_uchar16, vxc_uchar16, int4, vxc_uchar16)
PRELU_F16_2D(U8F16toF16, vxc_uchar16, vxc_uchar16, vxc_half8, half4, vxc_short8)
PRELU_F16_2D(F16F16toF16, vxc_short8, vxc_half8, vxc_half8, half4, vxc_short8)
PRELU_F16_2D(F16F16toI8, vxc_short8, vxc_half8, vxc_char16, int4, vxc_char16)
PRELU_F16_2D(F16F16toI16, vxc_short8, vxc_half8, vxc_short8, int4, vxc_short8)
PRELU_F16_2D(I16F16toI16, vxc_short8, vxc_short8, vxc_short8, int4, vxc_short8)
PRELU_F16_2D(F16F16toU8, vxc_short8, vxc_half8, vxc_uchar16, int4, vxc_uchar16)
#define PRELU_U8_2D(name, output_type, convert_type, copy_type) \
__kernel void prelu_U8U8to##name##_2D( \
#define PRELU_INTEGER_2D(name, src0_type, src1_type, output_type, convert_type, copy_type) \
__kernel void prelu_##name##_2D( \
__read_only image2d_array_t input0, \
__read_only image2d_array_t input1, \
__write_only image2d_array_t output) \
{\
int2 coord = (int2)(get_global_id(0), get_global_id(1));\
vxc_float4 vecA, vecB, vecC, vecD;\
vxc_uchar16 src0;\
vxc_uchar16 src1;\
vxc_uchar16 input_ZP0;\
vxc_uchar16 input_ZP1;\
float4 vecA, vecB, vecC, vecD;\
src0_type src0;\
src1_type src1;\
short zp0;\
short zp1;\
VXC_ReadImage(src0, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\
VXC_ReadImage(src1, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\
\
_viv_asm(COPY, input_ZP0, inputZP0, 4);\
VXC_DP4x4(vecA, src0, input_ZP0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniDataSubZPtoFp32Part0_4x4);\
VXC_DP4x4(vecB, src0, input_ZP0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniDataSubZPtoFp32Part1_4x4);\
_viv_asm(COPY, input_ZP1, inputZP1, 4);\
VXC_DP4x4(vecC, src1, input_ZP1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniDataSubZPtoFp32Part0_4x4);\
VXC_DP4x4(vecD, src1, input_ZP1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniDataSubZPtoFp32Part1_4x4);\
_viv_asm(COPY, zp0, input0_zp, 2);\
VXC_DP4x4(vecA, src0, zp0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniDataSubZPtoFp32Part0_4x4);\
VXC_DP4x4(vecB, src0, zp0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniDataSubZPtoFp32Part1_4x4);\
_viv_asm(COPY, zp1, input1_zp, 4);\
VXC_DP4x4(vecC, src1, zp1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniDataSubZPtoFp32Part0_4x4);\
VXC_DP4x4(vecD, src1, zp1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardInf, 0), uniDataSubZPtoFp32Part1_4x4);\
\
vecA = vecA * input_scale0;\
vecB = vecB * input_scale0;\
vecC = vecC * input_scale1;\
vecD = vecD * input_scale1;\
vxc_float4 maxData0 = vecA >= 0 ? vecA : 0.0; \
vxc_float4 maxData1 = vecB >= 0 ? vecB : 0.0; \
vxc_float4 minData0 = vecA < 0 ? vecA : 0.0; \
vxc_float4 minData1 = vecB < 0 ? vecB : 0.0; \
vecA = maxData0 + vecC * minData0 + outputZP;\
vecB = maxData1 + vecD * minData1 + outputZP;\
vecA = vecA * input0_scale;\
vecB = vecB * input0_scale;\
vecC = vecC * input1_scale;\
vecD = vecD * input1_scale;\
float4 maxData0 = vecA >= 0 ? vecA : 0.0; \
float4 maxData1 = vecB >= 0 ? vecB : 0.0; \
float4 minData0 = vecA < 0 ? vecA : 0.0; \
float4 minData1 = vecB < 0 ? vecB : 0.0; \
vecA = maxData0 + vecC * minData0 + output_zp;\
vecB = maxData1 + vecD * minData1 + output_zp;\
convert_type dst0, dst1;\
_viv_asm(CONV_RTE, dst0, vecA);\
_viv_asm(CONV_RTE, dst1, vecB);\
@ -268,7 +268,8 @@ PRELU_F16_2D(F16F16, U8, vxc_short8, vxc_half8, vxc_uchar16, int4, vxc_ucha
_viv_asm(COPY, dst, dst2, 16); \
VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\
}
PRELU_U8_2D(U8, vxc_uchar16, int4, vxc_uchar16)
PRELU_U8_2D(F16, vxc_half8, half4, vxc_short8)
PRELU_INTEGER_2D(U8U8toU8, vxc_uchar16, vxc_uchar16, vxc_uchar16, int4, vxc_uchar16)
PRELU_INTEGER_2D(U8U8toF16, vxc_uchar16, vxc_uchar16, vxc_half8, half4, vxc_short8)

View File

@ -0,0 +1,181 @@
#include "cl_viv_vx_ext.h"
_viv_uniform VXC_512Bits uniU8PostProcess_2x8;
_viv_uniform int2 multAndoutZP;//[0:15] multiplier, [31:63] output zp
_viv_uniform VXC_512Bits uniResize2xUp_0_4x8;
_viv_uniform VXC_512Bits uniResize2xUp_1_4x8;
_viv_uniform int out_height;
__kernel void resize_bilinear_U8toU8_2x_upsample_half_pixel_centers
(
__read_only image2d_array_t input,
__write_only image2d_array_t output,
int align_corners,
int half_pixel_centers
)
{
int4 coord_out = (int4)(get_global_id(0), 0, get_global_id(1), 0);
int4 coord_in = (int4)(get_global_id(0), -1, get_global_id(1), 0);
coord_in.x = (coord_out.x * 2 - 1) >> 2;
coord_in.x = coord_out.x == 0 ? -1 : coord_in.x;
vxc_uchar16 in0, in1, tmp, result;
int8 input_desc;
_viv_asm(COPY, input_desc, input, sizeof(input_desc));
int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;
_viv_asm(MOV, coord_in.w, baseAddr);
VXC_OP4(img_load_3d, in0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
VXC_OP4(img_load_3d, in1, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1),
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
int8 output_desc;
_viv_asm(COPY, output_desc, output, sizeof(output_desc));
baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;
_viv_asm(MOV, coord_out.w, baseAddr);
vxc_ushort8 multiplier;
_viv_asm(COPY, multiplier, multAndoutZP, 16);
vxc_ushort8 dst0;
while (coord_out.y < out_height)
{
VXC_DP4x8(dst0, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize2xUp_0_4x8);
VXC_DP2x8(result, dst0, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
VXC_DP4x8(dst0, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize2xUp_1_4x8);
VXC_DP2x8(result, dst0, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, result,
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
coord_out.y++;
VXC_OP4(img_load_3d, in0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 2),
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
VXC_DP4x8(dst0, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize2xUp_0_4x8);
VXC_DP2x8(result, dst0, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
VXC_DP4x8(dst0, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize2xUp_1_4x8);
VXC_DP2x8(result, dst0, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, result,
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
coord_out.y++;
VXC_DP4x8(dst0, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize2xUp_0_4x8);
VXC_DP2x8(result, dst0, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
VXC_DP4x8(dst0, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize2xUp_1_4x8);
VXC_DP2x8(result, dst0, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, result,
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
VXC_OP4(img_load_3d, in1, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 3),
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
coord_out.y++;
VXC_DP4x8(dst0, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize2xUp_0_4x8);
VXC_DP2x8(result, dst0, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
VXC_DP4x8(dst0, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize2xUp_1_4x8);
VXC_DP2x8(result, dst0, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, result,
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
coord_in.y += 2;
coord_out.y++;
}
}
_viv_uniform VXC_512Bits uniResize4xUp_l00_4x8;
_viv_uniform VXC_512Bits uniResize4xUp_l01_4x8;
_viv_uniform VXC_512Bits uniResize4xUp_l10_4x8;
_viv_uniform VXC_512Bits uniResize4xUp_l11_4x8;
__kernel void resize_bilinear_U8toU8_4x_upsample_half_pixel_centers
(
__read_only image2d_array_t input,
__write_only image2d_array_t output,
int align_corners,
int half_pixel_centers
)
{
int4 coord_out = (int4)(get_global_id(0), 0, get_global_id(1), 0);
int4 coord_in = (int4)(get_global_id(0), -1, get_global_id(1), 0);
coord_in.x = (coord_out.x * 2 - 3) >> 3;
coord_in.x = coord_out.x == 0 ? -1 : coord_in.x;
vxc_uchar16 in0, in1, dst0, dst1;
int8 input_desc;
_viv_asm(COPY, input_desc, input, sizeof(input_desc));
int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;
_viv_asm(MOV, coord_in.w, baseAddr);
VXC_OP4(img_load_3d, in0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
VXC_OP4(img_load_3d, in1, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1),
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
int8 output_desc;
_viv_asm(COPY, output_desc, output, sizeof(output_desc));
baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;
_viv_asm(MOV, coord_out.w, baseAddr);
vxc_ushort8 multiplier;
_viv_asm(COPY, multiplier, multAndoutZP, 16);
vxc_ushort8 tmp;
while (coord_out.y < out_height)
{
VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l00_4x8);
VXC_DP2x8(dst0, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l01_4x8);
VXC_DP2x8(dst0, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l10_4x8);
VXC_DP2x8(dst1, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l11_4x8);
VXC_DP2x8(dst1, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
VXC_OP4(img_load_3d, in0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 2),
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst0,
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
coord_out.y++;
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst1,
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
coord_out.y++;
VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l10_4x8);
VXC_DP2x8(dst0, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l11_4x8);
VXC_DP2x8(dst0, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l00_4x8);
VXC_DP2x8(dst1, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l01_4x8);
VXC_DP2x8(dst1, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst0,
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
coord_out.y++;
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst1,
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
coord_out.y++;
VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l00_4x8);
VXC_DP2x8(dst0, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l01_4x8);
VXC_DP2x8(dst0, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l10_4x8);
VXC_DP2x8(dst1, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l11_4x8);
VXC_DP2x8(dst1, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
VXC_OP4(img_load_3d, in1, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 3),
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst0,
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
coord_out.y++;
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst1,
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
coord_out.y++;
VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l10_4x8);
VXC_DP2x8(dst0, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l11_4x8);
VXC_DP2x8(dst0, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l00_4x8);
VXC_DP2x8(dst1, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l01_4x8);
VXC_DP2x8(dst1, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst0,
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
coord_out.y++;
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst1,
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
coord_in.y += 2;
coord_out.y++;
}
}

View File

@ -0,0 +1,102 @@
#include "cl_viv_vx_ext.h"
_viv_uniform VXC_512Bits uniU8PostProcess_2x8;
_viv_uniform int2 multAndoutZP;//[0:15] multiplier, [31:63] output zp
_viv_uniform VXC_512Bits uniResize3xUp_l00_2x8;
_viv_uniform VXC_512Bits uniResize3xUp_l01_2x8;
_viv_uniform VXC_512Bits uniResize3xUp_l10_4x4;
_viv_uniform VXC_512Bits uniResize3xUp_l11_4x4;
_viv_uniform VXC_512Bits uniResize3xUp_l12_4x4;
_viv_uniform VXC_512Bits uniResize3xUp_l13_4x4;
__kernel void resize_bilinear_U8toU8_3x_upsample_half_pixel_centers
(
__read_only image2d_array_t input,
__write_only image2d_array_t output,
int align_corners,
int half_pixel_centers
)
{
int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
coord_in.x = (short)(coord_out.x * 2 - 1) / (short)6;
coord_in.x = coord_out.x == 0 ? -1 : coord_in.x;
coord_in.y = (short)(coord_out.y * 2 - 1) / (short)6;
coord_in.y = coord_out.y == 0 ? -1 : coord_in.y;
vxc_uchar16 in0, in1, in2, in3, tmp, dst0, dst1, dst2;
int8 input_desc;
_viv_asm(COPY, input_desc, input, sizeof(input_desc));
int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;
_viv_asm(MOV, coord_in.w, baseAddr);
VXC_OP4(img_load_3d, in0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
VXC_OP4(img_load_3d, in1, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1),
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
VXC_OP4(img_load_3d, in2, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 2),
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
VXC_OP4(img_load_3d, in3, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 3),
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
int8 output_desc;
_viv_asm(COPY, output_desc, output, sizeof(output_desc));
baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;
_viv_asm(MOV, coord_out.w, baseAddr);
vxc_ushort8 multiplier;
_viv_asm(COPY, multiplier, multAndoutZP, 16);
vxc_ushort8 data;
VXC_DP4x4(data, in1, in0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l10_4x4);
VXC_DP4x4(data, in1, in0, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l11_4x4);
VXC_DP2x8(dst2, data, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
VXC_DP4x4(data, in1, in0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l12_4x4);
VXC_DP4x4(data, in1, in0, VXC_MODIFIER(4, 6, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l13_4x4);
VXC_DP2x8(dst2, data, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst2,
VXC_MODIFIER(0, 14, 0, VXC_RM_TowardZero, 0));
coord_out.y++;
VXC_DP2x8(data, in1, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l00_2x8);
VXC_DP2x8(dst0, data, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
VXC_DP2x8(data, in1, in1, VXC_MODIFIER(0, 6, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l01_2x8);
VXC_DP2x8(dst0, data, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
VXC_DP4x4(data, in1, in2, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l10_4x4);
VXC_DP4x4(data, in1, in2, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l11_4x4);
VXC_DP2x8(dst1, data, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
VXC_DP4x4(data, in1, in2, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l12_4x4);
VXC_DP4x4(data, in1, in2, VXC_MODIFIER(4, 6, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l13_4x4);
VXC_DP2x8(dst1, data, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
VXC_DP4x4(data, in2, in1, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l10_4x4);
VXC_DP4x4(data, in2, in1, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l11_4x4);
VXC_DP2x8(dst2, data, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
VXC_DP4x4(data, in2, in1, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l12_4x4);
VXC_DP4x4(data, in2, in1, VXC_MODIFIER(4, 6, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l13_4x4);
VXC_DP2x8(dst2, data, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst0,
VXC_MODIFIER(0, 14, 0, VXC_RM_TowardZero, 0));
coord_out.y++;
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst1,
VXC_MODIFIER(0, 14, 0, VXC_RM_TowardZero, 0));
coord_out.y++;
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst2,
VXC_MODIFIER(0, 14, 0, VXC_RM_TowardZero, 0));
coord_out.y++;
VXC_DP2x8(data, in2, in2, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l00_2x8);
VXC_DP2x8(dst0, data, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
VXC_DP2x8(data, in2, in2, VXC_MODIFIER(0, 6, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l01_2x8);
VXC_DP2x8(dst0, data, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
VXC_DP4x4(data, in2, in3, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l10_4x4);
VXC_DP4x4(data, in2, in3, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l11_4x4);
VXC_DP2x8(dst1, data, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
VXC_DP4x4(data, in2, in3, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l12_4x4);
VXC_DP4x4(data, in2, in3, VXC_MODIFIER(4, 6, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l13_4x4);
VXC_DP2x8(dst1, data, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst0,
VXC_MODIFIER(0, 14, 0, VXC_RM_TowardZero, 0));
coord_out.y++;
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst1,
VXC_MODIFIER(0, 14, 0, VXC_RM_TowardZero, 0));
}

View File

@ -0,0 +1,167 @@
#include "cl_viv_vx_ext.h"
_viv_uniform VXC_512Bits uniU8PostProcess_2x8;
_viv_uniform int2 multAndoutZP;//[0:15] multiplier, [31:63] output zp
_viv_uniform int out_height;
_viv_uniform VXC_512Bits uniResize8xUp_l00_4x8;
_viv_uniform VXC_512Bits uniResize8xUp_l01_4x8;
_viv_uniform VXC_512Bits uniResize8xUp_l10_4x8;
_viv_uniform VXC_512Bits uniResize8xUp_l11_4x8;
_viv_uniform VXC_512Bits uniResize8xUp_l20_4x8;
_viv_uniform VXC_512Bits uniResize8xUp_l21_4x8;
_viv_uniform VXC_512Bits uniResize8xUp_l30_4x8;
_viv_uniform VXC_512Bits uniResize8xUp_l31_4x8;
__kernel void resize_bilinear_U8toU8_8x_upsample_half_pixel_centers
(
__read_only image2d_array_t input,
__write_only image2d_array_t output,
int align_corners,
int half_pixel_centers
)
{
int4 coord_out = (int4)(get_global_id(0), 0, get_global_id(1), 0);
int4 coord_in = (int4)(get_global_id(0), -1, get_global_id(1), 0);
coord_in.x = (coord_out.x * 2 - 7) >> 4;
coord_in.x = coord_out.x == 0 ? -1 : coord_in.x;
vxc_uchar16 in0, in1, in2, dst0, dst1, dst2, dst3;
int8 input_desc;
_viv_asm(COPY, input_desc, input, sizeof(input_desc));
int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;
_viv_asm(MOV, coord_in.w, baseAddr);
VXC_OP4(img_load_3d, in0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
VXC_OP4(img_load_3d, in1, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1),
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
int8 output_desc;
_viv_asm(COPY, output_desc, output, sizeof(output_desc));
baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;
_viv_asm(MOV, coord_out.w, baseAddr);
vxc_ushort8 multiplier;
_viv_asm(COPY, multiplier, multAndoutZP, 16);
vxc_ushort8 tmp;
while (coord_out.y < out_height)
{
VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l00_4x8);
VXC_DP2x8(dst0, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l01_4x8);
VXC_DP2x8(dst0, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l10_4x8);
VXC_DP2x8(dst1, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l11_4x8);
VXC_DP2x8(dst1, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l20_4x8);
VXC_DP2x8(dst2, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l21_4x8);
VXC_DP2x8(dst2, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l30_4x8);
VXC_DP2x8(dst3, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l31_4x8);
VXC_DP2x8(dst3, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
VXC_OP4(img_load_3d, in0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 2),
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst0,
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
coord_out.y++;
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst1,
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
coord_out.y++;
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst2,
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
coord_out.y++;
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst3,
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
coord_out.y++;
VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l30_4x8);
VXC_DP2x8(dst0, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l31_4x8);
VXC_DP2x8(dst0, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l20_4x8);
VXC_DP2x8(dst1, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l21_4x8);
VXC_DP2x8(dst1, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l10_4x8);
VXC_DP2x8(dst2, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l11_4x8);
VXC_DP2x8(dst2, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l00_4x8);
VXC_DP2x8(dst3, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
VXC_DP4x8(tmp, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l01_4x8);
VXC_DP2x8(dst3, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst0,
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
coord_out.y++;
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst1,
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
coord_out.y++;
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst2,
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
coord_out.y++;
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst3,
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
coord_out.y++;
VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l00_4x8);
VXC_DP2x8(dst0, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l01_4x8);
VXC_DP2x8(dst0, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l10_4x8);
VXC_DP2x8(dst1, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l11_4x8);
VXC_DP2x8(dst1, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l20_4x8);
VXC_DP2x8(dst2, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l21_4x8);
VXC_DP2x8(dst2, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l30_4x8);
VXC_DP2x8(dst3, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l31_4x8);
VXC_DP2x8(dst3, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
VXC_OP4(img_load_3d, in1, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 3),
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
coord_in.y += 2;
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst0,
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
coord_out.y++;
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst1,
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
coord_out.y++;
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst2,
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
coord_out.y++;
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst3,
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
coord_out.y++;
VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l30_4x8);
VXC_DP2x8(dst0, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l31_4x8);
VXC_DP2x8(dst0, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l20_4x8);
VXC_DP2x8(dst1, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l21_4x8);
VXC_DP2x8(dst1, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l10_4x8);
VXC_DP2x8(dst2, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l11_4x8);
VXC_DP2x8(dst2, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l00_4x8);
VXC_DP2x8(dst3, tmp, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
VXC_DP4x8(tmp, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize8xUp_l01_4x8);
VXC_DP2x8(dst3, tmp, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniU8PostProcess_2x8);
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst0,
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
coord_out.y++;
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst1,
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
coord_out.y++;
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst2,
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
coord_out.y++;
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst3,
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
coord_out.y++;
}
}

View File

@ -0,0 +1,303 @@
#include "cl_viv_vx_ext.h"
_viv_uniform float scale0;
_viv_uniform float scale1;
_viv_uniform float output_zp;
_viv_uniform int half_head_size;
_viv_uniform VXC_512Bits uniATimesB_0_4x4;
_viv_uniform VXC_512Bits uniATimesB_1_4x4;
_viv_uniform VXC_512Bits uniExtract8Data_2x8;
#define ROPE_BNHS_SYMM(name, src_type, src1_type, copy_type, dst_type) \
__kernel void rope_##name##_bnhs \
( \
__read_only image2d_array_t input, \
__read_only image2d_array_t cos_cache, \
__read_only image2d_array_t sin_cache, \
__write_only image2d_array_t output, \
int axis \
) \
{ \
int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2)); \
int4 coord_out = coord_in; \
\
int8 input_desc; \
_viv_asm(COPY, input_desc, input, sizeof(input_desc)); \
int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0; \
_viv_asm(MOV, coord_in.w, baseAddr); \
\
src_type data0, data1; \
src1_type cos, sin; \
copy_type v0, v1; \
dst_type dst; \
VXC_OP4(img_load_3d, data0, input, coord_in.xywz, 0, \
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
VXC_ReadImage(v0, cos_cache, coord_in.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
_viv_asm(COPY, cos, v0, 16); \
VXC_ReadImage(v1, sin_cache, coord_in.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
_viv_asm(COPY, sin, v1, 16); \
coord_in.y += half_head_size; \
VXC_OP4(img_load_3d, data1, input, coord_in.xywz, 0, \
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
\
int8 output_desc; \
_viv_asm(COPY, output_desc, output, sizeof(output_desc)); \
baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \
_viv_asm(MOV, coord_out.w, baseAddr); \
\
float4 data2, data3, data4, data5; \
VXC_DP4x4(data2, data0, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_0_4x4); \
VXC_DP4x4(data3, data0, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_1_4x4); \
VXC_DP4x4(data4, data1, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_0_4x4); \
VXC_DP4x4(data5, data1, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_1_4x4); \
data2 = data2 * scale0 - data4 * scale1 + output_zp; \
data3 = data3 * scale0 - data5 * scale1 + output_zp; \
\
int4 dst0 = convert_int4_rte(data2); \
int4 dst1 = convert_int4_rte(data3); \
\
VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, \
dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \
\
VXC_DP4x4(data2, data0, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_0_4x4); \
VXC_DP4x4(data3, data0, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_1_4x4); \
VXC_DP4x4(data4, data1, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_0_4x4); \
VXC_DP4x4(data5, data1, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_1_4x4); \
data2 = data2 * scale1 + data4 * scale0 + output_zp; \
data3 = data3 * scale1 + data5 * scale0 + output_zp; \
\
dst0 = convert_int4_rte(data2); \
dst1 = convert_int4_rte(data3); \
\
VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
coord_out.y += half_head_size; \
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, \
dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \
}
ROPE_BNHS_SYMM(I16_I16toI16, vxc_short8, vxc_short8, vxc_short8, vxc_short8)
ROPE_BNHS_SYMM(I16_I16toI8, vxc_short8, vxc_short8, vxc_short8, vxc_char8)
ROPE_BNHS_SYMM(I16_I16toU8, vxc_short8, vxc_short8, vxc_short8, vxc_uchar8)
ROPE_BNHS_SYMM(I16_F16toI16, vxc_short8, vxc_half8, vxc_short8, vxc_short8)
ROPE_BNHS_SYMM(I16_F16toI8, vxc_short8, vxc_half8, vxc_short8, vxc_char8)
ROPE_BNHS_SYMM(I16_F16toU8, vxc_short8, vxc_half8, vxc_short8, vxc_uchar8)
__kernel void rope_F16_F16toF16_bnhs
(
__read_only image2d_array_t input,
__read_only image2d_array_t cos_cache,
__read_only image2d_array_t sin_cache,
__write_only image2d_array_t output,
int axis
)
{
int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));
int4 coord_out = coord_in;
int8 input_desc;
_viv_asm(COPY, input_desc, input, sizeof(input_desc));
int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;
_viv_asm(MOV, coord_in.w, baseAddr);
vxc_short8 v0, v1, v2, v3, dst;
vxc_half8 data0, data1, cos, sin, dst2;
VXC_OP4(img_load_3d, v0, input, coord_in.xywz, 0,
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
_viv_asm(COPY, data0, v0, 16);
VXC_ReadImage(v1, cos_cache, coord_in.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
_viv_asm(COPY, cos, v1, 16);
VXC_ReadImage(v2, sin_cache, coord_in.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
_viv_asm(COPY, sin, v2, 16);
coord_in.y += half_head_size;
VXC_OP4(img_load_3d, v3, input, coord_in.xywz, 0,
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
_viv_asm(COPY, data1, v3, 16);
int8 output_desc;
_viv_asm(COPY, output_desc, output, sizeof(output_desc));
baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;
_viv_asm(MOV, coord_out.w, baseAddr);
float4 data2, data3, data4, data5;
VXC_DP4x4(data2, data0, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_0_4x4);
VXC_DP4x4(data3, data0, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_1_4x4);
VXC_DP4x4(data4, data1, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_0_4x4);
VXC_DP4x4(data5, data1, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_1_4x4);
data2 = data2 - data4;
data3 = data3 - data5;
half4 dst0;
half4 dst1;
_viv_asm(CONV_RTE, dst0, data2);
_viv_asm(CONV_RTE, dst1, data3);
VXC_DP2x8(dst2, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8);
_viv_asm(COPY, dst, dst2, 16);
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
VXC_DP4x4(data2, data0, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_0_4x4);
VXC_DP4x4(data3, data0, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_1_4x4);
VXC_DP4x4(data4, data1, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_0_4x4);
VXC_DP4x4(data5, data1, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_1_4x4);
data2 = data2 * scale1 + data4 * scale0 + output_zp;
data3 = data3 * scale1 + data5 * scale0 + output_zp;
_viv_asm(CONV_RTE, dst0, data2);
_viv_asm(CONV_RTE, dst1, data3);
VXC_DP2x8(dst2, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8);
_viv_asm(COPY, dst, dst2, 16);
coord_out.y += half_head_size;
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
}
_viv_uniform int in0_zp;
_viv_uniform int cos_zp;
_viv_uniform int sin_zp;
_viv_uniform VXC_512Bits uniAMinusZp_0_4x4;
_viv_uniform VXC_512Bits uniAMinusZp_1_4x4;
#define ROPE_ASYM_BNHS(name, src1_type, copy_type, dtype) \
__kernel void rope_##name##_bnhs \
( \
__read_only image2d_array_t input, \
__read_only image2d_array_t cos_cache, \
__read_only image2d_array_t sin_cache, \
__write_only image2d_array_t output, \
int axis \
) \
{ \
int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2)); \
int4 coord_out = coord_in; \
\
int8 input_desc; \
_viv_asm(COPY, input_desc, input, sizeof(input_desc)); \
int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0; \
_viv_asm(MOV, coord_in.w, baseAddr); \
\
dtype data0, data1, dst; \
src1_type cos, sin; \
copy_type v0, v1; \
VXC_OP4(img_load_3d, data0, input, coord_in.xywz, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
VXC_ReadImage(v0, cos_cache, coord_in.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
_viv_asm(COPY, cos, v0, 16); \
VXC_ReadImage(v1, sin_cache, coord_in.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
_viv_asm(COPY, sin, v1, 16); \
coord_in.y += half_head_size; \
VXC_OP4(img_load_3d, data1, input, coord_in.xywz, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
\
int8 output_desc; \
_viv_asm(COPY, output_desc, output, sizeof(output_desc)); \
baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \
_viv_asm(MOV, coord_out.w, baseAddr); \
\
float4 l00, l01, cos0, cos1; \
float4 l10, l11, sin0, sin1; \
VXC_DP4x4(l00, data0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_0_4x4); \
VXC_DP4x4(l01, data0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_1_4x4); \
VXC_DP4x4(cos0, cos, cos_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_0_4x4); \
VXC_DP4x4(cos1, cos, cos_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_1_4x4); \
VXC_DP4x4(sin0, sin, sin_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_0_4x4); \
VXC_DP4x4(sin1, sin, sin_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_1_4x4); \
VXC_DP4x4(l10, data1, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_0_4x4); \
VXC_DP4x4(l11, data1, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_1_4x4); \
float4 data2 = l00 * cos0 * scale0 - l10 * sin0 * scale1 + output_zp; \
float4 data3 = l01 * cos1 * scale0 - l11 * sin1 * scale1 + output_zp; \
\
int4 dst0 = convert_int4_rte(data2); \
int4 dst1 = convert_int4_rte(data3); \
\
VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
VXC_OP4_NoDest(img_store_3d, output, \
coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \
\
data2 = l00 * sin0 * scale1 + l10 * cos0 * scale0 + output_zp; \
data3 = l01 * sin1 * scale1 + l11 * cos1 * scale0 + output_zp; \
\
dst0 = convert_int4_rte(data2); \
dst1 = convert_int4_rte(data3); \
\
VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
coord_out.y += half_head_size; \
VXC_OP4_NoDest(img_store_3d, output, \
coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \
}
ROPE_ASYM_BNHS(I8_I8toI8, vxc_char8, vxc_char8, vxc_char8)
ROPE_ASYM_BNHS(U8_U8toU8, vxc_uchar8, vxc_uchar8, vxc_uchar8)
ROPE_ASYM_BNHS(U16_U16toU16, vxc_ushort8, vxc_ushort8, vxc_ushort8)
ROPE_ASYM_BNHS(I8_F16toI8, vxc_half8, vxc_short8, vxc_char8)
ROPE_ASYM_BNHS(U8_F16toU8, vxc_half8, vxc_short8, vxc_uchar8)
ROPE_ASYM_BNHS(U16_F16toU16, vxc_half8, vxc_short8, vxc_ushort8)
_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;
_viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8;
_viv_uniform VXC_512Bits uniExtractOddData_2x8;
__kernel void rope_BF16_BF16toBF16_bnhs
(
__read_only image2d_array_t input,
__read_only image2d_array_t cos_cache,
__read_only image2d_array_t sin_cache,
__write_only image2d_array_t output,
int axis
)
{
int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));
int4 coord_out = coord_in;
int8 input_desc;
_viv_asm(COPY, input_desc, input, sizeof(input_desc));
int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;
_viv_asm(MOV, coord_in.w, baseAddr);
vxc_ushort8 v0, v1, v2, v3, dst;
VXC_OP4(img_load_3d, v0, input, coord_in.xywz, 0,
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(v1, cos_cache, coord_in.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(v2, sin_cache, coord_in.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
coord_in.y += half_head_size;
VXC_OP4(img_load_3d, v3, input, coord_in.xywz, 0,
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
int8 output_desc;
_viv_asm(COPY, output_desc, output, sizeof(output_desc));
baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;
_viv_asm(MOV, coord_out.w, baseAddr);
float4 src0, src1, src2, src3, cos0, cos1, sin0, sin1;
vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0);
vxc_short8 data;
VXC_DP2x8(data, v0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
_viv_asm(COPY, src0, data, 16);
VXC_DP2x8(data, v0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);
_viv_asm(COPY, src1, data, 16);
VXC_DP2x8(data, v1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
_viv_asm(COPY, cos0, data, 16);
VXC_DP2x8(data, v1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);
_viv_asm(COPY, cos1, data, 16);
VXC_DP2x8(data, v2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
_viv_asm(COPY, sin0, data, 16);
VXC_DP2x8(data, v2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);
_viv_asm(COPY, sin1, data, 16);
VXC_DP2x8(data, v3, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
_viv_asm(COPY, src2, data, 16);
VXC_DP2x8(data, v3, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);
_viv_asm(COPY, src3, data, 16);
float4 data0 = src0 * cos0 - src2 * sin0;
float4 data1 = src1 * cos1 - src3 * sin1;
_viv_asm(COPY, v0, data0, 16);
_viv_asm(COPY, v1, data1, 16);
VXC_DP2x8(dst, v0, v1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
data0 = src0 * sin0 + src2 * cos0;
data1 = src1 * sin1 + src3 * cos1;
_viv_asm(COPY, v0, data0, 16);
_viv_asm(COPY, v1, data1, 16);
VXC_DP2x8(dst, v0, v1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);
coord_out.y += half_head_size;
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
}

View File

@ -0,0 +1,245 @@
#include "cl_viv_vx_ext.h"
_viv_uniform float scale0;
_viv_uniform float scale1;
_viv_uniform float output_zp;
_viv_uniform int half_head_size;
_viv_uniform VXC_512Bits uniATimesB_0_4x4;
_viv_uniform VXC_512Bits uniATimesB_1_4x4;
_viv_uniform VXC_512Bits uniExtract8Data_2x8;
#define ROPE_BNH1_SYMM(name, src_type, src1_type, copy_type, dst_type) \
__kernel void rope_##name##_bnh1 \
( \
__read_only image2d_array_t input, \
__read_only image2d_array_t cos_cache, \
__read_only image2d_array_t sin_cache, \
__write_only image2d_array_t output, \
int axis \
) \
{ \
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); \
\
src_type data0, data1; \
src1_type cos, sin; \
copy_type v0, v1; \
VXC_ReadImage(data0, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
VXC_ReadImage(v0, cos_cache, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
_viv_asm(COPY, cos, v0, 16); \
VXC_ReadImage(v1, sin_cache, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
_viv_asm(COPY, sin, v1, 16); \
coord.x += half_head_size; \
VXC_ReadImage(data1, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
\
float4 data2, data3, data4, data5; \
VXC_DP4x4(data2, data0, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_0_4x4); \
VXC_DP4x4(data3, data0, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_1_4x4); \
VXC_DP4x4(data4, data1, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_0_4x4); \
VXC_DP4x4(data5, data1, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_1_4x4); \
data2 = data2 * scale0 - data4 * scale1 + output_zp; \
data3 = data3 * scale0 - data5 * scale1 + output_zp; \
\
int4 dst0 = convert_int4_rte(data2); \
int4 dst1 = convert_int4_rte(data3); \
\
dst_type dst; \
VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
VXC_WriteImage(output, coord.zw, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
\
VXC_DP4x4(data2, data0, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_0_4x4); \
VXC_DP4x4(data3, data0, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_1_4x4); \
VXC_DP4x4(data4, data1, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_0_4x4); \
VXC_DP4x4(data5, data1, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_1_4x4); \
data2 = data2 * scale1 + data4 * scale0 + output_zp; \
data3 = data3 * scale1 + data5 * scale0 + output_zp; \
\
dst0 = convert_int4_rte(data2); \
dst1 = convert_int4_rte(data3); \
\
VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
VXC_WriteImage(output, coord.xw, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
}
ROPE_BNH1_SYMM(I16_I16toI16, vxc_short8, vxc_short8, vxc_short8, vxc_short8)
ROPE_BNH1_SYMM(I16_I16toI8, vxc_short8, vxc_short8, vxc_short8, vxc_char8)
ROPE_BNH1_SYMM(I16_I16toU8, vxc_short8, vxc_short8, vxc_short8, vxc_uchar8)
ROPE_BNH1_SYMM(I16_F16toI16, vxc_short8, vxc_half8, vxc_short8, vxc_short8)
ROPE_BNH1_SYMM(I16_F16toI8, vxc_short8, vxc_half8, vxc_short8, vxc_char8)
ROPE_BNH1_SYMM(I16_F16toU8, vxc_short8, vxc_half8, vxc_short8, vxc_uchar8)
__kernel void rope_F16_F16toF16_bnh1
(
__read_only image2d_array_t input,
__read_only image2d_array_t cos_cache,
__read_only image2d_array_t sin_cache,
__write_only image2d_array_t output,
int axis
)
{
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));
vxc_short8 v0, v1, v2, v3, dst;
vxc_half8 data0, data1, cos, sin, dst2;
VXC_ReadImage(v0, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
_viv_asm(COPY, data0, v0, 16);
VXC_ReadImage(v1, cos_cache, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
_viv_asm(COPY, cos, v1, 16);
VXC_ReadImage(v2, sin_cache, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
_viv_asm(COPY, sin, v2, 16);
coord.x += half_head_size;
VXC_ReadImage(v3, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
_viv_asm(COPY, data1, v3, 16);
float4 data2, data3, data4, data5;
VXC_DP4x4(data2, data0, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_0_4x4);
VXC_DP4x4(data3, data0, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_1_4x4);
VXC_DP4x4(data4, data1, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_0_4x4);
VXC_DP4x4(data5, data1, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_1_4x4);
data2 = data2 - data4;
data3 = data3 - data5;
half4 dst0;
half4 dst1;
_viv_asm(CONV_RTE, dst0, data2);
_viv_asm(CONV_RTE, dst1, data3);
VXC_DP2x8(dst2, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8);
_viv_asm(COPY, dst, dst2, 16);
VXC_WriteImage(output, coord.zw, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
VXC_DP4x4(data2, data0, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_0_4x4);
VXC_DP4x4(data3, data0, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_1_4x4);
VXC_DP4x4(data4, data1, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_0_4x4);
VXC_DP4x4(data5, data1, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniATimesB_1_4x4);
data2 = data2 + data4;
data3 = data3 + data5;
_viv_asm(CONV_RTE, dst0, data2);
_viv_asm(CONV_RTE, dst1, data3);
VXC_DP2x8(dst2, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8);
_viv_asm(COPY, dst, dst2, 16);
VXC_WriteImage(output, coord.xw, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
}
_viv_uniform int in0_zp;
_viv_uniform int cos_zp;
_viv_uniform int sin_zp;
_viv_uniform VXC_512Bits uniAMinusZp_0_4x4;
_viv_uniform VXC_512Bits uniAMinusZp_1_4x4;
#define ROPE_ASYM_BNH1(name, src1_type, copy_type, dtype) \
__kernel void rope_##name##_bnh1 \
( \
__read_only image2d_array_t input, \
__read_only image2d_array_t cos_cache, \
__read_only image2d_array_t sin_cache, \
__write_only image2d_array_t output, \
int axis \
) \
{ \
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); \
\
dtype data0, data1, dst; \
src1_type cos, sin; \
copy_type v0, v1; \
VXC_ReadImage(data0, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
VXC_ReadImage(v0, cos_cache, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
_viv_asm(COPY, cos, v0, 16); \
VXC_ReadImage(v1, sin_cache, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
_viv_asm(COPY, sin, v1, 16); \
coord.x += half_head_size; \
VXC_ReadImage(data1, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
\
float4 l00, l01, cos0, cos1; \
float4 l10, l11, sin0, sin1; \
VXC_DP4x4(l00, data0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_0_4x4); \
VXC_DP4x4(l01, data0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_1_4x4); \
VXC_DP4x4(cos0, cos, cos_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_0_4x4); \
VXC_DP4x4(cos1, cos, cos_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_1_4x4); \
VXC_DP4x4(sin0, sin, sin_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_0_4x4); \
VXC_DP4x4(sin1, sin, sin_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_1_4x4); \
VXC_DP4x4(l10, data1, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_0_4x4); \
VXC_DP4x4(l11, data1, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_1_4x4); \
float4 data2 = l00 * cos0 * scale0 - l10 * sin0 * scale1 + output_zp; \
float4 data3 = l01 * cos1 * scale0 - l11 * sin1 * scale1 + output_zp; \
\
int4 dst0 = convert_int4_rte(data2); \
int4 dst1 = convert_int4_rte(data3); \
\
VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
VXC_WriteImage(output, coord.zw, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
\
data2 = l00 * sin0 * scale1 + l10 * cos0 * scale0 + output_zp; \
data3 = l01 * sin1 * scale1 + l11 * cos1 * scale0 + output_zp; \
\
dst0 = convert_int4_rte(data2); \
dst1 = convert_int4_rte(data3); \
\
VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
VXC_WriteImage(output, coord.xw, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
}
ROPE_ASYM_BNH1(I8_I8toI8, vxc_char8, vxc_char8, vxc_char8)
ROPE_ASYM_BNH1(U8_U8toU8, vxc_uchar8, vxc_uchar8, vxc_uchar8)
ROPE_ASYM_BNH1(U16_U16toU16, vxc_ushort8, vxc_ushort8, vxc_ushort8)
ROPE_ASYM_BNH1(I8_F16toI8, vxc_half8, vxc_short8, vxc_char8)
ROPE_ASYM_BNH1(U8_F16toU8, vxc_half8, vxc_short8, vxc_uchar8)
ROPE_ASYM_BNH1(U16_F16toU16, vxc_half8, vxc_short8, vxc_ushort8)
_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;
_viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8;
_viv_uniform VXC_512Bits uniExtractOddData_2x8;
__kernel void rope_BF16_BF16toBF16_bnh1
(
__read_only image2d_array_t input,
__read_only image2d_array_t cos_cache,
__read_only image2d_array_t sin_cache,
__write_only image2d_array_t output,
int axis
)
{
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));
vxc_ushort8 v0, v1, v2, v3, dst;
VXC_ReadImage(v0, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(v1, cos_cache, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(v2, sin_cache, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
coord.x += half_head_size;
VXC_ReadImage(v3, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
float4 src0, src1, src2, src3, cos0, cos1, sin0, sin1;
vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0);
vxc_short8 data;
VXC_DP2x8(data, v0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
_viv_asm(COPY, src0, data, 16);
VXC_DP2x8(data, v0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);
_viv_asm(COPY, src1, data, 16);
VXC_DP2x8(data, v1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
_viv_asm(COPY, cos0, data, 16);
VXC_DP2x8(data, v1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);
_viv_asm(COPY, cos1, data, 16);
VXC_DP2x8(data, v2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
_viv_asm(COPY, sin0, data, 16);
VXC_DP2x8(data, v2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);
_viv_asm(COPY, sin1, data, 16);
VXC_DP2x8(data, v3, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
_viv_asm(COPY, src2, data, 16);
VXC_DP2x8(data, v3, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);
_viv_asm(COPY, src3, data, 16);
float4 data0 = src0 * cos0 - src2 * sin0;
float4 data1 = src1 * cos1 - src3 * sin1;
_viv_asm(COPY, v0, data0, 16);
_viv_asm(COPY, v1, data1, 16);
VXC_DP2x8(dst, v0, v1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);
VXC_WriteImage(output, coord.zw, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
data0 = src0 * sin0 + src2 * cos0;
data1 = src1 * sin1 + src3 * cos1;
_viv_asm(COPY, v0, data0, 16);
_viv_asm(COPY, v1, data1, 16);
VXC_DP2x8(dst, v0, v1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);
VXC_WriteImage(output, coord.xw, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
}

View File

@ -0,0 +1,312 @@
#include "cl_viv_vx_ext.h"
_viv_uniform float scale0;
_viv_uniform float scale1;
_viv_uniform float output_zp;
_viv_uniform VXC_512Bits uniAEvenTimesB_0_4x4;
_viv_uniform VXC_512Bits uniAEvenTimesB_1_4x4;
_viv_uniform VXC_512Bits uniAOddTimesB_0_4x4;
_viv_uniform VXC_512Bits uniAOddTimesB_1_4x4;
_viv_uniform VXC_512Bits uniExtract8Data_2x8;
#define ROPE_BSNH_SYMM(name, src_type, src1_type, copy_type, dst_type) \
__kernel void rope_##name##_bsnh \
( \
__read_only image2d_array_t input, \
__read_only image2d_array_t cos_cache, \
__read_only image2d_array_t sin_cache, \
__write_only image2d_array_t output, \
int axis \
) \
{ \
int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2)); \
\
src_type data0, data1; \
src1_type cos, sin; \
copy_type v0, v1; \
dst_type dst; \
VXC_ReadImage(v0, cos_cache, coord_in.xw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
_viv_asm(COPY, cos, v0, 16); \
VXC_ReadImage(v1, sin_cache, coord_in.xw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
_viv_asm(COPY, sin, v1, 16); \
\
coord_in.x *= 2; \
int8 input_desc; \
_viv_asm(COPY, input_desc, input, sizeof(input_desc)); \
int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0; \
_viv_asm(MOV, coord_in.w, baseAddr); \
VXC_OP4(img_load_3d, data0, input, coord_in.xywz, 0, \
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
VXC_OP4(img_load_3d, data1, input, coord_in.xywz, VXC_5BITOFFSET_XY(8, 0), \
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
\
int4 coord_out = coord_in; \
int8 output_desc; \
_viv_asm(COPY, output_desc, output, sizeof(output_desc)); \
baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \
_viv_asm(MOV, coord_out.w, baseAddr); \
\
float4 data2, data3, data4, data5; \
VXC_DP4x4(data2, data0, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenTimesB_0_4x4); \
VXC_DP4x4(data3, data0, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenTimesB_0_4x4); \
VXC_DP4x4(data4, data0, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddTimesB_0_4x4); \
VXC_DP4x4(data5, data0, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddTimesB_0_4x4); \
data2 = data2 * scale0 - data4 * scale1 + output_zp; \
data3 = data3 * scale1 + data5 * scale0 + output_zp; \
\
int4 dst0 = convert_int4_rte(data2); \
int4 dst1 = convert_int4_rte(data3); \
\
VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, \
dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \
\
VXC_DP4x4(data2, data1, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenTimesB_1_4x4); \
VXC_DP4x4(data3, data1, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenTimesB_1_4x4); \
VXC_DP4x4(data4, data1, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddTimesB_1_4x4); \
VXC_DP4x4(data5, data1, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddTimesB_1_4x4); \
data2 = data2 * scale0 - data4 * scale1 + output_zp; \
data3 = data3 * scale1 + data5 * scale0 + output_zp; \
\
dst0 = convert_int4_rte(data2); \
dst1 = convert_int4_rte(data3); \
\
VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
coord_out.x += 8; \
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, \
dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \
}
ROPE_BSNH_SYMM(I16_I16toI16, vxc_short8, vxc_short8, vxc_short8, vxc_short8)
ROPE_BSNH_SYMM(I16_I16toI8, vxc_short8, vxc_short8, vxc_short8, vxc_char8)
ROPE_BSNH_SYMM(I16_I16toU8, vxc_short8, vxc_short8, vxc_short8, vxc_uchar8)
ROPE_BSNH_SYMM(I16_F16toI16, vxc_short8, vxc_half8, vxc_short8, vxc_short8)
ROPE_BSNH_SYMM(I16_F16toI8, vxc_short8, vxc_half8, vxc_short8, vxc_char8)
ROPE_BSNH_SYMM(I16_F16toU8, vxc_short8, vxc_half8, vxc_short8, vxc_uchar8)
__kernel void rope_F16_F16toF16_bsnh
(
__read_only image2d_array_t input,
__read_only image2d_array_t cos_cache,
__read_only image2d_array_t sin_cache,
__write_only image2d_array_t output,
int axis
)
{
int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));
vxc_short8 v0, v1, v2, v3, dst;
vxc_half8 data0, data1, cos, sin, dst2;
VXC_ReadImage(v1, cos_cache, coord_in.xw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
_viv_asm(COPY, cos, v1, 16);
VXC_ReadImage(v2, sin_cache, coord_in.xw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
_viv_asm(COPY, sin, v2, 16);
coord_in.x *= 2;
int8 input_desc;
_viv_asm(COPY, input_desc, input, sizeof(input_desc));
int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;
_viv_asm(MOV, coord_in.w, baseAddr);
VXC_OP4(img_load_3d, v0, input, coord_in.xywz, 0,
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
_viv_asm(COPY, data0, v0, 16);
VXC_OP4(img_load_3d, v3, input, coord_in.xywz, VXC_5BITOFFSET_XY(8, 0),
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
_viv_asm(COPY, data1, v3, 16);
int4 coord_out = coord_in;
int8 output_desc;
_viv_asm(COPY, output_desc, output, sizeof(output_desc));
baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;
_viv_asm(MOV, coord_out.w, baseAddr);
float4 data2, data3, data4, data5;
VXC_DP4x4(data2, data0, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenTimesB_0_4x4);
VXC_DP4x4(data3, data0, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenTimesB_0_4x4);
VXC_DP4x4(data4, data0, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddTimesB_0_4x4);
VXC_DP4x4(data5, data0, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddTimesB_0_4x4);
data2 = data2 - data4;
data3 = data3 + data5;
half4 dst0;
half4 dst1;
_viv_asm(CONV_RTE, dst0, data2);
_viv_asm(CONV_RTE, dst1, data3);
VXC_DP2x8(dst2, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8);
_viv_asm(COPY, dst, dst2, 16);
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
VXC_DP4x4(data2, data1, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenTimesB_1_4x4);
VXC_DP4x4(data3, data1, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenTimesB_1_4x4);
VXC_DP4x4(data4, data1, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddTimesB_1_4x4);
VXC_DP4x4(data5, data1, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddTimesB_1_4x4);
data2 = data2 - data4;
data3 = data3 + data5;
_viv_asm(CONV_RTE, dst0, data2);
_viv_asm(CONV_RTE, dst1, data3);
VXC_DP2x8(dst2, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8);
_viv_asm(COPY, dst, dst2, 16);
coord_out.x += 8;
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
}
_viv_uniform int in0_zp;
_viv_uniform int cos_zp;
_viv_uniform int sin_zp;
_viv_uniform VXC_512Bits uniAMinusZp_0_4x4;
_viv_uniform VXC_512Bits uniAMinusZp_1_4x4;
_viv_uniform VXC_512Bits uniAEvenMinusZp_4x4;
_viv_uniform VXC_512Bits uniAOddMinusZp_4x4;
#define ROPE_ASYM_BSNH(name, src1_type, copy_type, dtype) \
__kernel void rope_##name##_bsnh \
( \
__read_only image2d_array_t input, \
__read_only image2d_array_t cos_cache, \
__read_only image2d_array_t sin_cache, \
__write_only image2d_array_t output, \
int axis \
) \
{ \
int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2)); \
\
dtype data0, data1, dst; \
src1_type cos, sin; \
copy_type v0, v1; \
\
VXC_ReadImage(v0, cos_cache, coord_in.xw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
_viv_asm(COPY, cos, v0, 16); \
VXC_ReadImage(v1, sin_cache, coord_in.xw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
_viv_asm(COPY, sin, v1, 16); \
coord_in.x *= 2; \
int8 input_desc; \
_viv_asm(COPY, input_desc, input, sizeof(input_desc)); \
int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0; \
_viv_asm(MOV, coord_in.w, baseAddr); \
VXC_OP4(img_load_3d, data0, input, coord_in.xywz, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
VXC_OP4(img_load_3d, data1, input, coord_in.xywz, VXC_5BITOFFSET_XY(8, 0), \
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
\
int4 coord_out = coord_in; \
int8 output_desc; \
_viv_asm(COPY, output_desc, output, sizeof(output_desc)); \
baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \
_viv_asm(MOV, coord_out.w, baseAddr); \
\
float4 l00, l01, cos0, cos1; \
float4 l10, l11, sin0, sin1; \
VXC_DP4x4(l00, data0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenMinusZp_4x4); \
VXC_DP4x4(l01, data0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddMinusZp_4x4); \
VXC_DP4x4(cos0, cos, cos_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_0_4x4); \
VXC_DP4x4(cos1, cos, cos_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_1_4x4); \
VXC_DP4x4(sin0, sin, sin_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_0_4x4); \
VXC_DP4x4(sin1, sin, sin_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_1_4x4); \
float4 data2 = l00 * cos0 * scale0 - l01 * sin0 * scale1 + output_zp; \
float4 data3 = l00 * sin0 * scale1 + l01 * cos0 * scale0 + output_zp; \
\
int4 dst0 = convert_int4_rte(data2); \
int4 dst1 = convert_int4_rte(data3); \
\
VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
VXC_OP4_NoDest(img_store_3d, output, \
coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \
\
VXC_DP4x4(l10, data1, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenMinusZp_4x4); \
VXC_DP4x4(l11, data1, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddMinusZp_4x4); \
data2 = l10 * cos1 * scale0 - l11 * sin1 * scale1 + output_zp; \
data3 = l10 * sin1 * scale1 + l11 * cos1 * scale0 + output_zp; \
\
dst0 = convert_int4_rte(data2); \
dst1 = convert_int4_rte(data3); \
\
VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
coord_out.x += 8; \
VXC_OP4_NoDest(img_store_3d, output, \
coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \
}
ROPE_ASYM_BSNH(I8_I8toI8, vxc_char8, vxc_char8, vxc_char8)
ROPE_ASYM_BSNH(U8_U8toU8, vxc_uchar8, vxc_uchar8, vxc_uchar8)
ROPE_ASYM_BSNH(U16_U16toU16, vxc_ushort8, vxc_ushort8, vxc_ushort8)
ROPE_ASYM_BSNH(I8_F16toI8, vxc_half8, vxc_short8, vxc_char8)
ROPE_ASYM_BSNH(U8_F16toU8, vxc_half8, vxc_short8, vxc_uchar8)
ROPE_ASYM_BSNH(U16_F16toU16, vxc_half8, vxc_short8, vxc_ushort8)
_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;
_viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8;
_viv_uniform VXC_512Bits uniExtractOddData_2x8;
__kernel void rope_BF16_BF16toBF16_bsnh
(
__read_only image2d_array_t input,
__read_only image2d_array_t cos_cache,
__read_only image2d_array_t sin_cache,
__write_only image2d_array_t output,
int axis
)
{
int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));
vxc_ushort8 v0, v1, v2, v3, dst;
VXC_ReadImage(v1, cos_cache, coord_in.xw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(v2, sin_cache, coord_in.xw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
coord_in.x *= 2;
int8 input_desc;
_viv_asm(COPY, input_desc, input, sizeof(input_desc));
int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;
_viv_asm(MOV, coord_in.w, baseAddr);
VXC_OP4(img_load_3d, v0, input, coord_in.xywz, 0,
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
VXC_OP4(img_load_3d, v3, input, coord_in.xywz, VXC_5BITOFFSET_XY(8, 0),
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
int4 coord_out = coord_in;
int8 output_desc;
_viv_asm(COPY, output_desc, output, sizeof(output_desc));
baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;
_viv_asm(MOV, coord_out.w, baseAddr);
float4 src0, src1, src2, src3, cos0, cos1, sin0, sin1;
vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0);
vxc_short8 data;
VXC_DP2x8(data, v0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
_viv_asm(COPY, src0, data, 16);
VXC_DP2x8(data, v0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);
_viv_asm(COPY, src1, data, 16);
VXC_DP2x8(data, v1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
_viv_asm(COPY, cos0, data, 16);
VXC_DP2x8(data, v1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);
_viv_asm(COPY, cos1, data, 16);
VXC_DP2x8(data, v2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
_viv_asm(COPY, sin0, data, 16);
VXC_DP2x8(data, v2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);
_viv_asm(COPY, sin1, data, 16);
VXC_DP2x8(data, v3, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
_viv_asm(COPY, src2, data, 16);
VXC_DP2x8(data, v3, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);
_viv_asm(COPY, src3, data, 16);
float4 even = (float4)(src0.xz, src1.xz);
float4 odd = (float4)(src0.yw, src1.yw);
float4 data0 = even * cos0 - odd * sin0;
float4 data1 = even * sin0 + odd * cos0;
_viv_asm(COPY, v0, data0, 16);
_viv_asm(COPY, v1, data1, 16);
VXC_DP2x8(dst, v0, v1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
even = (float4)(src2.xz, src3.xz);
odd = (float4)(src2.yw, src3.yw);
data0 = even * cos1 - odd * sin1;
data1 = even * sin1 + odd * cos1;
_viv_asm(COPY, v0, data0, 16);
_viv_asm(COPY, v1, data1, 16);
VXC_DP2x8(dst, v0, v1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);
coord_out.x += 8;
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
}

View File

@ -0,0 +1,312 @@
#include "cl_viv_vx_ext.h"
_viv_uniform float scale0;
_viv_uniform float scale1;
_viv_uniform float output_zp;
_viv_uniform VXC_512Bits uniAEvenTimesB_0_4x4;
_viv_uniform VXC_512Bits uniAEvenTimesB_1_4x4;
_viv_uniform VXC_512Bits uniAOddTimesB_0_4x4;
_viv_uniform VXC_512Bits uniAOddTimesB_1_4x4;
_viv_uniform VXC_512Bits uniExtract8Data_2x8;
#define ROPE_BNSH_SYMM(name, src_type, src1_type, copy_type, dst_type) \
__kernel void rope_##name##_bnsh \
( \
__read_only image2d_array_t input, \
__read_only image2d_array_t cos_cache, \
__read_only image2d_array_t sin_cache, \
__write_only image2d_array_t output, \
int axis \
) \
{ \
int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2)); \
\
src_type data0, data1; \
src1_type cos, sin; \
copy_type v0, v1; \
dst_type dst; \
VXC_ReadImage(v0, cos_cache, coord_in.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
_viv_asm(COPY, cos, v0, 16); \
VXC_ReadImage(v1, sin_cache, coord_in.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
_viv_asm(COPY, sin, v1, 16); \
\
coord_in.x *= 2; \
int8 input_desc; \
_viv_asm(COPY, input_desc, input, sizeof(input_desc)); \
int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0; \
_viv_asm(MOV, coord_in.w, baseAddr); \
VXC_OP4(img_load_3d, data0, input, coord_in.xywz, 0, \
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
VXC_OP4(img_load_3d, data1, input, coord_in.xywz, VXC_5BITOFFSET_XY(8, 0), \
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
\
int4 coord_out = coord_in; \
int8 output_desc; \
_viv_asm(COPY, output_desc, output, sizeof(output_desc)); \
baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \
_viv_asm(MOV, coord_out.w, baseAddr); \
\
float4 data2, data3, data4, data5; \
VXC_DP4x4(data2, data0, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenTimesB_0_4x4); \
VXC_DP4x4(data3, data0, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenTimesB_0_4x4); \
VXC_DP4x4(data4, data0, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddTimesB_0_4x4); \
VXC_DP4x4(data5, data0, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddTimesB_0_4x4); \
data2 = data2 * scale0 - data4 * scale1 + output_zp; \
data3 = data3 * scale1 + data5 * scale0 + output_zp; \
\
int4 dst0 = convert_int4_rte(data2); \
int4 dst1 = convert_int4_rte(data3); \
\
VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, \
dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \
\
VXC_DP4x4(data2, data1, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenTimesB_1_4x4); \
VXC_DP4x4(data3, data1, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenTimesB_1_4x4); \
VXC_DP4x4(data4, data1, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddTimesB_1_4x4); \
VXC_DP4x4(data5, data1, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddTimesB_1_4x4); \
data2 = data2 * scale0 - data4 * scale1 + output_zp; \
data3 = data3 * scale1 + data5 * scale0 + output_zp; \
\
dst0 = convert_int4_rte(data2); \
dst1 = convert_int4_rte(data3); \
\
VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
coord_out.x += 8; \
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, \
dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \
}
ROPE_BNSH_SYMM(I16_I16toI16, vxc_short8, vxc_short8, vxc_short8, vxc_short8)
ROPE_BNSH_SYMM(I16_I16toI8, vxc_short8, vxc_short8, vxc_short8, vxc_char8)
ROPE_BNSH_SYMM(I16_I16toU8, vxc_short8, vxc_short8, vxc_short8, vxc_uchar8)
ROPE_BNSH_SYMM(I16_F16toI16, vxc_short8, vxc_half8, vxc_short8, vxc_short8)
ROPE_BNSH_SYMM(I16_F16toI8, vxc_short8, vxc_half8, vxc_short8, vxc_char8)
ROPE_BNSH_SYMM(I16_F16toU8, vxc_short8, vxc_half8, vxc_short8, vxc_uchar8)
__kernel void rope_F16_F16toF16_bnsh
(
__read_only image2d_array_t input,
__read_only image2d_array_t cos_cache,
__read_only image2d_array_t sin_cache,
__write_only image2d_array_t output,
int axis
)
{
int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));
vxc_short8 v0, v1, v2, v3, dst;
vxc_half8 data0, data1, cos, sin, dst2;
VXC_ReadImage(v1, cos_cache, coord_in.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
_viv_asm(COPY, cos, v1, 16);
VXC_ReadImage(v2, sin_cache, coord_in.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
_viv_asm(COPY, sin, v2, 16);
coord_in.x *= 2;
int8 input_desc;
_viv_asm(COPY, input_desc, input, sizeof(input_desc));
int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;
_viv_asm(MOV, coord_in.w, baseAddr);
VXC_OP4(img_load_3d, v0, input, coord_in.xywz, 0,
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
_viv_asm(COPY, data0, v0, 16);
VXC_OP4(img_load_3d, v3, input, coord_in.xywz, VXC_5BITOFFSET_XY(8, 0),
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
_viv_asm(COPY, data1, v3, 16);
int4 coord_out = coord_in;
int8 output_desc;
_viv_asm(COPY, output_desc, output, sizeof(output_desc));
baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;
_viv_asm(MOV, coord_out.w, baseAddr);
float4 data2, data3, data4, data5;
VXC_DP4x4(data2, data0, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenTimesB_0_4x4);
VXC_DP4x4(data3, data0, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenTimesB_0_4x4);
VXC_DP4x4(data4, data0, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddTimesB_0_4x4);
VXC_DP4x4(data5, data0, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddTimesB_0_4x4);
data2 = data2 - data4;
data3 = data3 + data5;
half4 dst0;
half4 dst1;
_viv_asm(CONV_RTE, dst0, data2);
_viv_asm(CONV_RTE, dst1, data3);
VXC_DP2x8(dst2, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8);
_viv_asm(COPY, dst, dst2, 16);
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
VXC_DP4x4(data2, data1, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenTimesB_1_4x4);
VXC_DP4x4(data3, data1, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenTimesB_1_4x4);
VXC_DP4x4(data4, data1, sin, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddTimesB_1_4x4);
VXC_DP4x4(data5, data1, cos, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddTimesB_1_4x4);
data2 = data2 - data4;
data3 = data3 + data5;
_viv_asm(CONV_RTE, dst0, data2);
_viv_asm(CONV_RTE, dst1, data3);
VXC_DP2x8(dst2, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8);
_viv_asm(COPY, dst, dst2, 16);
coord_out.x += 8;
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
}
_viv_uniform int in0_zp;
_viv_uniform int cos_zp;
_viv_uniform int sin_zp;
_viv_uniform VXC_512Bits uniAMinusZp_0_4x4;
_viv_uniform VXC_512Bits uniAMinusZp_1_4x4;
_viv_uniform VXC_512Bits uniAEvenMinusZp_4x4;
_viv_uniform VXC_512Bits uniAOddMinusZp_4x4;
#define ROPE_ASYM_BNSH(name, src1_type, copy_type, dtype) \
__kernel void rope_##name##_bnsh \
( \
__read_only image2d_array_t input, \
__read_only image2d_array_t cos_cache, \
__read_only image2d_array_t sin_cache, \
__write_only image2d_array_t output, \
int axis \
) \
{ \
int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2)); \
\
dtype data0, data1, dst; \
src1_type cos, sin; \
copy_type v0, v1; \
\
VXC_ReadImage(v0, cos_cache, coord_in.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
_viv_asm(COPY, cos, v0, 16); \
VXC_ReadImage(v1, sin_cache, coord_in.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
_viv_asm(COPY, sin, v1, 16); \
coord_in.x *= 2; \
int8 input_desc; \
_viv_asm(COPY, input_desc, input, sizeof(input_desc)); \
int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0; \
_viv_asm(MOV, coord_in.w, baseAddr); \
VXC_OP4(img_load_3d, data0, input, coord_in.xywz, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
VXC_OP4(img_load_3d, data1, input, coord_in.xywz, VXC_5BITOFFSET_XY(8, 0), \
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
\
int4 coord_out = coord_in; \
int8 output_desc; \
_viv_asm(COPY, output_desc, output, sizeof(output_desc)); \
baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \
_viv_asm(MOV, coord_out.w, baseAddr); \
\
float4 l00, l01, cos0, cos1; \
float4 l10, l11, sin0, sin1; \
VXC_DP4x4(l00, data0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenMinusZp_4x4); \
VXC_DP4x4(l01, data0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddMinusZp_4x4); \
VXC_DP4x4(cos0, cos, cos_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_0_4x4); \
VXC_DP4x4(cos1, cos, cos_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_1_4x4); \
VXC_DP4x4(sin0, sin, sin_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_0_4x4); \
VXC_DP4x4(sin1, sin, sin_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAMinusZp_1_4x4); \
float4 data2 = l00 * cos0 * scale0 - l01 * sin0 * scale1 + output_zp; \
float4 data3 = l00 * sin0 * scale1 + l01 * cos0 * scale0 + output_zp; \
\
int4 dst0 = convert_int4_rte(data2); \
int4 dst1 = convert_int4_rte(data3); \
\
VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
VXC_OP4_NoDest(img_store_3d, output, \
coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \
\
VXC_DP4x4(l10, data1, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAEvenMinusZp_4x4); \
VXC_DP4x4(l11, data1, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAOddMinusZp_4x4); \
data2 = l10 * cos1 * scale0 - l11 * sin1 * scale1 + output_zp; \
data3 = l10 * sin1 * scale1 + l11 * cos1 * scale0 + output_zp; \
\
dst0 = convert_int4_rte(data2); \
dst1 = convert_int4_rte(data3); \
\
VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
coord_out.x += 8; \
VXC_OP4_NoDest(img_store_3d, output, \
coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \
}
ROPE_ASYM_BNSH(I8_I8toI8, vxc_char8, vxc_char8, vxc_char8)
ROPE_ASYM_BNSH(U8_U8toU8, vxc_uchar8, vxc_uchar8, vxc_uchar8)
ROPE_ASYM_BNSH(U16_U16toU16, vxc_ushort8, vxc_ushort8, vxc_ushort8)
ROPE_ASYM_BNSH(I8_F16toI8, vxc_half8, vxc_short8, vxc_char8)
ROPE_ASYM_BNSH(U8_F16toU8, vxc_half8, vxc_short8, vxc_uchar8)
ROPE_ASYM_BNSH(U16_F16toU16, vxc_half8, vxc_short8, vxc_ushort8)
_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;
_viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8;
_viv_uniform VXC_512Bits uniExtractOddData_2x8;
__kernel void rope_BF16_BF16toBF16_bnsh
(
__read_only image2d_array_t input,
__read_only image2d_array_t cos_cache,
__read_only image2d_array_t sin_cache,
__write_only image2d_array_t output,
int axis
)
{
int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));
vxc_ushort8 v0, v1, v2, v3, dst;
VXC_ReadImage(v1, cos_cache, coord_in.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(v2, sin_cache, coord_in.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
coord_in.x *= 2;
int8 input_desc;
_viv_asm(COPY, input_desc, input, sizeof(input_desc));
int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;
_viv_asm(MOV, coord_in.w, baseAddr);
VXC_OP4(img_load_3d, v0, input, coord_in.xywz, 0,
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
VXC_OP4(img_load_3d, v3, input, coord_in.xywz, VXC_5BITOFFSET_XY(8, 0),
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
int4 coord_out = coord_in;
int8 output_desc;
_viv_asm(COPY, output_desc, output, sizeof(output_desc));
baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;
_viv_asm(MOV, coord_out.w, baseAddr);
float4 src0, src1, src2, src3, cos0, cos1, sin0, sin1;
vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0);
vxc_short8 data;
VXC_DP2x8(data, v0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
_viv_asm(COPY, src0, data, 16);
VXC_DP2x8(data, v0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);
_viv_asm(COPY, src1, data, 16);
VXC_DP2x8(data, v1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
_viv_asm(COPY, cos0, data, 16);
VXC_DP2x8(data, v1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);
_viv_asm(COPY, cos1, data, 16);
VXC_DP2x8(data, v2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
_viv_asm(COPY, sin0, data, 16);
VXC_DP2x8(data, v2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);
_viv_asm(COPY, sin1, data, 16);
VXC_DP2x8(data, v3, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
_viv_asm(COPY, src2, data, 16);
VXC_DP2x8(data, v3, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);
_viv_asm(COPY, src3, data, 16);
float4 even = (float4)(src0.xz, src1.xz);
float4 odd = (float4)(src0.yw, src1.yw);
float4 data0 = even * cos0 - odd * sin0;
float4 data1 = even * sin0 + odd * cos0;
_viv_asm(COPY, v0, data0, 16);
_viv_asm(COPY, v1, data1, 16);
VXC_DP2x8(dst, v0, v1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
even = (float4)(src2.xz, src3.xz);
odd = (float4)(src2.yw, src3.yw);
data0 = even * cos1 - odd * sin1;
data1 = even * sin1 + odd * cos1;
_viv_asm(COPY, v0, data0, 16);
_viv_asm(COPY, v1, data1, 16);
VXC_DP2x8(dst, v0, v1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);
coord_out.x += 8;
VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
}

View File

@ -93,3 +93,101 @@ __kernel void scatter_nd_update_cpy2out_##src0_type##to##src0_type( \
}
SCATTER_ND_UPDATE_COPY2OUT(U8, vxc_uchar16, 1)
SCATTER_ND_UPDATE_COPY2OUT(I8, vxc_char16, 1)
SCATTER_ND_UPDATE_COPY2OUT(U16, vxc_ushort8, 2)
SCATTER_ND_UPDATE_COPY2OUT(I16, vxc_short8, 2)
#define SCATTER_ND_UPDATE_REF2OUT_16BITS(src0_type, data_type) \
__kernel void scatter_nd_update_ref2out_##src0_type##to##src0_type( \
__read_only image2d_t input_ref, \
image2d_t temp_ref, \
image2d_t output0 \
) \
{ \
int gidx = get_global_id(0); \
Image img0 = create_image_from_image2d(input_ref, 2); \
Image img1 = create_image_from_image2d(temp_ref, 2); \
__global data_type* in_ptr = (__global data_type*)img0.ptr; \
__global data_type* out_ptr = (__global data_type*)img1.ptr; \
data_type src, dst; \
src = in_ptr[gidx]; \
vxc_ushort8 mp0; \
_viv_asm(COPY, mp0, multAndoutZP0, 16); \
VXC_DP2x8(dst, src, mp0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
uniU8MulAndPostShift0_Lo_2x8); \
out_ptr[gidx] = dst; \
}
SCATTER_ND_UPDATE_REF2OUT_16BITS(U16, vxc_ushort8)
SCATTER_ND_UPDATE_REF2OUT_16BITS(I16, vxc_short8)
#define SCATTER_ND_UPDATE_UPDATE2REF_16BITS(src0_type, data_type) \
__kernel void scatter_nd_update_update2ref_##src0_type##to##src0_type##_16x( \
__read_only image2d_t input_index, \
__read_only image2d_t input_update, \
image2d_t temp_ref, \
image2d_t input0, \
image2d_t output1, \
int width, int area, int vol, int coord_dim \
) \
{ \
int gidx = get_global_id(0); \
int gidy = get_global_id(1); \
\
Image img1 = create_image_from_image2d(input_index, 4); \
Image img2 = create_image_from_image2d(input_update, 2); \
Image img3 = create_image_from_image2d(temp_ref, 2); \
__global int* index_ptr = (__global int*)img1.ptr; \
__global data_type* update_ptr = (__global data_type*)img2.ptr; \
__global data_type* output_ptr = (__global data_type*)img3.ptr; \
data_type dst; \
\
int4 indice = vload4(0, index_ptr + gidy * coord_dim + offset_idx); \
data_type src = update_ptr[gidy * update_width + gidx]; \
int idx = indice.x * offsetX + indice.y * offsetY + indice.z * offsetZ + indice.w * offsetW; \
int loc = idx * output_width + gidx; \
vxc_ushort8 mp1; \
_viv_asm(COPY, mp1, multAndoutZP1, 16); \
VXC_DP2x8(dst, src, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
uniU8MulAndPostShift1_Lo_2x8); \
output_ptr[loc] = dst; \
}
SCATTER_ND_UPDATE_UPDATE2REF_16BITS(U16, vxc_ushort8)
SCATTER_ND_UPDATE_UPDATE2REF_16BITS(I16, vxc_short8)
__kernel void scatter_nd_update_ref2out_F16toF16(
__read_only image2d_t input_ref,
image2d_t temp_ref,
image2d_t output0
)
{
int gidx = get_global_id(0);
Image img0 = create_image_from_image2d(input_ref, 2);
Image img1 = create_image_from_image2d(temp_ref, 2);
__global vxc_ushort8* in_ptr = (__global vxc_ushort8*)img0.ptr;
__global vxc_ushort8* out_ptr = (__global vxc_ushort8*)img1.ptr;
out_ptr[gidx] = in_ptr[gidx];
}
__kernel void scatter_nd_update_update2ref_F16toF16_16x(
__read_only image2d_t input_index,
__read_only image2d_t input_update,
image2d_t temp_ref,
image2d_t input0,
image2d_t output1,
int width, int area, int vol, int coord_dim
)
{
int gidx = get_global_id(0);
int gidy = get_global_id(1);
Image img1 = create_image_from_image2d(input_index, 4);
Image img2 = create_image_from_image2d(input_update, 2);
Image img3 = create_image_from_image2d(temp_ref, 2);
__global int* index_ptr = (__global int*)img1.ptr;
__global vxc_ushort8* update_ptr = (__global vxc_ushort8*)img2.ptr;
__global vxc_ushort8* output_ptr = (__global vxc_ushort8*)img3.ptr;
int4 indice = vload4(0, index_ptr + gidy * coord_dim + offset_idx);
int idx = indice.x * offsetX + indice.y * offsetY + indice.z * offsetZ + indice.w * offsetW;
int loc = idx * output_width + gidx;
output_ptr[loc] = update_ptr[gidy * update_width + gidx];
}

File diff suppressed because it is too large Load Diff

View File

@ -29,6 +29,7 @@
#include "VX/vx_ext_program.h"
#include "vsi_nn_platform.h"
#include "vsi_nn_prv.h"
#include "vsi_nn_types_prv.h"
#include "vsi_nn_log.h"
#include "libnnext/vsi_nn_vxkernel.h"
#include "kernel/vsi_nn_kernel.h"
@ -198,10 +199,11 @@ static vsi_status vsi_nn_RegisterVXKernel
vx_size * program_len = NULL;
const char **program_src = NULL;
vx_context ctx = NULL;
vsi_nn_context_t context = NULL;
vx_kernel_description_t * kernel = kernel_info->kernel[kernel_info->kernel_index];
uint8_t i = 0;
vsi_bool load_from_file = FALSE;
vsi_nn_runtime_option_t* options;
options = ((vsi_nn_graph_prv_t*)graph)->options;
#define MAX_BUILDPROGRAM_LEN 128
char cmd[MAX_BUILDPROGRAM_LEN] = {0};
@ -210,8 +212,7 @@ static vsi_status vsi_nn_RegisterVXKernel
memset(cmd, 0, sizeof(char) * MAX_BUILDPROGRAM_LEN);
status = VSI_FAILURE;
ctx = vxGetContext( (vx_reference)graph->g );
context = graph->ctx;
evis = context->config.evis.ver;
evis = options->config.evis.ver;
program_src = (const char**)malloc(kernel_info->resource_num * sizeof(char *));
CHECK_PTR_FAIL_GOTO( program_src, "Create buffer fail.", final );
@ -244,12 +245,12 @@ static vsi_status vsi_nn_RegisterVXKernel
{
// set default evis version is 2
snprintf(cmd, MAX_BUILDPROGRAM_LEN,
"-cl-viv-vx-extension -D VX_VERSION=2 -D USE_40BITS_VA=%d", context->config.use_40bits_va);
"-cl-viv-vx-extension -D VX_VERSION=2 -D USE_40BITS_VA=%d", options->config.use_40bits_va);
}
else
{
snprintf(cmd, MAX_BUILDPROGRAM_LEN,
"-cl-viv-vx-extension -D VX_VERSION=%d -D USE_40BITS_VA=%d", evis, context->config.use_40bits_va);
"-cl-viv-vx-extension -D VX_VERSION=%d -D USE_40BITS_VA=%d", evis, options->config.use_40bits_va);
}
status = vxBuildProgram(program, cmd);
@ -302,7 +303,7 @@ static vsi_status vsi_nn_RegisterBinKernel
vx_size program_len = 0;
const uint8_t *program_ptr = NULL;
vx_context ctx;
vsi_nn_context_t context;
vsi_nn_runtime_option_t* options;
vx_kernel_description_t * kernel = kernel_info->kernel[kernel_info->kernel_index];
#define MAX_BUILDPROGRAM_LEN 128
@ -313,8 +314,8 @@ static vsi_status vsi_nn_RegisterBinKernel
status = VSI_FAILURE;
ctx = vxGetContext( (vx_reference)graph->g );
context = graph->ctx;
evis = context->config.evis.ver;
options = ((vsi_nn_graph_prv_t*)graph)->options;
evis = options->config.evis.ver;
program_ptr = vsi_nn_VxBinResourceGetResource(
kernel_info->resource_name[kernel_info->resource_num - 1], &program_len);
@ -337,12 +338,12 @@ static vsi_status vsi_nn_RegisterBinKernel
{
// set default evis version is 2
snprintf(cmd, MAX_BUILDPROGRAM_LEN,
"-cl-viv-vx-extension -D VX_VERSION=2 -D USE_40BITS_VA=%d", context->config.use_40bits_va);
"-cl-viv-vx-extension -D VX_VERSION=2 -D USE_40BITS_VA=%d", options->config.use_40bits_va);
}
else
{
snprintf(cmd, MAX_BUILDPROGRAM_LEN,
"-cl-viv-vx-extension -D VX_VERSION=%d -D USE_40BITS_VA=%d", evis, context->config.use_40bits_va);
"-cl-viv-vx-extension -D VX_VERSION=%d -D USE_40BITS_VA=%d", evis, options->config.use_40bits_va);
}
#else
snprintf(cmd, MAX_BUILDPROGRAM_LEN, "-cl-viv-vx-extension");

View File

@ -35,6 +35,8 @@
#include "utils/vsi_nn_constraint_check.h"
#include "kernel/vsi_nn_kernel.h"
#include "kernel/vsi_nn_kernel_eltwise.h"
#include "vsi_nn_tensor_util_prv.h"
#include "vsi_nn_error.h"
static vsi_status _try_set_high_presision_tensor
(
@ -120,9 +122,22 @@ static vsi_status _static_batchnorm
vsi_nn_tensor_t ** outputs
)
{
#define _TENSOR_LEN 64
vsi_status status;
vsi_nn_kernel_param_t * param = NULL;
vsi_nn_tensor_t* reshape_tensors[6] = { NULL };
vsi_size_t shape[VSI_NN_MAX_DIM_NUM];
uint32_t new_rank = 4;
vsi_nn_tensor_t* input0 = NULL;
vsi_nn_tensor_t* output = NULL;
char reshape0_tensor_name[_TENSOR_LEN];
char reshape1_tensor_name[_TENSOR_LEN];
char batch_norm_tensor_name[_TENSOR_LEN];
memset(reshape0_tensor_name, 0, sizeof(reshape0_tensor_name));
memset(reshape1_tensor_name, 0, sizeof(reshape1_tensor_name));
memset(batch_norm_tensor_name, 0, sizeof(batch_norm_tensor_name));
status = VSI_FAILURE;
status = _try_set_high_presision_tensor(inputs);
@ -131,10 +146,43 @@ static vsi_status _static_batchnorm
VSILOGE("Set tensor attr of high presision fail");
return status;
}
if(_require_reshape(self, inputs))
if (_require_reshape(self, inputs))
{
reshape_tensors[0] = self->nn_param.batch_norm.local->reshaped_input;
reshape_tensors[5] = self->nn_param.batch_norm.local->reshaped_output;
if (3 == inputs[0]->attr.dim_num)
{
shape[0] = inputs[0]->attr.size[0];
shape[1] = 1;
shape[2] = inputs[0]->attr.size[1];
shape[3] = inputs[0]->attr.size[2];
}
else if (5 == inputs[0]->attr.dim_num)
{
shape[0] = inputs[0]->attr.size[0] * inputs[0]->attr.size[1];
shape[1] = inputs[0]->attr.size[2];
shape[2] = inputs[0]->attr.size[3];
shape[3] = inputs[0]->attr.size[4];
}
input0 = vsi_nn_kernel_insert_reshape_node(self->graph,
inputs[0], shape, (uint32_t)new_rank, VSI_NN_OPTIMIZE_BACKWARD);
CHECK_PTR_FAIL_GOTO(input0, "Create tensor fail.", final);
reshape_tensors[0] = input0;
snprintf(reshape0_tensor_name, sizeof(reshape0_tensor_name), "uid_%u_sub_uid_%u_out_0", self->uid, 0);
if (vxSetReferenceName((vx_reference)reshape_tensors[0]->t, reshape0_tensor_name) == VSI_FAILURE)
{
VSILOGW("Set uid %u reshape 0 node output name fail", self->uid);
goto final;
}
output = vsi_nn_kernel_insert_reshape_node(self->graph,
outputs[0], shape, (uint32_t)new_rank, VSI_NN_OPTIMIZE_FORWARD);
CHECK_PTR_FAIL_GOTO(output, "Create tensor fail.", final);
reshape_tensors[5] = output;
snprintf(reshape1_tensor_name, sizeof(reshape1_tensor_name), "uid_%u_sub_uid_%u_out_0", self->uid, 1);
if (vxSetReferenceName((vx_reference)outputs[0]->t, reshape1_tensor_name) == VSI_FAILURE)
{
VSILOGW("Set uid %u reshap 1 node output name fail", self->uid);
goto final;
}
}
else
{
@ -155,12 +203,26 @@ static vsi_status _static_batchnorm
reshape_tensors, 5,
&reshape_tensors[5], 1, param );
if( self->n )
if ( self->n )
{
status = VSI_SUCCESS;
}
vsi_nn_kernel_param_release( &param );
vsi_nn_kernel_param_release(&param);
if (output)
{
snprintf(batch_norm_tensor_name, sizeof(batch_norm_tensor_name), "uid_%u_sub_uid_%u_out_0", self->uid, 2);
if (vxSetReferenceName((vx_reference)output->t, batch_norm_tensor_name) == VSI_FAILURE)
{
VSILOGW("Set uid %u instance_norm node output name fail", self->uid);
goto final;
}
}
final:
vsi_safe_release_tensor(input0);
vsi_safe_release_tensor(output);
return status;
}
@ -313,68 +375,6 @@ static vsi_status op_compute
return status;
} /* op_compute() */
static vsi_status op_optimize
(
vsi_nn_node_t * self,
vsi_nn_tensor_t ** inputs,
vsi_nn_tensor_t ** outputs,
vsi_nn_opt_direction_e direction
)
{
uint32_t dim = 0;
vsi_nn_batcnnorm_lcl_data *local = NULL;
vsi_size_t shape[VSI_NN_MAX_DIM_NUM];
char tensor_name[128];
dim = inputs[0]->attr.dim_num;
if(_require_reshape(self, inputs) == FALSE)
{
return VSI_SUCCESS;
}
VSILOGD("Optimize 3D %s, uid %u", vsi_nn_OpGetName(self->op), self->uid);
/*
reshape 3d input (xcn) --> 4d input (whcn)
reshape 3d output(xcn) --> 4d output(whcn)
*/
dim = 4;
if (3 == inputs[0]->attr.dim_num)
{
shape[0] = inputs[0]->attr.size[0];
shape[1] = 1;
shape[2] = inputs[0]->attr.size[1];
shape[3] = inputs[0]->attr.size[2];
}
else if (5 == inputs[0]->attr.dim_num)
{
shape[0] = inputs[0]->attr.size[0] * inputs[0]->attr.size[1];
shape[1] = inputs[0]->attr.size[2];
shape[2] = inputs[0]->attr.size[3];
shape[3] = inputs[0]->attr.size[4];
}
local = self->nn_param.batch_norm.local;
if (VSI_NN_OPTIMIZE_BACKWARD == direction)
{
local->reshaped_input = vsi_nn_reshape_tensor(self->graph, inputs[0], shape, dim);
}
else
{
local->reshaped_output = vsi_nn_reshape_tensor(self->graph, outputs[0], shape, dim);
if(local->reshaped_output && local->reshaped_output->t)
{
memset(tensor_name, 0, sizeof(tensor_name));
snprintf(tensor_name, sizeof(tensor_name), "uid_%u_reshape_out_0", self->uid);
if(vxSetReferenceName((vx_reference)local->reshaped_output->t, tensor_name) == VSI_FAILURE)
{
VSILOGW("Set uid %u batchnorm reshaped output name fail", self->uid);
return VSI_FAILURE;
}
}
}
return VSI_SUCCESS;
} /* op_optimize() */
static vsi_bool _dynamic_check
(
vsi_nn_node_t * self,
@ -494,58 +494,6 @@ static vsi_bool op_check
}
} /* op_check() */
static vsi_bool op_setup
(
vsi_nn_node_t * self,
vsi_nn_tensor_t ** inputs,
vsi_nn_tensor_t ** outputs
)
{
vsi_nn_batcnnorm_lcl_data *local = NULL;
if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
{
outputs[0]->attr.dim_num = inputs[0]->attr.dim_num;
memcpy( outputs[0]->attr.size, inputs[0]->attr.size,
VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t) );
}
if(_require_reshape(self, inputs))
{
local = (vsi_nn_batcnnorm_lcl_data *)malloc(sizeof(vsi_nn_batcnnorm_lcl_data));
if(NULL == local)
{
return VSI_FAILURE;
}
memset(local, 0, sizeof(vsi_nn_batcnnorm_lcl_data));
self->nn_param.batch_norm.local = local;
}
return TRUE;
} /* op_setup() */
static vsi_status op_deinit
(
vsi_nn_node_t * self
)
{
vsi_nn_batch_norm_param *p = &(self->nn_param.batch_norm);
if(p->local)
{
if (p->local->reshaped_input)
{
vsi_nn_ReleaseTensor(&(p->local->reshaped_input));
p->local->reshaped_input = NULL;
}
if (p->local->reshaped_output)
{
vsi_nn_ReleaseTensor(&(p->local->reshaped_output));
p->local->reshaped_output = NULL;
}
vsi_nn_safe_free(p->local);
}
vsi_nn_op_common_deinit(self);
return VSI_SUCCESS;
}
#ifdef __cplusplus
extern "C" {
#endif
@ -555,10 +503,10 @@ DEF_OP_REG
/* op_name */ BATCH_NORM,
/* init */ NULL,
/* compute */ op_compute,
/* deinit */ op_deinit,
/* deinit */ vsi_nn_op_common_deinit,
/* check */ op_check,
/* setup */ op_setup,
/* optimize */ op_optimize,
/* setup */ vsi_nn_op_common_setup,
/* optimize */ NULL,
/* input_num */ 5,
/* output_num */ 1
);

View File

@ -118,6 +118,7 @@ static vsi_bool op_setup
if (outputs[0]->attr.dim_num == 0)
{
outputs[0]->attr.size[0] = 1;
outputs[0]->attr.dim_num = 1;
vsi_nn_SetTensorIsScalar(outputs[0], TRUE);
}
else

View File

@ -82,6 +82,7 @@ static vsi_bool op_check
{
BEGIN_IO_TYPE_DECL(CUMSUM, 1, 1)
IO_TYPE(D_U32, D_U32)
IO_TYPE(D_I32, D_I32)
IO_TYPE(D_F32, D_F32)
IO_TYPE(D_F16, D_F16)
IO_TYPE(D_BF16, D_BF16)

View File

@ -253,6 +253,7 @@ static vsi_bool op_check
IO_TYPE(D_BOOL8, D_I32)
IO_TYPE(D_BOOL8, D_U16)
IO_TYPE(D_BOOL8, D_U32)
IO_TYPE(D_BOOL8, D_BF16)
IO_TYPE(D_U8|Q_ASYM, D_BOOL8)
IO_TYPE(D_I8|Q_ASYM, D_BOOL8)
IO_TYPE(D_I8|Q_DFP, D_BOOL8)

View File

@ -155,10 +155,10 @@ vsi_bool vsi_nn_op_eltwise_setup
vsi_size_t shape[VSI_NN_MAX_DIM_NUM] = { 0 };
vsi_bool ret = TRUE;
out_rank = inputs[0]->attr.dim_num;
out_rank = vsi_nn_get_tensor_dims(inputs[0]);
for ( i = 1; i < self->input.num; i++)
{
in2_rank = inputs[i]->attr.dim_num;
in2_rank = vsi_nn_get_tensor_dims(inputs[i]);
out_rank = vsi_nn_max( out_rank, in2_rank );
}
@ -166,10 +166,10 @@ vsi_bool vsi_nn_op_eltwise_setup
{
vsi_size_t sz0, sz1;
sz0 = i < inputs[0]->attr.dim_num ? inputs[0]->attr.size[i] : 1;
sz0 = i < vsi_nn_get_tensor_dims(inputs[0]) ? inputs[0]->attr.size[i] : 1;
for ( j = 1; j < self->input.num; j++)
{
sz1 = i < inputs[j]->attr.dim_num ? inputs[j]->attr.size[i] : 1;
sz1 = i < vsi_nn_get_tensor_dims(inputs[j]) ? inputs[j]->attr.size[i] : 1;
sz0 = vsi_nn_max( sz0, sz1 );
if (sz0 != sz1 && sz0 != 1 && sz1 != 1)
{
@ -187,11 +187,12 @@ vsi_bool vsi_nn_op_eltwise_setup
{
outputs[0]->attr.dim_num = out_rank;
memcpy( outputs[0]->attr.size, shape, out_rank * sizeof(vsi_size_t) );
if (out_rank == 1 &&
vsi_nn_GetTensorIsScalar(inputs[0]) &&
if (vsi_nn_GetTensorIsScalar(inputs[0]) &&
vsi_nn_GetTensorIsScalar(inputs[1]))
{
vsi_nn_SetTensorIsScalar(outputs[0], TRUE);
outputs[0]->attr.size[0] = 1;
outputs[0]->attr.dim_num = 1;
}
}
else

View File

@ -199,6 +199,7 @@ static vsi_bool op_setup
if (o_rank == 0)
{
outputs[0]->attr.size[0] = 1;
outputs[0]->attr.dim_num = 1;
vsi_nn_SetTensorIsScalar(outputs[0], TRUE);
}
else

View File

@ -306,6 +306,8 @@ static vsi_bool _op_check
IO_TYPE(D_I16|Q_DFP, D_F32, D_F32, D_I16|Q_DFP)
IO_TYPE(D_I16|Q_ASYM, D_F32, D_F32, D_I16|Q_ASYM)
IO_TYPE(D_I16|Q_SYM, D_F32, D_F32, D_I16|Q_SYM)
IO_TYPE(D_U16|Q_ASYM, D_F32, D_F32, D_U16|Q_ASYM)
IO_TYPE(D_U16|Q_SYM, D_F32, D_F32, D_U16|Q_SYM)
END_IO_TYPE_DECL(GROUP_NORM)
if (!VALIDATE_OP_IO_TYPES(GROUP_NORM, self, inputs, self->input.num, outputs, self->output.num))
{

View File

@ -25,6 +25,7 @@
#include <stdlib.h>
#include "vsi_nn_types.h"
#include "vsi_nn_types_prv.h"
#include "vsi_nn_platform.h"
#include "vsi_nn_log.h"
#include "vsi_nn_graph.h"
@ -197,6 +198,7 @@ static vsi_bool op_setup_default
vsi_nn_internal_tensor_t * hstate_fc_outputs[GRUCELL_GATE_CNT] = { NULL };
vsi_nn_internal_tensor_t * h_times_r = NULL;
vsi_nn_tensor_attr_t attr;
vsi_nn_activation_e recurrent_activation = p->recurrent_activation;
vsi_nn_internal_init_node_wksp( self );
@ -230,7 +232,8 @@ static vsi_bool op_setup_default
memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t));
attr.dtype.qnt_type = VSI_NN_QNT_TYPE_NONE;
if (inputs[GRUCELL_IN_H_STATE]->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT32 ||
self->graph->ctx->config.support_stream_processor)
(((vsi_nn_graph_prv_t*)(self->graph))->options->config.support_stream_processor &&
recurrent_activation == VSI_NN_ACT_SIGMOID))
{
attr.dtype.vx_type = VSI_NN_TYPE_FLOAT32;
}

View File

@ -93,37 +93,15 @@ static vsi_bool op_check
{
BEGIN_IO_TYPE_DECL(L1_LAYER_NORM, 4, 1)
IO_TYPE(D_F32, D_F32, D_F32, D_F32, D_F32)
IO_TYPE(D_F16, D_F32, D_F16, D_F32, D_F16)
IO_TYPE(D_F16, D_F32, D_F32, D_F32, D_F16)
IO_TYPE(D_F16, D_F32, D_F16, D_F32, D_U8|Q_ASYM)
IO_TYPE(D_F16, D_F32, D_F32, D_F32, D_U8|Q_ASYM)
IO_TYPE(D_F16, D_F32, D_F16, D_F32, D_I8|Q_DFP)
IO_TYPE(D_F16, D_F32, D_F32, D_F32, D_I8|Q_DFP)
IO_TYPE(D_F16, D_F32, D_F16, D_F32, D_I8|Q_ASYM)
IO_TYPE(D_F16, D_F32, D_F32, D_F32, D_I8|Q_ASYM)
IO_TYPE(D_F16, D_F32, D_F16, D_F32, D_I8|Q_SYM)
IO_TYPE(D_F16, D_F32, D_F32, D_F32, D_I8|Q_SYM)
IO_TYPE(D_F16, D_F32, D_F16, D_F32, D_I16|Q_DFP)
IO_TYPE(D_F16, D_F32, D_F32, D_F32, D_I16|Q_DFP)
IO_TYPE(D_F16, D_F32, D_F16, D_F32, D_I16|Q_ASYM)
IO_TYPE(D_F16, D_F32, D_F32, D_F32, D_I16|Q_ASYM)
IO_TYPE(D_F16, D_F32, D_F16, D_F32, D_I16|Q_SYM)
IO_TYPE(D_F16, D_F32, D_F32, D_F32, D_I16|Q_SYM)
IO_TYPE(D_BF16, D_F32, D_F32, D_F32, D_BF16)
IO_TYPE(D_U8|Q_ASYM, D_F32, D_F16, D_F32, D_F16)
IO_TYPE(D_U8|Q_ASYM, D_F32, D_F16, D_F32, D_U8|Q_ASYM)
IO_TYPE(D_I16|Q_DFP, D_F32, D_F16, D_F32, D_I16|Q_DFP)
IO_TYPE(D_I16|Q_ASYM, D_F32, D_F16, D_F32, D_I16|Q_ASYM)
IO_TYPE(D_I16|Q_SYM, D_F32, D_F16, D_F32, D_I16|Q_SYM)
IO_TYPE(D_I16|Q_DFP, D_F32, D_F16, D_F32, D_F16)
IO_TYPE(D_I16|Q_ASYM, D_F32, D_F16, D_F32, D_F16)
IO_TYPE(D_I16|Q_SYM, D_F32, D_F16, D_F32, D_F16)
IO_TYPE(D_I8|Q_DFP, D_F32, D_F16, D_F32, D_I8|Q_DFP)
IO_TYPE(D_I8|Q_ASYM, D_F32, D_F16, D_F32, D_I8|Q_ASYM)
IO_TYPE(D_I8|Q_SYM, D_F32, D_F16, D_F32, D_I8|Q_SYM)
IO_TYPE(D_I8|Q_DFP, D_F32, D_F16, D_F32, D_F16)
IO_TYPE(D_I8|Q_ASYM, D_F32, D_F16, D_F32, D_F16)
IO_TYPE(D_I8|Q_SYM, D_F32, D_F16, D_F32, D_F16)
IO_TYPE(D_U8|Q_ASYM, D_F32, D_F32, D_F32, D_U8|Q_ASYM)
IO_TYPE(D_U8|Q_ASYM, D_F32, D_F32, D_F32, D_F16)
IO_TYPE(D_I16|Q_DFP, D_F32, D_F32, D_F32, D_I16|Q_DFP)

View File

@ -25,6 +25,7 @@
#include <stdlib.h>
#include "vsi_nn_types.h"
#include "vsi_nn_types_prv.h"
#include "vsi_nn_platform.h"
#include "vsi_nn_graph.h"
#include "vsi_nn_node.h"
@ -351,7 +352,7 @@ static vsi_bool op_setup
}
else if ( ( inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_BFLOAT16 &&
outputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_BFLOAT16 ) ||
self->graph->ctx->config.support_stream_processor )
((vsi_nn_graph_prv_t*)(self->graph))->options->config.support_stream_processor )
{
vsi_nn_internal_tensor_t* output_tensor = NULL;
vsi_nn_internal_tensor_t* reshape_tensor = NULL;

View File

@ -106,7 +106,7 @@ static vsi_bool op_setup
vsi_nn_internal_init_node_wksp( self );
if ( axis != 0 && !self->graph->ctx->config.support_stream_processor)
if ( axis != 0 && !((vsi_nn_graph_prv_t*)(self->graph))->options->config.support_stream_processor)
{
vsi_nn_internal_tensor_t* mean_tensor = NULL;
vsi_nn_internal_tensor_t* vari_tensor = NULL;

View File

@ -25,6 +25,7 @@
#include <stdlib.h>
#include "vsi_nn_types.h"
#include "vsi_nn_types_prv.h"
#include "vsi_nn_platform.h"
#include "vsi_nn_log.h"
#include "vsi_nn_graph.h"
@ -139,7 +140,7 @@ static vsi_bool op_setup
p->is_cifg = inputs[LSTMUNIT_ACT_INPUT_FC_I] == NULL;
p->is_projection = outputs[LSTMUNIT_ACT_HSTATE_OUT] == NULL;
if (self->graph->ctx->config.support_stream_processor)
if (((vsi_nn_graph_prv_t*)(self->graph))->options->config.support_stream_processor)
{
p->is_layer_norm = inputs[LSTMUNIT_ACT_HSTATE_FC_F] == NULL;
}

View File

@ -100,6 +100,7 @@ static vsi_bool op_check
IO_TYPE(D_I32, D_I16|Q_ASYM)
IO_TYPE(D_I32, D_I16|Q_SYM)
IO_TYPE(D_I32, D_I32)
IO_TYPE(D_I32, D_BF16)
IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM)
IO_TYPE(D_U8|Q_ASYM, D_I16|Q_SYM)
IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP)
@ -111,8 +112,10 @@ static vsi_bool op_check
IO_TYPE(D_U8|Q_ASYM, D_BF16)
IO_TYPE(D_I8|Q_ASYM, D_I8|Q_ASYM)
IO_TYPE(D_I8|Q_ASYM, D_F16)
IO_TYPE(D_I8|Q_ASYM, D_BF16)
IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP)
IO_TYPE(D_I8|Q_DFP, D_F16)
IO_TYPE(D_I8|Q_DFP, D_BF16)
IO_TYPE(D_I16|Q_DFP, D_I8|Q_DFP)
IO_TYPE(D_I16|Q_DFP, D_U8|Q_ASYM)
IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP)
@ -124,11 +127,14 @@ static vsi_bool op_check
IO_TYPE(D_I16|Q_ASYM, D_I16|Q_ASYM)
IO_TYPE(D_I16|Q_SYM, D_I8|Q_SYM)
IO_TYPE(D_I16|Q_ASYM, D_F16)
IO_TYPE(D_I16|Q_ASYM, D_BF16)
IO_TYPE(D_I16|Q_ASYM, D_F32)
IO_TYPE(D_I16|Q_SYM, D_U8|Q_ASYM)
IO_TYPE(D_I16|Q_SYM, D_I16|Q_SYM)
IO_TYPE(D_I16|Q_SYM, D_BF16)
IO_TYPE(D_I8|Q_SYM, D_I8|Q_SYM)
IO_TYPE(D_I8|Q_SYM, D_F16)
IO_TYPE(D_I8|Q_SYM, D_BF16)
IO_TYPE(D_BF16, D_BF16)
END_IO_TYPE_DECL(ONE_HOT)
if (!VALIDATE_OP_IO_TYPES(ONE_HOT, self, inputs, self->input.num, outputs, self->output.num))

View File

@ -36,6 +36,7 @@
#include "vsi_nn_tensor_util.h"
#include "kernel/vsi_nn_kernel.h"
#include "utils/vsi_nn_constraint_check.h"
#include "vsi_nn_error.h"
#define _INPUT_NUM (1)
#define _OUTPUT_NUM (1)
@ -50,33 +51,52 @@ static vsi_status op_compute
vsi_status status = VSI_FAILURE;
vsi_nn_kernel_param_t * param = NULL;
vsi_nn_kernel_node_t n = NULL;
param =vsi_nn_kernel_param_create();
vsi_nn_tensor_t* reshape_tensor = NULL;
vsi_size_t shape[VSI_NN_MAX_DIM_NUM] = { 0 };
vsi_nn_pre_process_rgb_param* p = NULL;
vsi_nn_kernel_param_add_int32( param, "scale_x", self->nn_param.pre_process_rgb.local.scale_x );
vsi_nn_kernel_param_add_int32( param, "scale_y", self->nn_param.pre_process_rgb.local.scale_y );
vsi_nn_kernel_param_add_int32( param, "left", self->nn_param.pre_process_rgb.rect.left );
vsi_nn_kernel_param_add_int32( param, "top", self->nn_param.pre_process_rgb.rect.top );
vsi_nn_kernel_param_add_float32( param, "r_mean", self->nn_param.pre_process_rgb.r_mean );
vsi_nn_kernel_param_add_float32( param, "g_mean", self->nn_param.pre_process_rgb.g_mean );
vsi_nn_kernel_param_add_float32( param, "b_mean", self->nn_param.pre_process_rgb.b_mean );
vsi_nn_kernel_param_add_float32( param, "r_scale", self->nn_param.pre_process_rgb.r_scale );
vsi_nn_kernel_param_add_float32( param, "g_scale", self->nn_param.pre_process_rgb.g_scale );
vsi_nn_kernel_param_add_float32( param, "b_scale", self->nn_param.pre_process_rgb.b_scale );
vsi_nn_kernel_param_add_int32( param, "reverse", self->nn_param.pre_process_rgb.reverse_channel );
vsi_nn_kernel_param_add_int32( param, "enable_perm", self->nn_param.pre_process_rgb.local.enable_perm );
vsi_nn_kernel_param_add_int32( param, "enable_copy", self->nn_param.pre_process_rgb.local.enable_copy );
n = vsi_nn_kernel_selector( self->graph, "pre_process_rgb", inputs, 1, outputs, 1, param );
if( n != NULL )
memcpy(shape, inputs[0]->attr.size, inputs[0]->attr.dim_num * sizeof(vsi_size_t));
shape[0] = shape[1] * shape[0];
shape[1] = shape[2];
shape[2] = 1;
reshape_tensor = vsi_nn_reshape_tensor(self->graph,
inputs[0], shape, inputs[0]->attr.dim_num);
CHECK_PTR_FAIL_GOTO(reshape_tensor, "Create tensor failed", final);
p = (vsi_nn_pre_process_rgb_param*)&(self->nn_param.pre_process_rgb);
param = vsi_nn_kernel_param_create();
vsi_nn_kernel_param_add_int32( param, "scale_x", p->local->scale_x );
vsi_nn_kernel_param_add_int32( param, "scale_y", p->local->scale_y );
vsi_nn_kernel_param_add_int32( param, "left", p->rect.left );
vsi_nn_kernel_param_add_int32( param, "top", p->rect.top );
vsi_nn_kernel_param_add_float32( param, "r_mean", p->r_mean );
vsi_nn_kernel_param_add_float32( param, "g_mean", p->g_mean );
vsi_nn_kernel_param_add_float32( param, "b_mean", p->b_mean );
vsi_nn_kernel_param_add_float32( param, "r_scale", p->r_scale );
vsi_nn_kernel_param_add_float32( param, "g_scale", p->g_scale );
vsi_nn_kernel_param_add_float32( param, "b_scale", p->b_scale );
vsi_nn_kernel_param_add_int32( param, "reverse", p->reverse_channel );
vsi_nn_kernel_param_add_int32( param, "enable_perm", p->local->enable_perm );
vsi_nn_kernel_param_add_int32( param, "enable_copy", p->local->enable_copy );
n = vsi_nn_kernel_selector( self->graph, "pre_process_rgb", &reshape_tensor, 1, outputs, 1, param );
if ( n != NULL )
{
self->n = (vx_node)n;
status = VSI_SUCCESS;
}
if(param != NULL)
if (param != NULL)
{
vsi_nn_kernel_param_release( &param );
}
final:
vsi_safe_release_tensor(reshape_tensor);
return status;
} /* op_compute() */
@ -166,35 +186,57 @@ static vsi_bool op_setup
}
self->nn_param.pre_process_rgb.local.enable_perm = FALSE;
p->local->enable_perm = FALSE;
if (self->nn_param.pre_process_rgb.local.enable_perm == FALSE)
if (p->local->enable_perm == FALSE)
{
p->local.scale_x = (int32_t)((p->rect.width << 15) / outputs[PRE_PROCESS_RGB_OUTPUT]->attr.size[0]);
p->local.scale_y = (int32_t)((p->rect.height << 15) / outputs[PRE_PROCESS_RGB_OUTPUT]->attr.size[1]);
p->local->scale_x = (int32_t)((p->rect.width << 15) / outputs[PRE_PROCESS_RGB_OUTPUT]->attr.size[0]);
p->local->scale_y = (int32_t)((p->rect.height << 15) / outputs[PRE_PROCESS_RGB_OUTPUT]->attr.size[1]);
}
else
{
p->local.scale_x = (int32_t)((p->rect.width << 15) / outputs[PRE_PROCESS_RGB_OUTPUT]->attr.size[1]);
p->local.scale_y = (int32_t)((p->rect.height << 15) / outputs[PRE_PROCESS_RGB_OUTPUT]->attr.size[2]);
p->local->scale_x = (int32_t)((p->rect.width << 15) / outputs[PRE_PROCESS_RGB_OUTPUT]->attr.size[1]);
p->local->scale_y = (int32_t)((p->rect.height << 15) / outputs[PRE_PROCESS_RGB_OUTPUT]->attr.size[2]);
}
p->local.enable_copy = ((p->local.scale_x == p->local.scale_y) && (p->local.scale_x == (1 << 15)));
p->local->enable_copy = ((p->local->scale_x == p->local->scale_y) && (p->local->scale_x == (1 << 15)));
return TRUE;
} /* op_setup() */
static vsi_status op_init
(
vsi_nn_node_t* self
)
{
vsi_status status = VSI_SUCCESS;
self->nn_param.pre_process_rgb.local =
(vsi_nn_pre_process_rgb_lcl_data*)malloc(sizeof(vsi_nn_pre_process_rgb_lcl_data));
if (NULL == self->nn_param.pre_process_rgb.local)
{
return VX_ERROR_NO_MEMORY;
}
memset(self->nn_param.pre_process_rgb.local, 0, sizeof(vsi_nn_pre_process_rgb_lcl_data));
return status;
} /* op_init() */
static vsi_status op_deinit
(
vsi_nn_node_t * self
)
{
if (self->nn_param.pre_process_rgb.local.local_tensor != NULL)
if (self->nn_param.pre_process_rgb.local->local_tensor != NULL)
{
vxReleaseTensor(&self->nn_param.pre_process_rgb.local.local_tensor);
self->nn_param.pre_process_rgb.local.local_tensor = NULL;
vxReleaseTensor(&self->nn_param.pre_process_rgb.local->local_tensor);
self->nn_param.pre_process_rgb.local->local_tensor = NULL;
}
vsi_nn_safe_free(self->nn_param.pre_process_rgb.local);
vsi_nn_op_common_deinit(self);
return VSI_SUCCESS;
@ -208,7 +250,7 @@ extern "C" {
DEF_OP_REG
(
/* op_name */ PRE_PROCESS_RGB,
/* init */ NULL,
/* init */ op_init,
/* compute */ op_compute,
/* deinit */ op_deinit,
/* check */ op_check,

View File

@ -79,7 +79,10 @@ static vsi_status _prelu_op_compute
vsi_status status = VSI_FAILURE;
vsi_nn_prelu_param *prelu = &self->nn_param.prelu;
vsi_ssize_t shapes[VSI_NN_MAX_DIM_NUM] = { 1 };
vsi_nn_tensor_t* reshape_tensors[2] = { NULL };
vsi_nn_tensor_t* reshape_tensors[3] = { NULL };
vsi_nn_tensor_t* input0 = NULL;
vsi_nn_tensor_t* input1 = NULL;
vsi_nn_tensor_t* output = NULL;
vsi_bool one_rank = FALSE;
vsi_bool is_per_channel_alpha = 0;
vsi_size_t alpha_shape = 1;
@ -88,6 +91,7 @@ static vsi_status _prelu_op_compute
uint32_t dims = outputs[0]->attr.dim_num;
reshape_tensors[0] = inputs[0];
reshape_tensors[2] = outputs[0];
one_rank = _is_one_rank_tensor(inputs[1], &alpha_shape);
for (i = 0; i < VSI_NN_MAX_DIM_NUM; i++)
@ -114,18 +118,23 @@ static vsi_status _prelu_op_compute
dims = inputs[1]->attr.dim_num;
}
reshape_tensors[1] = vsi_nn_reshape_tensor( self->graph,
input1 = vsi_nn_reshape_tensor( self->graph,
inputs[1], (vsi_size_t*)shapes, dims );
CHECK_PTR_FAIL_GOTO(input1, "Create tensor fail.", final);
reshape_tensors[1] = input1;
}
else
{
memcpy(shapes, inputs[1]->attr.size, inputs[1]->attr.dim_num * sizeof(vsi_size_t));
reshape_tensors[1] = vsi_nn_reshape_tensor( self->graph,
input1 = vsi_nn_reshape_tensor( self->graph,
inputs[1], (vsi_size_t*)shapes, inputs[1]->attr.dim_num );
CHECK_PTR_FAIL_GOTO(input1, "Create tensor fail.", final);
reshape_tensors[1] = input1;
}
}
else
{
uint32_t rank = inputs[0]->attr.dim_num;
dims = inputs[1]->attr.dim_num;
memcpy(shapes, inputs[1]->attr.size, inputs[1]->attr.dim_num * sizeof(vsi_size_t));
@ -141,9 +150,32 @@ static vsi_status _prelu_op_compute
shapes[1] = 1;
dims = 2;
}
else if (one_rank && inputs[1]->attr.is_const == TRUE &&
alpha_shape == inputs[0]->attr.size[0] &&
alpha_shape == inputs[1]->attr.size[0] &&
rank < 3)
{
is_per_channel_alpha = TRUE;
shapes[0] = 1;
shapes[1] = 1;
shapes[2] = alpha_shape;
shapes[3] = rank > 1 ? inputs[0]->attr.size[1] : 1;
dims = 4;
input0 = vsi_nn_reshape_tensor(self->graph, inputs[0], (vsi_size_t*)shapes, dims);
CHECK_PTR_FAIL_GOTO(input0, "Create tensor fail.", final);
reshape_tensors[0] = input0;
output = vsi_nn_reshape_tensor(self->graph, outputs[0], (vsi_size_t*)shapes, dims);
CHECK_PTR_FAIL_GOTO(output, "Create tensor fail.", final);
reshape_tensors[2] = output;
shapes[0] = alpha_shape;
shapes[1] = 1;
dims = 2;
}
reshape_tensors[1] = vsi_nn_reshape_tensor( self->graph,
input1 = vsi_nn_reshape_tensor( self->graph,
inputs[1], (vsi_size_t*)shapes, dims );
CHECK_PTR_FAIL_GOTO(input1, "Create tensor fail.", final);
reshape_tensors[1] = input1;
}
// Add params
@ -153,15 +185,19 @@ static vsi_status _prelu_op_compute
self->n = (vx_node)vsi_nn_kernel_selector( self->graph,
kernel_name,
&reshape_tensors[0], 2,
outputs, 1, param );
&reshape_tensors[2], 1, param );
vsi_nn_kernel_param_release( &param );
vsi_nn_ReleaseTensor( &reshape_tensors[1] );
if( self->n )
if ( self->n )
{
status = VSI_SUCCESS;
}
final:
vsi_safe_release_tensor(input0);
vsi_safe_release_tensor(input1);
vsi_safe_release_tensor(output);
return status;
} /* _prelu_op_compute() */
@ -211,28 +247,36 @@ static vsi_bool op_check
)
{
BEGIN_IO_TYPE_DECL(PRELU, 2, 1)
IO_TYPE(D_F16, D_F16, D_U8|Q_ASYM)
IO_TYPE(D_F16, D_F16, D_I16|Q_DFP)
IO_TYPE(D_F16, D_F16, D_I8|Q_DFP)
IO_TYPE(D_F16, D_F16, D_F16)
IO_TYPE(D_U8|Q_ASYM, D_F16, D_U8|Q_ASYM)
IO_TYPE(D_U8|Q_ASYM, D_F16, D_F16)
IO_TYPE(D_I8|Q_DFP, D_F16, D_I8|Q_DFP)
IO_TYPE(D_I8|Q_DFP, D_F16, D_F16)
IO_TYPE(D_I16|Q_DFP, D_F16, D_I16|Q_DFP)
IO_TYPE(D_I16|Q_DFP, D_F16, D_F16)
IO_TYPE(D_BF16, D_F16, D_BF16)
IO_TYPE(D_BF16, D_BF16, D_BF16)
IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM)
IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_F16)
IO_TYPE(D_F32, D_F32, D_F32)
IO_TYPE(D_I32, D_I32, D_I32)
IO_TYPE(D_F16, D_F16, D_U8|Q_ASYM)
IO_TYPE(D_F16, D_F16, D_I16|Q_DFP)
IO_TYPE(D_F16, D_F16, D_I8|Q_DFP)
IO_TYPE(D_F16, D_F16, D_F16)
IO_TYPE(D_U8|Q_ASYM, D_F16, D_U8|Q_ASYM)
IO_TYPE(D_U8|Q_ASYM, D_F16, D_F16)
IO_TYPE(D_I8|Q_DFP, D_F16, D_I8|Q_DFP)
IO_TYPE(D_I8|Q_DFP, D_F16, D_F16)
IO_TYPE(D_I8|Q_SYM, D_F16, D_I8|Q_SYM)
IO_TYPE(D_I8|Q_ASYM, D_F16, D_I8|Q_ASYM)
IO_TYPE(D_I8|Q_SYM, D_F16, D_F16)
IO_TYPE(D_I8|Q_ASYM, D_F16, D_F16)
IO_TYPE(D_I16|Q_DFP, D_F16, D_I16|Q_DFP)
IO_TYPE(D_I16|Q_DFP, D_F16, D_F16)
IO_TYPE(D_I16|Q_SYM, D_F16, D_I16|Q_SYM)
IO_TYPE(D_I16|Q_ASYM, D_F16, D_I16|Q_ASYM)
IO_TYPE(D_I16|Q_SYM, D_F16, D_F16)
IO_TYPE(D_I16|Q_ASYM, D_F16, D_F16)
IO_TYPE(D_BF16, D_F16, D_BF16)
IO_TYPE(D_BF16, D_BF16, D_BF16)
IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM)
IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_F16)
IO_TYPE(D_F32, D_F32, D_F32)
IO_TYPE(D_I32, D_I32, D_I32)
/* HW 9.0 */
IO_TYPE(D_F32, D_BF16, D_BF16)
IO_TYPE(D_BF16, D_BF16, D_F32)
IO_TYPE(D_F32, D_BF16, D_BF16)
IO_TYPE(D_BF16, D_BF16, D_F32)
END_IO_TYPE_DECL(PRELU)
if(!VALIDATE_OP_IO_TYPES(PRELU, self, inputs, self->input.num, outputs, self->output.num)) {
if (!VALIDATE_OP_IO_TYPES(PRELU, self, inputs, self->input.num, outputs, self->output.num)) {
char* desc = generate_op_io_types_desc(inputs,
self->input.num, outputs, self->output.num);
VSILOGE("Inputs/Outputs data type not support: %s", desc);

View File

@ -162,7 +162,7 @@ static vsi_bool _check_is_sp_supported_type
int32_t * axes = self->nn_param.reduce.local2->axes;
int32_t axes_num = self->nn_param.reduce.local2->axes_num;
if ( !self->graph->ctx->config.support_stream_processor ||
if ( !((vsi_nn_graph_prv_t*)(self->graph))->options->config.support_stream_processor ||
(type != VSI_NN_REDUCE_SUM && type != VSI_NN_REDUCE_MEAN && type != VSI_NN_REDUCE_MAX) )
{
return FALSE;
@ -788,7 +788,7 @@ static vsi_bool op_set_reduce_axis(
}
*out_rank_x = inputs[0]->attr.dim_num;
}
else if (!self->graph->ctx->config.support_stream_processor ||
else if (!((vsi_nn_graph_prv_t*)(self->graph))->options->config.support_stream_processor ||
resolved_dim_count > 2)
{
optimzation_input_size(

View File

@ -61,7 +61,7 @@ static vsi_status op_compute
vx_nn_reshape_params_t reshape_param;
memset(&attr, 0, sizeof(attr));
attr.size[0] = self->nn_param.reshape.dim_num;
attr.size[0] = vsi_nn_max(self->nn_param.reshape.dim_num, 1);
attr.dim_num = 1;
attr.is_const = TRUE;
attr.dtype.vx_type = VSI_NN_TYPE_INT32;
@ -124,17 +124,28 @@ static vsi_bool op_setup
vsi_bool ret = TRUE;
if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
{
vsi_size_t shape[VSI_NN_MAX_DIM_NUM] = {0};
uint32_t i = 0;
for (i = 0; i < self->nn_param.reshape.dim_num; i++)
if (self->nn_param.reshape.dim_num == 0 ||
self->nn_param.reshape.size == NULL
)
{
shape[i] = (uint32_t)-1 == self->nn_param.reshape.size[i] ? \
(vsi_size_t)-1 : (vsi_size_t)self->nn_param.reshape.size[i];
outputs[0]->attr.size[0] = 1;
outputs[0]->attr.dim_num = 1;
vsi_nn_SetTensorIsScalar(outputs[0], TRUE);
}
else
{
vsi_size_t shape[VSI_NN_MAX_DIM_NUM] = { 0 };
uint32_t i = 0;
for (i = 0; i < self->nn_param.reshape.dim_num; i++)
{
shape[i] = (uint32_t)-1 == self->nn_param.reshape.size[i] ? \
(vsi_size_t)-1 : (vsi_size_t)self->nn_param.reshape.size[i];
}
ret = vsi_nn_CalcReshapeTensor(inputs[0],
outputs[0],
shape,
self->nn_param.reshape.dim_num);
}
ret = vsi_nn_CalcReshapeTensor(inputs[0],
outputs[0],
shape,
self->nn_param.reshape.dim_num);
}
return ret;

View File

@ -66,7 +66,7 @@ static vsi_status op_compute
}
memset(&attr, 0, sizeof(attr));
attr.size[0] = self->nn_param.reshape2.dim_num;
attr.size[0] = vsi_nn_max(self->nn_param.reshape2.dim_num, 1);
attr.dim_num = 1;
attr.is_const = TRUE;
attr.dtype.vx_type = VSI_NN_TYPE_INT32;
@ -161,13 +161,24 @@ static vsi_bool op_setup
vsi_bool ret = TRUE;
if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
{
vsi_size_t shape[VSI_NN_MAX_DIM_NUM] = {0};
memcpy(shape, self->nn_param.reshape2.size,
sizeof(vsi_size_t) * self->nn_param.reshape2.dim_num);
ret = vsi_nn_CalcReshapeTensor(inputs[0],
outputs[0],
shape,
self->nn_param.reshape2.dim_num);
if (self->nn_param.reshape2.dim_num == 0 ||
self->nn_param.reshape2.size == NULL
)
{
outputs[0]->attr.size[0] = 1;
outputs[0]->attr.dim_num = 1;
vsi_nn_SetTensorIsScalar(outputs[0], TRUE);
}
else
{
vsi_size_t shape[VSI_NN_MAX_DIM_NUM] = { 0 };
memcpy(shape, self->nn_param.reshape2.size,
sizeof(vsi_size_t) * self->nn_param.reshape2.dim_num);
ret = vsi_nn_CalcReshapeTensor(inputs[0],
outputs[0],
shape,
self->nn_param.reshape2.dim_num);
}
}
return ret;

Some files were not shown because too many files have changed in this diff Show More