From c4e75674fa2ce8393938ffaae4d893c34a0b5261 Mon Sep 17 00:00:00 2001 From: Kee Date: Mon, 13 Oct 2025 13:15:31 +0800 Subject: [PATCH] Refine platform code and samples (#713) * Refine platform code and samples 1. Support viplite v2 API 2. Unify the Lite and Native platform APIs so that the same code can run on different platforms through different compilation options. Type: Code Improvement Signed-off-by: Kee * Fix build error if VSI device API is not supported Signed-off-by: Kee --------- Signed-off-by: Kee --- cmake/local_sdk.cmake | 6 +- include/tim/vx/platform/lite/lite_native.h | 86 ++- include/tim/vx/platform/native.h | 56 +- include/tim/vx/platform/platform.h | 63 +- samples/CMakeLists.txt | 4 +- samples/lenet_multi_device/CMakeLists.txt | 5 + .../lenet_multi_device/lenet_multi_device.cc | 52 +- samples/lite_multi_device/CMakeLists.txt | 13 - samples/multi_device/README | 24 +- samples/multi_device/multi_device.cc | 113 ++-- samples/multi_device/multi_device_demo.cc | 33 +- samples/multi_device/vx_resnet50.cc | 2 +- samples/platform_sample/CMakeLists.txt | 13 + samples/platform_sample/README | 25 + .../platform_sample.cc} | 18 +- src/tim/CMakeLists.txt | 6 +- src/tim/vx/platform/lite/lite_native.cc | 584 ++++++++++++------ .../vx/platform/lite/lite_native_private.h | 147 +++++ src/tim/vx/platform/native.cc | 345 ++++++----- src/tim/vx/platform/native_device_private.h | 58 -- src/tim/vx/platform/native_private.h | 106 ++++ 21 files changed, 1098 insertions(+), 661 deletions(-) delete mode 100644 samples/lite_multi_device/CMakeLists.txt create mode 100644 samples/platform_sample/CMakeLists.txt create mode 100644 samples/platform_sample/README rename samples/{lite_multi_device/lite_multi_device.cc => platform_sample/platform_sample.cc} (87%) create mode 100644 src/tim/vx/platform/lite/lite_native_private.h delete mode 100644 src/tim/vx/platform/native_device_private.h create mode 100644 src/tim/vx/platform/native_private.h diff --git a/cmake/local_sdk.cmake b/cmake/local_sdk.cmake index a74de46..bed2a21 100644 --- a/cmake/local_sdk.cmake +++ b/cmake/local_sdk.cmake @@ -9,7 +9,11 @@ list(APPEND OVXDRV_INCLUDE_DIRS if("${CONFIG}" STREQUAL "BUILDROOT") set(VIV_SDK_DRIVER_PREFIX "usr/lib") else() - set(VIV_SDK_DRIVER_PREFIX "drivers") + if(EXISTS ${EXTERNAL_VIV_SDK}/drivers) + set(VIV_SDK_DRIVER_PREFIX "drivers") + else() + set(VIV_SDK_DRIVER_PREFIX "lib") + endif() endif() message("using driver libs from ${EXTERNAL_VIV_SDK}/${VIV_SDK_DRIVER_PREFIX}") diff --git a/include/tim/vx/platform/lite/lite_native.h b/include/tim/vx/platform/lite/lite_native.h index b83d5fe..a9ed553 100644 --- a/include/tim/vx/platform/lite/lite_native.h +++ b/include/tim/vx/platform/lite/lite_native.h @@ -25,72 +25,58 @@ #define TIM_VX_LITE_NATIVE_H_ #include "tim/vx/platform/platform.h" -#include "vip_lite.h" -#include "nbg_linker.h" namespace tim { namespace vx { namespace platform { -class LiteNativeExecutor - : public IExecutor, - public std::enable_shared_from_this { +class LiteNativeDevice : public IDevice { public: - LiteNativeExecutor(const std::shared_ptr& device); - virtual ~LiteNativeExecutor(); - bool Submit(const std::shared_ptr& executable, - const std::shared_ptr& ref, - bool after = true) override; - bool Trigger(bool async = false) override; - std::shared_ptr Compile( - const std::shared_ptr& graph) override; - - private: - vip_task_descriptor_t* task_descriptor_; - vip_database database_; + virtual ~LiteNativeDevice() {}; + virtual bool Submit(const std::shared_ptr& graph) = 0; + virtual bool Trigger(bool async = false, async_callback cb = NULL) = 0; + virtual bool DeviceExit() = 0; + virtual void WaitDeviceIdle() = 0; + virtual std::shared_ptr CreateExecutor(const int32_t core_index = 0, + const int32_t core_count = -1, + const std::shared_ptr& context = nullptr) = 0; + static std::vector> Enumerate(); + static bool vip_initialized; +}; +class LiteNativeExecutor + : public IExecutor { + public: + virtual ~LiteNativeExecutor() {}; + virtual bool Submit(const std::shared_ptr& executable, + const std::shared_ptr& ref, + bool after = true) = 0; + virtual bool Trigger(bool async = false) = 0; + virtual std::shared_ptr Compile( + const std::shared_ptr& graph) = 0; }; class LiteNativeExecutable : public IExecutable { public: - LiteNativeExecutable(const std::shared_ptr& executor, - const std::vector& nb_buf); - virtual ~LiteNativeExecutable(); - void SetInput(const std::shared_ptr& th) override; - void SetOutput(const std::shared_ptr& th) override; - void GetOutput( - const std::vector>& th) override; - bool Submit(const std::shared_ptr& ref, bool after) override; - bool Trigger(bool async) override; - bool Verify() override; - std::shared_ptr AllocateTensor( - const TensorSpec& tensor_spec) override; - - vip_network network_; - - private: - void SetBuffer(vip_memory_t* dst, gcvip_videomemory_t* src); - - int32_t input_count_; - int32_t output_count_; - - gcvip_videomemory_t* coeff_; - gcvip_videomemory_t* command_; - gcvip_videomemory_t* memory_pool_; - gcvip_videomemory_t* others_; - gcvip_videomemory_t* pre_command_; + virtual ~LiteNativeExecutable() {}; + virtual void SetInput(const std::shared_ptr& th) = 0; + virtual void SetOutput(const std::shared_ptr& th) = 0; + virtual void SetInputs(const std::vector>& ths) = 0; + virtual void SetOutputs(const std::vector>& ths) = 0; + virtual bool Submit(const std::shared_ptr& ref, bool after) = 0; + virtual bool Trigger(bool async) = 0; + virtual bool Verify() = 0; + virtual std::shared_ptr AllocateTensor(const TensorSpec& tensor_spec, + void* data = nullptr, uint32_t size = 0) = 0; }; class LiteNativeTensorHandle : public ITensorHandle { public: - LiteNativeTensorHandle(const std::shared_ptr& tensr); - virtual ~LiteNativeTensorHandle(); - bool CopyDataToTensor(const void* data, uint32_t size_in_bytes) override; - bool CopyDataFromTensor(void* data) override; - - gcvip_videomemory_t* tensor_buffer_; + virtual ~LiteNativeTensorHandle() {}; + bool CopyDataToTensor(const void* data, uint32_t size_in_bytes) = 0; + bool CopyDataFromTensor(void* data) = 0; }; } // namespace platform } // namespace vx } // namespace tim -#endif \ No newline at end of file +#endif diff --git a/include/tim/vx/platform/native.h b/include/tim/vx/platform/native.h index cecf34a..8521731 100644 --- a/include/tim/vx/platform/native.h +++ b/include/tim/vx/platform/native.h @@ -37,51 +37,41 @@ class NativeDevice : public IDevice { virtual bool Trigger(bool async = false, async_callback cb = NULL) = 0; virtual bool DeviceExit() = 0; virtual void WaitDeviceIdle() = 0; + virtual std::shared_ptr CreateExecutor(const int32_t core_index = 0, + const int32_t core_count = -1, + const std::shared_ptr& context = nullptr) = 0; static std::vector> Enumerate(); }; class NativeExecutable : public IExecutable { public: - NativeExecutable(const std::shared_ptr& executor, - const std::vector& nb_buf, size_t inputs, - size_t outputs); - ~NativeExecutable(){}; - void SetInput(const std::shared_ptr& th) override; - void SetOutput(const std::shared_ptr& th) override; - void GetOutput( - const std::vector>& th) override; - bool Submit(const std::shared_ptr& ref, - bool after = true) override; - bool Trigger(bool async = false) override; - std::shared_ptr AllocateTensor( - const TensorSpec& tensor_spec) override; - bool Verify() override; - - protected: - std::shared_ptr nb_node_; - std::vector nb_buf_; + virtual ~NativeExecutable() {}; + virtual void SetInput(const std::shared_ptr& th) = 0; + virtual void SetOutput(const std::shared_ptr& th) = 0; + virtual void SetInputs(const std::vector>& ths) = 0; + virtual void SetOutputs(const std::vector>& ths) = 0; + virtual bool Submit(const std::shared_ptr& ref, + bool after = true) = 0; + virtual bool Trigger(bool async = false) = 0; + virtual std::shared_ptr AllocateTensor(const TensorSpec& tensor_spec, + void* data = nullptr, uint32_t size = 0) = 0; + virtual bool Verify() = 0; }; -class NativeExecutor : public IExecutor, - public std::enable_shared_from_this { +class NativeExecutor : public IExecutor { public: - NativeExecutor(const std::shared_ptr& device); - NativeExecutor(const std::shared_ptr& device, - const std::shared_ptr& context); - ~NativeExecutor(){}; - bool Submit(const std::shared_ptr& executable, - const std::shared_ptr& ref, - bool after = true) override; - bool Trigger(bool async = false) override; - std::shared_ptr Compile( - const std::shared_ptr& graph) override; + virtual ~NativeExecutor(){}; + virtual bool Submit(const std::shared_ptr& executable, + const std::shared_ptr& ref, + bool after = true) = 0; + virtual bool Trigger(bool async = false) = 0; + virtual std::shared_ptr Compile(const std::shared_ptr& graph) = 0; }; class NativeTensorHandle : public ITensorHandle { public: - NativeTensorHandle(const std::shared_ptr& tensor); - bool CopyDataToTensor(const void* data, uint32_t size_in_bytes) override; - bool CopyDataFromTensor(void* data) override; + virtual bool CopyDataToTensor(const void* data, uint32_t size_in_bytes) = 0; + virtual bool CopyDataFromTensor(void* data) = 0; }; } // namespace platform diff --git a/include/tim/vx/platform/platform.h b/include/tim/vx/platform/platform.h index 263042b..94ba61c 100644 --- a/include/tim/vx/platform/platform.h +++ b/include/tim/vx/platform/platform.h @@ -46,15 +46,12 @@ namespace platform { class IDevice; class IExecutable; -class ExecutableSet; class IExecutor; class ITensorHandle; std::shared_ptr Compile( const std::shared_ptr& graph, const std::shared_ptr& executor); -std::shared_ptr CreateExecutableSet( - const std::vector>& executables); class IDevice { public: @@ -68,17 +65,25 @@ class IDevice { virtual ~IDevice(){}; virtual bool Submit(const std::shared_ptr& graph) = 0; virtual bool Trigger(bool async = false, async_callback cb = NULL) = 0; - device_id_t Id() const; + device_id_t Id() const { return device_id_;}; virtual void WaitDeviceIdle() = 0; virtual bool DeviceExit() = 0; virtual void RemoteReset(); + uint32_t CoreCount() const {return core_count_;}; + virtual std::shared_ptr CreateExecutor(const int32_t core_index = 0, + const int32_t core_count = -1, + const std::shared_ptr& context = nullptr) = 0; + static std::vector> Enumerate(); protected: device_id_t device_id_; + uint32_t core_count_; + }; class IExecutor { public: + //using task = std::shared_ptr; using task = std::weak_ptr; virtual ~IExecutor(){}; virtual bool Submit(const std::shared_ptr& executable, @@ -87,13 +92,17 @@ class IExecutor { virtual bool Trigger(bool async = false) = 0; // todo: async=true virtual std::shared_ptr Compile( const std::shared_ptr& graph) = 0; - virtual std::shared_ptr Device() const; - virtual std::shared_ptr Contex() const; - + virtual std::shared_ptr Device() const {return device_;}; + virtual std::shared_ptr Contex() const {return context_;}; + virtual uint32_t CoreIndex() const {return core_index_; }; + virtual uint32_t CoreCount() const {return core_count_; }; protected: std::vector tasks_; std::shared_ptr device_; std::shared_ptr context_; + uint32_t core_index_; + uint32_t core_count_; + }; class IExecutable : public std::enable_shared_from_this { @@ -101,40 +110,24 @@ class IExecutable : public std::enable_shared_from_this { virtual ~IExecutable(){}; virtual void SetInput(const std::shared_ptr& th) = 0; virtual void SetOutput(const std::shared_ptr& th) = 0; - virtual void GetOutput( - const std::vector>& th) = 0; // for remote + virtual void SetInputs(const std::vector>& ths) = 0; + virtual void SetOutputs(const std::vector>& ths) = 0; + virtual std::vector> GetOutputs() { return input_handles_;}; + virtual std::vector> Getinputs() { return input_handles_;}; virtual bool Submit(const std::shared_ptr& ref, bool after = true) = 0; virtual bool Trigger(bool async = false) = 0; // todo: async=true virtual bool Verify() = 0; - virtual std::shared_ptr NBGraph() const; - virtual std::shared_ptr AllocateTensor( - const TensorSpec& tensor_spec) = 0; - virtual std::shared_ptr Executor() const; + std::shared_ptr NBGraph() const {return nb_graph_;}; + virtual std::shared_ptr AllocateTensor(const TensorSpec& tensor_spec , + void* data = nullptr, uint32_t size = 0) = 0; protected: std::weak_ptr executor_; std::shared_ptr context_; std::shared_ptr nb_graph_; -}; - -class ExecutableSet : public IExecutable { - public: - ExecutableSet(const std::vector>& executables); - void SetInput(const std::shared_ptr& th) override; - void SetOutput(const std::shared_ptr& th) override; - void GetOutput( - const std::vector>& th) override; - bool Submit(const std::shared_ptr& ref, - bool after = true) override; - bool Trigger(bool async = false) override; - bool Verify() override; - std::shared_ptr AllocateTensor( - const TensorSpec& tensor_spec) override; - std::vector> Executables() const; - - protected: - std::vector> executables_; + std::vector> input_handles_; + std::vector> output_handles_; }; class ITensorHandle { @@ -142,13 +135,15 @@ class ITensorHandle { virtual ~ITensorHandle(){}; virtual bool CopyDataToTensor(const void* data, uint32_t size_in_bytes) = 0; virtual bool CopyDataFromTensor(void* data) = 0; - virtual std::shared_ptr GetTensor() const; + virtual std::shared_ptr GetTensor() const { return tensor_;}; + virtual TensorSpec& GetSpec() { return spec_;}; protected: std::shared_ptr tensor_; + TensorSpec spec_; }; } // namespace platform } // namespace vx } // namespace tim -#endif \ No newline at end of file +#endif diff --git a/samples/CMakeLists.txt b/samples/CMakeLists.txt index 919c0a5..301afdb 100644 --- a/samples/CMakeLists.txt +++ b/samples/CMakeLists.txt @@ -20,9 +20,7 @@ endif() if(TIM_VX_ENABLE_PLATFORM) add_subdirectory("lenet_multi_device") add_subdirectory("multi_device") - if(${TIM_VX_ENABLE_PLATFORM_LITE}) - add_subdirectory("lite_multi_device") - endif() + add_subdirectory("platform_sample") if(TIM_VX_ENABLE_GRPC) add_subdirectory("grpc") endif() diff --git a/samples/lenet_multi_device/CMakeLists.txt b/samples/lenet_multi_device/CMakeLists.txt index 6658898..e62787b 100644 --- a/samples/lenet_multi_device/CMakeLists.txt +++ b/samples/lenet_multi_device/CMakeLists.txt @@ -11,5 +11,10 @@ target_include_directories(${TARGET_NAME} PRIVATE ${PROJECT_SOURCE_DIR}/include ) +target_include_directories(${TARGET_NAME} PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR} + ${PROJECT_SOURCE_DIR}/include +) + install(TARGETS ${TARGET_NAME} ${TARGET_NAME} DESTINATION ${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_BINDIR}) diff --git a/samples/lenet_multi_device/lenet_multi_device.cc b/samples/lenet_multi_device/lenet_multi_device.cc index 9ce79c9..1761e97 100644 --- a/samples/lenet_multi_device/lenet_multi_device.cc +++ b/samples/lenet_multi_device/lenet_multi_device.cc @@ -33,7 +33,6 @@ #include "tim/vx/context.h" #include "tim/vx/graph.h" #include "tim/vx/platform/platform.h" -#include "tim/vx/platform/native.h" std::vector input_data = { 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 2, 0, 0, 8, 0, @@ -108,17 +107,17 @@ static void printTopN(const T* prob, int outputCount, int topNum) { } } + int main(int argc, char** argv) { (void) argc, (void) argv; auto context0 = tim::vx::Context::Create(); auto graph0 = lenet(context0); auto graph1 = lenet(context0); - auto devices = tim::vx::platform::NativeDevice::Enumerate(); + auto devices = tim::vx::platform::IDevice::Enumerate(); auto device = devices[0]; - std::shared_ptr executor = std::make_shared (device); - - auto executable0 = tim::vx::platform::Compile(graph0, executor); // compile to nbg + auto executor = device->CreateExecutor(0,-1,context0); + auto executable0 = tim::vx::platform::Compile(graph0, executor); auto input_handle0 = executable0->AllocateTensor(graph0->InputsTensor()[0]->GetSpec()); auto output_handle0 = executable0->AllocateTensor(graph0->OutputsTensor()[0]->GetSpec()); executable0->SetInput(input_handle0); @@ -127,7 +126,18 @@ int main(int argc, char** argv) { assert(executable0->Submit(executable0)); executable0->Trigger(); - auto executable1 = tim::vx::platform::Compile(graph1, executor); // compile to nbg + std::vector output_data; + output_data.resize(1 * 10); + if (!output_handle0->CopyDataFromTensor(output_data.data())) { + std::cout << "Copy output data fail." << std::endl; + return -1; + } + std::cout << "executable0 out." << std::endl; + printTopN(output_data.data(), output_data.size(), 5); + output_data.assign(output_data.size(),0); + output_handle0->CopyDataToTensor(output_data.data(), output_data.size()); + + auto executable1 = tim::vx::platform::Compile(graph1, executor); auto input_handle1 = executable1->AllocateTensor(graph1->InputsTensor()[0]->GetSpec()); auto output_handle1 = executable1->AllocateTensor(graph1->OutputsTensor()[0]->GetSpec()); executable1->SetInput(input_handle1); @@ -136,34 +146,28 @@ int main(int argc, char** argv) { assert(executable1->Submit(executable0)); executable1->Trigger(); + std::vector output_data1; + output_data1.resize(1 * 10); + if (!output_handle1->CopyDataFromTensor(output_data1.data())) { + std::cout << "Copy output data fail." << std::endl; + return -1; + } + std::cout << "executable1 out." << std::endl; + printTopN(output_data1.data(), output_data1.size(), 5); + output_data1.assign(output_data1.size(),0); + output_handle1->CopyDataToTensor(output_data1.data(), output_data1.size()); + executor->Submit(executable0, executable0); executor->Submit(executable1, executable0); - std::vector> executables0; - executables0.push_back(executable0); - executables0.push_back(executable1); - auto executable_set0 = tim::vx::platform::CreateExecutableSet(executables0); - executor->Submit(executable_set0, executable_set0); executor->Trigger(); - - std::vector input_data0; - input_data0.resize(28 * 28); - if (!input_handle0->CopyDataFromTensor(input_data0.data())) { - std::cout << "Copy intput data fail." << std::endl; - return -1; - } - printTopN(input_data0.data(), input_data0.size(), 5); - - std::vector output_data; - output_data.resize(1 * 10); + std::cout << "executor out." << std::endl; if (!output_handle0->CopyDataFromTensor(output_data.data())) { std::cout << "Copy output data fail." << std::endl; return -1; } printTopN(output_data.data(), output_data.size(), 5); - std::vector output_data1; - output_data1.resize(1 * 10); if (!output_handle1->CopyDataFromTensor(output_data1.data())) { std::cout << "Copy output data fail." << std::endl; return -1; diff --git a/samples/lite_multi_device/CMakeLists.txt b/samples/lite_multi_device/CMakeLists.txt deleted file mode 100644 index 0356eef..0000000 --- a/samples/lite_multi_device/CMakeLists.txt +++ /dev/null @@ -1,13 +0,0 @@ -message("samples/lite_multi_device") - -set(TARGET_NAME "lite_multi_device") - -add_executable(${TARGET_NAME} ${CMAKE_CURRENT_SOURCE_DIR}/lite_multi_device.cc) - -target_link_libraries(${TARGET_NAME} PRIVATE -Wl,--whole-archive tim-vx) -target_include_directories(${TARGET_NAME} PRIVATE - ${PROJECT_SOURCE_DIR}/include - ${PROJECT_SOURCE_DIR}/prebuilt-sdk/viplite/build/sdk/include) - -install(TARGETS ${TARGET_NAME} ${TARGET_NAME} - DESTINATION ${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_BINDIR}) \ No newline at end of file diff --git a/samples/multi_device/README b/samples/multi_device/README index 557e20d..890e417 100644 --- a/samples/multi_device/README +++ b/samples/multi_device/README @@ -1,15 +1,25 @@ ## brief -The multi_device demo uses some acuity exported tim-vx networks, and running on 4 devices of NPU using platform api. +The multi_device demo uses some acuity exported tim-vx networks, and running on multi-core devices of NPU using platform api. -## environment - export VSIMULATOR_CONFIG=VIP9400O_PID0XD9 - export VIV_MGPU_AFFINITY="1:0" - export VIV_OVX_USE_MULTI_DEVICE="1:1" - export TIM_VX_ROOT="${workspaceFolder}/tim-vx" +## note +Please note that if you have enabled lite platform, a dedicated VIVANTE_SDK(NO_KERNEL) is required as the compiler for NBG. +The driver for the NPU is the VIPLITE driver + +##requirements +Vivante SDK >= 6.4.22 +ovxlib >= 1.2.26 +viplite >=2.0.0 ## build cd build -cmake .. -DCMAKE_BUILD_TYPE=Debug -DTIM_VX_BUILD_EXAMPLES=ON -DTIM_VX_ENABLE_PLATFORM=ON +cmake .. -DCMAKE_BUILD_TYPE=Release -DTIM_VX_USE_EXTERNAL_OVXLIB=ON -DEXTERNAL_VIV_SDK=${VIVANTE_NOKERNEL_SDK_DIR} -DOVXLIB_INC=${OVXLIB_DIR}/include \ + -DOVXLIB_LIB=/path/to/libovxlib.so -DTIM_VX_BUILD_EXAMPLES=ON -DTIM_VX_ENABLE_PLATFORM=ON -DTIM_VX_ENABLE_PLATFORM_LITE=ON -DVIP_LITE_SDK=${VIP_LITE_SDK} + +## environment +# Export VIV_GPU_FILE to specify the NPU hardware configuration file for the NBG compiler +# VIV_GPU_FILE Specify the NPU hardware configuration file for the NBG compiler +export VIV_GPU_FILE="/path/to/VIP9400NANOQ_PLUS_PID0X10000055.config" +export TIM_VX_ROOT="${workspaceFolder}/tim-vx" ## run cd build diff --git a/samples/multi_device/multi_device.cc b/samples/multi_device/multi_device.cc index 6e1e772..c3f040a 100644 --- a/samples/multi_device/multi_device.cc +++ b/samples/multi_device/multi_device.cc @@ -35,7 +35,6 @@ #include "tim/vx/context.h" #include "tim/vx/graph.h" #include "tim/vx/platform/platform.h" -#include "tim/vx/platform/native.h" #include "vx_lenet.h" #include "vx_mobilenet.h" #include "vx_resnet50.h" @@ -59,7 +58,7 @@ static void printTopN(const T* prob, int outputCount, int topNum) { } template -void print_topN(std::size_t size, std::shared_ptr handle) { +void print_topN(std::size_t size, std::shared_ptr & handle) { std::vector output_data; output_data.resize(size); if (!handle->CopyDataFromTensor(output_data.data())) { @@ -94,7 +93,8 @@ void executor_trigger(std::shared_ptr executor) { } auto context = tim::vx::Context::Create(); -std::pair, std::shared_ptr> generate_executable( +std::pair, std::shared_ptr> + generate_executable( std::shared_ptr executor, std::function, const char*)> construct_func, std::string weight_file, @@ -114,15 +114,17 @@ std::pair, std::shared_ptr executor0 = std::make_shared (device0); - auto device1 = devices[1]; - std::shared_ptr executor1 = std::make_shared (device1); - auto device2 = devices[2]; - std::shared_ptr executor2 = std::make_shared (device2); - auto device3 = devices[3]; - std::shared_ptr executor3 = std::make_shared (device3); + auto total_core_count = device0->CoreCount(); + uint32_t core_index = 0; + auto use_core_count = 1; + std::vector> executors; + + for(core_index = 0; core_index < total_core_count; core_index += use_core_count) { + auto executor = device0->CreateExecutor(core_index,use_core_count, context); + executors.push_back(executor); + } auto root = std::getenv("TIM_VX_ROOT"); assert(root != NULL); @@ -142,46 +144,57 @@ int main(int argc, char** argv) { auto resnet50_weight_file = ROOT + "/samples/multi_device/resnet50/resnet50.export.data"; std::function, const char*)> resnet50_construct_func = acuitylite::resnet50::construct_graph; - std::shared_ptr lenet_0, lenet_2, lenet_3, mobilenet_1, mobilenet_2, mobilenet_3, resnet50_0, resnet50_1; - std::shared_ptr lenet_0_outhandle, lenet_2_outhandle, lenet_3_outhandle, mobilenet_1_outhandle, mobilenet_2_outhandle, mobilenet_3_outhandle, - resnet50_0_outhandle, resnet50_1_outhandle; + auto excutor_cnt = executors.size(); - std::tie(lenet_0, lenet_0_outhandle) = generate_executable(executor0, lenet_construct_func, lenet_weight_file, lenet_input_files, lenet_input_bytes); - std::tie(resnet50_0, resnet50_0_outhandle) = generate_executable(executor0, resnet50_construct_func, resnet50_weight_file, resnet50_input_files, resnet50_input_bytes); - executor0->Submit(lenet_0, lenet_0); - executor0->Submit(resnet50_0, lenet_0); + //each excutor run 2 models. + auto lenet = [&](std::shared_ptr executor) { + return generate_executable(executor, lenet_construct_func, lenet_weight_file, + lenet_input_files, lenet_input_bytes); + }; + auto resnet = [&](std::shared_ptr executor) { + return generate_executable(executor, resnet50_construct_func, resnet50_weight_file, + resnet50_input_files, resnet50_input_bytes); + }; + auto mobilenet = [&](std::shared_ptr executor) { + return generate_executable(executor, mobilenet_construct_func, mobilenet_weight_file, + mobilenet_input_files, mobilenet_input_bytes); + }; + std::vector, + std::shared_ptr>> nets; + for (size_t i = 0; i < excutor_cnt; i++) { + if(i % 3 == 0) { + //lenet + resnet + nets.push_back(lenet(executors[i])); + executors[i]->Submit(nets.back().first, nets.back().first); + nets.push_back(resnet(executors[i])); + executors[i]->Submit(nets.back().first, nets.back().first); + } + if(i % 3 == 1) { + //resnet + mobilenet + nets.push_back(resnet(executors[i])); + executors[i]->Submit(nets.back().first, nets.back().first); + nets.push_back(mobilenet(executors[i])); + executors[i]->Submit(nets.back().first, nets.back().first); + } + if(i % 3 == 2) { + //lenet + mobilenet + nets.push_back(mobilenet(executors[i])); + executors[i]->Submit(nets.back().first, nets.back().first); + nets.push_back(lenet(executors[i])); + executors[i]->Submit(nets.back().first, nets.back().first); + } + } + std::vector threads; + for(auto executor:executors) { + threads.push_back(std::thread(executor_trigger, executor)); + } + for(std::thread &t : threads) { + t.join(); + } - std::tie(mobilenet_1, mobilenet_1_outhandle) = generate_executable(executor1, mobilenet_construct_func, mobilenet_weight_file, mobilenet_input_files, mobilenet_input_bytes); - std::tie(resnet50_1, resnet50_1_outhandle) = generate_executable(executor1, resnet50_construct_func, resnet50_weight_file, resnet50_input_files, resnet50_input_bytes); - auto executable_set1 = tim::vx::platform::CreateExecutableSet({mobilenet_1, resnet50_1}); - executor1->Submit(executable_set1, executable_set1); - - std::tie(lenet_2, lenet_2_outhandle) = generate_executable(executor2, lenet_construct_func, lenet_weight_file, lenet_input_files, lenet_input_bytes); - std::tie(mobilenet_2, mobilenet_2_outhandle) = generate_executable(executor2, mobilenet_construct_func, mobilenet_weight_file, mobilenet_input_files, mobilenet_input_bytes); - auto executable_set2 = tim::vx::platform::CreateExecutableSet({lenet_2, mobilenet_2}); - executor2->Submit(executable_set2, executable_set2); - - std::tie(lenet_3, lenet_3_outhandle) = generate_executable(executor3, lenet_construct_func, lenet_weight_file, lenet_input_files, lenet_input_bytes); - std::tie(mobilenet_3, mobilenet_3_outhandle) = generate_executable(executor3, mobilenet_construct_func, mobilenet_weight_file, mobilenet_input_files, mobilenet_input_bytes); - auto executable_set3 = tim::vx::platform::CreateExecutableSet({lenet_3, mobilenet_3}); - executor3->Submit(executable_set3, executable_set3); - - std::thread t0(executor_trigger, executor0); - std::thread t1(executor_trigger, executor1); - std::thread t2(executor_trigger, executor2); - std::thread t3(executor_trigger, executor3); - t0.join(); - t1.join(); - t2.join(); - t3.join(); - - print_topN(1 * 10, lenet_0_outhandle); - print_topN(1 * 10, lenet_2_outhandle); - print_topN(1 * 10, lenet_3_outhandle); - print_topN(1 * 1001, mobilenet_1_outhandle); - print_topN(1 * 1001, mobilenet_2_outhandle); - print_topN(1 * 1001, mobilenet_3_outhandle); - print_topN(1 * 1000, resnet50_0_outhandle); - print_topN(1 * 1000, resnet50_1_outhandle); +for (auto net : nets) { + auto size = net.second->GetSpec().GetElementNum(); + print_topN(size, net.second); +} return 0; } diff --git a/samples/multi_device/multi_device_demo.cc b/samples/multi_device/multi_device_demo.cc index dd20c3c..369569b 100644 --- a/samples/multi_device/multi_device_demo.cc +++ b/samples/multi_device/multi_device_demo.cc @@ -29,7 +29,7 @@ #include "tim/vx/graph.h" #include "tim/vx/operation.h" #include "tim/vx/tensor.h" -#include "tim/vx/platform/native.h" +#include "tim/vx/platform/platform.h" static void printTopN() { } @@ -46,9 +46,9 @@ int demo(int argc, char** argv) { tim::vx::TensorSpec g0_input0, g0_output0, g1_output0, g2_output0, g3_output0, g4_output0, g5_output0; // query device and get executor of devcie - auto devices = tim::vx::platform::NativeDevice::Enumerate(); + auto devices = tim::vx::platform::IDevice::Enumerate(); auto device = devices[0]; - std::shared_ptr executor = std::make_shared (device); + auto executor = device->CreateExecutor(0,-1, context); // executable0 auto executable0 = executor->Compile(g0); // compile to nbg @@ -89,33 +89,6 @@ int demo(int argc, char** argv) { // trigger executor->Trigger(); // run all submitted executables - /* 2. another way to run */ - // executable_set0 - std::vector> executables0; - executables0.push_back(executable0); - auto executable_set0 = CreateExecutableSet(executables0); - // executable_set1 - std::vector> executables1; - executables1.push_back(executable1); - executables1.push_back(executable3); - auto executable_set1 = CreateExecutableSet(executables1); - // executable_set2 - std::vector> executables2; - executables2.push_back(executable2); - executables2.push_back(executable4); - auto executable_set2 = CreateExecutableSet(executables2); - // executable_set3 - std::vector> executables3; - executables3.push_back(executable5); - auto executable_set3 = CreateExecutableSet(executables3); - // submit executaleSets - executable_set0->Submit(executable_set0); - executable_set1->Submit(executable_set0); - executable_set2->Submit(executable_set1); - executable_set3->Submit(executable_set2); - // trigger - executor->Trigger(); // run all submitted executableSets - printTopN(); return 0; diff --git a/samples/multi_device/vx_resnet50.cc b/samples/multi_device/vx_resnet50.cc index 7011e3e..a15480e 100644 --- a/samples/multi_device/vx_resnet50.cc +++ b/samples/multi_device/vx_resnet50.cc @@ -1296,7 +1296,7 @@ void resnet50::construct_graph auto input_0 = graph->CreateTensor(input_0_spec); tim::vx::ShapeType output_229_shape({1000,1}); - tim::vx::TensorSpec output_229_spec(tim::vx::DataType::FLOAT16, output_229_shape, + tim::vx::TensorSpec output_229_spec(tim::vx::DataType::FLOAT32, output_229_shape, tim::vx::TensorAttribute::OUTPUT); auto output_229 = graph->CreateTensor(output_229_spec); diff --git a/samples/platform_sample/CMakeLists.txt b/samples/platform_sample/CMakeLists.txt new file mode 100644 index 0000000..1b91e85 --- /dev/null +++ b/samples/platform_sample/CMakeLists.txt @@ -0,0 +1,13 @@ +message("samples/platform_sample") + +set(TARGET_NAME "platform_sample") + +add_executable(${TARGET_NAME} ${CMAKE_CURRENT_SOURCE_DIR}/platform_sample.cc) + +target_link_libraries(${TARGET_NAME} PRIVATE -Wl,--whole-archive tim-vx) +target_include_directories(${TARGET_NAME} PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR} + ${PROJECT_SOURCE_DIR}/include) + +install(TARGETS ${TARGET_NAME} ${TARGET_NAME} + DESTINATION ${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_BINDIR}) diff --git a/samples/platform_sample/README b/samples/platform_sample/README new file mode 100644 index 0000000..f1df921 --- /dev/null +++ b/samples/platform_sample/README @@ -0,0 +1,25 @@ +## brief +The platform sample usage which use platform api. + +## note +Please note that if you have enabled lite platform, a dedicated VIVANTE_SDK(NO_KERNEL) is required as the compiler for NBG. +The driver for the NPU is the VIPLITE driver + +##requirements +Vivante SDK >= 6.4.22 +ovxlib >= 1.2.26 +viplite >=2.0.0 + +## build +cd build +cmake .. -DCMAKE_BUILD_TYPE=Release -DTIM_VX_USE_EXTERNAL_OVXLIB=ON -DEXTERNAL_VIV_SDK=${VIVANTE_NOKERNEL_SDK_DIR} -DOVXLIB_INC=${OVXLIB_DIR}/include \ + -DOVXLIB_LIB=${VIVANTE_NOKERNEL_SDK_DIR}/drivers/libovxlib.so -DTIM_VX_BUILD_EXAMPLES=ON -DTIM_VX_ENABLE_PLATFORM=ON \ + -DTIM_VX_ENABLE_PLATFORM_LITE=ON -DVIP_LITE_SDK=${VIP_LITE_SDK} + +## environment +# Export VIV_GPU_FILE to specify the NPU hardware configuration file for the NBG compiler +export VIV_GPU_FILE="/path/to/VIP9000NANOQ_PLUS_PID0X100000XX.config" + +## run +cd build +./samples/platform_sample/platform_sample diff --git a/samples/lite_multi_device/lite_multi_device.cc b/samples/platform_sample/platform_sample.cc similarity index 87% rename from samples/lite_multi_device/lite_multi_device.cc rename to samples/platform_sample/platform_sample.cc index 51aec07..9506a9e 100644 --- a/samples/lite_multi_device/lite_multi_device.cc +++ b/samples/platform_sample/platform_sample.cc @@ -26,8 +26,8 @@ #include "tim/vx/graph.h" #include "tim/vx/ops.h" #include "tim/vx/types.h" -#include "tim/vx/platform/native.h" -#include "tim/vx/platform/lite/lite_native.h" +#include "tim/vx/platform/platform.h" + int main() { //construct tim-vx graph @@ -49,9 +49,15 @@ int main() { std::vector data_vec_i0({1, 2, 3, 4}); std::vector data_vec_i1({4, 3, 2, 1}); - auto devices = tim::vx::platform::NativeDevice::Enumerate(); + auto devices = tim::vx::platform::IDevice::Enumerate(); + + std::cout << "NPU device count: " << devices.size() <(device); + //run 1 core in device 0 + std::cout << "NPU device[0] has " << device->CoreCount() << "cores" <CreateExecutor(use_core_count); + auto executable = executor->Compile(graph); auto input0_handle = executable->AllocateTensor(input_spec); auto input1_handle = executable->AllocateTensor(input_spec); @@ -73,6 +79,10 @@ int main() { //each output value should be "5" in this demo for (int i = 0; i < 4; ++i) { std::cout << "output value: " << data[i] << std::endl; + if(data[i] != 5) { + std::cout << "test failed" << std::endl; + break; + } } free(data); return 0; diff --git a/src/tim/CMakeLists.txt b/src/tim/CMakeLists.txt index 8b38b33..f8f36c5 100644 --- a/src/tim/CMakeLists.txt +++ b/src/tim/CMakeLists.txt @@ -61,8 +61,10 @@ if(TIM_VX_ENABLE_PLATFORM) endif() list(APPEND LITE_EXTERNAL_LIBS ${VIP_LITE_SDK}/drivers/libNBGlinker.so - ${VIP_LITE_SDK}/drivers/libVIPlite.so) - list(APPEND LITE_INC_DIRS ${VIP_LITE_SDK}/include) + ${VIP_LITE_SDK}/drivers/libVIPhal.so) + list(APPEND LITE_INC_DIRS + ${VIP_LITE_SDK}/include + ${VIP_LITE_SDK}/include/nbg_linker) endif() if(TIM_VX_ENABLE_GRPC) diff --git a/src/tim/vx/platform/lite/lite_native.cc b/src/tim/vx/platform/lite/lite_native.cc index 6ab557c..21a45af 100644 --- a/src/tim/vx/platform/lite/lite_native.cc +++ b/src/tim/vx/platform/lite/lite_native.cc @@ -22,36 +22,202 @@ * *****************************************************************************/ #include "tim/vx/platform/lite/lite_native.h" +#include "lite_native_private.h" #include - #include "tim/vx/graph.h" #include "graph_private.h" -#include "vsi_nn_pub.h" +#include "context_private.h" namespace tim { namespace vx { namespace platform { -LiteNativeExecutor::LiteNativeExecutor(const std::shared_ptr& device) { + + LiteNetwork::LiteNetwork(vip_create_network_param_t& param) { + vip_create_network(¶m, sizeof(param), &network_); + } + vip_status_e LiteNetwork::Query(vip_enum property, void* value) { + return vip_query_network(network_, property, value); + } + vip_status_e LiteNetwork::Set(vip_enum property, void* value) { + return vip_set_network(network_, property, value); + } + vip_status_e LiteNetwork::Prepare() { + return vip_prepare_network(network_); + } + vip_status_e LiteNetwork::Run() {return vip_run_network(network_);} + + vip_status_e LiteNetwork::Trigger() {return vip_trigger_network(network_);} + + vip_status_e LiteNetwork::Wait() {return vip_wait_network(network_);} + + vip_status_e LiteNetwork::Cancel() {return vip_cancel_network(network_);} + + vip_status_e LiteNetwork::QueryInput(vip_uint32_t index, vip_enum property, void* value) { + return vip_query_input(network_, index, property,value); + } + + vip_status_e LiteNetwork::QueryOutput(vip_uint32_t index, vip_enum property, void* value) { + return vip_query_output(network_, index, property, value); + } + + vip_status_e LiteNetwork::SetInput(vip_uint32_t index, std::shared_ptr input) { + vip_buffer buffer = + std::dynamic_pointer_cast(input)->GetBuffer(); + return vip_set_input(network_, index, buffer); + } + + vip_status_e LiteNetwork::SetOutput(vip_uint32_t index, std::shared_ptr output) { + vip_buffer buffer = + std::dynamic_pointer_cast(output)->GetBuffer(); + return vip_set_output(network_, index, buffer); + } + + LiteNetwork::~LiteNetwork(){ + vip_finish_network(network_); + vip_destroy_network(network_); + } + +bool LiteNativeDevice::vip_initialized = false; + +LiteNativeDeviceImpl::LiteNativeDeviceImpl(device_id_t id,uint32_t core_count) { + device_id_ = id; + core_count_ = core_count; + } + +bool LiteNativeDeviceImpl::Submit(const std::shared_ptr& graph) { + (void)graph; + return true; +} + +bool LiteNativeDeviceImpl::Trigger(bool async, async_callback cb) { + (void)async; + (void)cb; + return true; +} +void LiteNativeDeviceImpl::WaitDeviceIdle() {} + +bool LiteNativeDeviceImpl::DeviceExit() {return false;} + +std::shared_ptr LiteNativeDeviceImpl::CreateExecutor(const int32_t core_index, + const int32_t core_count, + const std::shared_ptr& context) { + std::shared_ptr this_sp = shared_from_this(); + auto executor = std::make_shared(this_sp, core_count, core_index, context); + return executor; +} + +std::vector> LiteNativeDevice::Enumerate() { + std::vector> device_v; + device_id_t deviceCount = 0; + std::vector core_count; + uint32_t version = 0; + if( !LiteNativeDevice::vip_initialized ) { + vip_status_e status = vip_init(); + if(status != VIP_SUCCESS) { + VSILOGE("Initialize viplite driver fail"); + return device_v; + } + LiteNativeDevice::vip_initialized = true; + } + version = vip_get_version(); + if (version >= 0x00010601 ) { + vip_query_hardware(VIP_QUERY_HW_PROP_DEVICE_COUNT, sizeof(uint32_t), &deviceCount); + core_count.resize(deviceCount); + vip_query_hardware(VIP_QUERY_HW_PROP_CORE_COUNT_EACH_DEVICE, + sizeof(uint32_t) * core_count.size(), core_count.data()); + } + + for (device_id_t i = 0; i < deviceCount; i++) { + auto local_device = std::make_shared(i, core_count.at(i)); + device_v.push_back(local_device); + } + return device_v; +} + +int LiteNativeExecutorImpl::executor_count = 0; + +LiteNativeExecutorImpl::LiteNativeExecutorImpl(const std::shared_ptr& device, + const int32_t core_count, const int32_t core_index, const std::shared_ptr& context) + { device_ = device; - context_ = Context::Create(); - database_ = VIP_NULL; + context_ = context; + if(context_ == nullptr) { + context_ = tim::vx::Context::Create(); + } + auto fixed_core_count = core_count; + int32_t fixed_core_index = core_index; + vip_status_e status = VIP_SUCCESS; + if( !LiteNativeDevice::vip_initialized ) { + status = vip_init(); + if(status != VIP_SUCCESS){ + throw "Initialize viplite driver fail"; + } + } + int32_t total_core_count = (int32_t)device->CoreCount(); + if (fixed_core_index < 0) + { + fixed_core_index = 0; + } + if (fixed_core_index > total_core_count - 1){ + throw "Core index is larger than total core count."; + } + if (fixed_core_count <= 0 ) { + fixed_core_count = total_core_count - fixed_core_index; + } - vip_init(); - vip_query_database(&database_); - nbg_linker_init(database_); + if (fixed_core_index + fixed_core_count > total_core_count) { + fixed_core_count = total_core_count - fixed_core_index; + VSILOGW( + "Core_index + core_count is larger than total core count. Fix core " + "count to %d", + fixed_core_count); + } + core_index_ = (uint32_t)fixed_core_index; + core_count_ = (uint32_t)fixed_core_count; + +#ifdef VSI_DEVICE_SUPPORT + vsi_nn_device_t vsi_devices[VSI_MAX_DEVICES] = {0}; + vsi_size_t num_devices = 0; + vsi_size_t available_core_count = 0; + auto ctx = dynamic_cast(context_.get()); + vsi_nn_GetDevices(ctx->context(), vsi_devices, &num_devices); + + //Always use device 0 to compile NBG. + vsi_nn_GetDeviceCoreCount(vsi_devices[0], &available_core_count); + + if(core_index_ + core_count_ > (uint32_t)available_core_count) { + VSILOGE("the used core count is larger than compiler available core count"); + assert(false); + } + vsi_nn_CreateSubDevice(vsi_devices[0], core_index_, core_count_, &sub_device_); +#else + VSILOGE("device is not supported!"); + assert(false); +#endif + + executor_count++; } -LiteNativeExecutor::~LiteNativeExecutor() { - nbg_destroy_task(task_descriptor_); - nbg_linker_destroy(); - vip_destroy(); +LiteNativeExecutorImpl::~LiteNativeExecutorImpl() { +#ifdef VSI_DEVICE_SUPPORT + if(sub_device_) + vsi_nn_ReleaseDevice(&sub_device_); +#endif + executor_count--; + if(executor_count <1) + vip_destroy(); } -bool LiteNativeExecutor::Submit(const std::shared_ptr& executable, +bool LiteNativeExecutorImpl::Submit(const std::shared_ptr& executable, const std::shared_ptr& ref, bool after) { bool success = false; + success = executable->Verify(); + if (success == false) { + VSILOGE("Executable NBG compile failed"); + return false; + } if (executable == ref) { tasks_.push_back(executable); return true; @@ -72,239 +238,285 @@ bool LiteNativeExecutor::Submit(const std::shared_ptr& executable, return success; } -bool LiteNativeExecutor::Trigger(bool async) { +bool LiteNativeExecutorImpl::Trigger(bool async) { (void)async; - vip_status_e status = VIP_SUCCESS; - std::vector networks; - for (auto exe : tasks_) { - auto task = exe.lock(); - task->Verify(); - vip_network& network = - std::dynamic_pointer_cast(task)->network_; - networks.push_back(std::move(network)); - } - status = nbg_create_task(networks.size(), networks.data(), &task_descriptor_); - if (status != VIP_SUCCESS) { - VSILOGE("create task descriptor fail"); - return false; - } - status = vip_trigger_task(task_descriptor_); - if (status != VIP_SUCCESS) { - VSILOGE("trigger task descriptor fail"); - return false; - } - status = vip_wait_task(task_descriptor_); - if (status != VIP_SUCCESS) { - VSILOGE("wait task descriptor fail"); - // nbg_gen_capture(networks.size(), networks.data()); - return false; + while (!tasks_.empty()) { + auto task = tasks_.front(); + tasks_.erase(tasks_.begin()); + auto task_tmp = task.lock(); + if (!task_tmp) { + VSILOGE("Task is empty"); + return false; + } + task_tmp->Trigger(); } return true; } -std::shared_ptr LiteNativeExecutor::Compile( +std::shared_ptr LiteNativeExecutorImpl::Compile( const std::shared_ptr& graph) { - GraphImpl* graphimp = dynamic_cast(graph.get()); - IDevice::device_id_t id = device_->Id(); - vxSetGraphAttribute(graphimp->graph()->g, VX_GRAPH_DEVICE_INDEX_VIV, - (void*)(&id), sizeof(id)); size_t bin_size = -1; - graph->CompileToBinary(nullptr, &bin_size); std::vector nb_buf; +#ifdef VSI_DEVICE_SUPPORT + GraphImpl* graphimp = dynamic_cast(graph.get()); + vsi_nn_BindDevices(graphimp->graph(), 1, &sub_device_); +#endif + auto ret = graph->CompileToBinary(nullptr, &bin_size); nb_buf.resize(bin_size); - graph->CompileToBinary(nb_buf.data(), &bin_size); - return std::make_shared(shared_from_this(), nb_buf); + ret |= graph->CompileToBinary(nb_buf.data(), &bin_size); + if(!ret) { + VSILOGE("Compile fail"); + return nullptr; + } + + std::shared_ptr this_sp = shared_from_this(); + auto executable = std::make_shared(this_sp, nb_buf); + return executable; } -LiteNativeExecutable::LiteNativeExecutable( +LiteNativeExecutableImpl::LiteNativeExecutableImpl( const std::shared_ptr& executor, const std::vector& nb_buf) { executor_ = executor; - context_ = executor->Contex(); - nb_graph_ = context_->CreateGraph(); - nbg_create_network(nb_buf.data(), nb_buf.size(), - VIP_CREATE_NETWORK_FROM_MEMORY, &network_); - input_count_ = 0; - output_count_ = 0; - coeff_ = nullptr; - command_ = nullptr; - memory_pool_ = nullptr; - others_ = nullptr; - pre_command_ = nullptr; + context_ = nullptr; + nb_graph_ = nullptr; + vip_status_e status = VIP_SUCCESS; + vip_create_network_param_t net_param; + device_id_ = executor_.lock()->Device()->Id(); + auto core_index = executor_.lock()->CoreIndex(); + net_param.device_index = device_id_; + net_param.prop = VIP_NET_CREATE_PROP_FROM_NBG; + net_param.nbg.type = VIP_NET_CREATE_NBG_FROM_MEMORY; + net_param.nbg.memory.nbg_memory = (void*)nb_buf.data(); + net_param.nbg.memory.nbg_size = nb_buf.size(); - /* prepare vip network */ - vip_status_e status = VIP_SUCCESS; - nbg_network_memory_size_t buffer_size; - nbg_network_memory_buffer_t buffer; - vip_memory_t coeff_buffer; - vip_memory_t cmd_buffer; - vip_memory_t pre_cmd_buffer; - vip_memory_t pool_buffer; - vip_memory_t others_buffer; - nbg_query_network(network_, VIP_NETWORK_PROP_MEMORY_SIZE, &buffer_size); + auto network(std::make_unique(net_param)); - vip_allocate_videomemory(buffer_size.coeff, &coeff_); - vip_allocate_videomemory(buffer_size.command, &command_); - vip_allocate_videomemory(buffer_size.memory_pool, &memory_pool_); - vip_allocate_videomemory(buffer_size.others, &others_); - vip_allocate_videomemory(buffer_size.pre_command, &pre_command_); - - SetBuffer(&coeff_buffer, coeff_); - SetBuffer(&cmd_buffer, command_); - SetBuffer(&pre_cmd_buffer, pre_command_); - SetBuffer(&pool_buffer, memory_pool_); - SetBuffer(&others_buffer, others_); - - buffer.coeff = &coeff_buffer; - buffer.command = &cmd_buffer; - buffer.memory_pool = &pool_buffer; - buffer.others = &others_buffer; - buffer.pre_command = &pre_cmd_buffer; - buffer.dma_command = nullptr; - status = nbg_prepare_network(network_, &buffer); - - vip_flush_videomemory(coeff_, VIP_BUFFER_OPER_TYPE_FLUSH); - vip_flush_videomemory(command_, VIP_BUFFER_OPER_TYPE_FLUSH); - vip_flush_videomemory(pre_command_, VIP_BUFFER_OPER_TYPE_FLUSH); - vip_flush_videomemory(memory_pool_, VIP_BUFFER_OPER_TYPE_FLUSH); - vip_flush_videomemory(others_, VIP_BUFFER_OPER_TYPE_FLUSH); + lite_network_ = std::move(network); + status = lite_network_->Query(VIP_NETWORK_PROP_INPUT_COUNT,&input_count_); + if (status != VIP_SUCCESS) { + VSILOGE("failed to query network inputs"); + assert(false); + } + status = lite_network_->Query(VIP_NETWORK_PROP_OUTPUT_COUNT,&output_count_); + if (status != VIP_SUCCESS) { + VSILOGE("failed to query network outputs"); + assert(false); + } + status = lite_network_->Set(VIP_NETWORK_PROP_SET_CORE_INDEX,&core_index); + if (status != VIP_SUCCESS) { + VSILOGE("failed to set core index"); + assert(false); + } + status = lite_network_->Prepare(); if (status != VIP_SUCCESS) { VSILOGE("failed to prepare network"); assert(false); } } -LiteNativeExecutable::~LiteNativeExecutable() { - nbg_finish_network(network_); - nbg_destroy_network(network_); - if (coeff_) { - vip_free_videomemory(coeff_); - coeff_ = nullptr; - } - if (command_) { - vip_free_videomemory(command_); - command_ = nullptr; - } - if (memory_pool_) { - vip_free_videomemory(memory_pool_); - memory_pool_ = nullptr; - } - if (others_) { - vip_free_videomemory(others_); - others_ = nullptr; - } - if (pre_command_) { - vip_free_videomemory(pre_command_); - pre_command_ = nullptr; - } -} - -void LiteNativeExecutable::SetInput(const std::shared_ptr& th) { +void LiteNativeExecutableImpl::SetInput(const std::shared_ptr& th) { vip_status_e status = VIP_SUCCESS; - gcvip_videomemory_t* mem = - std::dynamic_pointer_cast(th)->tensor_buffer_; - vip_memory_t buffer; - SetBuffer(&buffer, mem); - - status = nbg_set_input(network_, input_count_, &buffer); + int32_t input_index = input_handles_.size(); + status = lite_network_->SetInput(input_index, th); if (status != VIP_SUCCESS) { - VSILOGE("failed to set input: %d", input_count_); + VSILOGE("failed to set input: %d", input_index); assert(false); } - ++input_count_; + input_handles_.push_back(th); +} +void LiteNativeExecutableImpl::SetInputs(const std::vector>& ths) { + for (auto th : ths) { + SetInput(th); + } } -void LiteNativeExecutable::SetOutput(const std::shared_ptr& th) { +void LiteNativeExecutableImpl::SetOutput(const std::shared_ptr& th) { vip_status_e status = VIP_SUCCESS; - gcvip_videomemory_t* mem = - std::dynamic_pointer_cast(th)->tensor_buffer_; - vip_memory_t buffer; - SetBuffer(&buffer, mem); - - status = nbg_set_output(network_, output_count_, &buffer); + int32_t output_index = output_handles_.size(); + status = lite_network_->SetOutput(output_index,th); if (status != VIP_SUCCESS) { - VSILOGE("failed to set output: %d", output_count_); + VSILOGE("failed to set output: %d", output_index); assert(false); } - ++output_count_; + output_handles_.push_back(th); } -void LiteNativeExecutable::GetOutput( - const std::vector>& th) { - (void)th; +void LiteNativeExecutableImpl::SetOutputs(const std::vector>& ths) { + for (auto th : ths) { + SetOutput(th); + } } -bool LiteNativeExecutable::Submit(const std::shared_ptr& ref, +bool LiteNativeExecutableImpl::Submit(const std::shared_ptr& ref, bool after) { bool status = false; + std::shared_ptr executor = + std::dynamic_pointer_cast(executor_.lock()); std::shared_ptr executable = shared_from_this(); - status = Executor()->Submit(executable, ref, after); + status = executor->Submit(executable, ref, after); return status; } -bool LiteNativeExecutable::Trigger(bool async) { - (void)async; - return false; -} - -bool LiteNativeExecutable::Verify() { - int32_t input_count = 0; - nbg_query_network(network_, VIP_NETWORK_PROP_INPUT_COUNT, &input_count); - if (input_count != input_count_) { - VSILOGE("input count mismatch, required: %d, provided: %d", input_count, - input_count_); - return false; +bool LiteNativeExecutableImpl::Trigger(bool async) { + vip_status_e status = VIP_SUCCESS; + if (async) { + status = lite_network_->Trigger(); + status = lite_network_->Wait(); + if (status != VIP_SUCCESS) { + VSILOGE("trigger network fail"); + return false; + } + } else { + status = lite_network_->Run(); + if (status != VIP_SUCCESS) { + VSILOGE("run network fail"); + return false; + } } - int32_t output_count = 0; - nbg_query_network(network_, VIP_NETWORK_PROP_OUTPUT_COUNT, &output_count); - if (output_count != output_count_) { - VSILOGE("output count mismatch, required: %d, provided: %d", output_count, - output_count_); - return false; - } - return true; } -std::shared_ptr LiteNativeExecutable::AllocateTensor( - const TensorSpec& tensor_spec) { - auto tensor = nb_graph_->CreateTensor(tensor_spec); - return std::make_shared(tensor); +bool LiteNativeExecutableImpl::Verify() { + bool ret = true; + auto output_index = output_handles_.size(); + auto input_index = input_handles_.size(); + if(input_index != input_count_) { + VSILOGE("Network need %d inputs but gaving %d.\n", input_count_, input_index); + ret = false; + } + if(output_index != output_count_) { + VSILOGE("Network need %d outputs but gaving %d.\n", output_count_, output_index); + ret = false; + } + + return ret; } -void LiteNativeExecutable::SetBuffer(vip_memory_t* dst, - gcvip_videomemory_t* src) { - if (dst && src) { - dst->cpu_logical = src->cpu_logical; - dst->npu_physical = src->npu_physical; - dst->size = src->size; +std::shared_ptr LiteNativeExecutableImpl::AllocateTensor(const TensorSpec& tensor_spec, + void* data, uint32_t size) { + return std::make_shared(tensor_spec, data, size, device_id_); +} + +LiteNativeTensorHandleImpl::LiteNativeTensorHandleImpl(const TensorSpec& tensor_spec, void* data, uint32_t size, + uint32_t device_id) { + vip_status_e status = VIP_ERROR_FAILURE; + spec_ = tensor_spec; + uint32_t tensor_size = tensor_spec.GetByteSize(); + vip_buffer_create_params_t tensor_param; + + uint32_t block_aligned_size = 64; + memory_type_ = ALLOC_MEM_NONE; + handle_ = nullptr; + handle_size_ = 0; + if(size > 0 && !data && tensor_size > size ) { + VSILOGE("Buffer size is less than the memory size required by the tensor"); + assert(false); + } +#if 0 + uint32_t addr_aligned_size = 256; + if (!data) { + data = vsi_nn_MallocAlignedBuffer(tensor_size,addr_aligned_size,block_aligned_size); + size = ((tensor_size + block_aligned_size - 1) / block_aligned_size) * block_aligned_size; + memory_type_ = ALLOC_MEM_INTERNAL; + } else { + memory_type_ = ALLOC_MEM_EXTERNAL; + } + handle_ = data; + if(!vsi_nn_IsBufferAligned((uint8_t *)handle_, addr_aligned_size)) { + VSILOGE("The starting address of the buffer needs to be 64-byte aligned"); + assert(false); + } + if(size % 64 != 0) { + VSILOGE("The size of the buffer needs to be 64-byte aligned"); + assert(false); + } + handle_size_ = size; + tensor_param.type = VIP_BUFFER_CREATE_FROM_USER_MEM; + tensor_param.device_index = device_id ; + tensor_param.src.from_handle.memory_type = VIP_BUFFER_FROM_USER_MEM_TYPE_HOST; + tensor_param.src.from_handle.logical_addr = handle_; + tensor_param.src.from_handle.size = handle_size_; + status = vip_create_buffer(&tensor_param,sizeof(tensor_param),&tensor_buffer_); +#else + (void)data; + tensor_param.type = VIP_BUFFER_CREATE_ALLOC_MEM; + tensor_param.device_index = device_id ; + tensor_param.src.alloc_mem.size = tensor_size; + tensor_param.src.alloc_mem.align = block_aligned_size; + status = vip_create_buffer(&tensor_param,sizeof(tensor_param),&tensor_buffer_); + memory_type_ = ALLOC_MEM_VIDEOMEM; +#endif + if(status != VIP_SUCCESS) { + if(memory_type_ == ALLOC_MEM_INTERNAL) { + vsi_nn_FreeAlignedBuffer((uint8_t*)handle_); + } + VSILOGE("Fail to create vip buffer."); + assert(false); } } -LiteNativeTensorHandle::LiteNativeTensorHandle( - const std::shared_ptr& tensor) { - tensor_ = tensor; - uint32_t size = tensor->GetSpec().GetByteSize(); - vip_allocate_videomemory(size, &tensor_buffer_); -} - -LiteNativeTensorHandle::~LiteNativeTensorHandle() { +LiteNativeTensorHandleImpl::~LiteNativeTensorHandleImpl() { if (tensor_buffer_) { - vip_free_videomemory(tensor_buffer_); + vip_destroy_buffer(tensor_buffer_); tensor_buffer_ = nullptr; } + if(memory_type_ == ALLOC_MEM_INTERNAL && handle_) { + vsi_nn_FreeAlignedBuffer((uint8_t*)handle_); + + } } -bool LiteNativeTensorHandle::CopyDataToTensor(const void* data, - uint32_t size_in_bytes) { - memcpy(tensor_buffer_->cpu_logical, data, size_in_bytes); +bool LiteNativeTensorHandleImpl::CopyDataToTensor(const void* data, + uint32_t size_in_bytes) { + void* handle = handle_; + if(memory_type_ == ALLOC_MEM_VIDEOMEM) { + handle = vip_map_buffer(tensor_buffer_); + } + auto buff_size = vip_get_buffer_size(tensor_buffer_); + memcpy(handle, data, buff_size > size_in_bytes ? size_in_bytes : buff_size); + if(memory_type_ == ALLOC_MEM_VIDEOMEM) { + vip_unmap_buffer(tensor_buffer_); + } + Flush(); return true; } -bool LiteNativeTensorHandle::CopyDataFromTensor(void* data) { - memcpy(data, tensor_buffer_->cpu_logical, tensor_buffer_->size); - return true; +bool LiteNativeTensorHandleImpl::CopyDataFromTensor(void* data) { + bool ret = Invalidate(); + if(ret) { + void* handle = handle_; + auto buff_size = vip_get_buffer_size(tensor_buffer_); + if(memory_type_ == ALLOC_MEM_VIDEOMEM) { + handle = vip_map_buffer(tensor_buffer_); + } + memcpy(data, handle, buff_size); + if(memory_type_ == ALLOC_MEM_VIDEOMEM) { + vip_unmap_buffer(tensor_buffer_); + } + } + + return ret; +} + +bool LiteNativeTensorHandleImpl::Flush() { + vip_status_e status = vip_flush_buffer(tensor_buffer_,VIP_BUFFER_OPER_TYPE_FLUSH); + if (status != VIP_SUCCESS) { + return false; + } + else{ + return true; + } +} +bool LiteNativeTensorHandleImpl::Invalidate() { + vip_status_e status = vip_flush_buffer(tensor_buffer_,VIP_BUFFER_OPER_TYPE_INVALIDATE); + if (status != VIP_SUCCESS) { + return false; + } + else{ + return true; + } } } // namespace platform diff --git a/src/tim/vx/platform/lite/lite_native_private.h b/src/tim/vx/platform/lite/lite_native_private.h new file mode 100644 index 0000000..b371945 --- /dev/null +++ b/src/tim/vx/platform/lite/lite_native_private.h @@ -0,0 +1,147 @@ +/**************************************************************************** +* +* Copyright (c) 2020-2023 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef TIM_VX_LITE_NATIVE_DEVICE_PRIVATE_H_ +#define TIM_VX_LITE_NATIVE_DEVICE_PRIVATE_H_ + +#include "tim/vx/platform/lite/lite_native.h" +#include "vip_lite.h" +#include "vsi_nn_pub.h" + + +namespace tim { +namespace vx { + +namespace platform { + +class LiteNetwork +{ +public: + LiteNetwork(vip_create_network_param_t& param); + ~LiteNetwork(); + vip_status_e Query(vip_enum property, void* value); + vip_status_e Set(vip_enum property, void* value); + vip_status_e Prepare(); + vip_status_e Run(); + vip_status_e Trigger(); + vip_status_e Wait(); + vip_status_e Cancel(); + vip_status_e QueryInput(vip_uint32_t index, vip_enum property, void* value); + vip_status_e QueryOutput(vip_uint32_t index, vip_enum property, void* value); + vip_status_e SetInput(vip_uint32_t index, std::shared_ptr input); + vip_status_e SetOutput(vip_uint32_t index, std::shared_ptr output); + +private: + vip_network network_; +}; + +class LiteNativeDeviceImpl : public LiteNativeDevice, + public std::enable_shared_from_this { + public: + LiteNativeDeviceImpl(device_id_t id,uint32_t core_count); + ~LiteNativeDeviceImpl() {}; + + bool Submit(const std::shared_ptr& graph) override; + bool Trigger(bool async = false, async_callback cb = NULL) override; + bool DeviceExit() override; + void WaitDeviceIdle() override; + std::shared_ptr CreateExecutor(const int32_t core_index = 0, + const int32_t core_count = -1, + const std::shared_ptr& context = nullptr) override; +}; + +class LiteNativeExecutorImpl + : public LiteNativeExecutor, + public std::enable_shared_from_this { + public: + LiteNativeExecutorImpl(const std::shared_ptr& device, + const int32_t core_index = 0, + const int32_t core_count = -1, + const std::shared_ptr& context = nullptr); + virtual ~LiteNativeExecutorImpl(); + bool Submit(const std::shared_ptr& executable, + const std::shared_ptr& ref, + bool after = true) override; + bool Trigger(bool async = false) override; + std::shared_ptr Compile(const std::shared_ptr& graph) override; + static int executor_count; + +private: +#ifdef VSI_DEVICE_SUPPORT + vsi_nn_device_t sub_device_; +#endif +}; + +class LiteNativeExecutableImpl : public LiteNativeExecutable { + public: + LiteNativeExecutableImpl(const std::shared_ptr& executor, + const std::vector& nb_buf); + virtual ~LiteNativeExecutableImpl() {}; + void SetInput(const std::shared_ptr& th) override; + void SetOutput(const std::shared_ptr& th) override; + void SetInputs(const std::vector>& ths) override; + void SetOutputs(const std::vector>& ths) override; + bool Submit(const std::shared_ptr& ref, bool after) override; + bool Trigger(bool async) override; + bool Verify() override; + std::shared_ptr AllocateTensor(const TensorSpec& tensor_spec, + void* data = nullptr, uint32_t size = 0) override; + + private: + uint32_t device_id_; + uint32_t input_count_; + uint32_t output_count_; + std::unique_ptr lite_network_; +}; + +class LiteNativeTensorHandleImpl : public LiteNativeTensorHandle { + public: + typedef enum { + ALLOC_MEM_NONE, + ALLOC_MEM_EXTERNAL, + ALLOC_MEM_INTERNAL, + ALLOC_MEM_VIDEOMEM, + ALLOC_MEM_PHYSICAL, + ALLOC_MEM_FD, + } alloc_mem_type; + + LiteNativeTensorHandleImpl(const TensorSpec& tensor_spec,void* data, uint32_t size,uint32_t device_id); + virtual ~LiteNativeTensorHandleImpl(); + bool CopyDataToTensor(const void* data, uint32_t size_in_bytes) override; + bool CopyDataFromTensor(void* data) override; + bool Flush(); + bool Invalidate(); + vip_buffer GetBuffer() {return tensor_buffer_;}; + +private: + vip_buffer tensor_buffer_; + void* handle_; + uint32_t handle_size_; + alloc_mem_type memory_type_; +}; + +} // namespace platform +} // namespace vx +} // namespace tim + +#endif /* TIM_VX_NATIVE_DEVICE_PRIVATE_H_*/ diff --git a/src/tim/vx/platform/native.cc b/src/tim/vx/platform/native.cc index 81c9c3a..038f38d 100644 --- a/src/tim/vx/platform/native.cc +++ b/src/tim/vx/platform/native.cc @@ -22,9 +22,14 @@ * *****************************************************************************/ #include "tim/vx/platform/native.h" -#include "native_device_private.h" +#include "native_private.h" +#include "context_private.h" #include "tim/vx/ops/nbg.h" +#ifdef ENABLE_PLATFORM_LITE +#include "tim/vx/platform/lite/lite_native.h" +#endif +#include namespace tim { namespace vx { namespace platform { @@ -35,215 +40,203 @@ std::shared_ptr Compile( return executor->Compile(graph); } -std::shared_ptr CreateExecutableSet( - const std::vector>& executables) { - ExecutableSet* executable_set = new ExecutableSet(executables); - std::shared_ptr executable(executable_set); - return executable; +NativeDeviceImpl::NativeDeviceImpl(device_id_t id, uint32_t core_count) { + device_id_ = id; + core_count_ = core_count; +} +std::vector> IDevice::Enumerate() { +#ifdef ENABLE_PLATFORM_LITE + auto devices = tim::vx::platform::LiteNativeDevice::Enumerate(); +#else + auto devices = tim::vx::platform::NativeDevice::Enumerate(); +#endif + return devices; } - -IDevice::device_id_t IDevice::Id() const { return device_id_; } void IDevice::RemoteReset() {} -NativeDeviceImpl::NativeDeviceImpl(device_id_t id) { - vip_device_ = std::make_unique(id); - device_id_ = id; -} - bool NativeDeviceImpl::Submit(const std::shared_ptr& graph) { - GraphImpl* graphimp = - dynamic_cast(graph.get()); // hack to downcast - vsi_graph_v_.push_back(graphimp->graph()); + (void)graph; return true; } bool NativeDeviceImpl::Trigger(bool async, async_callback cb) { - // extract graph from tasks (void)async; - bool status = false; - while (!vsi_graph_v_.empty()) { - auto task = vsi_graph_v_.front(); - vsi_graph_v_.erase(vsi_graph_v_.begin()); - status = vip_device_->GraphSubmit(task, cb, NULL); - } - return status; + (void)cb; + return true; } -void NativeDeviceImpl::WaitDeviceIdle() { vip_device_->WaitThreadIdle(); } +void NativeDeviceImpl::WaitDeviceIdle() {} -bool NativeDeviceImpl::DeviceExit() { return vip_device_->ThreadExit(); } +bool NativeDeviceImpl::DeviceExit() { return true; } + +std::shared_ptr NativeDeviceImpl::CreateExecutor(const int32_t core_index, + const int32_t core_count, + const std::shared_ptr& context) { + std::shared_ptr this_sp = shared_from_this(); + auto executor = std::make_shared(this_sp, core_count,core_index,context); + return executor; +} std::vector> NativeDevice::Enumerate() { std::vector> device_v; - device_id_t deviceCount = 0; - vsi_nn_context_t context; - context = vsi_nn_CreateContext(); + vsi_nn_context_t context = vsi_nn_CreateContext(); + vsi_size_t deviceCount = 0; +#ifdef VSI_DEVICE_SUPPORT + vsi_nn_device_t vsi_devices[VSI_MAX_DEVICES] = {0}; + vsi_status status = VSI_FAILURE; + vsi_size_t deviceCount = 0; + + status = vsi_nn_GetDevices(context,vsi_devices,&deviceCount); + if(status != VSI_SUCCESS){ + VSILOGE("Get device count fail"); + return device_v; + } + + for (vsi_size_t i = 0; i < deviceCount; i++) { + vsi_size_t available_core_count = 0; + vsi_nn_GetDeviceCoreCount(vsi_devices[i],&available_core_count); + auto local_device = std::make_shared(i,available_core_count); + device_v.push_back(local_device); + } +#else vxQueryContext(context->c, VX_CONTEXT_DEVICE_COUNT_VIV, &deviceCount, sizeof(deviceCount)); - std::cout << "Device count = " << deviceCount << std::endl; for (device_id_t i = 0; i < deviceCount; i++) { - IDevice* local_device = new NativeDeviceImpl(i); - std::shared_ptr local_device_sp(local_device); - device_v.push_back(local_device_sp); + auto local_device = std::make_shared(i,0); + device_v.push_back(local_device); } + VSILOGE("VSI device API is not supportted, please upgrade Vivant SDK version >= 6.4.22 && ovxlib >= 1.2.26 !"); +#endif vsi_nn_ReleaseContext(&context); return device_v; } -std::shared_ptr IExecutable::NBGraph() const { return nb_graph_; } - -std::shared_ptr IExecutable::Executor() const { - auto executor = executor_.lock(); - if (!executor) { - std::cout << "Executor unable to lock weak_ptr"; - } - return executor; -} - -NativeExecutable::NativeExecutable(const std::shared_ptr& executor, +NativeExecutableImpl::NativeExecutableImpl(const std::shared_ptr& executor, const std::vector& nb_buf, size_t inputs, size_t outputs) { - CompileOption opt; - opt.setDeviceId(executor->Device()->Id()); executor_ = executor; context_ = executor->Contex(); - nb_graph_ = context_->CreateGraph(opt); + nb_graph_ = context_->CreateGraph(); nb_buf_ = nb_buf; nb_node_ = nb_graph_->CreateOperation(nb_buf_.data(), inputs, outputs); } -void NativeExecutable::SetInput(const std::shared_ptr& th) { +void NativeExecutableImpl::SetInput(const std::shared_ptr& th) { nb_node_->BindInput(th->GetTensor()); + input_handles_.push_back(th); } -void NativeExecutable::SetOutput(const std::shared_ptr& th) { +void NativeExecutableImpl::SetInputs(const std::vector>& ths) { + for (auto& t : ths) { + SetInput(t); + } +} + +void NativeExecutableImpl::SetOutput(const std::shared_ptr& th) { nb_node_->BindOutput(th->GetTensor()); + output_handles_.push_back(th); } -void NativeExecutable::GetOutput( - const std::vector>& th) { - (void)th; +void NativeExecutableImpl::SetOutputs(const std::vector>& ths) { + for (auto& t : ths) { + SetOutput(t); + } + } -bool NativeExecutable::Submit(const std::shared_ptr& ref, +bool NativeExecutableImpl::Submit(const std::shared_ptr& ref, bool after) { bool status = false; std::shared_ptr executable = shared_from_this(); - status = Executor()->Submit(executable, ref, after); + std::shared_ptr executor = std::dynamic_pointer_cast(executor_.lock()); + status = executor->Submit(executable, ref, after); return status; } -bool NativeExecutable::Trigger(bool async) { +bool NativeExecutableImpl::Trigger(bool async) { (void)async; - bool status = false; - auto device = Executor()->Device(); - device->Submit(nb_graph_); - status = device->Trigger(); - device->WaitDeviceIdle(); + bool status = nb_graph_->Run(); return status; } -std::shared_ptr NativeExecutable::AllocateTensor( - const TensorSpec& tensor_spec) { - auto tensor = nb_graph_->CreateTensor(tensor_spec); - ITensorHandle* tensor_handle = new NativeTensorHandle(tensor); - std::shared_ptr tensor_handle_sp(tensor_handle); - return tensor_handle_sp; +std::shared_ptr NativeExecutableImpl::AllocateTensor(const TensorSpec& tensor_spec, + void* data, uint32_t size) { + (void)size; + auto tensor = nb_graph_->CreateTensor(tensor_spec,data); + return std::make_shared(tensor); } -bool NativeExecutable::Verify() { return nb_graph_->Compile(); } - -ExecutableSet::ExecutableSet( - const std::vector>& executables) { - executables_ = executables; - executor_ = executables[0]->Executor(); -} - -void ExecutableSet::SetInput(const std::shared_ptr& th) { - (void)th; -} - -void ExecutableSet::SetOutput(const std::shared_ptr& th) { - (void)th; -} - -void ExecutableSet::GetOutput( - const std::vector>& th) { - (void)th; -} - -bool ExecutableSet::Submit(const std::shared_ptr& ref, - bool after) { - bool status = false; - std::shared_ptr executable = shared_from_this(); - status = Executor()->Submit(executable, ref, after); - return status; -} - -bool ExecutableSet::Trigger(bool async) { - (void)async; - bool status = false; - auto device = Executor()->Device(); - for (auto executable : executables_) { - device->Submit(executable->NBGraph()); +bool NativeExecutableImpl::Verify() { + std::shared_ptr executor = std::dynamic_pointer_cast(executor_.lock()); + bool success = executor->BindDevices(NBGraph()); + if (success == false) { + VSILOGE("Executable bind device failed"); + return false; } - status = device->Trigger(); - device->WaitDeviceIdle(); - return status; -} - -std::shared_ptr ExecutableSet::AllocateTensor( - const TensorSpec& tensor_spec) { - std::shared_ptr tensor_handle_sp; - (void)tensor_spec; - return tensor_handle_sp; -} - -std::vector> ExecutableSet::Executables() const { - return executables_; -} - -bool ExecutableSet::Verify() { - bool status = false; - for (auto executable : executables_) { - status = executable->Verify(); + success = nb_graph_->Compile(); + return success; } - return status; -} -std::shared_ptr IExecutor::Contex() const { return context_; } - -NativeExecutor::NativeExecutor(const std::shared_ptr& device) { - device_ = device; - context_ = Context::Create(); -} - -NativeExecutor::NativeExecutor(const std::shared_ptr& device, +NativeExecutorImpl::NativeExecutorImpl(const std::shared_ptr& device, + const int32_t core_count, + const int32_t core_index, const std::shared_ptr& context) { device_ = device; - context_ = context; + if(!context) { + context_ = Context::Create(); + } else { + context_ = context; + } + auto fixed_core_count = core_count; + int32_t fixed_core_index = core_index; + int32_t total_core_count =(int32_t)device_->CoreCount(); + if (fixed_core_index < 0) { + fixed_core_index = 0; + } + if (fixed_core_index > total_core_count - 1) { + VSILOGE("Core index is larger than total core count"); + assert(false); + } + if (fixed_core_count <= 0 ) { + fixed_core_count = total_core_count - fixed_core_index; + } + + if (fixed_core_index + fixed_core_count > total_core_count) { + fixed_core_count = total_core_count - fixed_core_index; + VSILOGW( + "Core_index + core_count is larger than total core count. Fix core count to %d", fixed_core_count); + } + core_index_ = (uint32_t)fixed_core_index; + core_count_ = (uint32_t)fixed_core_count; +#ifdef VSI_DEVICE_SUPPORT + vsi_nn_device_t vsi_devices[VSI_MAX_DEVICES] = {0}; + vsi_size_t num_devices = 0; + auto ctx = dynamic_cast(context_.get()); + vsi_nn_GetDevices(ctx->context(),vsi_devices,&num_devices); + vsi_nn_CreateSubDevice(vsi_devices[device_->Id()],core_index_,core_count_,&sub_devices_); +#endif } -bool NativeExecutor::Submit(const std::shared_ptr& executable, +bool NativeExecutorImpl::Submit(const std::shared_ptr& executable, const std::shared_ptr& ref, bool after) { bool success = false; success = executable->Verify(); - if (success == false) { - std::cout << "Executable NBG compile failed"; + if(success == false) { + VSILOGE("Executable NBG compile failed"); return false; } - if (executable == ref) { + if(executable == ref) { tasks_.push_back(executable); return true; } - for (size_t i = 0; i < tasks_.size(); i++) { - if (tasks_[i].lock() == ref) { - if (after == true) { + for(size_t i = 0; i < tasks_.size(); i++) { + if(tasks_[i].lock() == ref) { + if(after == true) { tasks_.insert(tasks_.begin() + i + 1, executable); success = true; break; @@ -257,59 +250,81 @@ bool NativeExecutor::Submit(const std::shared_ptr& executable, return success; } -bool NativeExecutor::Trigger(bool async) { +bool NativeExecutorImpl::Trigger(bool async) { (void)async; - while (!tasks_.empty()) { + bool ret = false; + while(!tasks_.empty()) { auto task = tasks_.front(); tasks_.erase(tasks_.begin()); - auto task_ = task.lock(); - if (!task_) { - std::cout << "Task unable to lock weak_ptr"; + auto task_tmp = task.lock(); + if(!task_tmp) { + VSILOGE("Task unable to lock weak_ptr"); + return false; } - task_->Trigger(); + ret = task_tmp->Trigger(); } device_->WaitDeviceIdle(); - return true; + return ret; } -std::shared_ptr NativeExecutor::Compile( +std::shared_ptr NativeExecutorImpl::Compile( const std::shared_ptr& graph) { - - CompileOption option; - option.setDeviceId(device_->Id()); - graph->SetCompileOption(option); - + bool ret = BindDevices(graph); + if(!ret) { + return nullptr; + } size_t bin_size = -1; - graph->CompileToBinary(nullptr, &bin_size); + ret = graph->CompileToBinary(nullptr, &bin_size); + if(!ret) { + return nullptr; + } std::vector nb_buf; nb_buf.resize(bin_size); size_t inputs = graph->InputsTensor().size(); size_t outputs = graph->OutputsTensor().size(); - graph->CompileToBinary(nb_buf.data(), &bin_size); - std::shared_ptr this_sp = shared_from_this(); - IExecutable* executable = - new NativeExecutable(this_sp, nb_buf, inputs, outputs); - std::shared_ptr executable_sp(executable); - return executable_sp; + ret = graph->CompileToBinary(nb_buf.data(), &bin_size); + if(!ret) { + return nullptr; + } + std::shared_ptr this_sp = shared_from_this(); + auto executable = std::make_shared(this_sp, nb_buf,inputs,outputs); + return executable; } -std::shared_ptr IExecutor::Device() const { return device_; } -std::shared_ptr ITensorHandle::GetTensor() const { return tensor_; } +bool NativeExecutorImpl::BindDevices(const std::shared_ptr& graph){ + vsi_status status = VSI_SUCCESS; +#ifdef VSI_DEVICE_SUPPORT + GraphImpl* graphimp = dynamic_cast(graph.get()); + status = vsi_nn_BindDevices(graphimp->graph(), 1, &sub_devices_); +#else + CompileOption option; + option.setDeviceId(device_->Id()); + graph->SetCompileOption(option); +#endif + if(status == VSI_SUCCESS) { + return true; + } + else{ + return false; + } +} -NativeTensorHandle::NativeTensorHandle(const std::shared_ptr& tensor) { + +NativeTensorHandleImpl::NativeTensorHandleImpl(const std::shared_ptr& tensor) { tensor_ = tensor; + spec_ = tensor->GetSpec(); } -bool NativeTensorHandle::CopyDataToTensor(const void* data, +bool NativeTensorHandleImpl::CopyDataToTensor(const void* data, uint32_t size_in_bytes) { return tensor_->CopyDataToTensor(data, size_in_bytes); } -bool NativeTensorHandle::CopyDataFromTensor(void* data) { +bool NativeTensorHandleImpl::CopyDataFromTensor(void* data) { return tensor_->CopyDataFromTensor(data); } } // namespace platform } // namespace vx -} // namespace tim \ No newline at end of file +} // namespace tim diff --git a/src/tim/vx/platform/native_device_private.h b/src/tim/vx/platform/native_device_private.h deleted file mode 100644 index ad005f9..0000000 --- a/src/tim/vx/platform/native_device_private.h +++ /dev/null @@ -1,58 +0,0 @@ -/**************************************************************************** -* -* Copyright (c) 2020-2023 Vivante Corporation -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -* DEALINGS IN THE SOFTWARE. -* -*****************************************************************************/ -#ifndef TIM_VX_NATIVE_DEVICE_PRIVATE_H_ -#define TIM_VX_NATIVE_DEVICE_PRIVATE_H_ - -#include "tim/vx/platform/native.h" -#include "vip/virtual_device.h" -#include "graph_private.h" - -namespace tim { -namespace vx { - -class GraphImpl; - -namespace platform { - -class NativeDeviceImpl : public NativeDevice { - public: - NativeDeviceImpl(device_id_t id); - ~NativeDeviceImpl(){}; - - bool Submit(const std::shared_ptr& graph) override; - bool Trigger(bool async = false, async_callback cb = NULL) override; - bool DeviceExit() override; - void WaitDeviceIdle() override; - - protected: - std::unique_ptr vip_device_; - std::vector vsi_graph_v_; - -}; - -} // namespace platform -} // namespace vx -} // namespace tim - -#endif /* TIM_VX_NATIVE_DEVICE_PRIVATE_H_*/ \ No newline at end of file diff --git a/src/tim/vx/platform/native_private.h b/src/tim/vx/platform/native_private.h new file mode 100644 index 0000000..3d86a08 --- /dev/null +++ b/src/tim/vx/platform/native_private.h @@ -0,0 +1,106 @@ +/**************************************************************************** +* +* Copyright (c) 2020-2025 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef TIM_VX_NATIVE_DEVICE_PRIVATE_H_ +#define TIM_VX_NATIVE_DEVICE_PRIVATE_H_ + +#include "tim/vx/platform/native.h" +#include "vip/virtual_device.h" +#include "graph_private.h" + +namespace tim { +namespace vx { + +class GraphImpl; + +namespace platform { + +class NativeDeviceImpl : public NativeDevice, + public std::enable_shared_from_this{ + public: + NativeDeviceImpl(device_id_t id,uint32_t core_count); + ~NativeDeviceImpl(){}; + + bool Submit(const std::shared_ptr& graph) override; + bool Trigger(bool async = false, async_callback cb = NULL) override; + bool DeviceExit() override; + void WaitDeviceIdle() override; + std::shared_ptr CreateExecutor(const int32_t core_index = 0, + const int32_t core_count = -1, + const std::shared_ptr& context = nullptr) override; +}; + +class NativeExecutableImpl : public NativeExecutable { + public: + NativeExecutableImpl(const std::shared_ptr& executor, + const std::vector& nb_buf, size_t inputs, + size_t outputs); + ~NativeExecutableImpl() {}; + void SetInput(const std::shared_ptr& th) override; + void SetOutput(const std::shared_ptr& th) override; + void SetInputs(const std::vector>& ths) override; + void SetOutputs(const std::vector>& ths) override; + bool Submit(const std::shared_ptr& ref, bool after = true) override; + bool Trigger(bool async = false) override; + std::shared_ptr AllocateTensor(const TensorSpec& tensor_spec, + void* data = nullptr, uint32_t size = 0) override; + bool Verify() override; + + protected: + std::shared_ptr nb_node_; + std::vector nb_buf_; +}; + +class NativeExecutorImpl : public NativeExecutor, + public std::enable_shared_from_this { + public: + NativeExecutorImpl(const std::shared_ptr& device, + const int32_t core_count = -1, + const int32_t core_index = 0, + const std::shared_ptr& context = nullptr); + ~NativeExecutorImpl(){}; + bool Submit(const std::shared_ptr& executable, + const std::shared_ptr& ref, + bool after = true) override; + bool Trigger(bool async = false) override; + std::shared_ptr Compile(const std::shared_ptr& graph) override; + bool BindDevices(const std::shared_ptr& graph); + +private: +#ifdef VSI_DEVICE_SUPPORT + vsi_nn_device_t sub_devices_; +#endif +}; + +class NativeTensorHandleImpl : public NativeTensorHandle { + public: + NativeTensorHandleImpl(const std::shared_ptr& tensor); + bool CopyDataToTensor(const void* data, uint32_t size_in_bytes) override; + bool CopyDataFromTensor(void* data) override; +}; + +} // namespace platform +} // namespace vx +} // namespace tim + +#endif /* TIM_VX_NATIVE_DEVICE_PRIVATE_H_*/