diff --git a/cmake/local_sdk.cmake b/cmake/local_sdk.cmake index a74de46..bed2a21 100644 --- a/cmake/local_sdk.cmake +++ b/cmake/local_sdk.cmake @@ -9,7 +9,11 @@ list(APPEND OVXDRV_INCLUDE_DIRS if("${CONFIG}" STREQUAL "BUILDROOT") set(VIV_SDK_DRIVER_PREFIX "usr/lib") else() - set(VIV_SDK_DRIVER_PREFIX "drivers") + if(EXISTS ${EXTERNAL_VIV_SDK}/drivers) + set(VIV_SDK_DRIVER_PREFIX "drivers") + else() + set(VIV_SDK_DRIVER_PREFIX "lib") + endif() endif() message("using driver libs from ${EXTERNAL_VIV_SDK}/${VIV_SDK_DRIVER_PREFIX}") diff --git a/include/tim/vx/platform/lite/lite_native.h b/include/tim/vx/platform/lite/lite_native.h index b83d5fe..a9ed553 100644 --- a/include/tim/vx/platform/lite/lite_native.h +++ b/include/tim/vx/platform/lite/lite_native.h @@ -25,72 +25,58 @@ #define TIM_VX_LITE_NATIVE_H_ #include "tim/vx/platform/platform.h" -#include "vip_lite.h" -#include "nbg_linker.h" namespace tim { namespace vx { namespace platform { -class LiteNativeExecutor - : public IExecutor, - public std::enable_shared_from_this { +class LiteNativeDevice : public IDevice { public: - LiteNativeExecutor(const std::shared_ptr& device); - virtual ~LiteNativeExecutor(); - bool Submit(const std::shared_ptr& executable, - const std::shared_ptr& ref, - bool after = true) override; - bool Trigger(bool async = false) override; - std::shared_ptr Compile( - const std::shared_ptr& graph) override; - - private: - vip_task_descriptor_t* task_descriptor_; - vip_database database_; + virtual ~LiteNativeDevice() {}; + virtual bool Submit(const std::shared_ptr& graph) = 0; + virtual bool Trigger(bool async = false, async_callback cb = NULL) = 0; + virtual bool DeviceExit() = 0; + virtual void WaitDeviceIdle() = 0; + virtual std::shared_ptr CreateExecutor(const int32_t core_index = 0, + const int32_t core_count = -1, + const std::shared_ptr& context = nullptr) = 0; + static std::vector> Enumerate(); + static bool vip_initialized; +}; +class LiteNativeExecutor + : public IExecutor { + public: + virtual ~LiteNativeExecutor() {}; + virtual bool Submit(const std::shared_ptr& executable, + const std::shared_ptr& ref, + bool after = true) = 0; + virtual bool Trigger(bool async = false) = 0; + virtual std::shared_ptr Compile( + const std::shared_ptr& graph) = 0; }; class LiteNativeExecutable : public IExecutable { public: - LiteNativeExecutable(const std::shared_ptr& executor, - const std::vector& nb_buf); - virtual ~LiteNativeExecutable(); - void SetInput(const std::shared_ptr& th) override; - void SetOutput(const std::shared_ptr& th) override; - void GetOutput( - const std::vector>& th) override; - bool Submit(const std::shared_ptr& ref, bool after) override; - bool Trigger(bool async) override; - bool Verify() override; - std::shared_ptr AllocateTensor( - const TensorSpec& tensor_spec) override; - - vip_network network_; - - private: - void SetBuffer(vip_memory_t* dst, gcvip_videomemory_t* src); - - int32_t input_count_; - int32_t output_count_; - - gcvip_videomemory_t* coeff_; - gcvip_videomemory_t* command_; - gcvip_videomemory_t* memory_pool_; - gcvip_videomemory_t* others_; - gcvip_videomemory_t* pre_command_; + virtual ~LiteNativeExecutable() {}; + virtual void SetInput(const std::shared_ptr& th) = 0; + virtual void SetOutput(const std::shared_ptr& th) = 0; + virtual void SetInputs(const std::vector>& ths) = 0; + virtual void SetOutputs(const std::vector>& ths) = 0; + virtual bool Submit(const std::shared_ptr& ref, bool after) = 0; + virtual bool Trigger(bool async) = 0; + virtual bool Verify() = 0; + virtual std::shared_ptr AllocateTensor(const TensorSpec& tensor_spec, + void* data = nullptr, uint32_t size = 0) = 0; }; class LiteNativeTensorHandle : public ITensorHandle { public: - LiteNativeTensorHandle(const std::shared_ptr& tensr); - virtual ~LiteNativeTensorHandle(); - bool CopyDataToTensor(const void* data, uint32_t size_in_bytes) override; - bool CopyDataFromTensor(void* data) override; - - gcvip_videomemory_t* tensor_buffer_; + virtual ~LiteNativeTensorHandle() {}; + bool CopyDataToTensor(const void* data, uint32_t size_in_bytes) = 0; + bool CopyDataFromTensor(void* data) = 0; }; } // namespace platform } // namespace vx } // namespace tim -#endif \ No newline at end of file +#endif diff --git a/include/tim/vx/platform/native.h b/include/tim/vx/platform/native.h index cecf34a..8521731 100644 --- a/include/tim/vx/platform/native.h +++ b/include/tim/vx/platform/native.h @@ -37,51 +37,41 @@ class NativeDevice : public IDevice { virtual bool Trigger(bool async = false, async_callback cb = NULL) = 0; virtual bool DeviceExit() = 0; virtual void WaitDeviceIdle() = 0; + virtual std::shared_ptr CreateExecutor(const int32_t core_index = 0, + const int32_t core_count = -1, + const std::shared_ptr& context = nullptr) = 0; static std::vector> Enumerate(); }; class NativeExecutable : public IExecutable { public: - NativeExecutable(const std::shared_ptr& executor, - const std::vector& nb_buf, size_t inputs, - size_t outputs); - ~NativeExecutable(){}; - void SetInput(const std::shared_ptr& th) override; - void SetOutput(const std::shared_ptr& th) override; - void GetOutput( - const std::vector>& th) override; - bool Submit(const std::shared_ptr& ref, - bool after = true) override; - bool Trigger(bool async = false) override; - std::shared_ptr AllocateTensor( - const TensorSpec& tensor_spec) override; - bool Verify() override; - - protected: - std::shared_ptr nb_node_; - std::vector nb_buf_; + virtual ~NativeExecutable() {}; + virtual void SetInput(const std::shared_ptr& th) = 0; + virtual void SetOutput(const std::shared_ptr& th) = 0; + virtual void SetInputs(const std::vector>& ths) = 0; + virtual void SetOutputs(const std::vector>& ths) = 0; + virtual bool Submit(const std::shared_ptr& ref, + bool after = true) = 0; + virtual bool Trigger(bool async = false) = 0; + virtual std::shared_ptr AllocateTensor(const TensorSpec& tensor_spec, + void* data = nullptr, uint32_t size = 0) = 0; + virtual bool Verify() = 0; }; -class NativeExecutor : public IExecutor, - public std::enable_shared_from_this { +class NativeExecutor : public IExecutor { public: - NativeExecutor(const std::shared_ptr& device); - NativeExecutor(const std::shared_ptr& device, - const std::shared_ptr& context); - ~NativeExecutor(){}; - bool Submit(const std::shared_ptr& executable, - const std::shared_ptr& ref, - bool after = true) override; - bool Trigger(bool async = false) override; - std::shared_ptr Compile( - const std::shared_ptr& graph) override; + virtual ~NativeExecutor(){}; + virtual bool Submit(const std::shared_ptr& executable, + const std::shared_ptr& ref, + bool after = true) = 0; + virtual bool Trigger(bool async = false) = 0; + virtual std::shared_ptr Compile(const std::shared_ptr& graph) = 0; }; class NativeTensorHandle : public ITensorHandle { public: - NativeTensorHandle(const std::shared_ptr& tensor); - bool CopyDataToTensor(const void* data, uint32_t size_in_bytes) override; - bool CopyDataFromTensor(void* data) override; + virtual bool CopyDataToTensor(const void* data, uint32_t size_in_bytes) = 0; + virtual bool CopyDataFromTensor(void* data) = 0; }; } // namespace platform diff --git a/include/tim/vx/platform/platform.h b/include/tim/vx/platform/platform.h index 263042b..94ba61c 100644 --- a/include/tim/vx/platform/platform.h +++ b/include/tim/vx/platform/platform.h @@ -46,15 +46,12 @@ namespace platform { class IDevice; class IExecutable; -class ExecutableSet; class IExecutor; class ITensorHandle; std::shared_ptr Compile( const std::shared_ptr& graph, const std::shared_ptr& executor); -std::shared_ptr CreateExecutableSet( - const std::vector>& executables); class IDevice { public: @@ -68,17 +65,25 @@ class IDevice { virtual ~IDevice(){}; virtual bool Submit(const std::shared_ptr& graph) = 0; virtual bool Trigger(bool async = false, async_callback cb = NULL) = 0; - device_id_t Id() const; + device_id_t Id() const { return device_id_;}; virtual void WaitDeviceIdle() = 0; virtual bool DeviceExit() = 0; virtual void RemoteReset(); + uint32_t CoreCount() const {return core_count_;}; + virtual std::shared_ptr CreateExecutor(const int32_t core_index = 0, + const int32_t core_count = -1, + const std::shared_ptr& context = nullptr) = 0; + static std::vector> Enumerate(); protected: device_id_t device_id_; + uint32_t core_count_; + }; class IExecutor { public: + //using task = std::shared_ptr; using task = std::weak_ptr; virtual ~IExecutor(){}; virtual bool Submit(const std::shared_ptr& executable, @@ -87,13 +92,17 @@ class IExecutor { virtual bool Trigger(bool async = false) = 0; // todo: async=true virtual std::shared_ptr Compile( const std::shared_ptr& graph) = 0; - virtual std::shared_ptr Device() const; - virtual std::shared_ptr Contex() const; - + virtual std::shared_ptr Device() const {return device_;}; + virtual std::shared_ptr Contex() const {return context_;}; + virtual uint32_t CoreIndex() const {return core_index_; }; + virtual uint32_t CoreCount() const {return core_count_; }; protected: std::vector tasks_; std::shared_ptr device_; std::shared_ptr context_; + uint32_t core_index_; + uint32_t core_count_; + }; class IExecutable : public std::enable_shared_from_this { @@ -101,40 +110,24 @@ class IExecutable : public std::enable_shared_from_this { virtual ~IExecutable(){}; virtual void SetInput(const std::shared_ptr& th) = 0; virtual void SetOutput(const std::shared_ptr& th) = 0; - virtual void GetOutput( - const std::vector>& th) = 0; // for remote + virtual void SetInputs(const std::vector>& ths) = 0; + virtual void SetOutputs(const std::vector>& ths) = 0; + virtual std::vector> GetOutputs() { return input_handles_;}; + virtual std::vector> Getinputs() { return input_handles_;}; virtual bool Submit(const std::shared_ptr& ref, bool after = true) = 0; virtual bool Trigger(bool async = false) = 0; // todo: async=true virtual bool Verify() = 0; - virtual std::shared_ptr NBGraph() const; - virtual std::shared_ptr AllocateTensor( - const TensorSpec& tensor_spec) = 0; - virtual std::shared_ptr Executor() const; + std::shared_ptr NBGraph() const {return nb_graph_;}; + virtual std::shared_ptr AllocateTensor(const TensorSpec& tensor_spec , + void* data = nullptr, uint32_t size = 0) = 0; protected: std::weak_ptr executor_; std::shared_ptr context_; std::shared_ptr nb_graph_; -}; - -class ExecutableSet : public IExecutable { - public: - ExecutableSet(const std::vector>& executables); - void SetInput(const std::shared_ptr& th) override; - void SetOutput(const std::shared_ptr& th) override; - void GetOutput( - const std::vector>& th) override; - bool Submit(const std::shared_ptr& ref, - bool after = true) override; - bool Trigger(bool async = false) override; - bool Verify() override; - std::shared_ptr AllocateTensor( - const TensorSpec& tensor_spec) override; - std::vector> Executables() const; - - protected: - std::vector> executables_; + std::vector> input_handles_; + std::vector> output_handles_; }; class ITensorHandle { @@ -142,13 +135,15 @@ class ITensorHandle { virtual ~ITensorHandle(){}; virtual bool CopyDataToTensor(const void* data, uint32_t size_in_bytes) = 0; virtual bool CopyDataFromTensor(void* data) = 0; - virtual std::shared_ptr GetTensor() const; + virtual std::shared_ptr GetTensor() const { return tensor_;}; + virtual TensorSpec& GetSpec() { return spec_;}; protected: std::shared_ptr tensor_; + TensorSpec spec_; }; } // namespace platform } // namespace vx } // namespace tim -#endif \ No newline at end of file +#endif diff --git a/samples/CMakeLists.txt b/samples/CMakeLists.txt index 919c0a5..301afdb 100644 --- a/samples/CMakeLists.txt +++ b/samples/CMakeLists.txt @@ -20,9 +20,7 @@ endif() if(TIM_VX_ENABLE_PLATFORM) add_subdirectory("lenet_multi_device") add_subdirectory("multi_device") - if(${TIM_VX_ENABLE_PLATFORM_LITE}) - add_subdirectory("lite_multi_device") - endif() + add_subdirectory("platform_sample") if(TIM_VX_ENABLE_GRPC) add_subdirectory("grpc") endif() diff --git a/samples/lenet_multi_device/CMakeLists.txt b/samples/lenet_multi_device/CMakeLists.txt index 6658898..e62787b 100644 --- a/samples/lenet_multi_device/CMakeLists.txt +++ b/samples/lenet_multi_device/CMakeLists.txt @@ -11,5 +11,10 @@ target_include_directories(${TARGET_NAME} PRIVATE ${PROJECT_SOURCE_DIR}/include ) +target_include_directories(${TARGET_NAME} PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR} + ${PROJECT_SOURCE_DIR}/include +) + install(TARGETS ${TARGET_NAME} ${TARGET_NAME} DESTINATION ${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_BINDIR}) diff --git a/samples/lenet_multi_device/lenet_multi_device.cc b/samples/lenet_multi_device/lenet_multi_device.cc index 9ce79c9..1761e97 100644 --- a/samples/lenet_multi_device/lenet_multi_device.cc +++ b/samples/lenet_multi_device/lenet_multi_device.cc @@ -33,7 +33,6 @@ #include "tim/vx/context.h" #include "tim/vx/graph.h" #include "tim/vx/platform/platform.h" -#include "tim/vx/platform/native.h" std::vector input_data = { 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 2, 0, 0, 8, 0, @@ -108,17 +107,17 @@ static void printTopN(const T* prob, int outputCount, int topNum) { } } + int main(int argc, char** argv) { (void) argc, (void) argv; auto context0 = tim::vx::Context::Create(); auto graph0 = lenet(context0); auto graph1 = lenet(context0); - auto devices = tim::vx::platform::NativeDevice::Enumerate(); + auto devices = tim::vx::platform::IDevice::Enumerate(); auto device = devices[0]; - std::shared_ptr executor = std::make_shared (device); - - auto executable0 = tim::vx::platform::Compile(graph0, executor); // compile to nbg + auto executor = device->CreateExecutor(0,-1,context0); + auto executable0 = tim::vx::platform::Compile(graph0, executor); auto input_handle0 = executable0->AllocateTensor(graph0->InputsTensor()[0]->GetSpec()); auto output_handle0 = executable0->AllocateTensor(graph0->OutputsTensor()[0]->GetSpec()); executable0->SetInput(input_handle0); @@ -127,7 +126,18 @@ int main(int argc, char** argv) { assert(executable0->Submit(executable0)); executable0->Trigger(); - auto executable1 = tim::vx::platform::Compile(graph1, executor); // compile to nbg + std::vector output_data; + output_data.resize(1 * 10); + if (!output_handle0->CopyDataFromTensor(output_data.data())) { + std::cout << "Copy output data fail." << std::endl; + return -1; + } + std::cout << "executable0 out." << std::endl; + printTopN(output_data.data(), output_data.size(), 5); + output_data.assign(output_data.size(),0); + output_handle0->CopyDataToTensor(output_data.data(), output_data.size()); + + auto executable1 = tim::vx::platform::Compile(graph1, executor); auto input_handle1 = executable1->AllocateTensor(graph1->InputsTensor()[0]->GetSpec()); auto output_handle1 = executable1->AllocateTensor(graph1->OutputsTensor()[0]->GetSpec()); executable1->SetInput(input_handle1); @@ -136,34 +146,28 @@ int main(int argc, char** argv) { assert(executable1->Submit(executable0)); executable1->Trigger(); + std::vector output_data1; + output_data1.resize(1 * 10); + if (!output_handle1->CopyDataFromTensor(output_data1.data())) { + std::cout << "Copy output data fail." << std::endl; + return -1; + } + std::cout << "executable1 out." << std::endl; + printTopN(output_data1.data(), output_data1.size(), 5); + output_data1.assign(output_data1.size(),0); + output_handle1->CopyDataToTensor(output_data1.data(), output_data1.size()); + executor->Submit(executable0, executable0); executor->Submit(executable1, executable0); - std::vector> executables0; - executables0.push_back(executable0); - executables0.push_back(executable1); - auto executable_set0 = tim::vx::platform::CreateExecutableSet(executables0); - executor->Submit(executable_set0, executable_set0); executor->Trigger(); - - std::vector input_data0; - input_data0.resize(28 * 28); - if (!input_handle0->CopyDataFromTensor(input_data0.data())) { - std::cout << "Copy intput data fail." << std::endl; - return -1; - } - printTopN(input_data0.data(), input_data0.size(), 5); - - std::vector output_data; - output_data.resize(1 * 10); + std::cout << "executor out." << std::endl; if (!output_handle0->CopyDataFromTensor(output_data.data())) { std::cout << "Copy output data fail." << std::endl; return -1; } printTopN(output_data.data(), output_data.size(), 5); - std::vector output_data1; - output_data1.resize(1 * 10); if (!output_handle1->CopyDataFromTensor(output_data1.data())) { std::cout << "Copy output data fail." << std::endl; return -1; diff --git a/samples/lite_multi_device/CMakeLists.txt b/samples/lite_multi_device/CMakeLists.txt deleted file mode 100644 index 0356eef..0000000 --- a/samples/lite_multi_device/CMakeLists.txt +++ /dev/null @@ -1,13 +0,0 @@ -message("samples/lite_multi_device") - -set(TARGET_NAME "lite_multi_device") - -add_executable(${TARGET_NAME} ${CMAKE_CURRENT_SOURCE_DIR}/lite_multi_device.cc) - -target_link_libraries(${TARGET_NAME} PRIVATE -Wl,--whole-archive tim-vx) -target_include_directories(${TARGET_NAME} PRIVATE - ${PROJECT_SOURCE_DIR}/include - ${PROJECT_SOURCE_DIR}/prebuilt-sdk/viplite/build/sdk/include) - -install(TARGETS ${TARGET_NAME} ${TARGET_NAME} - DESTINATION ${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_BINDIR}) \ No newline at end of file diff --git a/samples/multi_device/README b/samples/multi_device/README index 557e20d..890e417 100644 --- a/samples/multi_device/README +++ b/samples/multi_device/README @@ -1,15 +1,25 @@ ## brief -The multi_device demo uses some acuity exported tim-vx networks, and running on 4 devices of NPU using platform api. +The multi_device demo uses some acuity exported tim-vx networks, and running on multi-core devices of NPU using platform api. -## environment - export VSIMULATOR_CONFIG=VIP9400O_PID0XD9 - export VIV_MGPU_AFFINITY="1:0" - export VIV_OVX_USE_MULTI_DEVICE="1:1" - export TIM_VX_ROOT="${workspaceFolder}/tim-vx" +## note +Please note that if you have enabled lite platform, a dedicated VIVANTE_SDK(NO_KERNEL) is required as the compiler for NBG. +The driver for the NPU is the VIPLITE driver + +##requirements +Vivante SDK >= 6.4.22 +ovxlib >= 1.2.26 +viplite >=2.0.0 ## build cd build -cmake .. -DCMAKE_BUILD_TYPE=Debug -DTIM_VX_BUILD_EXAMPLES=ON -DTIM_VX_ENABLE_PLATFORM=ON +cmake .. -DCMAKE_BUILD_TYPE=Release -DTIM_VX_USE_EXTERNAL_OVXLIB=ON -DEXTERNAL_VIV_SDK=${VIVANTE_NOKERNEL_SDK_DIR} -DOVXLIB_INC=${OVXLIB_DIR}/include \ + -DOVXLIB_LIB=/path/to/libovxlib.so -DTIM_VX_BUILD_EXAMPLES=ON -DTIM_VX_ENABLE_PLATFORM=ON -DTIM_VX_ENABLE_PLATFORM_LITE=ON -DVIP_LITE_SDK=${VIP_LITE_SDK} + +## environment +# Export VIV_GPU_FILE to specify the NPU hardware configuration file for the NBG compiler +# VIV_GPU_FILE Specify the NPU hardware configuration file for the NBG compiler +export VIV_GPU_FILE="/path/to/VIP9400NANOQ_PLUS_PID0X10000055.config" +export TIM_VX_ROOT="${workspaceFolder}/tim-vx" ## run cd build diff --git a/samples/multi_device/multi_device.cc b/samples/multi_device/multi_device.cc index 6e1e772..c3f040a 100644 --- a/samples/multi_device/multi_device.cc +++ b/samples/multi_device/multi_device.cc @@ -35,7 +35,6 @@ #include "tim/vx/context.h" #include "tim/vx/graph.h" #include "tim/vx/platform/platform.h" -#include "tim/vx/platform/native.h" #include "vx_lenet.h" #include "vx_mobilenet.h" #include "vx_resnet50.h" @@ -59,7 +58,7 @@ static void printTopN(const T* prob, int outputCount, int topNum) { } template -void print_topN(std::size_t size, std::shared_ptr handle) { +void print_topN(std::size_t size, std::shared_ptr & handle) { std::vector output_data; output_data.resize(size); if (!handle->CopyDataFromTensor(output_data.data())) { @@ -94,7 +93,8 @@ void executor_trigger(std::shared_ptr executor) { } auto context = tim::vx::Context::Create(); -std::pair, std::shared_ptr> generate_executable( +std::pair, std::shared_ptr> + generate_executable( std::shared_ptr executor, std::function, const char*)> construct_func, std::string weight_file, @@ -114,15 +114,17 @@ std::pair, std::shared_ptr executor0 = std::make_shared (device0); - auto device1 = devices[1]; - std::shared_ptr executor1 = std::make_shared (device1); - auto device2 = devices[2]; - std::shared_ptr executor2 = std::make_shared (device2); - auto device3 = devices[3]; - std::shared_ptr executor3 = std::make_shared (device3); + auto total_core_count = device0->CoreCount(); + uint32_t core_index = 0; + auto use_core_count = 1; + std::vector> executors; + + for(core_index = 0; core_index < total_core_count; core_index += use_core_count) { + auto executor = device0->CreateExecutor(core_index,use_core_count, context); + executors.push_back(executor); + } auto root = std::getenv("TIM_VX_ROOT"); assert(root != NULL); @@ -142,46 +144,57 @@ int main(int argc, char** argv) { auto resnet50_weight_file = ROOT + "/samples/multi_device/resnet50/resnet50.export.data"; std::function, const char*)> resnet50_construct_func = acuitylite::resnet50::construct_graph; - std::shared_ptr lenet_0, lenet_2, lenet_3, mobilenet_1, mobilenet_2, mobilenet_3, resnet50_0, resnet50_1; - std::shared_ptr lenet_0_outhandle, lenet_2_outhandle, lenet_3_outhandle, mobilenet_1_outhandle, mobilenet_2_outhandle, mobilenet_3_outhandle, - resnet50_0_outhandle, resnet50_1_outhandle; + auto excutor_cnt = executors.size(); - std::tie(lenet_0, lenet_0_outhandle) = generate_executable(executor0, lenet_construct_func, lenet_weight_file, lenet_input_files, lenet_input_bytes); - std::tie(resnet50_0, resnet50_0_outhandle) = generate_executable(executor0, resnet50_construct_func, resnet50_weight_file, resnet50_input_files, resnet50_input_bytes); - executor0->Submit(lenet_0, lenet_0); - executor0->Submit(resnet50_0, lenet_0); + //each excutor run 2 models. + auto lenet = [&](std::shared_ptr executor) { + return generate_executable(executor, lenet_construct_func, lenet_weight_file, + lenet_input_files, lenet_input_bytes); + }; + auto resnet = [&](std::shared_ptr executor) { + return generate_executable(executor, resnet50_construct_func, resnet50_weight_file, + resnet50_input_files, resnet50_input_bytes); + }; + auto mobilenet = [&](std::shared_ptr executor) { + return generate_executable(executor, mobilenet_construct_func, mobilenet_weight_file, + mobilenet_input_files, mobilenet_input_bytes); + }; + std::vector, + std::shared_ptr>> nets; + for (size_t i = 0; i < excutor_cnt; i++) { + if(i % 3 == 0) { + //lenet + resnet + nets.push_back(lenet(executors[i])); + executors[i]->Submit(nets.back().first, nets.back().first); + nets.push_back(resnet(executors[i])); + executors[i]->Submit(nets.back().first, nets.back().first); + } + if(i % 3 == 1) { + //resnet + mobilenet + nets.push_back(resnet(executors[i])); + executors[i]->Submit(nets.back().first, nets.back().first); + nets.push_back(mobilenet(executors[i])); + executors[i]->Submit(nets.back().first, nets.back().first); + } + if(i % 3 == 2) { + //lenet + mobilenet + nets.push_back(mobilenet(executors[i])); + executors[i]->Submit(nets.back().first, nets.back().first); + nets.push_back(lenet(executors[i])); + executors[i]->Submit(nets.back().first, nets.back().first); + } + } + std::vector threads; + for(auto executor:executors) { + threads.push_back(std::thread(executor_trigger, executor)); + } + for(std::thread &t : threads) { + t.join(); + } - std::tie(mobilenet_1, mobilenet_1_outhandle) = generate_executable(executor1, mobilenet_construct_func, mobilenet_weight_file, mobilenet_input_files, mobilenet_input_bytes); - std::tie(resnet50_1, resnet50_1_outhandle) = generate_executable(executor1, resnet50_construct_func, resnet50_weight_file, resnet50_input_files, resnet50_input_bytes); - auto executable_set1 = tim::vx::platform::CreateExecutableSet({mobilenet_1, resnet50_1}); - executor1->Submit(executable_set1, executable_set1); - - std::tie(lenet_2, lenet_2_outhandle) = generate_executable(executor2, lenet_construct_func, lenet_weight_file, lenet_input_files, lenet_input_bytes); - std::tie(mobilenet_2, mobilenet_2_outhandle) = generate_executable(executor2, mobilenet_construct_func, mobilenet_weight_file, mobilenet_input_files, mobilenet_input_bytes); - auto executable_set2 = tim::vx::platform::CreateExecutableSet({lenet_2, mobilenet_2}); - executor2->Submit(executable_set2, executable_set2); - - std::tie(lenet_3, lenet_3_outhandle) = generate_executable(executor3, lenet_construct_func, lenet_weight_file, lenet_input_files, lenet_input_bytes); - std::tie(mobilenet_3, mobilenet_3_outhandle) = generate_executable(executor3, mobilenet_construct_func, mobilenet_weight_file, mobilenet_input_files, mobilenet_input_bytes); - auto executable_set3 = tim::vx::platform::CreateExecutableSet({lenet_3, mobilenet_3}); - executor3->Submit(executable_set3, executable_set3); - - std::thread t0(executor_trigger, executor0); - std::thread t1(executor_trigger, executor1); - std::thread t2(executor_trigger, executor2); - std::thread t3(executor_trigger, executor3); - t0.join(); - t1.join(); - t2.join(); - t3.join(); - - print_topN(1 * 10, lenet_0_outhandle); - print_topN(1 * 10, lenet_2_outhandle); - print_topN(1 * 10, lenet_3_outhandle); - print_topN(1 * 1001, mobilenet_1_outhandle); - print_topN(1 * 1001, mobilenet_2_outhandle); - print_topN(1 * 1001, mobilenet_3_outhandle); - print_topN(1 * 1000, resnet50_0_outhandle); - print_topN(1 * 1000, resnet50_1_outhandle); +for (auto net : nets) { + auto size = net.second->GetSpec().GetElementNum(); + print_topN(size, net.second); +} return 0; } diff --git a/samples/multi_device/multi_device_demo.cc b/samples/multi_device/multi_device_demo.cc index dd20c3c..369569b 100644 --- a/samples/multi_device/multi_device_demo.cc +++ b/samples/multi_device/multi_device_demo.cc @@ -29,7 +29,7 @@ #include "tim/vx/graph.h" #include "tim/vx/operation.h" #include "tim/vx/tensor.h" -#include "tim/vx/platform/native.h" +#include "tim/vx/platform/platform.h" static void printTopN() { } @@ -46,9 +46,9 @@ int demo(int argc, char** argv) { tim::vx::TensorSpec g0_input0, g0_output0, g1_output0, g2_output0, g3_output0, g4_output0, g5_output0; // query device and get executor of devcie - auto devices = tim::vx::platform::NativeDevice::Enumerate(); + auto devices = tim::vx::platform::IDevice::Enumerate(); auto device = devices[0]; - std::shared_ptr executor = std::make_shared (device); + auto executor = device->CreateExecutor(0,-1, context); // executable0 auto executable0 = executor->Compile(g0); // compile to nbg @@ -89,33 +89,6 @@ int demo(int argc, char** argv) { // trigger executor->Trigger(); // run all submitted executables - /* 2. another way to run */ - // executable_set0 - std::vector> executables0; - executables0.push_back(executable0); - auto executable_set0 = CreateExecutableSet(executables0); - // executable_set1 - std::vector> executables1; - executables1.push_back(executable1); - executables1.push_back(executable3); - auto executable_set1 = CreateExecutableSet(executables1); - // executable_set2 - std::vector> executables2; - executables2.push_back(executable2); - executables2.push_back(executable4); - auto executable_set2 = CreateExecutableSet(executables2); - // executable_set3 - std::vector> executables3; - executables3.push_back(executable5); - auto executable_set3 = CreateExecutableSet(executables3); - // submit executaleSets - executable_set0->Submit(executable_set0); - executable_set1->Submit(executable_set0); - executable_set2->Submit(executable_set1); - executable_set3->Submit(executable_set2); - // trigger - executor->Trigger(); // run all submitted executableSets - printTopN(); return 0; diff --git a/samples/multi_device/vx_resnet50.cc b/samples/multi_device/vx_resnet50.cc index 7011e3e..a15480e 100644 --- a/samples/multi_device/vx_resnet50.cc +++ b/samples/multi_device/vx_resnet50.cc @@ -1296,7 +1296,7 @@ void resnet50::construct_graph auto input_0 = graph->CreateTensor(input_0_spec); tim::vx::ShapeType output_229_shape({1000,1}); - tim::vx::TensorSpec output_229_spec(tim::vx::DataType::FLOAT16, output_229_shape, + tim::vx::TensorSpec output_229_spec(tim::vx::DataType::FLOAT32, output_229_shape, tim::vx::TensorAttribute::OUTPUT); auto output_229 = graph->CreateTensor(output_229_spec); diff --git a/samples/platform_sample/CMakeLists.txt b/samples/platform_sample/CMakeLists.txt new file mode 100644 index 0000000..1b91e85 --- /dev/null +++ b/samples/platform_sample/CMakeLists.txt @@ -0,0 +1,13 @@ +message("samples/platform_sample") + +set(TARGET_NAME "platform_sample") + +add_executable(${TARGET_NAME} ${CMAKE_CURRENT_SOURCE_DIR}/platform_sample.cc) + +target_link_libraries(${TARGET_NAME} PRIVATE -Wl,--whole-archive tim-vx) +target_include_directories(${TARGET_NAME} PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR} + ${PROJECT_SOURCE_DIR}/include) + +install(TARGETS ${TARGET_NAME} ${TARGET_NAME} + DESTINATION ${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_BINDIR}) diff --git a/samples/platform_sample/README b/samples/platform_sample/README new file mode 100644 index 0000000..f1df921 --- /dev/null +++ b/samples/platform_sample/README @@ -0,0 +1,25 @@ +## brief +The platform sample usage which use platform api. + +## note +Please note that if you have enabled lite platform, a dedicated VIVANTE_SDK(NO_KERNEL) is required as the compiler for NBG. +The driver for the NPU is the VIPLITE driver + +##requirements +Vivante SDK >= 6.4.22 +ovxlib >= 1.2.26 +viplite >=2.0.0 + +## build +cd build +cmake .. -DCMAKE_BUILD_TYPE=Release -DTIM_VX_USE_EXTERNAL_OVXLIB=ON -DEXTERNAL_VIV_SDK=${VIVANTE_NOKERNEL_SDK_DIR} -DOVXLIB_INC=${OVXLIB_DIR}/include \ + -DOVXLIB_LIB=${VIVANTE_NOKERNEL_SDK_DIR}/drivers/libovxlib.so -DTIM_VX_BUILD_EXAMPLES=ON -DTIM_VX_ENABLE_PLATFORM=ON \ + -DTIM_VX_ENABLE_PLATFORM_LITE=ON -DVIP_LITE_SDK=${VIP_LITE_SDK} + +## environment +# Export VIV_GPU_FILE to specify the NPU hardware configuration file for the NBG compiler +export VIV_GPU_FILE="/path/to/VIP9000NANOQ_PLUS_PID0X100000XX.config" + +## run +cd build +./samples/platform_sample/platform_sample diff --git a/samples/lite_multi_device/lite_multi_device.cc b/samples/platform_sample/platform_sample.cc similarity index 87% rename from samples/lite_multi_device/lite_multi_device.cc rename to samples/platform_sample/platform_sample.cc index 51aec07..9506a9e 100644 --- a/samples/lite_multi_device/lite_multi_device.cc +++ b/samples/platform_sample/platform_sample.cc @@ -26,8 +26,8 @@ #include "tim/vx/graph.h" #include "tim/vx/ops.h" #include "tim/vx/types.h" -#include "tim/vx/platform/native.h" -#include "tim/vx/platform/lite/lite_native.h" +#include "tim/vx/platform/platform.h" + int main() { //construct tim-vx graph @@ -49,9 +49,15 @@ int main() { std::vector data_vec_i0({1, 2, 3, 4}); std::vector data_vec_i1({4, 3, 2, 1}); - auto devices = tim::vx::platform::NativeDevice::Enumerate(); + auto devices = tim::vx::platform::IDevice::Enumerate(); + + std::cout << "NPU device count: " << devices.size() <(device); + //run 1 core in device 0 + std::cout << "NPU device[0] has " << device->CoreCount() << "cores" <CreateExecutor(use_core_count); + auto executable = executor->Compile(graph); auto input0_handle = executable->AllocateTensor(input_spec); auto input1_handle = executable->AllocateTensor(input_spec); @@ -73,6 +79,10 @@ int main() { //each output value should be "5" in this demo for (int i = 0; i < 4; ++i) { std::cout << "output value: " << data[i] << std::endl; + if(data[i] != 5) { + std::cout << "test failed" << std::endl; + break; + } } free(data); return 0; diff --git a/src/tim/CMakeLists.txt b/src/tim/CMakeLists.txt index 8b38b33..f8f36c5 100644 --- a/src/tim/CMakeLists.txt +++ b/src/tim/CMakeLists.txt @@ -61,8 +61,10 @@ if(TIM_VX_ENABLE_PLATFORM) endif() list(APPEND LITE_EXTERNAL_LIBS ${VIP_LITE_SDK}/drivers/libNBGlinker.so - ${VIP_LITE_SDK}/drivers/libVIPlite.so) - list(APPEND LITE_INC_DIRS ${VIP_LITE_SDK}/include) + ${VIP_LITE_SDK}/drivers/libVIPhal.so) + list(APPEND LITE_INC_DIRS + ${VIP_LITE_SDK}/include + ${VIP_LITE_SDK}/include/nbg_linker) endif() if(TIM_VX_ENABLE_GRPC) diff --git a/src/tim/vx/platform/lite/lite_native.cc b/src/tim/vx/platform/lite/lite_native.cc index 6ab557c..21a45af 100644 --- a/src/tim/vx/platform/lite/lite_native.cc +++ b/src/tim/vx/platform/lite/lite_native.cc @@ -22,36 +22,202 @@ * *****************************************************************************/ #include "tim/vx/platform/lite/lite_native.h" +#include "lite_native_private.h" #include - #include "tim/vx/graph.h" #include "graph_private.h" -#include "vsi_nn_pub.h" +#include "context_private.h" namespace tim { namespace vx { namespace platform { -LiteNativeExecutor::LiteNativeExecutor(const std::shared_ptr& device) { + + LiteNetwork::LiteNetwork(vip_create_network_param_t& param) { + vip_create_network(¶m, sizeof(param), &network_); + } + vip_status_e LiteNetwork::Query(vip_enum property, void* value) { + return vip_query_network(network_, property, value); + } + vip_status_e LiteNetwork::Set(vip_enum property, void* value) { + return vip_set_network(network_, property, value); + } + vip_status_e LiteNetwork::Prepare() { + return vip_prepare_network(network_); + } + vip_status_e LiteNetwork::Run() {return vip_run_network(network_);} + + vip_status_e LiteNetwork::Trigger() {return vip_trigger_network(network_);} + + vip_status_e LiteNetwork::Wait() {return vip_wait_network(network_);} + + vip_status_e LiteNetwork::Cancel() {return vip_cancel_network(network_);} + + vip_status_e LiteNetwork::QueryInput(vip_uint32_t index, vip_enum property, void* value) { + return vip_query_input(network_, index, property,value); + } + + vip_status_e LiteNetwork::QueryOutput(vip_uint32_t index, vip_enum property, void* value) { + return vip_query_output(network_, index, property, value); + } + + vip_status_e LiteNetwork::SetInput(vip_uint32_t index, std::shared_ptr input) { + vip_buffer buffer = + std::dynamic_pointer_cast(input)->GetBuffer(); + return vip_set_input(network_, index, buffer); + } + + vip_status_e LiteNetwork::SetOutput(vip_uint32_t index, std::shared_ptr output) { + vip_buffer buffer = + std::dynamic_pointer_cast(output)->GetBuffer(); + return vip_set_output(network_, index, buffer); + } + + LiteNetwork::~LiteNetwork(){ + vip_finish_network(network_); + vip_destroy_network(network_); + } + +bool LiteNativeDevice::vip_initialized = false; + +LiteNativeDeviceImpl::LiteNativeDeviceImpl(device_id_t id,uint32_t core_count) { + device_id_ = id; + core_count_ = core_count; + } + +bool LiteNativeDeviceImpl::Submit(const std::shared_ptr& graph) { + (void)graph; + return true; +} + +bool LiteNativeDeviceImpl::Trigger(bool async, async_callback cb) { + (void)async; + (void)cb; + return true; +} +void LiteNativeDeviceImpl::WaitDeviceIdle() {} + +bool LiteNativeDeviceImpl::DeviceExit() {return false;} + +std::shared_ptr LiteNativeDeviceImpl::CreateExecutor(const int32_t core_index, + const int32_t core_count, + const std::shared_ptr& context) { + std::shared_ptr this_sp = shared_from_this(); + auto executor = std::make_shared(this_sp, core_count, core_index, context); + return executor; +} + +std::vector> LiteNativeDevice::Enumerate() { + std::vector> device_v; + device_id_t deviceCount = 0; + std::vector core_count; + uint32_t version = 0; + if( !LiteNativeDevice::vip_initialized ) { + vip_status_e status = vip_init(); + if(status != VIP_SUCCESS) { + VSILOGE("Initialize viplite driver fail"); + return device_v; + } + LiteNativeDevice::vip_initialized = true; + } + version = vip_get_version(); + if (version >= 0x00010601 ) { + vip_query_hardware(VIP_QUERY_HW_PROP_DEVICE_COUNT, sizeof(uint32_t), &deviceCount); + core_count.resize(deviceCount); + vip_query_hardware(VIP_QUERY_HW_PROP_CORE_COUNT_EACH_DEVICE, + sizeof(uint32_t) * core_count.size(), core_count.data()); + } + + for (device_id_t i = 0; i < deviceCount; i++) { + auto local_device = std::make_shared(i, core_count.at(i)); + device_v.push_back(local_device); + } + return device_v; +} + +int LiteNativeExecutorImpl::executor_count = 0; + +LiteNativeExecutorImpl::LiteNativeExecutorImpl(const std::shared_ptr& device, + const int32_t core_count, const int32_t core_index, const std::shared_ptr& context) + { device_ = device; - context_ = Context::Create(); - database_ = VIP_NULL; + context_ = context; + if(context_ == nullptr) { + context_ = tim::vx::Context::Create(); + } + auto fixed_core_count = core_count; + int32_t fixed_core_index = core_index; + vip_status_e status = VIP_SUCCESS; + if( !LiteNativeDevice::vip_initialized ) { + status = vip_init(); + if(status != VIP_SUCCESS){ + throw "Initialize viplite driver fail"; + } + } + int32_t total_core_count = (int32_t)device->CoreCount(); + if (fixed_core_index < 0) + { + fixed_core_index = 0; + } + if (fixed_core_index > total_core_count - 1){ + throw "Core index is larger than total core count."; + } + if (fixed_core_count <= 0 ) { + fixed_core_count = total_core_count - fixed_core_index; + } - vip_init(); - vip_query_database(&database_); - nbg_linker_init(database_); + if (fixed_core_index + fixed_core_count > total_core_count) { + fixed_core_count = total_core_count - fixed_core_index; + VSILOGW( + "Core_index + core_count is larger than total core count. Fix core " + "count to %d", + fixed_core_count); + } + core_index_ = (uint32_t)fixed_core_index; + core_count_ = (uint32_t)fixed_core_count; + +#ifdef VSI_DEVICE_SUPPORT + vsi_nn_device_t vsi_devices[VSI_MAX_DEVICES] = {0}; + vsi_size_t num_devices = 0; + vsi_size_t available_core_count = 0; + auto ctx = dynamic_cast(context_.get()); + vsi_nn_GetDevices(ctx->context(), vsi_devices, &num_devices); + + //Always use device 0 to compile NBG. + vsi_nn_GetDeviceCoreCount(vsi_devices[0], &available_core_count); + + if(core_index_ + core_count_ > (uint32_t)available_core_count) { + VSILOGE("the used core count is larger than compiler available core count"); + assert(false); + } + vsi_nn_CreateSubDevice(vsi_devices[0], core_index_, core_count_, &sub_device_); +#else + VSILOGE("device is not supported!"); + assert(false); +#endif + + executor_count++; } -LiteNativeExecutor::~LiteNativeExecutor() { - nbg_destroy_task(task_descriptor_); - nbg_linker_destroy(); - vip_destroy(); +LiteNativeExecutorImpl::~LiteNativeExecutorImpl() { +#ifdef VSI_DEVICE_SUPPORT + if(sub_device_) + vsi_nn_ReleaseDevice(&sub_device_); +#endif + executor_count--; + if(executor_count <1) + vip_destroy(); } -bool LiteNativeExecutor::Submit(const std::shared_ptr& executable, +bool LiteNativeExecutorImpl::Submit(const std::shared_ptr& executable, const std::shared_ptr& ref, bool after) { bool success = false; + success = executable->Verify(); + if (success == false) { + VSILOGE("Executable NBG compile failed"); + return false; + } if (executable == ref) { tasks_.push_back(executable); return true; @@ -72,239 +238,285 @@ bool LiteNativeExecutor::Submit(const std::shared_ptr& executable, return success; } -bool LiteNativeExecutor::Trigger(bool async) { +bool LiteNativeExecutorImpl::Trigger(bool async) { (void)async; - vip_status_e status = VIP_SUCCESS; - std::vector networks; - for (auto exe : tasks_) { - auto task = exe.lock(); - task->Verify(); - vip_network& network = - std::dynamic_pointer_cast(task)->network_; - networks.push_back(std::move(network)); - } - status = nbg_create_task(networks.size(), networks.data(), &task_descriptor_); - if (status != VIP_SUCCESS) { - VSILOGE("create task descriptor fail"); - return false; - } - status = vip_trigger_task(task_descriptor_); - if (status != VIP_SUCCESS) { - VSILOGE("trigger task descriptor fail"); - return false; - } - status = vip_wait_task(task_descriptor_); - if (status != VIP_SUCCESS) { - VSILOGE("wait task descriptor fail"); - // nbg_gen_capture(networks.size(), networks.data()); - return false; + while (!tasks_.empty()) { + auto task = tasks_.front(); + tasks_.erase(tasks_.begin()); + auto task_tmp = task.lock(); + if (!task_tmp) { + VSILOGE("Task is empty"); + return false; + } + task_tmp->Trigger(); } return true; } -std::shared_ptr LiteNativeExecutor::Compile( +std::shared_ptr LiteNativeExecutorImpl::Compile( const std::shared_ptr& graph) { - GraphImpl* graphimp = dynamic_cast(graph.get()); - IDevice::device_id_t id = device_->Id(); - vxSetGraphAttribute(graphimp->graph()->g, VX_GRAPH_DEVICE_INDEX_VIV, - (void*)(&id), sizeof(id)); size_t bin_size = -1; - graph->CompileToBinary(nullptr, &bin_size); std::vector nb_buf; +#ifdef VSI_DEVICE_SUPPORT + GraphImpl* graphimp = dynamic_cast(graph.get()); + vsi_nn_BindDevices(graphimp->graph(), 1, &sub_device_); +#endif + auto ret = graph->CompileToBinary(nullptr, &bin_size); nb_buf.resize(bin_size); - graph->CompileToBinary(nb_buf.data(), &bin_size); - return std::make_shared(shared_from_this(), nb_buf); + ret |= graph->CompileToBinary(nb_buf.data(), &bin_size); + if(!ret) { + VSILOGE("Compile fail"); + return nullptr; + } + + std::shared_ptr this_sp = shared_from_this(); + auto executable = std::make_shared(this_sp, nb_buf); + return executable; } -LiteNativeExecutable::LiteNativeExecutable( +LiteNativeExecutableImpl::LiteNativeExecutableImpl( const std::shared_ptr& executor, const std::vector& nb_buf) { executor_ = executor; - context_ = executor->Contex(); - nb_graph_ = context_->CreateGraph(); - nbg_create_network(nb_buf.data(), nb_buf.size(), - VIP_CREATE_NETWORK_FROM_MEMORY, &network_); - input_count_ = 0; - output_count_ = 0; - coeff_ = nullptr; - command_ = nullptr; - memory_pool_ = nullptr; - others_ = nullptr; - pre_command_ = nullptr; + context_ = nullptr; + nb_graph_ = nullptr; + vip_status_e status = VIP_SUCCESS; + vip_create_network_param_t net_param; + device_id_ = executor_.lock()->Device()->Id(); + auto core_index = executor_.lock()->CoreIndex(); + net_param.device_index = device_id_; + net_param.prop = VIP_NET_CREATE_PROP_FROM_NBG; + net_param.nbg.type = VIP_NET_CREATE_NBG_FROM_MEMORY; + net_param.nbg.memory.nbg_memory = (void*)nb_buf.data(); + net_param.nbg.memory.nbg_size = nb_buf.size(); - /* prepare vip network */ - vip_status_e status = VIP_SUCCESS; - nbg_network_memory_size_t buffer_size; - nbg_network_memory_buffer_t buffer; - vip_memory_t coeff_buffer; - vip_memory_t cmd_buffer; - vip_memory_t pre_cmd_buffer; - vip_memory_t pool_buffer; - vip_memory_t others_buffer; - nbg_query_network(network_, VIP_NETWORK_PROP_MEMORY_SIZE, &buffer_size); + auto network(std::make_unique(net_param)); - vip_allocate_videomemory(buffer_size.coeff, &coeff_); - vip_allocate_videomemory(buffer_size.command, &command_); - vip_allocate_videomemory(buffer_size.memory_pool, &memory_pool_); - vip_allocate_videomemory(buffer_size.others, &others_); - vip_allocate_videomemory(buffer_size.pre_command, &pre_command_); - - SetBuffer(&coeff_buffer, coeff_); - SetBuffer(&cmd_buffer, command_); - SetBuffer(&pre_cmd_buffer, pre_command_); - SetBuffer(&pool_buffer, memory_pool_); - SetBuffer(&others_buffer, others_); - - buffer.coeff = &coeff_buffer; - buffer.command = &cmd_buffer; - buffer.memory_pool = &pool_buffer; - buffer.others = &others_buffer; - buffer.pre_command = &pre_cmd_buffer; - buffer.dma_command = nullptr; - status = nbg_prepare_network(network_, &buffer); - - vip_flush_videomemory(coeff_, VIP_BUFFER_OPER_TYPE_FLUSH); - vip_flush_videomemory(command_, VIP_BUFFER_OPER_TYPE_FLUSH); - vip_flush_videomemory(pre_command_, VIP_BUFFER_OPER_TYPE_FLUSH); - vip_flush_videomemory(memory_pool_, VIP_BUFFER_OPER_TYPE_FLUSH); - vip_flush_videomemory(others_, VIP_BUFFER_OPER_TYPE_FLUSH); + lite_network_ = std::move(network); + status = lite_network_->Query(VIP_NETWORK_PROP_INPUT_COUNT,&input_count_); + if (status != VIP_SUCCESS) { + VSILOGE("failed to query network inputs"); + assert(false); + } + status = lite_network_->Query(VIP_NETWORK_PROP_OUTPUT_COUNT,&output_count_); + if (status != VIP_SUCCESS) { + VSILOGE("failed to query network outputs"); + assert(false); + } + status = lite_network_->Set(VIP_NETWORK_PROP_SET_CORE_INDEX,&core_index); + if (status != VIP_SUCCESS) { + VSILOGE("failed to set core index"); + assert(false); + } + status = lite_network_->Prepare(); if (status != VIP_SUCCESS) { VSILOGE("failed to prepare network"); assert(false); } } -LiteNativeExecutable::~LiteNativeExecutable() { - nbg_finish_network(network_); - nbg_destroy_network(network_); - if (coeff_) { - vip_free_videomemory(coeff_); - coeff_ = nullptr; - } - if (command_) { - vip_free_videomemory(command_); - command_ = nullptr; - } - if (memory_pool_) { - vip_free_videomemory(memory_pool_); - memory_pool_ = nullptr; - } - if (others_) { - vip_free_videomemory(others_); - others_ = nullptr; - } - if (pre_command_) { - vip_free_videomemory(pre_command_); - pre_command_ = nullptr; - } -} - -void LiteNativeExecutable::SetInput(const std::shared_ptr& th) { +void LiteNativeExecutableImpl::SetInput(const std::shared_ptr& th) { vip_status_e status = VIP_SUCCESS; - gcvip_videomemory_t* mem = - std::dynamic_pointer_cast(th)->tensor_buffer_; - vip_memory_t buffer; - SetBuffer(&buffer, mem); - - status = nbg_set_input(network_, input_count_, &buffer); + int32_t input_index = input_handles_.size(); + status = lite_network_->SetInput(input_index, th); if (status != VIP_SUCCESS) { - VSILOGE("failed to set input: %d", input_count_); + VSILOGE("failed to set input: %d", input_index); assert(false); } - ++input_count_; + input_handles_.push_back(th); +} +void LiteNativeExecutableImpl::SetInputs(const std::vector>& ths) { + for (auto th : ths) { + SetInput(th); + } } -void LiteNativeExecutable::SetOutput(const std::shared_ptr& th) { +void LiteNativeExecutableImpl::SetOutput(const std::shared_ptr& th) { vip_status_e status = VIP_SUCCESS; - gcvip_videomemory_t* mem = - std::dynamic_pointer_cast(th)->tensor_buffer_; - vip_memory_t buffer; - SetBuffer(&buffer, mem); - - status = nbg_set_output(network_, output_count_, &buffer); + int32_t output_index = output_handles_.size(); + status = lite_network_->SetOutput(output_index,th); if (status != VIP_SUCCESS) { - VSILOGE("failed to set output: %d", output_count_); + VSILOGE("failed to set output: %d", output_index); assert(false); } - ++output_count_; + output_handles_.push_back(th); } -void LiteNativeExecutable::GetOutput( - const std::vector>& th) { - (void)th; +void LiteNativeExecutableImpl::SetOutputs(const std::vector>& ths) { + for (auto th : ths) { + SetOutput(th); + } } -bool LiteNativeExecutable::Submit(const std::shared_ptr& ref, +bool LiteNativeExecutableImpl::Submit(const std::shared_ptr& ref, bool after) { bool status = false; + std::shared_ptr executor = + std::dynamic_pointer_cast(executor_.lock()); std::shared_ptr executable = shared_from_this(); - status = Executor()->Submit(executable, ref, after); + status = executor->Submit(executable, ref, after); return status; } -bool LiteNativeExecutable::Trigger(bool async) { - (void)async; - return false; -} - -bool LiteNativeExecutable::Verify() { - int32_t input_count = 0; - nbg_query_network(network_, VIP_NETWORK_PROP_INPUT_COUNT, &input_count); - if (input_count != input_count_) { - VSILOGE("input count mismatch, required: %d, provided: %d", input_count, - input_count_); - return false; +bool LiteNativeExecutableImpl::Trigger(bool async) { + vip_status_e status = VIP_SUCCESS; + if (async) { + status = lite_network_->Trigger(); + status = lite_network_->Wait(); + if (status != VIP_SUCCESS) { + VSILOGE("trigger network fail"); + return false; + } + } else { + status = lite_network_->Run(); + if (status != VIP_SUCCESS) { + VSILOGE("run network fail"); + return false; + } } - int32_t output_count = 0; - nbg_query_network(network_, VIP_NETWORK_PROP_OUTPUT_COUNT, &output_count); - if (output_count != output_count_) { - VSILOGE("output count mismatch, required: %d, provided: %d", output_count, - output_count_); - return false; - } - return true; } -std::shared_ptr LiteNativeExecutable::AllocateTensor( - const TensorSpec& tensor_spec) { - auto tensor = nb_graph_->CreateTensor(tensor_spec); - return std::make_shared(tensor); +bool LiteNativeExecutableImpl::Verify() { + bool ret = true; + auto output_index = output_handles_.size(); + auto input_index = input_handles_.size(); + if(input_index != input_count_) { + VSILOGE("Network need %d inputs but gaving %d.\n", input_count_, input_index); + ret = false; + } + if(output_index != output_count_) { + VSILOGE("Network need %d outputs but gaving %d.\n", output_count_, output_index); + ret = false; + } + + return ret; } -void LiteNativeExecutable::SetBuffer(vip_memory_t* dst, - gcvip_videomemory_t* src) { - if (dst && src) { - dst->cpu_logical = src->cpu_logical; - dst->npu_physical = src->npu_physical; - dst->size = src->size; +std::shared_ptr LiteNativeExecutableImpl::AllocateTensor(const TensorSpec& tensor_spec, + void* data, uint32_t size) { + return std::make_shared(tensor_spec, data, size, device_id_); +} + +LiteNativeTensorHandleImpl::LiteNativeTensorHandleImpl(const TensorSpec& tensor_spec, void* data, uint32_t size, + uint32_t device_id) { + vip_status_e status = VIP_ERROR_FAILURE; + spec_ = tensor_spec; + uint32_t tensor_size = tensor_spec.GetByteSize(); + vip_buffer_create_params_t tensor_param; + + uint32_t block_aligned_size = 64; + memory_type_ = ALLOC_MEM_NONE; + handle_ = nullptr; + handle_size_ = 0; + if(size > 0 && !data && tensor_size > size ) { + VSILOGE("Buffer size is less than the memory size required by the tensor"); + assert(false); + } +#if 0 + uint32_t addr_aligned_size = 256; + if (!data) { + data = vsi_nn_MallocAlignedBuffer(tensor_size,addr_aligned_size,block_aligned_size); + size = ((tensor_size + block_aligned_size - 1) / block_aligned_size) * block_aligned_size; + memory_type_ = ALLOC_MEM_INTERNAL; + } else { + memory_type_ = ALLOC_MEM_EXTERNAL; + } + handle_ = data; + if(!vsi_nn_IsBufferAligned((uint8_t *)handle_, addr_aligned_size)) { + VSILOGE("The starting address of the buffer needs to be 64-byte aligned"); + assert(false); + } + if(size % 64 != 0) { + VSILOGE("The size of the buffer needs to be 64-byte aligned"); + assert(false); + } + handle_size_ = size; + tensor_param.type = VIP_BUFFER_CREATE_FROM_USER_MEM; + tensor_param.device_index = device_id ; + tensor_param.src.from_handle.memory_type = VIP_BUFFER_FROM_USER_MEM_TYPE_HOST; + tensor_param.src.from_handle.logical_addr = handle_; + tensor_param.src.from_handle.size = handle_size_; + status = vip_create_buffer(&tensor_param,sizeof(tensor_param),&tensor_buffer_); +#else + (void)data; + tensor_param.type = VIP_BUFFER_CREATE_ALLOC_MEM; + tensor_param.device_index = device_id ; + tensor_param.src.alloc_mem.size = tensor_size; + tensor_param.src.alloc_mem.align = block_aligned_size; + status = vip_create_buffer(&tensor_param,sizeof(tensor_param),&tensor_buffer_); + memory_type_ = ALLOC_MEM_VIDEOMEM; +#endif + if(status != VIP_SUCCESS) { + if(memory_type_ == ALLOC_MEM_INTERNAL) { + vsi_nn_FreeAlignedBuffer((uint8_t*)handle_); + } + VSILOGE("Fail to create vip buffer."); + assert(false); } } -LiteNativeTensorHandle::LiteNativeTensorHandle( - const std::shared_ptr& tensor) { - tensor_ = tensor; - uint32_t size = tensor->GetSpec().GetByteSize(); - vip_allocate_videomemory(size, &tensor_buffer_); -} - -LiteNativeTensorHandle::~LiteNativeTensorHandle() { +LiteNativeTensorHandleImpl::~LiteNativeTensorHandleImpl() { if (tensor_buffer_) { - vip_free_videomemory(tensor_buffer_); + vip_destroy_buffer(tensor_buffer_); tensor_buffer_ = nullptr; } + if(memory_type_ == ALLOC_MEM_INTERNAL && handle_) { + vsi_nn_FreeAlignedBuffer((uint8_t*)handle_); + + } } -bool LiteNativeTensorHandle::CopyDataToTensor(const void* data, - uint32_t size_in_bytes) { - memcpy(tensor_buffer_->cpu_logical, data, size_in_bytes); +bool LiteNativeTensorHandleImpl::CopyDataToTensor(const void* data, + uint32_t size_in_bytes) { + void* handle = handle_; + if(memory_type_ == ALLOC_MEM_VIDEOMEM) { + handle = vip_map_buffer(tensor_buffer_); + } + auto buff_size = vip_get_buffer_size(tensor_buffer_); + memcpy(handle, data, buff_size > size_in_bytes ? size_in_bytes : buff_size); + if(memory_type_ == ALLOC_MEM_VIDEOMEM) { + vip_unmap_buffer(tensor_buffer_); + } + Flush(); return true; } -bool LiteNativeTensorHandle::CopyDataFromTensor(void* data) { - memcpy(data, tensor_buffer_->cpu_logical, tensor_buffer_->size); - return true; +bool LiteNativeTensorHandleImpl::CopyDataFromTensor(void* data) { + bool ret = Invalidate(); + if(ret) { + void* handle = handle_; + auto buff_size = vip_get_buffer_size(tensor_buffer_); + if(memory_type_ == ALLOC_MEM_VIDEOMEM) { + handle = vip_map_buffer(tensor_buffer_); + } + memcpy(data, handle, buff_size); + if(memory_type_ == ALLOC_MEM_VIDEOMEM) { + vip_unmap_buffer(tensor_buffer_); + } + } + + return ret; +} + +bool LiteNativeTensorHandleImpl::Flush() { + vip_status_e status = vip_flush_buffer(tensor_buffer_,VIP_BUFFER_OPER_TYPE_FLUSH); + if (status != VIP_SUCCESS) { + return false; + } + else{ + return true; + } +} +bool LiteNativeTensorHandleImpl::Invalidate() { + vip_status_e status = vip_flush_buffer(tensor_buffer_,VIP_BUFFER_OPER_TYPE_INVALIDATE); + if (status != VIP_SUCCESS) { + return false; + } + else{ + return true; + } } } // namespace platform diff --git a/src/tim/vx/platform/lite/lite_native_private.h b/src/tim/vx/platform/lite/lite_native_private.h new file mode 100644 index 0000000..b371945 --- /dev/null +++ b/src/tim/vx/platform/lite/lite_native_private.h @@ -0,0 +1,147 @@ +/**************************************************************************** +* +* Copyright (c) 2020-2023 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef TIM_VX_LITE_NATIVE_DEVICE_PRIVATE_H_ +#define TIM_VX_LITE_NATIVE_DEVICE_PRIVATE_H_ + +#include "tim/vx/platform/lite/lite_native.h" +#include "vip_lite.h" +#include "vsi_nn_pub.h" + + +namespace tim { +namespace vx { + +namespace platform { + +class LiteNetwork +{ +public: + LiteNetwork(vip_create_network_param_t& param); + ~LiteNetwork(); + vip_status_e Query(vip_enum property, void* value); + vip_status_e Set(vip_enum property, void* value); + vip_status_e Prepare(); + vip_status_e Run(); + vip_status_e Trigger(); + vip_status_e Wait(); + vip_status_e Cancel(); + vip_status_e QueryInput(vip_uint32_t index, vip_enum property, void* value); + vip_status_e QueryOutput(vip_uint32_t index, vip_enum property, void* value); + vip_status_e SetInput(vip_uint32_t index, std::shared_ptr input); + vip_status_e SetOutput(vip_uint32_t index, std::shared_ptr output); + +private: + vip_network network_; +}; + +class LiteNativeDeviceImpl : public LiteNativeDevice, + public std::enable_shared_from_this { + public: + LiteNativeDeviceImpl(device_id_t id,uint32_t core_count); + ~LiteNativeDeviceImpl() {}; + + bool Submit(const std::shared_ptr& graph) override; + bool Trigger(bool async = false, async_callback cb = NULL) override; + bool DeviceExit() override; + void WaitDeviceIdle() override; + std::shared_ptr CreateExecutor(const int32_t core_index = 0, + const int32_t core_count = -1, + const std::shared_ptr& context = nullptr) override; +}; + +class LiteNativeExecutorImpl + : public LiteNativeExecutor, + public std::enable_shared_from_this { + public: + LiteNativeExecutorImpl(const std::shared_ptr& device, + const int32_t core_index = 0, + const int32_t core_count = -1, + const std::shared_ptr& context = nullptr); + virtual ~LiteNativeExecutorImpl(); + bool Submit(const std::shared_ptr& executable, + const std::shared_ptr& ref, + bool after = true) override; + bool Trigger(bool async = false) override; + std::shared_ptr Compile(const std::shared_ptr& graph) override; + static int executor_count; + +private: +#ifdef VSI_DEVICE_SUPPORT + vsi_nn_device_t sub_device_; +#endif +}; + +class LiteNativeExecutableImpl : public LiteNativeExecutable { + public: + LiteNativeExecutableImpl(const std::shared_ptr& executor, + const std::vector& nb_buf); + virtual ~LiteNativeExecutableImpl() {}; + void SetInput(const std::shared_ptr& th) override; + void SetOutput(const std::shared_ptr& th) override; + void SetInputs(const std::vector>& ths) override; + void SetOutputs(const std::vector>& ths) override; + bool Submit(const std::shared_ptr& ref, bool after) override; + bool Trigger(bool async) override; + bool Verify() override; + std::shared_ptr AllocateTensor(const TensorSpec& tensor_spec, + void* data = nullptr, uint32_t size = 0) override; + + private: + uint32_t device_id_; + uint32_t input_count_; + uint32_t output_count_; + std::unique_ptr lite_network_; +}; + +class LiteNativeTensorHandleImpl : public LiteNativeTensorHandle { + public: + typedef enum { + ALLOC_MEM_NONE, + ALLOC_MEM_EXTERNAL, + ALLOC_MEM_INTERNAL, + ALLOC_MEM_VIDEOMEM, + ALLOC_MEM_PHYSICAL, + ALLOC_MEM_FD, + } alloc_mem_type; + + LiteNativeTensorHandleImpl(const TensorSpec& tensor_spec,void* data, uint32_t size,uint32_t device_id); + virtual ~LiteNativeTensorHandleImpl(); + bool CopyDataToTensor(const void* data, uint32_t size_in_bytes) override; + bool CopyDataFromTensor(void* data) override; + bool Flush(); + bool Invalidate(); + vip_buffer GetBuffer() {return tensor_buffer_;}; + +private: + vip_buffer tensor_buffer_; + void* handle_; + uint32_t handle_size_; + alloc_mem_type memory_type_; +}; + +} // namespace platform +} // namespace vx +} // namespace tim + +#endif /* TIM_VX_NATIVE_DEVICE_PRIVATE_H_*/ diff --git a/src/tim/vx/platform/native.cc b/src/tim/vx/platform/native.cc index 81c9c3a..038f38d 100644 --- a/src/tim/vx/platform/native.cc +++ b/src/tim/vx/platform/native.cc @@ -22,9 +22,14 @@ * *****************************************************************************/ #include "tim/vx/platform/native.h" -#include "native_device_private.h" +#include "native_private.h" +#include "context_private.h" #include "tim/vx/ops/nbg.h" +#ifdef ENABLE_PLATFORM_LITE +#include "tim/vx/platform/lite/lite_native.h" +#endif +#include namespace tim { namespace vx { namespace platform { @@ -35,215 +40,203 @@ std::shared_ptr Compile( return executor->Compile(graph); } -std::shared_ptr CreateExecutableSet( - const std::vector>& executables) { - ExecutableSet* executable_set = new ExecutableSet(executables); - std::shared_ptr executable(executable_set); - return executable; +NativeDeviceImpl::NativeDeviceImpl(device_id_t id, uint32_t core_count) { + device_id_ = id; + core_count_ = core_count; +} +std::vector> IDevice::Enumerate() { +#ifdef ENABLE_PLATFORM_LITE + auto devices = tim::vx::platform::LiteNativeDevice::Enumerate(); +#else + auto devices = tim::vx::platform::NativeDevice::Enumerate(); +#endif + return devices; } - -IDevice::device_id_t IDevice::Id() const { return device_id_; } void IDevice::RemoteReset() {} -NativeDeviceImpl::NativeDeviceImpl(device_id_t id) { - vip_device_ = std::make_unique(id); - device_id_ = id; -} - bool NativeDeviceImpl::Submit(const std::shared_ptr& graph) { - GraphImpl* graphimp = - dynamic_cast(graph.get()); // hack to downcast - vsi_graph_v_.push_back(graphimp->graph()); + (void)graph; return true; } bool NativeDeviceImpl::Trigger(bool async, async_callback cb) { - // extract graph from tasks (void)async; - bool status = false; - while (!vsi_graph_v_.empty()) { - auto task = vsi_graph_v_.front(); - vsi_graph_v_.erase(vsi_graph_v_.begin()); - status = vip_device_->GraphSubmit(task, cb, NULL); - } - return status; + (void)cb; + return true; } -void NativeDeviceImpl::WaitDeviceIdle() { vip_device_->WaitThreadIdle(); } +void NativeDeviceImpl::WaitDeviceIdle() {} -bool NativeDeviceImpl::DeviceExit() { return vip_device_->ThreadExit(); } +bool NativeDeviceImpl::DeviceExit() { return true; } + +std::shared_ptr NativeDeviceImpl::CreateExecutor(const int32_t core_index, + const int32_t core_count, + const std::shared_ptr& context) { + std::shared_ptr this_sp = shared_from_this(); + auto executor = std::make_shared(this_sp, core_count,core_index,context); + return executor; +} std::vector> NativeDevice::Enumerate() { std::vector> device_v; - device_id_t deviceCount = 0; - vsi_nn_context_t context; - context = vsi_nn_CreateContext(); + vsi_nn_context_t context = vsi_nn_CreateContext(); + vsi_size_t deviceCount = 0; +#ifdef VSI_DEVICE_SUPPORT + vsi_nn_device_t vsi_devices[VSI_MAX_DEVICES] = {0}; + vsi_status status = VSI_FAILURE; + vsi_size_t deviceCount = 0; + + status = vsi_nn_GetDevices(context,vsi_devices,&deviceCount); + if(status != VSI_SUCCESS){ + VSILOGE("Get device count fail"); + return device_v; + } + + for (vsi_size_t i = 0; i < deviceCount; i++) { + vsi_size_t available_core_count = 0; + vsi_nn_GetDeviceCoreCount(vsi_devices[i],&available_core_count); + auto local_device = std::make_shared(i,available_core_count); + device_v.push_back(local_device); + } +#else vxQueryContext(context->c, VX_CONTEXT_DEVICE_COUNT_VIV, &deviceCount, sizeof(deviceCount)); - std::cout << "Device count = " << deviceCount << std::endl; for (device_id_t i = 0; i < deviceCount; i++) { - IDevice* local_device = new NativeDeviceImpl(i); - std::shared_ptr local_device_sp(local_device); - device_v.push_back(local_device_sp); + auto local_device = std::make_shared(i,0); + device_v.push_back(local_device); } + VSILOGE("VSI device API is not supportted, please upgrade Vivant SDK version >= 6.4.22 && ovxlib >= 1.2.26 !"); +#endif vsi_nn_ReleaseContext(&context); return device_v; } -std::shared_ptr IExecutable::NBGraph() const { return nb_graph_; } - -std::shared_ptr IExecutable::Executor() const { - auto executor = executor_.lock(); - if (!executor) { - std::cout << "Executor unable to lock weak_ptr"; - } - return executor; -} - -NativeExecutable::NativeExecutable(const std::shared_ptr& executor, +NativeExecutableImpl::NativeExecutableImpl(const std::shared_ptr& executor, const std::vector& nb_buf, size_t inputs, size_t outputs) { - CompileOption opt; - opt.setDeviceId(executor->Device()->Id()); executor_ = executor; context_ = executor->Contex(); - nb_graph_ = context_->CreateGraph(opt); + nb_graph_ = context_->CreateGraph(); nb_buf_ = nb_buf; nb_node_ = nb_graph_->CreateOperation(nb_buf_.data(), inputs, outputs); } -void NativeExecutable::SetInput(const std::shared_ptr& th) { +void NativeExecutableImpl::SetInput(const std::shared_ptr& th) { nb_node_->BindInput(th->GetTensor()); + input_handles_.push_back(th); } -void NativeExecutable::SetOutput(const std::shared_ptr& th) { +void NativeExecutableImpl::SetInputs(const std::vector>& ths) { + for (auto& t : ths) { + SetInput(t); + } +} + +void NativeExecutableImpl::SetOutput(const std::shared_ptr& th) { nb_node_->BindOutput(th->GetTensor()); + output_handles_.push_back(th); } -void NativeExecutable::GetOutput( - const std::vector>& th) { - (void)th; +void NativeExecutableImpl::SetOutputs(const std::vector>& ths) { + for (auto& t : ths) { + SetOutput(t); + } + } -bool NativeExecutable::Submit(const std::shared_ptr& ref, +bool NativeExecutableImpl::Submit(const std::shared_ptr& ref, bool after) { bool status = false; std::shared_ptr executable = shared_from_this(); - status = Executor()->Submit(executable, ref, after); + std::shared_ptr executor = std::dynamic_pointer_cast(executor_.lock()); + status = executor->Submit(executable, ref, after); return status; } -bool NativeExecutable::Trigger(bool async) { +bool NativeExecutableImpl::Trigger(bool async) { (void)async; - bool status = false; - auto device = Executor()->Device(); - device->Submit(nb_graph_); - status = device->Trigger(); - device->WaitDeviceIdle(); + bool status = nb_graph_->Run(); return status; } -std::shared_ptr NativeExecutable::AllocateTensor( - const TensorSpec& tensor_spec) { - auto tensor = nb_graph_->CreateTensor(tensor_spec); - ITensorHandle* tensor_handle = new NativeTensorHandle(tensor); - std::shared_ptr tensor_handle_sp(tensor_handle); - return tensor_handle_sp; +std::shared_ptr NativeExecutableImpl::AllocateTensor(const TensorSpec& tensor_spec, + void* data, uint32_t size) { + (void)size; + auto tensor = nb_graph_->CreateTensor(tensor_spec,data); + return std::make_shared(tensor); } -bool NativeExecutable::Verify() { return nb_graph_->Compile(); } - -ExecutableSet::ExecutableSet( - const std::vector>& executables) { - executables_ = executables; - executor_ = executables[0]->Executor(); -} - -void ExecutableSet::SetInput(const std::shared_ptr& th) { - (void)th; -} - -void ExecutableSet::SetOutput(const std::shared_ptr& th) { - (void)th; -} - -void ExecutableSet::GetOutput( - const std::vector>& th) { - (void)th; -} - -bool ExecutableSet::Submit(const std::shared_ptr& ref, - bool after) { - bool status = false; - std::shared_ptr executable = shared_from_this(); - status = Executor()->Submit(executable, ref, after); - return status; -} - -bool ExecutableSet::Trigger(bool async) { - (void)async; - bool status = false; - auto device = Executor()->Device(); - for (auto executable : executables_) { - device->Submit(executable->NBGraph()); +bool NativeExecutableImpl::Verify() { + std::shared_ptr executor = std::dynamic_pointer_cast(executor_.lock()); + bool success = executor->BindDevices(NBGraph()); + if (success == false) { + VSILOGE("Executable bind device failed"); + return false; } - status = device->Trigger(); - device->WaitDeviceIdle(); - return status; -} - -std::shared_ptr ExecutableSet::AllocateTensor( - const TensorSpec& tensor_spec) { - std::shared_ptr tensor_handle_sp; - (void)tensor_spec; - return tensor_handle_sp; -} - -std::vector> ExecutableSet::Executables() const { - return executables_; -} - -bool ExecutableSet::Verify() { - bool status = false; - for (auto executable : executables_) { - status = executable->Verify(); + success = nb_graph_->Compile(); + return success; } - return status; -} -std::shared_ptr IExecutor::Contex() const { return context_; } - -NativeExecutor::NativeExecutor(const std::shared_ptr& device) { - device_ = device; - context_ = Context::Create(); -} - -NativeExecutor::NativeExecutor(const std::shared_ptr& device, +NativeExecutorImpl::NativeExecutorImpl(const std::shared_ptr& device, + const int32_t core_count, + const int32_t core_index, const std::shared_ptr& context) { device_ = device; - context_ = context; + if(!context) { + context_ = Context::Create(); + } else { + context_ = context; + } + auto fixed_core_count = core_count; + int32_t fixed_core_index = core_index; + int32_t total_core_count =(int32_t)device_->CoreCount(); + if (fixed_core_index < 0) { + fixed_core_index = 0; + } + if (fixed_core_index > total_core_count - 1) { + VSILOGE("Core index is larger than total core count"); + assert(false); + } + if (fixed_core_count <= 0 ) { + fixed_core_count = total_core_count - fixed_core_index; + } + + if (fixed_core_index + fixed_core_count > total_core_count) { + fixed_core_count = total_core_count - fixed_core_index; + VSILOGW( + "Core_index + core_count is larger than total core count. Fix core count to %d", fixed_core_count); + } + core_index_ = (uint32_t)fixed_core_index; + core_count_ = (uint32_t)fixed_core_count; +#ifdef VSI_DEVICE_SUPPORT + vsi_nn_device_t vsi_devices[VSI_MAX_DEVICES] = {0}; + vsi_size_t num_devices = 0; + auto ctx = dynamic_cast(context_.get()); + vsi_nn_GetDevices(ctx->context(),vsi_devices,&num_devices); + vsi_nn_CreateSubDevice(vsi_devices[device_->Id()],core_index_,core_count_,&sub_devices_); +#endif } -bool NativeExecutor::Submit(const std::shared_ptr& executable, +bool NativeExecutorImpl::Submit(const std::shared_ptr& executable, const std::shared_ptr& ref, bool after) { bool success = false; success = executable->Verify(); - if (success == false) { - std::cout << "Executable NBG compile failed"; + if(success == false) { + VSILOGE("Executable NBG compile failed"); return false; } - if (executable == ref) { + if(executable == ref) { tasks_.push_back(executable); return true; } - for (size_t i = 0; i < tasks_.size(); i++) { - if (tasks_[i].lock() == ref) { - if (after == true) { + for(size_t i = 0; i < tasks_.size(); i++) { + if(tasks_[i].lock() == ref) { + if(after == true) { tasks_.insert(tasks_.begin() + i + 1, executable); success = true; break; @@ -257,59 +250,81 @@ bool NativeExecutor::Submit(const std::shared_ptr& executable, return success; } -bool NativeExecutor::Trigger(bool async) { +bool NativeExecutorImpl::Trigger(bool async) { (void)async; - while (!tasks_.empty()) { + bool ret = false; + while(!tasks_.empty()) { auto task = tasks_.front(); tasks_.erase(tasks_.begin()); - auto task_ = task.lock(); - if (!task_) { - std::cout << "Task unable to lock weak_ptr"; + auto task_tmp = task.lock(); + if(!task_tmp) { + VSILOGE("Task unable to lock weak_ptr"); + return false; } - task_->Trigger(); + ret = task_tmp->Trigger(); } device_->WaitDeviceIdle(); - return true; + return ret; } -std::shared_ptr NativeExecutor::Compile( +std::shared_ptr NativeExecutorImpl::Compile( const std::shared_ptr& graph) { - - CompileOption option; - option.setDeviceId(device_->Id()); - graph->SetCompileOption(option); - + bool ret = BindDevices(graph); + if(!ret) { + return nullptr; + } size_t bin_size = -1; - graph->CompileToBinary(nullptr, &bin_size); + ret = graph->CompileToBinary(nullptr, &bin_size); + if(!ret) { + return nullptr; + } std::vector nb_buf; nb_buf.resize(bin_size); size_t inputs = graph->InputsTensor().size(); size_t outputs = graph->OutputsTensor().size(); - graph->CompileToBinary(nb_buf.data(), &bin_size); - std::shared_ptr this_sp = shared_from_this(); - IExecutable* executable = - new NativeExecutable(this_sp, nb_buf, inputs, outputs); - std::shared_ptr executable_sp(executable); - return executable_sp; + ret = graph->CompileToBinary(nb_buf.data(), &bin_size); + if(!ret) { + return nullptr; + } + std::shared_ptr this_sp = shared_from_this(); + auto executable = std::make_shared(this_sp, nb_buf,inputs,outputs); + return executable; } -std::shared_ptr IExecutor::Device() const { return device_; } -std::shared_ptr ITensorHandle::GetTensor() const { return tensor_; } +bool NativeExecutorImpl::BindDevices(const std::shared_ptr& graph){ + vsi_status status = VSI_SUCCESS; +#ifdef VSI_DEVICE_SUPPORT + GraphImpl* graphimp = dynamic_cast(graph.get()); + status = vsi_nn_BindDevices(graphimp->graph(), 1, &sub_devices_); +#else + CompileOption option; + option.setDeviceId(device_->Id()); + graph->SetCompileOption(option); +#endif + if(status == VSI_SUCCESS) { + return true; + } + else{ + return false; + } +} -NativeTensorHandle::NativeTensorHandle(const std::shared_ptr& tensor) { + +NativeTensorHandleImpl::NativeTensorHandleImpl(const std::shared_ptr& tensor) { tensor_ = tensor; + spec_ = tensor->GetSpec(); } -bool NativeTensorHandle::CopyDataToTensor(const void* data, +bool NativeTensorHandleImpl::CopyDataToTensor(const void* data, uint32_t size_in_bytes) { return tensor_->CopyDataToTensor(data, size_in_bytes); } -bool NativeTensorHandle::CopyDataFromTensor(void* data) { +bool NativeTensorHandleImpl::CopyDataFromTensor(void* data) { return tensor_->CopyDataFromTensor(data); } } // namespace platform } // namespace vx -} // namespace tim \ No newline at end of file +} // namespace tim diff --git a/src/tim/vx/platform/native_device_private.h b/src/tim/vx/platform/native_device_private.h deleted file mode 100644 index ad005f9..0000000 --- a/src/tim/vx/platform/native_device_private.h +++ /dev/null @@ -1,58 +0,0 @@ -/**************************************************************************** -* -* Copyright (c) 2020-2023 Vivante Corporation -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -* DEALINGS IN THE SOFTWARE. -* -*****************************************************************************/ -#ifndef TIM_VX_NATIVE_DEVICE_PRIVATE_H_ -#define TIM_VX_NATIVE_DEVICE_PRIVATE_H_ - -#include "tim/vx/platform/native.h" -#include "vip/virtual_device.h" -#include "graph_private.h" - -namespace tim { -namespace vx { - -class GraphImpl; - -namespace platform { - -class NativeDeviceImpl : public NativeDevice { - public: - NativeDeviceImpl(device_id_t id); - ~NativeDeviceImpl(){}; - - bool Submit(const std::shared_ptr& graph) override; - bool Trigger(bool async = false, async_callback cb = NULL) override; - bool DeviceExit() override; - void WaitDeviceIdle() override; - - protected: - std::unique_ptr vip_device_; - std::vector vsi_graph_v_; - -}; - -} // namespace platform -} // namespace vx -} // namespace tim - -#endif /* TIM_VX_NATIVE_DEVICE_PRIVATE_H_*/ \ No newline at end of file diff --git a/src/tim/vx/platform/native_private.h b/src/tim/vx/platform/native_private.h new file mode 100644 index 0000000..3d86a08 --- /dev/null +++ b/src/tim/vx/platform/native_private.h @@ -0,0 +1,106 @@ +/**************************************************************************** +* +* Copyright (c) 2020-2025 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef TIM_VX_NATIVE_DEVICE_PRIVATE_H_ +#define TIM_VX_NATIVE_DEVICE_PRIVATE_H_ + +#include "tim/vx/platform/native.h" +#include "vip/virtual_device.h" +#include "graph_private.h" + +namespace tim { +namespace vx { + +class GraphImpl; + +namespace platform { + +class NativeDeviceImpl : public NativeDevice, + public std::enable_shared_from_this{ + public: + NativeDeviceImpl(device_id_t id,uint32_t core_count); + ~NativeDeviceImpl(){}; + + bool Submit(const std::shared_ptr& graph) override; + bool Trigger(bool async = false, async_callback cb = NULL) override; + bool DeviceExit() override; + void WaitDeviceIdle() override; + std::shared_ptr CreateExecutor(const int32_t core_index = 0, + const int32_t core_count = -1, + const std::shared_ptr& context = nullptr) override; +}; + +class NativeExecutableImpl : public NativeExecutable { + public: + NativeExecutableImpl(const std::shared_ptr& executor, + const std::vector& nb_buf, size_t inputs, + size_t outputs); + ~NativeExecutableImpl() {}; + void SetInput(const std::shared_ptr& th) override; + void SetOutput(const std::shared_ptr& th) override; + void SetInputs(const std::vector>& ths) override; + void SetOutputs(const std::vector>& ths) override; + bool Submit(const std::shared_ptr& ref, bool after = true) override; + bool Trigger(bool async = false) override; + std::shared_ptr AllocateTensor(const TensorSpec& tensor_spec, + void* data = nullptr, uint32_t size = 0) override; + bool Verify() override; + + protected: + std::shared_ptr nb_node_; + std::vector nb_buf_; +}; + +class NativeExecutorImpl : public NativeExecutor, + public std::enable_shared_from_this { + public: + NativeExecutorImpl(const std::shared_ptr& device, + const int32_t core_count = -1, + const int32_t core_index = 0, + const std::shared_ptr& context = nullptr); + ~NativeExecutorImpl(){}; + bool Submit(const std::shared_ptr& executable, + const std::shared_ptr& ref, + bool after = true) override; + bool Trigger(bool async = false) override; + std::shared_ptr Compile(const std::shared_ptr& graph) override; + bool BindDevices(const std::shared_ptr& graph); + +private: +#ifdef VSI_DEVICE_SUPPORT + vsi_nn_device_t sub_devices_; +#endif +}; + +class NativeTensorHandleImpl : public NativeTensorHandle { + public: + NativeTensorHandleImpl(const std::shared_ptr& tensor); + bool CopyDataToTensor(const void* data, uint32_t size_in_bytes) override; + bool CopyDataFromTensor(void* data) override; +}; + +} // namespace platform +} // namespace vx +} // namespace tim + +#endif /* TIM_VX_NATIVE_DEVICE_PRIVATE_H_*/