Refine platform code and samples (#713)

* Refine platform code and samples 1. Support viplite v2 API 2. Unify the Lite and Native platform APIs so that the same code can run on different platforms through different compilation options. Type: Code Improvement Signed-off-by: Kee <xuke537@hotmail.com> * Fix build error if VSI device API is not supported Signed-off-by: Kee <xuke537@hotmail.com> --------- Signed-off-by: Kee <xuke537@hotmail.com>
2025-10-13 13:15:31 +08:00 · 2025-10-13 13:15:31 +08:00 · c4e75674fa
parent 6810d310d3
commit c4e75674fa
21 changed files with 1098 additions and 661 deletions
--- a/cmake/local_sdk.cmake
+++ b/cmake/local_sdk.cmake
@ -9,7 +9,11 @@ list(APPEND OVXDRV_INCLUDE_DIRS
 if("${CONFIG}" STREQUAL "BUILDROOT")
    set(VIV_SDK_DRIVER_PREFIX "usr/lib")
 else()
-    set(VIV_SDK_DRIVER_PREFIX "drivers")
+    if(EXISTS ${EXTERNAL_VIV_SDK}/drivers)
+       set(VIV_SDK_DRIVER_PREFIX "drivers")
+    else()
+       set(VIV_SDK_DRIVER_PREFIX "lib")
+    endif()
 endif()

 message("using driver libs from ${EXTERNAL_VIV_SDK}/${VIV_SDK_DRIVER_PREFIX}")
--- a/include/tim/vx/platform/lite/lite_native.h
+++ b/include/tim/vx/platform/lite/lite_native.h
@ -25,72 +25,58 @@
 #define TIM_VX_LITE_NATIVE_H_

 #include "tim/vx/platform/platform.h"
-#include "vip_lite.h"
-#include "nbg_linker.h"

 namespace tim {
 namespace vx {
 namespace platform {

-class LiteNativeExecutor
-    : public IExecutor,
-      public std::enable_shared_from_this<LiteNativeExecutor> {
+class LiteNativeDevice : public IDevice {
 public:
-  LiteNativeExecutor(const std::shared_ptr<IDevice>& device);
-  virtual ~LiteNativeExecutor();
-  bool Submit(const std::shared_ptr<IExecutable>& executable,
-              const std::shared_ptr<IExecutable>& ref,
-              bool after = true) override;
-  bool Trigger(bool async = false) override;
-  std::shared_ptr<IExecutable> Compile(
-      const std::shared_ptr<Graph>& graph) override;
-
- private:
-  vip_task_descriptor_t* task_descriptor_;
-  vip_database database_;
+  virtual ~LiteNativeDevice() {};
+  virtual bool Submit(const std::shared_ptr<Graph>& graph) = 0;
+  virtual bool Trigger(bool async = false, async_callback cb = NULL) = 0;
+  virtual bool DeviceExit() = 0;
+  virtual void WaitDeviceIdle() = 0;
+  virtual std::shared_ptr<IExecutor> CreateExecutor(const int32_t core_index = 0,
+                                                    const int32_t core_count = -1,
+                                                    const std::shared_ptr<Context>& context = nullptr) = 0;
+  static std::vector<std::shared_ptr<IDevice>> Enumerate();
+  static bool vip_initialized;
+};
+class LiteNativeExecutor
+    : public IExecutor {
+ public:
+  virtual ~LiteNativeExecutor() {};
+  virtual bool Submit(const std::shared_ptr<IExecutable>& executable,
+                      const std::shared_ptr<IExecutable>& ref,
+                      bool after = true) = 0;
+  virtual bool Trigger(bool async = false) = 0;
+  virtual std::shared_ptr<IExecutable> Compile(
+      const std::shared_ptr<Graph>& graph) = 0;
 };

 class LiteNativeExecutable : public IExecutable {
 public:
-  LiteNativeExecutable(const std::shared_ptr<IExecutor>& executor,
-                       const std::vector<char>& nb_buf);
-  virtual ~LiteNativeExecutable();
-  void SetInput(const std::shared_ptr<ITensorHandle>& th) override;
-  void SetOutput(const std::shared_ptr<ITensorHandle>& th) override;
-  void GetOutput(
-      const std::vector<std::shared_ptr<ITensorHandle>>& th) override;
-  bool Submit(const std::shared_ptr<IExecutable>& ref, bool after) override;
-  bool Trigger(bool async) override;
-  bool Verify() override;
-  std::shared_ptr<ITensorHandle> AllocateTensor(
-      const TensorSpec& tensor_spec) override;
-
-  vip_network network_;
-
- private:
-  void SetBuffer(vip_memory_t* dst, gcvip_videomemory_t* src);
-
-  int32_t input_count_;
-  int32_t output_count_;
-
-  gcvip_videomemory_t* coeff_;
-  gcvip_videomemory_t* command_;
-  gcvip_videomemory_t* memory_pool_;
-  gcvip_videomemory_t* others_;
-  gcvip_videomemory_t* pre_command_;
+  virtual ~LiteNativeExecutable() {};
+  virtual void SetInput(const std::shared_ptr<ITensorHandle>& th) = 0;
+  virtual void SetOutput(const std::shared_ptr<ITensorHandle>& th) = 0;
+  virtual void SetInputs(const std::vector<std::shared_ptr<ITensorHandle>>& ths) = 0;
+  virtual void SetOutputs(const std::vector<std::shared_ptr<ITensorHandle>>& ths) = 0;
+  virtual bool Submit(const std::shared_ptr<IExecutable>& ref, bool after) = 0;
+  virtual bool Trigger(bool async) = 0;
+  virtual bool Verify() = 0;
+  virtual std::shared_ptr<ITensorHandle> AllocateTensor(const TensorSpec& tensor_spec,
+                                                        void* data = nullptr, uint32_t size = 0) = 0;
 };

 class LiteNativeTensorHandle : public ITensorHandle {
 public:
-  LiteNativeTensorHandle(const std::shared_ptr<Tensor>& tensr);
-  virtual ~LiteNativeTensorHandle();
-  bool CopyDataToTensor(const void* data, uint32_t size_in_bytes) override;
-  bool CopyDataFromTensor(void* data) override;
-
-  gcvip_videomemory_t* tensor_buffer_;
+  virtual ~LiteNativeTensorHandle() {};
+  bool CopyDataToTensor(const void* data, uint32_t size_in_bytes) = 0;
+  bool CopyDataFromTensor(void* data) = 0;
 };
 }  // namespace platform
 }  // namespace vx
 }  // namespace tim

-#endif
+#endif
--- a/include/tim/vx/platform/native.h
+++ b/include/tim/vx/platform/native.h
@ -37,51 +37,41 @@ class NativeDevice : public IDevice {
  virtual bool Trigger(bool async = false, async_callback cb = NULL) = 0;
  virtual bool DeviceExit() = 0;
  virtual void WaitDeviceIdle() = 0;
+  virtual std::shared_ptr<IExecutor> CreateExecutor(const int32_t core_index = 0,
+                                                    const int32_t core_count = -1,
+                                                    const std::shared_ptr<Context>& context = nullptr) = 0;
  static std::vector<std::shared_ptr<IDevice>> Enumerate();
 };

 class NativeExecutable : public IExecutable {
 public:
-  NativeExecutable(const std::shared_ptr<IExecutor>& executor,
-                   const std::vector<char>& nb_buf, size_t inputs,
-                   size_t outputs);
-  ~NativeExecutable(){};
-  void SetInput(const std::shared_ptr<ITensorHandle>& th) override;
-  void SetOutput(const std::shared_ptr<ITensorHandle>& th) override;
-  void GetOutput(
-      const std::vector<std::shared_ptr<ITensorHandle>>& th) override;
-  bool Submit(const std::shared_ptr<IExecutable>& ref,
-              bool after = true) override;
-  bool Trigger(bool async = false) override;
-  std::shared_ptr<ITensorHandle> AllocateTensor(
-      const TensorSpec& tensor_spec) override;
-  bool Verify() override;
-
- protected:
-  std::shared_ptr<tim::vx::ops::NBG> nb_node_;
-  std::vector<char> nb_buf_;
+  virtual ~NativeExecutable() {};
+  virtual void SetInput(const std::shared_ptr<ITensorHandle>& th) = 0;
+  virtual void SetOutput(const std::shared_ptr<ITensorHandle>& th) = 0;
+  virtual void SetInputs(const std::vector<std::shared_ptr<ITensorHandle>>& ths) = 0;
+  virtual void SetOutputs(const std::vector<std::shared_ptr<ITensorHandle>>& ths) = 0;
+  virtual bool Submit(const std::shared_ptr<IExecutable>& ref,
+                      bool after = true) = 0;
+  virtual bool Trigger(bool async = false) = 0;
+  virtual std::shared_ptr<ITensorHandle> AllocateTensor(const TensorSpec& tensor_spec,
+                                                        void* data = nullptr, uint32_t size = 0) = 0;
+  virtual bool Verify() = 0;
 };

-class NativeExecutor : public IExecutor,
-                       public std::enable_shared_from_this<NativeExecutor> {
+class NativeExecutor : public IExecutor {
 public:
-  NativeExecutor(const std::shared_ptr<IDevice>& device);
-  NativeExecutor(const std::shared_ptr<IDevice>& device,
-                 const std::shared_ptr<Context>& context);
-  ~NativeExecutor(){};
-  bool Submit(const std::shared_ptr<IExecutable>& executable,
-              const std::shared_ptr<IExecutable>& ref,
-              bool after = true) override;
-  bool Trigger(bool async = false) override;
-  std::shared_ptr<IExecutable> Compile(
-      const std::shared_ptr<Graph>& graph) override;
+  virtual ~NativeExecutor(){};
+  virtual bool Submit(const std::shared_ptr<IExecutable>& executable,
+                      const std::shared_ptr<IExecutable>& ref,
+                      bool after = true) = 0;
+  virtual bool Trigger(bool async = false) = 0;
+  virtual std::shared_ptr<IExecutable> Compile(const std::shared_ptr<Graph>& graph) = 0;
 };

 class NativeTensorHandle : public ITensorHandle {
 public:
-  NativeTensorHandle(const std::shared_ptr<Tensor>& tensor);
-  bool CopyDataToTensor(const void* data, uint32_t size_in_bytes) override;
-  bool CopyDataFromTensor(void* data) override;
+  virtual bool CopyDataToTensor(const void* data, uint32_t size_in_bytes) = 0;
+  virtual bool CopyDataFromTensor(void* data) = 0;
 };

 }  // namespace platform
--- a/include/tim/vx/platform/platform.h
+++ b/include/tim/vx/platform/platform.h
@ -46,15 +46,12 @@ namespace platform {

 class IDevice;
 class IExecutable;
-class ExecutableSet;
 class IExecutor;
 class ITensorHandle;

 std::shared_ptr<IExecutable> Compile(
    const std::shared_ptr<Graph>& graph,
    const std::shared_ptr<IExecutor>& executor);
-std::shared_ptr<IExecutable> CreateExecutableSet(
-    const std::vector<std::shared_ptr<IExecutable>>& executables);

 class IDevice {
 public:
@ -68,17 +65,25 @@ class IDevice {
  virtual ~IDevice(){};
  virtual bool Submit(const std::shared_ptr<Graph>& graph) = 0;
  virtual bool Trigger(bool async = false, async_callback cb = NULL) = 0;
-  device_id_t Id() const;
+  device_id_t Id() const { return device_id_;};
  virtual void WaitDeviceIdle() = 0;
  virtual bool DeviceExit() = 0;
  virtual void RemoteReset();
+  uint32_t CoreCount() const {return core_count_;};
+  virtual std::shared_ptr<IExecutor> CreateExecutor(const int32_t core_index = 0,
+                                                    const int32_t core_count = -1,
+                                                    const std::shared_ptr<Context>& context = nullptr) = 0;
+  static std::vector<std::shared_ptr<IDevice>> Enumerate();

 protected:
  device_id_t device_id_;
+  uint32_t core_count_;
+
 };

 class IExecutor {
 public:
+  //using task = std::shared_ptr<IExecutable>;
  using task = std::weak_ptr<IExecutable>;
  virtual ~IExecutor(){};
  virtual bool Submit(const std::shared_ptr<IExecutable>& executable,
@ -87,13 +92,17 @@ class IExecutor {
  virtual bool Trigger(bool async = false) = 0;  // todo: async=true
  virtual std::shared_ptr<IExecutable> Compile(
      const std::shared_ptr<Graph>& graph) = 0;
-  virtual std::shared_ptr<IDevice> Device() const;
-  virtual std::shared_ptr<Context> Contex() const;
-
+  virtual std::shared_ptr<IDevice> Device() const {return device_;};
+  virtual std::shared_ptr<Context> Contex() const {return context_;};
+  virtual uint32_t CoreIndex() const {return core_index_; };
+  virtual uint32_t CoreCount() const {return core_count_; };
 protected:
  std::vector<task> tasks_;
  std::shared_ptr<IDevice> device_;
  std::shared_ptr<Context> context_;
+  uint32_t core_index_;
+  uint32_t core_count_;
+
 };

 class IExecutable : public std::enable_shared_from_this<IExecutable> {
@ -101,40 +110,24 @@ class IExecutable : public std::enable_shared_from_this<IExecutable> {
  virtual ~IExecutable(){};
  virtual void SetInput(const std::shared_ptr<ITensorHandle>& th) = 0;
  virtual void SetOutput(const std::shared_ptr<ITensorHandle>& th) = 0;
-  virtual void GetOutput(
-      const std::vector<std::shared_ptr<ITensorHandle>>& th) = 0;  // for remote
+  virtual void SetInputs(const std::vector<std::shared_ptr<ITensorHandle>>& ths) = 0;
+  virtual void SetOutputs(const std::vector<std::shared_ptr<ITensorHandle>>& ths) = 0;
+  virtual std::vector<std::shared_ptr<ITensorHandle>> GetOutputs() { return input_handles_;};
+  virtual std::vector<std::shared_ptr<ITensorHandle>> Getinputs() { return input_handles_;};
  virtual bool Submit(const std::shared_ptr<IExecutable>& ref,
                      bool after = true) = 0;
  virtual bool Trigger(bool async = false) = 0;  // todo: async=true
  virtual bool Verify() = 0;
-  virtual std::shared_ptr<Graph> NBGraph() const;
-  virtual std::shared_ptr<ITensorHandle> AllocateTensor(
-      const TensorSpec& tensor_spec) = 0;
-  virtual std::shared_ptr<IExecutor> Executor() const;
+  std::shared_ptr<Graph> NBGraph() const {return nb_graph_;};
+  virtual std::shared_ptr<ITensorHandle> AllocateTensor(const TensorSpec& tensor_spec ,
+                                                        void* data = nullptr, uint32_t size = 0) = 0;

 protected:
  std::weak_ptr<IExecutor> executor_;
  std::shared_ptr<Context> context_;
  std::shared_ptr<Graph> nb_graph_;
-};
-
-class ExecutableSet : public IExecutable {
- public:
-  ExecutableSet(const std::vector<std::shared_ptr<IExecutable>>& executables);
-  void SetInput(const std::shared_ptr<ITensorHandle>& th) override;
-  void SetOutput(const std::shared_ptr<ITensorHandle>& th) override;
-  void GetOutput(
-      const std::vector<std::shared_ptr<ITensorHandle>>& th) override;
-  bool Submit(const std::shared_ptr<IExecutable>& ref,
-              bool after = true) override;
-  bool Trigger(bool async = false) override;
-  bool Verify() override;
-  std::shared_ptr<ITensorHandle> AllocateTensor(
-      const TensorSpec& tensor_spec) override;
-  std::vector<std::shared_ptr<IExecutable>> Executables() const;
-
- protected:
-  std::vector<std::shared_ptr<IExecutable>> executables_;
+  std::vector<std::shared_ptr<ITensorHandle>> input_handles_;
+  std::vector<std::shared_ptr<ITensorHandle>> output_handles_;
 };

 class ITensorHandle {
@ -142,13 +135,15 @@ class ITensorHandle {
  virtual ~ITensorHandle(){};
  virtual bool CopyDataToTensor(const void* data, uint32_t size_in_bytes) = 0;
  virtual bool CopyDataFromTensor(void* data) = 0;
-  virtual std::shared_ptr<Tensor> GetTensor() const;
+  virtual std::shared_ptr<Tensor> GetTensor() const { return tensor_;};
+  virtual TensorSpec& GetSpec() { return spec_;};

 protected:
  std::shared_ptr<Tensor> tensor_;
+  TensorSpec spec_;
 };

 }  // namespace platform
 }  // namespace vx
 }  // namespace tim
-#endif
+#endif
--- a/samples/CMakeLists.txt
+++ b/samples/CMakeLists.txt
@ -20,9 +20,7 @@ endif()
 if(TIM_VX_ENABLE_PLATFORM)
    add_subdirectory("lenet_multi_device")
    add_subdirectory("multi_device")
-    if(${TIM_VX_ENABLE_PLATFORM_LITE})
-        add_subdirectory("lite_multi_device")
-    endif()
+    add_subdirectory("platform_sample")
    if(TIM_VX_ENABLE_GRPC)
        add_subdirectory("grpc")
    endif()
--- a/samples/lenet_multi_device/CMakeLists.txt
+++ b/samples/lenet_multi_device/CMakeLists.txt
@ -11,5 +11,10 @@ target_include_directories(${TARGET_NAME} PRIVATE
    ${PROJECT_SOURCE_DIR}/include
 )

+target_include_directories(${TARGET_NAME} PRIVATE
+    ${CMAKE_CURRENT_SOURCE_DIR}
+    ${PROJECT_SOURCE_DIR}/include
+)
+
 install(TARGETS ${TARGET_NAME} ${TARGET_NAME}
    DESTINATION ${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_BINDIR})
--- a/samples/lenet_multi_device/lenet_multi_device.cc
+++ b/samples/lenet_multi_device/lenet_multi_device.cc
@ -33,7 +33,6 @@
 #include "tim/vx/context.h"
 #include "tim/vx/graph.h"
 #include "tim/vx/platform/platform.h"
-#include "tim/vx/platform/native.h"

 std::vector<uint8_t> input_data = {
    0,   0,   0,   0,   0,   0,   0,   0,   6,   0,   2,   0,   0,   8,   0,
@ -108,17 +107,17 @@ static void printTopN(const T* prob, int outputCount, int topNum) {
  }
 }

+
 int main(int argc, char** argv) {
  (void) argc, (void) argv;
  auto context0 = tim::vx::Context::Create();
  auto graph0 = lenet(context0);
  auto graph1 = lenet(context0);

-  auto devices = tim::vx::platform::NativeDevice::Enumerate();
+  auto devices = tim::vx::platform::IDevice::Enumerate();
  auto device = devices[0];
-  std::shared_ptr<tim::vx::platform::IExecutor> executor = std::make_shared<tim::vx::platform::NativeExecutor> (device);
-
-  auto executable0 = tim::vx::platform::Compile(graph0, executor);  // compile to nbg
+  auto executor = device->CreateExecutor(0,-1,context0);
+  auto executable0 = tim::vx::platform::Compile(graph0, executor);
  auto input_handle0 = executable0->AllocateTensor(graph0->InputsTensor()[0]->GetSpec());
  auto output_handle0 = executable0->AllocateTensor(graph0->OutputsTensor()[0]->GetSpec());
  executable0->SetInput(input_handle0);
@ -127,7 +126,18 @@ int main(int argc, char** argv) {
  assert(executable0->Submit(executable0));
  executable0->Trigger();

-  auto executable1 = tim::vx::platform::Compile(graph1, executor);  // compile to nbg
+  std::vector<float> output_data;
+  output_data.resize(1 * 10);
+  if (!output_handle0->CopyDataFromTensor(output_data.data())) {
+    std::cout << "Copy output data fail." << std::endl;
+    return -1;
+  }
+  std::cout << "executable0 out." << std::endl;
+  printTopN(output_data.data(), output_data.size(), 5);
+  output_data.assign(output_data.size(),0);
+  output_handle0->CopyDataToTensor(output_data.data(), output_data.size());
+
+  auto executable1 = tim::vx::platform::Compile(graph1, executor);
  auto input_handle1 = executable1->AllocateTensor(graph1->InputsTensor()[0]->GetSpec());
  auto output_handle1 = executable1->AllocateTensor(graph1->OutputsTensor()[0]->GetSpec());
  executable1->SetInput(input_handle1);
@ -136,34 +146,28 @@ int main(int argc, char** argv) {
  assert(executable1->Submit(executable0));
  executable1->Trigger();

+  std::vector<float> output_data1;
+  output_data1.resize(1 * 10);
+  if (!output_handle1->CopyDataFromTensor(output_data1.data())) {
+    std::cout << "Copy output data fail." << std::endl;
+    return -1;
+  }
+  std::cout << "executable1 out." << std::endl;
+  printTopN(output_data1.data(), output_data1.size(), 5);
+  output_data1.assign(output_data1.size(),0);
+  output_handle1->CopyDataToTensor(output_data1.data(), output_data1.size());
+
  executor->Submit(executable0, executable0);
  executor->Submit(executable1, executable0);

-  std::vector<std::shared_ptr<tim::vx::platform::IExecutable>> executables0;
-  executables0.push_back(executable0);
-  executables0.push_back(executable1);
-  auto executable_set0 = tim::vx::platform::CreateExecutableSet(executables0);
-  executor->Submit(executable_set0, executable_set0);
  executor->Trigger();
-
-  std::vector<uint8_t> input_data0;
-  input_data0.resize(28 * 28);
-  if (!input_handle0->CopyDataFromTensor(input_data0.data())) {
-    std::cout << "Copy intput data fail." << std::endl;
-    return -1;
-  }
-  printTopN(input_data0.data(), input_data0.size(), 5);
-
-  std::vector<float> output_data;
-  output_data.resize(1 * 10);
+  std::cout << "executor out." << std::endl;
  if (!output_handle0->CopyDataFromTensor(output_data.data())) {
    std::cout << "Copy output data fail." << std::endl;
    return -1;
  }
  printTopN(output_data.data(), output_data.size(), 5);

-  std::vector<float> output_data1;
-  output_data1.resize(1 * 10);
  if (!output_handle1->CopyDataFromTensor(output_data1.data())) {
    std::cout << "Copy output data fail." << std::endl;
    return -1;
--- a/samples/lite_multi_device/CMakeLists.txt
+++ b/samples/lite_multi_device/CMakeLists.txt
@ -1,13 +0,0 @@
-message("samples/lite_multi_device")
-
-set(TARGET_NAME "lite_multi_device")
-
-add_executable(${TARGET_NAME} ${CMAKE_CURRENT_SOURCE_DIR}/lite_multi_device.cc)
-
-target_link_libraries(${TARGET_NAME} PRIVATE -Wl,--whole-archive tim-vx)
-target_include_directories(${TARGET_NAME} PRIVATE
-    ${PROJECT_SOURCE_DIR}/include
-    ${PROJECT_SOURCE_DIR}/prebuilt-sdk/viplite/build/sdk/include)
-
-install(TARGETS ${TARGET_NAME} ${TARGET_NAME}
-    DESTINATION ${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_BINDIR})
--- a/samples/multi_device/README
+++ b/samples/multi_device/README
@ -1,15 +1,25 @@
 ## brief
-The multi_device demo uses some acuity exported tim-vx networks, and running on 4 devices of NPU using platform api.
+The multi_device demo uses some acuity exported tim-vx networks, and running on multi-core devices of NPU using platform api.

-## environment
-  export VSIMULATOR_CONFIG=VIP9400O_PID0XD9
-  export VIV_MGPU_AFFINITY="1:0"
-  export VIV_OVX_USE_MULTI_DEVICE="1:1"
-  export TIM_VX_ROOT="${workspaceFolder}/tim-vx"
+## note
+Please note that if you have enabled lite platform, a dedicated VIVANTE_SDK(NO_KERNEL) is required as the compiler for NBG.
+The driver for the NPU is the VIPLITE driver
+
+##requirements
+Vivante SDK >= 6.4.22
+ovxlib >= 1.2.26
+viplite >=2.0.0

 ## build
 cd build
-cmake .. -DCMAKE_BUILD_TYPE=Debug -DTIM_VX_BUILD_EXAMPLES=ON -DTIM_VX_ENABLE_PLATFORM=ON
+cmake .. -DCMAKE_BUILD_TYPE=Release -DTIM_VX_USE_EXTERNAL_OVXLIB=ON -DEXTERNAL_VIV_SDK=${VIVANTE_NOKERNEL_SDK_DIR} -DOVXLIB_INC=${OVXLIB_DIR}/include \
+         -DOVXLIB_LIB=/path/to/libovxlib.so -DTIM_VX_BUILD_EXAMPLES=ON -DTIM_VX_ENABLE_PLATFORM=ON -DTIM_VX_ENABLE_PLATFORM_LITE=ON -DVIP_LITE_SDK=${VIP_LITE_SDK}
+
+## environment
+# Export VIV_GPU_FILE to specify the NPU hardware configuration file for the NBG compiler
+# VIV_GPU_FILE Specify the NPU hardware configuration file for the NBG compiler
+export VIV_GPU_FILE="/path/to/VIP9400NANOQ_PLUS_PID0X10000055.config"
+export TIM_VX_ROOT="${workspaceFolder}/tim-vx"

 ## run
 cd build
--- a/samples/multi_device/multi_device.cc
+++ b/samples/multi_device/multi_device.cc
@ -35,7 +35,6 @@
 #include "tim/vx/context.h"
 #include "tim/vx/graph.h"
 #include "tim/vx/platform/platform.h"
-#include "tim/vx/platform/native.h"
 #include "vx_lenet.h"
 #include "vx_mobilenet.h"
 #include "vx_resnet50.h"
@ -59,7 +58,7 @@ static void printTopN(const T* prob, int outputCount, int topNum) {
 }

 template <typename T>
-void print_topN(std::size_t size, std::shared_ptr<tim::vx::platform::ITensorHandle> handle) {
+void print_topN(std::size_t size, std::shared_ptr<tim::vx::platform::ITensorHandle> & handle) {
  std::vector<T> output_data;
  output_data.resize(size);
  if (!handle->CopyDataFromTensor(output_data.data())) {
@ -94,7 +93,8 @@ void executor_trigger(std::shared_ptr<tim::vx::platform::IExecutor> executor) {
 }

 auto context = tim::vx::Context::Create();
-std::pair<std::shared_ptr<tim::vx::platform::IExecutable>, std::shared_ptr<tim::vx::platform::ITensorHandle>> generate_executable(
+std::pair<std::shared_ptr<tim::vx::platform::IExecutable>, std::shared_ptr<tim::vx::platform::ITensorHandle>>
+  generate_executable(
    std::shared_ptr<tim::vx::platform::IExecutor> executor,
    std::function<void(std::shared_ptr<tim::vx::Graph>, const char*)> construct_func,
    std::string weight_file,
@ -114,15 +114,17 @@ std::pair<std::shared_ptr<tim::vx::platform::IExecutable>, std::shared_ptr<tim::

 int main(int argc, char** argv) {
  (void) argc, (void) argv;
-  auto devices = tim::vx::platform::NativeDevice::Enumerate();
+  auto devices = tim::vx::platform::IDevice::Enumerate();
  auto device0 = devices[0];
-  std::shared_ptr<tim::vx::platform::IExecutor> executor0 = std::make_shared<tim::vx::platform::NativeExecutor> (device0);
-  auto device1 = devices[1];
-  std::shared_ptr<tim::vx::platform::IExecutor> executor1 = std::make_shared<tim::vx::platform::NativeExecutor> (device1);
-  auto device2 = devices[2];
-  std::shared_ptr<tim::vx::platform::IExecutor> executor2 = std::make_shared<tim::vx::platform::NativeExecutor> (device2);
-  auto device3 = devices[3];
-  std::shared_ptr<tim::vx::platform::IExecutor> executor3 = std::make_shared<tim::vx::platform::NativeExecutor> (device3);
+  auto total_core_count = device0->CoreCount();
+  uint32_t core_index = 0;
+  auto use_core_count = 1;
+  std::vector<std::shared_ptr<tim::vx::platform::IExecutor>> executors;
+
+  for(core_index = 0; core_index < total_core_count; core_index += use_core_count) {
+    auto executor = device0->CreateExecutor(core_index,use_core_count, context);
+    executors.push_back(executor);
+  }

  auto root = std::getenv("TIM_VX_ROOT");
  assert(root != NULL);
@ -142,46 +144,57 @@ int main(int argc, char** argv) {
  auto resnet50_weight_file = ROOT + "/samples/multi_device/resnet50/resnet50.export.data";
  std::function<void(std::shared_ptr<tim::vx::Graph>, const char*)> resnet50_construct_func = acuitylite::resnet50::construct_graph;

-  std::shared_ptr<tim::vx::platform::IExecutable> lenet_0, lenet_2, lenet_3, mobilenet_1, mobilenet_2, mobilenet_3, resnet50_0, resnet50_1;
-  std::shared_ptr<tim::vx::platform::ITensorHandle> lenet_0_outhandle, lenet_2_outhandle, lenet_3_outhandle, mobilenet_1_outhandle, mobilenet_2_outhandle, mobilenet_3_outhandle,
-    resnet50_0_outhandle, resnet50_1_outhandle;
+  auto excutor_cnt  = executors.size();

-  std::tie(lenet_0, lenet_0_outhandle) = generate_executable(executor0, lenet_construct_func, lenet_weight_file, lenet_input_files, lenet_input_bytes);
-  std::tie(resnet50_0, resnet50_0_outhandle) = generate_executable(executor0, resnet50_construct_func, resnet50_weight_file, resnet50_input_files, resnet50_input_bytes);
-  executor0->Submit(lenet_0, lenet_0);
-  executor0->Submit(resnet50_0, lenet_0);
+  //each excutor run 2 models.
+  auto lenet = [&](std::shared_ptr<tim::vx::platform::IExecutor> executor) {
+    return generate_executable(executor, lenet_construct_func, lenet_weight_file,
+                               lenet_input_files, lenet_input_bytes);
+  };
+  auto resnet = [&](std::shared_ptr<tim::vx::platform::IExecutor> executor) {
+     return generate_executable(executor, resnet50_construct_func, resnet50_weight_file,
+                                resnet50_input_files, resnet50_input_bytes);
+  };
+  auto mobilenet = [&](std::shared_ptr<tim::vx::platform::IExecutor> executor) {
+     return generate_executable(executor, mobilenet_construct_func, mobilenet_weight_file,
+                                mobilenet_input_files, mobilenet_input_bytes);
+  };
+  std::vector<std::pair<std::shared_ptr<tim::vx::platform::IExecutable>,
+              std::shared_ptr<tim::vx::platform::ITensorHandle>>> nets;
+  for (size_t i = 0; i < excutor_cnt; i++) {
+    if(i % 3 == 0) {
+      //lenet + resnet
+      nets.push_back(lenet(executors[i]));
+      executors[i]->Submit(nets.back().first, nets.back().first);
+      nets.push_back(resnet(executors[i]));
+      executors[i]->Submit(nets.back().first, nets.back().first);
+    }
+    if(i % 3 == 1) {
+      //resnet + mobilenet
+      nets.push_back(resnet(executors[i]));
+      executors[i]->Submit(nets.back().first, nets.back().first);
+      nets.push_back(mobilenet(executors[i]));
+      executors[i]->Submit(nets.back().first, nets.back().first);
+    }
+    if(i % 3 == 2) {
+      //lenet + mobilenet
+      nets.push_back(mobilenet(executors[i]));
+      executors[i]->Submit(nets.back().first, nets.back().first);
+      nets.push_back(lenet(executors[i]));
+      executors[i]->Submit(nets.back().first, nets.back().first);
+    }
+  }
+  std::vector<std::thread> threads;
+  for(auto executor:executors) {
+        threads.push_back(std::thread(executor_trigger, executor));
+  }
+  for(std::thread &t : threads) {
+     t.join();
+  }

-  std::tie(mobilenet_1, mobilenet_1_outhandle) = generate_executable(executor1, mobilenet_construct_func, mobilenet_weight_file, mobilenet_input_files, mobilenet_input_bytes);
-  std::tie(resnet50_1, resnet50_1_outhandle) = generate_executable(executor1, resnet50_construct_func, resnet50_weight_file, resnet50_input_files, resnet50_input_bytes);
-  auto executable_set1 = tim::vx::platform::CreateExecutableSet({mobilenet_1, resnet50_1});
-  executor1->Submit(executable_set1, executable_set1);
-
-  std::tie(lenet_2, lenet_2_outhandle) = generate_executable(executor2, lenet_construct_func, lenet_weight_file, lenet_input_files, lenet_input_bytes);
-  std::tie(mobilenet_2, mobilenet_2_outhandle) = generate_executable(executor2, mobilenet_construct_func, mobilenet_weight_file, mobilenet_input_files, mobilenet_input_bytes);
-  auto executable_set2 = tim::vx::platform::CreateExecutableSet({lenet_2, mobilenet_2});
-  executor2->Submit(executable_set2, executable_set2);
-
-  std::tie(lenet_3, lenet_3_outhandle) = generate_executable(executor3, lenet_construct_func, lenet_weight_file, lenet_input_files, lenet_input_bytes);
-  std::tie(mobilenet_3, mobilenet_3_outhandle) = generate_executable(executor3, mobilenet_construct_func, mobilenet_weight_file, mobilenet_input_files, mobilenet_input_bytes);
-  auto executable_set3 = tim::vx::platform::CreateExecutableSet({lenet_3, mobilenet_3});
-  executor3->Submit(executable_set3, executable_set3);
-
-  std::thread t0(executor_trigger, executor0);
-  std::thread t1(executor_trigger, executor1);
-  std::thread t2(executor_trigger, executor2);
-  std::thread t3(executor_trigger, executor3);
-  t0.join();
-  t1.join();
-  t2.join();
-  t3.join();
-
-  print_topN<float>(1 * 10, lenet_0_outhandle);
-  print_topN<float>(1 * 10, lenet_2_outhandle);
-  print_topN<float>(1 * 10, lenet_3_outhandle);
-  print_topN<float>(1 * 1001, mobilenet_1_outhandle);
-  print_topN<float>(1 * 1001, mobilenet_2_outhandle);
-  print_topN<float>(1 * 1001, mobilenet_3_outhandle);
-  print_topN<uint16_t>(1 * 1000, resnet50_0_outhandle);
-  print_topN<uint16_t>(1 * 1000, resnet50_1_outhandle);
+for (auto net : nets) {
+  auto size = net.second->GetSpec().GetElementNum();
+  print_topN<float>(size, net.second);
+}
  return 0;
 }
--- a/samples/multi_device/multi_device_demo.cc
+++ b/samples/multi_device/multi_device_demo.cc
@ -29,7 +29,7 @@
 #include "tim/vx/graph.h"
 #include "tim/vx/operation.h"
 #include "tim/vx/tensor.h"
-#include "tim/vx/platform/native.h"
+#include "tim/vx/platform/platform.h"

 static void printTopN() {
 }
@ -46,9 +46,9 @@ int demo(int argc, char** argv) {
  tim::vx::TensorSpec g0_input0, g0_output0, g1_output0, g2_output0, g3_output0, g4_output0, g5_output0;

  // query device and get executor of devcie
-  auto devices = tim::vx::platform::NativeDevice::Enumerate();
+  auto devices = tim::vx::platform::IDevice::Enumerate();
  auto device = devices[0];
-  std::shared_ptr<tim::vx::platform::IExecutor> executor = std::make_shared<tim::vx::platform::NativeExecutor> (device);
+  auto executor = device->CreateExecutor(0,-1, context);

  // executable0
  auto executable0 = executor->Compile(g0);  // compile to nbg
@ -89,33 +89,6 @@ int demo(int argc, char** argv) {
  // trigger
  executor->Trigger();  // run all submitted executables

-  /* 2. another way to run */
-  // executable_set0
-  std::vector<std::shared_ptr<tim::vx::platform::IExecutable>> executables0;
-  executables0.push_back(executable0);
-  auto executable_set0 = CreateExecutableSet(executables0);
-  // executable_set1
-  std::vector<std::shared_ptr<tim::vx::platform::IExecutable>> executables1;
-  executables1.push_back(executable1);
-  executables1.push_back(executable3);
-  auto executable_set1 = CreateExecutableSet(executables1);
-  // executable_set2
-  std::vector<std::shared_ptr<tim::vx::platform::IExecutable>> executables2;
-  executables2.push_back(executable2);
-  executables2.push_back(executable4);
-  auto executable_set2 = CreateExecutableSet(executables2);
-  // executable_set3
-  std::vector<std::shared_ptr<tim::vx::platform::IExecutable>> executables3;
-  executables3.push_back(executable5);
-  auto executable_set3 = CreateExecutableSet(executables3);
-  // submit executaleSets
-  executable_set0->Submit(executable_set0);
-  executable_set1->Submit(executable_set0);
-  executable_set2->Submit(executable_set1);
-  executable_set3->Submit(executable_set2);
-  // trigger
-  executor->Trigger();  // run all submitted executableSets
-
  printTopN();

  return 0;
--- a/samples/multi_device/vx_resnet50.cc
+++ b/samples/multi_device/vx_resnet50.cc
@ -1296,7 +1296,7 @@ void resnet50::construct_graph
    auto input_0 = graph->CreateTensor(input_0_spec);

    tim::vx::ShapeType output_229_shape({1000,1});
-    tim::vx::TensorSpec output_229_spec(tim::vx::DataType::FLOAT16, output_229_shape,
+    tim::vx::TensorSpec output_229_spec(tim::vx::DataType::FLOAT32, output_229_shape,
    tim::vx::TensorAttribute::OUTPUT);
    auto output_229 = graph->CreateTensor(output_229_spec);

--- a/samples/platform_sample/CMakeLists.txt
+++ b/samples/platform_sample/CMakeLists.txt
@ -0,0 +1,13 @@
+message("samples/platform_sample")
+
+set(TARGET_NAME "platform_sample")
+
+add_executable(${TARGET_NAME} ${CMAKE_CURRENT_SOURCE_DIR}/platform_sample.cc)
+
+target_link_libraries(${TARGET_NAME} PRIVATE -Wl,--whole-archive tim-vx)
+target_include_directories(${TARGET_NAME} PRIVATE
+    ${CMAKE_CURRENT_SOURCE_DIR}
+    ${PROJECT_SOURCE_DIR}/include)
+
+install(TARGETS ${TARGET_NAME} ${TARGET_NAME}
+    DESTINATION ${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_BINDIR})
--- a/samples/platform_sample/README
+++ b/samples/platform_sample/README
@ -0,0 +1,25 @@
+## brief
+The platform sample usage which use platform api.
+
+## note
+Please note that if you have enabled lite platform, a dedicated VIVANTE_SDK(NO_KERNEL) is required as the compiler for NBG.
+The driver for the NPU is the VIPLITE driver
+
+##requirements
+Vivante SDK >= 6.4.22
+ovxlib >= 1.2.26
+viplite >=2.0.0
+
+## build
+cd build
+cmake .. -DCMAKE_BUILD_TYPE=Release -DTIM_VX_USE_EXTERNAL_OVXLIB=ON -DEXTERNAL_VIV_SDK=${VIVANTE_NOKERNEL_SDK_DIR} -DOVXLIB_INC=${OVXLIB_DIR}/include \
+         -DOVXLIB_LIB=${VIVANTE_NOKERNEL_SDK_DIR}/drivers/libovxlib.so -DTIM_VX_BUILD_EXAMPLES=ON -DTIM_VX_ENABLE_PLATFORM=ON \
+         -DTIM_VX_ENABLE_PLATFORM_LITE=ON -DVIP_LITE_SDK=${VIP_LITE_SDK}
+
+## environment
+# Export VIV_GPU_FILE to specify the NPU hardware configuration file for the NBG compiler
+export VIV_GPU_FILE="/path/to/VIP9000NANOQ_PLUS_PID0X100000XX.config"
+
+## run
+cd build
+./samples/platform_sample/platform_sample
--- a/samples/lite_multi_device/lite_multi_device.cc
+++ b/samples/lite_multi_device/lite_multi_device.cc
@ -26,8 +26,8 @@
 #include "tim/vx/graph.h"
 #include "tim/vx/ops.h"
 #include "tim/vx/types.h"
-#include "tim/vx/platform/native.h"
-#include "tim/vx/platform/lite/lite_native.h"
+#include "tim/vx/platform/platform.h"
+

 int main() {
  //construct tim-vx graph
@ -49,9 +49,15 @@ int main() {
  std::vector<int> data_vec_i0({1, 2, 3, 4});
  std::vector<int> data_vec_i1({4, 3, 2, 1});

-  auto devices = tim::vx::platform::NativeDevice::Enumerate();
+  auto devices = tim::vx::platform::IDevice::Enumerate();
+
+  std::cout << "NPU device count: " << devices.size() <<std::endl;
  auto device = devices[0];
-  auto executor = std::make_shared<tim::vx::platform::LiteNativeExecutor>(device);
+  //run 1 core in device 0
+  std::cout << "NPU device[0] has " << device->CoreCount() << "cores" <<std::endl;
+  auto use_core_count = -1;
+  auto executor = device->CreateExecutor(use_core_count);
+
  auto executable = executor->Compile(graph);
  auto input0_handle = executable->AllocateTensor(input_spec);
  auto input1_handle = executable->AllocateTensor(input_spec);
@ -73,6 +79,10 @@ int main() {
  //each output value should be "5" in this demo
  for (int i = 0; i < 4; ++i) {
    std::cout << "output value: " << data[i] << std::endl;
+    if(data[i] != 5) {
+      std::cout << "test failed" << std::endl;
+      break;
+    }
  }
  free(data);
  return 0;
--- a/src/tim/CMakeLists.txt
+++ b/src/tim/CMakeLists.txt
@ -61,8 +61,10 @@ if(TIM_VX_ENABLE_PLATFORM)
        endif()
        list(APPEND LITE_EXTERNAL_LIBS
            ${VIP_LITE_SDK}/drivers/libNBGlinker.so
-            ${VIP_LITE_SDK}/drivers/libVIPlite.so)
-        list(APPEND LITE_INC_DIRS ${VIP_LITE_SDK}/include)
+            ${VIP_LITE_SDK}/drivers/libVIPhal.so)
+        list(APPEND LITE_INC_DIRS
+            ${VIP_LITE_SDK}/include
+            ${VIP_LITE_SDK}/include/nbg_linker)
    endif()

    if(TIM_VX_ENABLE_GRPC)
--- a/src/tim/vx/platform/lite/lite_native.cc
+++ b/src/tim/vx/platform/lite/lite_native.cc
@ -22,36 +22,202 @@
 *
 *****************************************************************************/
 #include "tim/vx/platform/lite/lite_native.h"
+#include "lite_native_private.h"

 #include <cassert>
-
 #include "tim/vx/graph.h"
 #include "graph_private.h"
-#include "vsi_nn_pub.h"
+#include "context_private.h"

 namespace tim {
 namespace vx {
 namespace platform {
-LiteNativeExecutor::LiteNativeExecutor(const std::shared_ptr<IDevice>& device) {
+
+  LiteNetwork::LiteNetwork(vip_create_network_param_t& param) {
+    vip_create_network(&param, sizeof(param), &network_);
+  }
+  vip_status_e LiteNetwork::Query(vip_enum property, void* value) {
+    return vip_query_network(network_, property, value);
+  }
+  vip_status_e LiteNetwork::Set(vip_enum property, void* value) {
+     return vip_set_network(network_, property, value);
+  }
+  vip_status_e LiteNetwork::Prepare() {
+      return vip_prepare_network(network_);
+  }
+   vip_status_e LiteNetwork::Run() {return vip_run_network(network_);}
+
+   vip_status_e LiteNetwork::Trigger() {return vip_trigger_network(network_);}
+
+   vip_status_e LiteNetwork::Wait() {return vip_wait_network(network_);}
+
+   vip_status_e LiteNetwork::Cancel() {return vip_cancel_network(network_);}
+
+   vip_status_e LiteNetwork::QueryInput(vip_uint32_t index, vip_enum property, void* value) {
+    return vip_query_input(network_, index, property,value);
+  }
+
+  vip_status_e LiteNetwork::QueryOutput(vip_uint32_t index, vip_enum property, void* value) {
+    return vip_query_output(network_, index, property, value);
+  }
+
+  vip_status_e LiteNetwork::SetInput(vip_uint32_t index, std::shared_ptr<ITensorHandle> input) {
+      vip_buffer buffer =
+        std::dynamic_pointer_cast<LiteNativeTensorHandleImpl>(input)->GetBuffer();
+    return vip_set_input(network_, index, buffer);
+  }
+
+  vip_status_e LiteNetwork::SetOutput(vip_uint32_t index, std::shared_ptr<ITensorHandle> output) {
+      vip_buffer buffer =
+        std::dynamic_pointer_cast<LiteNativeTensorHandleImpl>(output)->GetBuffer();
+     return vip_set_output(network_, index, buffer);
+  }
+
+  LiteNetwork::~LiteNetwork(){
+    vip_finish_network(network_);
+    vip_destroy_network(network_);
+  }
+
+bool LiteNativeDevice::vip_initialized = false;
+
+LiteNativeDeviceImpl::LiteNativeDeviceImpl(device_id_t id,uint32_t core_count) {
+  device_id_ = id;
+  core_count_ = core_count;
+ }
+
+bool LiteNativeDeviceImpl::Submit(const std::shared_ptr<Graph>& graph) {
+  (void)graph;
+  return true;
+}
+
+bool LiteNativeDeviceImpl::Trigger(bool async, async_callback cb) {
+  (void)async;
+  (void)cb;
+  return true;
+}
+void LiteNativeDeviceImpl::WaitDeviceIdle() {}
+
+bool LiteNativeDeviceImpl::DeviceExit() {return false;}
+
+std::shared_ptr<IExecutor> LiteNativeDeviceImpl::CreateExecutor(const int32_t core_index,
+                                                    const int32_t core_count,
+                                                    const std::shared_ptr<Context>& context) {
+  std::shared_ptr<IDevice> this_sp = shared_from_this();
+  auto executor = std::make_shared<LiteNativeExecutorImpl>(this_sp, core_count, core_index, context);
+  return executor;
+}
+
+std::vector<std::shared_ptr<IDevice>> LiteNativeDevice::Enumerate() {
+  std::vector<std::shared_ptr<IDevice>> device_v;
+  device_id_t deviceCount = 0;
+  std::vector<uint32_t> core_count;
+  uint32_t version = 0;
+  if( !LiteNativeDevice::vip_initialized ) {
+    vip_status_e status = vip_init();
+    if(status != VIP_SUCCESS) {
+      VSILOGE("Initialize viplite driver fail");
+      return device_v;
+    }
+    LiteNativeDevice::vip_initialized = true;
+  }
+  version = vip_get_version();
+  if (version >= 0x00010601 ) {
+      vip_query_hardware(VIP_QUERY_HW_PROP_DEVICE_COUNT, sizeof(uint32_t), &deviceCount);
+      core_count.resize(deviceCount);
+      vip_query_hardware(VIP_QUERY_HW_PROP_CORE_COUNT_EACH_DEVICE,
+      sizeof(uint32_t) * core_count.size(), core_count.data());
+  }
+
+  for (device_id_t i = 0; i < deviceCount; i++) {
+    auto  local_device = std::make_shared<LiteNativeDeviceImpl>(i, core_count.at(i));
+    device_v.push_back(local_device);
+  }
+  return device_v;
+}
+
+int LiteNativeExecutorImpl::executor_count = 0;
+
+LiteNativeExecutorImpl::LiteNativeExecutorImpl(const std::shared_ptr<IDevice>& device,
+  const int32_t core_count, const int32_t core_index, const std::shared_ptr<Context>& context)
+ {
  device_ = device;
-  context_ = Context::Create();
-  database_ = VIP_NULL;
+  context_ = context;
+  if(context_ == nullptr) {
+    context_ = tim::vx::Context::Create();
+  }
+  auto fixed_core_count = core_count;
+  int32_t fixed_core_index = core_index;
+  vip_status_e status  = VIP_SUCCESS;
+  if( !LiteNativeDevice::vip_initialized ) {
+     status = vip_init();
+     if(status != VIP_SUCCESS){
+      throw "Initialize viplite driver fail";
+      }
+  }
+  int32_t total_core_count  = (int32_t)device->CoreCount();
+  if (fixed_core_index < 0)
+  {
+    fixed_core_index = 0;
+  }
+  if (fixed_core_index > total_core_count - 1){
+     throw "Core index is larger than total core count.";
+  }
+  if (fixed_core_count <= 0 ) {
+    fixed_core_count = total_core_count - fixed_core_index;
+  }

-  vip_init();
-  vip_query_database(&database_);
-  nbg_linker_init(database_);
+  if (fixed_core_index + fixed_core_count > total_core_count) {
+    fixed_core_count = total_core_count - fixed_core_index;
+    VSILOGW(
+        "Core_index + core_count is larger than total core count. Fix core "
+        "count to %d",
+        fixed_core_count);
+  }
+  core_index_ = (uint32_t)fixed_core_index;
+  core_count_ = (uint32_t)fixed_core_count;
+
+#ifdef VSI_DEVICE_SUPPORT
+  vsi_nn_device_t  vsi_devices[VSI_MAX_DEVICES] = {0};
+  vsi_size_t num_devices = 0;
+  vsi_size_t available_core_count = 0;
+  auto ctx = dynamic_cast<ContextImpl*>(context_.get());
+  vsi_nn_GetDevices(ctx->context(), vsi_devices, &num_devices);
+
+  //Always use device 0 to compile NBG.
+  vsi_nn_GetDeviceCoreCount(vsi_devices[0], &available_core_count);
+
+  if(core_index_ + core_count_ > (uint32_t)available_core_count) {
+      VSILOGE("the used core count is larger than compiler available core count");
+      assert(false);
+  }
+  vsi_nn_CreateSubDevice(vsi_devices[0], core_index_, core_count_, &sub_device_);
+#else
+  VSILOGE("device is not supported!");
+  assert(false);
+#endif
+
+  executor_count++;
 }

-LiteNativeExecutor::~LiteNativeExecutor() {
-  nbg_destroy_task(task_descriptor_);
-  nbg_linker_destroy();
-  vip_destroy();
+LiteNativeExecutorImpl::~LiteNativeExecutorImpl() {
+#ifdef VSI_DEVICE_SUPPORT
+  if(sub_device_)
+     vsi_nn_ReleaseDevice(&sub_device_);
+#endif
+  executor_count--;
+  if(executor_count <1)
+    vip_destroy();
 }

-bool LiteNativeExecutor::Submit(const std::shared_ptr<IExecutable>& executable,
+bool LiteNativeExecutorImpl::Submit(const std::shared_ptr<IExecutable>& executable,
                                const std::shared_ptr<IExecutable>& ref,
                                bool after) {
  bool success = false;
+  success = executable->Verify();
+  if (success == false) {
+    VSILOGE("Executable NBG compile failed");
+    return false;
+  }
  if (executable == ref) {
    tasks_.push_back(executable);
    return true;
@ -72,239 +238,285 @@ bool LiteNativeExecutor::Submit(const std::shared_ptr<IExecutable>& executable,
  return success;
 }

-bool LiteNativeExecutor::Trigger(bool async) {
+bool LiteNativeExecutorImpl::Trigger(bool async) {
  (void)async;
-  vip_status_e status = VIP_SUCCESS;
-  std::vector<vip_network> networks;
-  for (auto exe : tasks_) {
-    auto task = exe.lock();
-    task->Verify();
-    vip_network& network =
-        std::dynamic_pointer_cast<LiteNativeExecutable>(task)->network_;
-    networks.push_back(std::move(network));
-  }
-  status = nbg_create_task(networks.size(), networks.data(), &task_descriptor_);
-  if (status != VIP_SUCCESS) {
-    VSILOGE("create task descriptor fail");
-    return false;
-  }
-  status = vip_trigger_task(task_descriptor_);
-  if (status != VIP_SUCCESS) {
-    VSILOGE("trigger task descriptor fail");
-    return false;
-  }
-  status = vip_wait_task(task_descriptor_);
-  if (status != VIP_SUCCESS) {
-    VSILOGE("wait task descriptor fail");
-    // nbg_gen_capture(networks.size(), networks.data());
-    return false;
+  while (!tasks_.empty()) {
+    auto task = tasks_.front();
+    tasks_.erase(tasks_.begin());
+    auto task_tmp = task.lock();
+    if (!task_tmp) {
+      VSILOGE("Task is empty");
+      return false;
+    }
+    task_tmp->Trigger();
  }
  return true;
 }

-std::shared_ptr<IExecutable> LiteNativeExecutor::Compile(
+std::shared_ptr<IExecutable> LiteNativeExecutorImpl::Compile(
    const std::shared_ptr<Graph>& graph) {
-  GraphImpl* graphimp = dynamic_cast<GraphImpl*>(graph.get());
-  IDevice::device_id_t id = device_->Id();
-  vxSetGraphAttribute(graphimp->graph()->g, VX_GRAPH_DEVICE_INDEX_VIV,
-                      (void*)(&id), sizeof(id));
  size_t bin_size = -1;
-  graph->CompileToBinary(nullptr, &bin_size);
  std::vector<char> nb_buf;
+#ifdef VSI_DEVICE_SUPPORT
+  GraphImpl* graphimp = dynamic_cast<GraphImpl*>(graph.get());
+  vsi_nn_BindDevices(graphimp->graph(), 1, &sub_device_);
+#endif
+  auto ret = graph->CompileToBinary(nullptr, &bin_size);
  nb_buf.resize(bin_size);
-  graph->CompileToBinary(nb_buf.data(), &bin_size);
-  return std::make_shared<LiteNativeExecutable>(shared_from_this(), nb_buf);
+  ret |= graph->CompileToBinary(nb_buf.data(), &bin_size);
+  if(!ret) {
+    VSILOGE("Compile fail");
+    return nullptr;
+  }
+
+  std::shared_ptr<IExecutor> this_sp = shared_from_this();
+  auto executable = std::make_shared<LiteNativeExecutableImpl>(this_sp, nb_buf);
+  return executable;
 }

-LiteNativeExecutable::LiteNativeExecutable(
+LiteNativeExecutableImpl::LiteNativeExecutableImpl(
    const std::shared_ptr<IExecutor>& executor,
    const std::vector<char>& nb_buf) {
  executor_ = executor;
-  context_ = executor->Contex();
-  nb_graph_ = context_->CreateGraph();
-  nbg_create_network(nb_buf.data(), nb_buf.size(),
-                     VIP_CREATE_NETWORK_FROM_MEMORY, &network_);
-  input_count_ = 0;
-  output_count_ = 0;
-  coeff_ = nullptr;
-  command_ = nullptr;
-  memory_pool_ = nullptr;
-  others_ = nullptr;
-  pre_command_ = nullptr;
+  context_ = nullptr;
+  nb_graph_ = nullptr;
+  vip_status_e  status = VIP_SUCCESS;
+  vip_create_network_param_t net_param;
+  device_id_ = executor_.lock()->Device()->Id();
+  auto core_index  = executor_.lock()->CoreIndex();
+  net_param.device_index = device_id_;
+  net_param.prop = VIP_NET_CREATE_PROP_FROM_NBG;
+  net_param.nbg.type = VIP_NET_CREATE_NBG_FROM_MEMORY;
+  net_param.nbg.memory.nbg_memory = (void*)nb_buf.data();
+  net_param.nbg.memory.nbg_size = nb_buf.size();

-  /* prepare vip network */
-  vip_status_e status = VIP_SUCCESS;
-  nbg_network_memory_size_t buffer_size;
-  nbg_network_memory_buffer_t buffer;
-  vip_memory_t coeff_buffer;
-  vip_memory_t cmd_buffer;
-  vip_memory_t pre_cmd_buffer;
-  vip_memory_t pool_buffer;
-  vip_memory_t others_buffer;
-  nbg_query_network(network_, VIP_NETWORK_PROP_MEMORY_SIZE, &buffer_size);
+  auto network(std::make_unique<LiteNetwork>(net_param));

-  vip_allocate_videomemory(buffer_size.coeff, &coeff_);
-  vip_allocate_videomemory(buffer_size.command, &command_);
-  vip_allocate_videomemory(buffer_size.memory_pool, &memory_pool_);
-  vip_allocate_videomemory(buffer_size.others, &others_);
-  vip_allocate_videomemory(buffer_size.pre_command, &pre_command_);
-
-  SetBuffer(&coeff_buffer, coeff_);
-  SetBuffer(&cmd_buffer, command_);
-  SetBuffer(&pre_cmd_buffer, pre_command_);
-  SetBuffer(&pool_buffer, memory_pool_);
-  SetBuffer(&others_buffer, others_);
-
-  buffer.coeff = &coeff_buffer;
-  buffer.command = &cmd_buffer;
-  buffer.memory_pool = &pool_buffer;
-  buffer.others = &others_buffer;
-  buffer.pre_command = &pre_cmd_buffer;
-  buffer.dma_command = nullptr;
-  status = nbg_prepare_network(network_, &buffer);
-
-  vip_flush_videomemory(coeff_, VIP_BUFFER_OPER_TYPE_FLUSH);
-  vip_flush_videomemory(command_, VIP_BUFFER_OPER_TYPE_FLUSH);
-  vip_flush_videomemory(pre_command_, VIP_BUFFER_OPER_TYPE_FLUSH);
-  vip_flush_videomemory(memory_pool_, VIP_BUFFER_OPER_TYPE_FLUSH);
-  vip_flush_videomemory(others_, VIP_BUFFER_OPER_TYPE_FLUSH);
+  lite_network_ = std::move(network);
+  status = lite_network_->Query(VIP_NETWORK_PROP_INPUT_COUNT,&input_count_);
+  if (status != VIP_SUCCESS) {
+    VSILOGE("failed to query network inputs");
+    assert(false);
+  }
+  status = lite_network_->Query(VIP_NETWORK_PROP_OUTPUT_COUNT,&output_count_);
+  if (status != VIP_SUCCESS) {
+    VSILOGE("failed to query network outputs");
+    assert(false);
+  }

+  status = lite_network_->Set(VIP_NETWORK_PROP_SET_CORE_INDEX,&core_index);
+  if (status != VIP_SUCCESS) {
+    VSILOGE("failed to set core index");
+    assert(false);
+  }
+   status = lite_network_->Prepare();
  if (status != VIP_SUCCESS) {
    VSILOGE("failed to prepare network");
    assert(false);
  }
 }

-LiteNativeExecutable::~LiteNativeExecutable() {
-  nbg_finish_network(network_);
-  nbg_destroy_network(network_);
-  if (coeff_) {
-    vip_free_videomemory(coeff_);
-    coeff_ = nullptr;
-  }
-  if (command_) {
-    vip_free_videomemory(command_);
-    command_ = nullptr;
-  }
-  if (memory_pool_) {
-    vip_free_videomemory(memory_pool_);
-    memory_pool_ = nullptr;
-  }
-  if (others_) {
-    vip_free_videomemory(others_);
-    others_ = nullptr;
-  }
-  if (pre_command_) {
-    vip_free_videomemory(pre_command_);
-    pre_command_ = nullptr;
-  }
-}
-
-void LiteNativeExecutable::SetInput(const std::shared_ptr<ITensorHandle>& th) {
+void LiteNativeExecutableImpl::SetInput(const std::shared_ptr<ITensorHandle>& th) {
  vip_status_e status = VIP_SUCCESS;
-  gcvip_videomemory_t* mem =
-      std::dynamic_pointer_cast<LiteNativeTensorHandle>(th)->tensor_buffer_;
-  vip_memory_t buffer;
-  SetBuffer(&buffer, mem);
-
-  status = nbg_set_input(network_, input_count_, &buffer);
+  int32_t input_index = input_handles_.size();
+  status = lite_network_->SetInput(input_index, th);
  if (status != VIP_SUCCESS) {
-    VSILOGE("failed to set input: %d", input_count_);
+    VSILOGE("failed to set input: %d", input_index);
    assert(false);
  }
-  ++input_count_;
+  input_handles_.push_back(th);
+}
+void LiteNativeExecutableImpl::SetInputs(const std::vector<std::shared_ptr<ITensorHandle>>& ths) {
+  for (auto th : ths) {
+    SetInput(th);
+  }
 }

-void LiteNativeExecutable::SetOutput(const std::shared_ptr<ITensorHandle>& th) {
+void LiteNativeExecutableImpl::SetOutput(const std::shared_ptr<ITensorHandle>& th) {
  vip_status_e status = VIP_SUCCESS;
-  gcvip_videomemory_t* mem =
-      std::dynamic_pointer_cast<LiteNativeTensorHandle>(th)->tensor_buffer_;
-  vip_memory_t buffer;
-  SetBuffer(&buffer, mem);
-
-  status = nbg_set_output(network_, output_count_, &buffer);
+  int32_t output_index = output_handles_.size();
+  status = lite_network_->SetOutput(output_index,th);
  if (status != VIP_SUCCESS) {
-    VSILOGE("failed to set output: %d", output_count_);
+    VSILOGE("failed to set output: %d", output_index);
    assert(false);
  }
-  ++output_count_;
+  output_handles_.push_back(th);
 }

-void LiteNativeExecutable::GetOutput(
-    const std::vector<std::shared_ptr<ITensorHandle>>& th) {
-  (void)th;
+void LiteNativeExecutableImpl::SetOutputs(const std::vector<std::shared_ptr<ITensorHandle>>& ths) {
+  for (auto th : ths) {
+    SetOutput(th);
+  }
 }

-bool LiteNativeExecutable::Submit(const std::shared_ptr<IExecutable>& ref,
+bool LiteNativeExecutableImpl::Submit(const std::shared_ptr<IExecutable>& ref,
                                  bool after) {
  bool status = false;
+  std::shared_ptr<LiteNativeExecutorImpl> executor =
+        std::dynamic_pointer_cast<LiteNativeExecutorImpl>(executor_.lock());
  std::shared_ptr<IExecutable> executable = shared_from_this();
-  status = Executor()->Submit(executable, ref, after);
+  status = executor->Submit(executable, ref, after);
  return status;
 }

-bool LiteNativeExecutable::Trigger(bool async) {
-  (void)async;
-  return false;
-}
-
-bool LiteNativeExecutable::Verify() {
-  int32_t input_count = 0;
-  nbg_query_network(network_, VIP_NETWORK_PROP_INPUT_COUNT, &input_count);
-  if (input_count != input_count_) {
-    VSILOGE("input count mismatch, required: %d, provided: %d", input_count,
-            input_count_);
-    return false;
+bool LiteNativeExecutableImpl::Trigger(bool async) {
+  vip_status_e status = VIP_SUCCESS;
+  if (async) {
+    status = lite_network_->Trigger();
+    status = lite_network_->Wait();
+    if (status != VIP_SUCCESS) {
+      VSILOGE("trigger network fail");
+      return false;
+    }
+  } else {
+    status = lite_network_->Run();
+    if (status != VIP_SUCCESS) {
+      VSILOGE("run network fail");
+      return false;
+    }
  }
-  int32_t output_count = 0;
-  nbg_query_network(network_, VIP_NETWORK_PROP_OUTPUT_COUNT, &output_count);
-  if (output_count != output_count_) {
-    VSILOGE("output count mismatch, required: %d, provided: %d", output_count,
-            output_count_);
-    return false;
-  }
-
  return true;
 }

-std::shared_ptr<ITensorHandle> LiteNativeExecutable::AllocateTensor(
-    const TensorSpec& tensor_spec) {
-  auto tensor = nb_graph_->CreateTensor(tensor_spec);
-  return std::make_shared<LiteNativeTensorHandle>(tensor);
+bool LiteNativeExecutableImpl::Verify() {
+  bool ret = true;
+  auto output_index = output_handles_.size();
+  auto input_index = input_handles_.size();
+  if(input_index != input_count_) {
+      VSILOGE("Network need %d inputs but gaving  %d.\n", input_count_, input_index);
+      ret = false;
+  }
+  if(output_index != output_count_) {
+     VSILOGE("Network need %d outputs but gaving  %d.\n", output_count_, output_index);
+      ret = false;
+  }
+
+  return ret;
 }

-void LiteNativeExecutable::SetBuffer(vip_memory_t* dst,
-                                     gcvip_videomemory_t* src) {
-  if (dst && src) {
-    dst->cpu_logical = src->cpu_logical;
-    dst->npu_physical = src->npu_physical;
-    dst->size = src->size;
+std::shared_ptr<ITensorHandle> LiteNativeExecutableImpl::AllocateTensor(const TensorSpec& tensor_spec,
+                                                                        void* data, uint32_t size) {
+  return std::make_shared<LiteNativeTensorHandleImpl>(tensor_spec, data, size, device_id_);
+}
+
+LiteNativeTensorHandleImpl::LiteNativeTensorHandleImpl(const TensorSpec& tensor_spec, void* data, uint32_t size,
+                                                       uint32_t device_id) {
+  vip_status_e  status  = VIP_ERROR_FAILURE;
+  spec_  = tensor_spec;
+  uint32_t tensor_size = tensor_spec.GetByteSize();
+  vip_buffer_create_params_t tensor_param;
+
+  uint32_t block_aligned_size = 64;
+  memory_type_ = ALLOC_MEM_NONE;
+  handle_ = nullptr;
+  handle_size_ = 0;
+  if(size > 0 && !data && tensor_size >  size ) {
+    VSILOGE("Buffer size is less than the memory size required by the tensor");
+    assert(false);
+  }
+#if 0
+  uint32_t addr_aligned_size = 256;
+  if (!data) {
+    data = vsi_nn_MallocAlignedBuffer(tensor_size,addr_aligned_size,block_aligned_size);
+    size = ((tensor_size + block_aligned_size - 1) / block_aligned_size) * block_aligned_size;
+    memory_type_ = ALLOC_MEM_INTERNAL;
+  } else {
+    memory_type_ = ALLOC_MEM_EXTERNAL;
+  }
+  handle_ = data;
+  if(!vsi_nn_IsBufferAligned((uint8_t *)handle_, addr_aligned_size)) {
+      VSILOGE("The starting address of the buffer needs to be 64-byte aligned");
+      assert(false);
+  }
+  if(size % 64 != 0) {
+      VSILOGE("The size of the buffer needs to be 64-byte aligned");
+      assert(false);
+  }
+  handle_size_ = size;
+  tensor_param.type = VIP_BUFFER_CREATE_FROM_USER_MEM;
+  tensor_param.device_index = device_id ;
+  tensor_param.src.from_handle.memory_type = VIP_BUFFER_FROM_USER_MEM_TYPE_HOST;
+  tensor_param.src.from_handle.logical_addr = handle_;
+  tensor_param.src.from_handle.size = handle_size_;
+  status = vip_create_buffer(&tensor_param,sizeof(tensor_param),&tensor_buffer_);
+#else
+  (void)data;
+  tensor_param.type = VIP_BUFFER_CREATE_ALLOC_MEM;
+  tensor_param.device_index = device_id ;
+  tensor_param.src.alloc_mem.size = tensor_size;
+  tensor_param.src.alloc_mem.align = block_aligned_size;
+  status = vip_create_buffer(&tensor_param,sizeof(tensor_param),&tensor_buffer_);
+  memory_type_ = ALLOC_MEM_VIDEOMEM;
+#endif
+  if(status != VIP_SUCCESS) {
+    if(memory_type_ == ALLOC_MEM_INTERNAL) {
+      vsi_nn_FreeAlignedBuffer((uint8_t*)handle_);
+    }
+    VSILOGE("Fail to create vip buffer.");
+    assert(false);
  }
 }

-LiteNativeTensorHandle::LiteNativeTensorHandle(
-    const std::shared_ptr<Tensor>& tensor) {
-  tensor_ = tensor;
-  uint32_t size = tensor->GetSpec().GetByteSize();
-  vip_allocate_videomemory(size, &tensor_buffer_);
-}
-
-LiteNativeTensorHandle::~LiteNativeTensorHandle() {
+LiteNativeTensorHandleImpl::~LiteNativeTensorHandleImpl() {
  if (tensor_buffer_) {
-    vip_free_videomemory(tensor_buffer_);
+    vip_destroy_buffer(tensor_buffer_);
    tensor_buffer_ = nullptr;
  }
+  if(memory_type_ == ALLOC_MEM_INTERNAL && handle_) {
+    vsi_nn_FreeAlignedBuffer((uint8_t*)handle_);
+
+  }
 }

-bool LiteNativeTensorHandle::CopyDataToTensor(const void* data,
-                                              uint32_t size_in_bytes) {
-  memcpy(tensor_buffer_->cpu_logical, data, size_in_bytes);
+bool LiteNativeTensorHandleImpl::CopyDataToTensor(const void* data,
+                                                  uint32_t size_in_bytes) {
+  void* handle  = handle_;
+  if(memory_type_ == ALLOC_MEM_VIDEOMEM) {
+    handle = vip_map_buffer(tensor_buffer_);
+  }
+  auto buff_size = vip_get_buffer_size(tensor_buffer_);
+  memcpy(handle, data, buff_size > size_in_bytes ? size_in_bytes : buff_size);
+  if(memory_type_ == ALLOC_MEM_VIDEOMEM) {
+    vip_unmap_buffer(tensor_buffer_);
+  }
+  Flush();
  return true;
 }

-bool LiteNativeTensorHandle::CopyDataFromTensor(void* data) {
-  memcpy(data, tensor_buffer_->cpu_logical, tensor_buffer_->size);
-  return true;
+bool LiteNativeTensorHandleImpl::CopyDataFromTensor(void* data) {
+  bool ret = Invalidate();
+  if(ret) {
+    void* handle  = handle_;
+    auto buff_size = vip_get_buffer_size(tensor_buffer_);
+    if(memory_type_ == ALLOC_MEM_VIDEOMEM) {
+      handle = vip_map_buffer(tensor_buffer_);
+    }
+    memcpy(data, handle, buff_size);
+    if(memory_type_ == ALLOC_MEM_VIDEOMEM) {
+      vip_unmap_buffer(tensor_buffer_);
+    }
+  }
+
+  return ret;
+}
+
+bool LiteNativeTensorHandleImpl::Flush() {
+  vip_status_e status = vip_flush_buffer(tensor_buffer_,VIP_BUFFER_OPER_TYPE_FLUSH);
+  if (status != VIP_SUCCESS) {
+     return false;
+  }
+  else{
+    return true;
+  }
+}
+bool LiteNativeTensorHandleImpl::Invalidate() {
+  vip_status_e status = vip_flush_buffer(tensor_buffer_,VIP_BUFFER_OPER_TYPE_INVALIDATE);
+  if (status != VIP_SUCCESS) {
+     return false;
+  }
+  else{
+    return true;
+  }
 }

 }  // namespace platform
--- a/src/tim/vx/platform/lite/lite_native_private.h
+++ b/src/tim/vx/platform/lite/lite_native_private.h
@ -0,0 +1,147 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020-2023 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+#ifndef TIM_VX_LITE_NATIVE_DEVICE_PRIVATE_H_
+#define TIM_VX_LITE_NATIVE_DEVICE_PRIVATE_H_
+
+#include "tim/vx/platform/lite/lite_native.h"
+#include "vip_lite.h"
+#include "vsi_nn_pub.h"
+
+
+namespace tim {
+namespace vx {
+
+namespace platform {
+
+class LiteNetwork
+{
+public:
+  LiteNetwork(vip_create_network_param_t& param);
+  ~LiteNetwork();
+  vip_status_e Query(vip_enum property, void* value);
+  vip_status_e Set(vip_enum property, void* value);
+  vip_status_e Prepare();
+  vip_status_e Run();
+  vip_status_e Trigger();
+  vip_status_e Wait();
+  vip_status_e Cancel();
+  vip_status_e QueryInput(vip_uint32_t index, vip_enum property, void* value);
+  vip_status_e QueryOutput(vip_uint32_t index, vip_enum property, void* value);
+  vip_status_e SetInput(vip_uint32_t index, std::shared_ptr<ITensorHandle> input);
+  vip_status_e SetOutput(vip_uint32_t index, std::shared_ptr<ITensorHandle> output);
+
+private:
+    vip_network network_;
+};
+
+class LiteNativeDeviceImpl : public LiteNativeDevice,
+                             public std::enable_shared_from_this<LiteNativeDeviceImpl> {
+ public:
+  LiteNativeDeviceImpl(device_id_t id,uint32_t core_count);
+  ~LiteNativeDeviceImpl() {};
+
+  bool Submit(const std::shared_ptr<tim::vx::Graph>& graph) override;
+  bool Trigger(bool async = false, async_callback cb = NULL) override;
+  bool DeviceExit() override;
+  void WaitDeviceIdle() override;
+   std::shared_ptr<IExecutor> CreateExecutor(const int32_t core_index = 0,
+                                             const int32_t core_count = -1,
+                                             const std::shared_ptr<Context>& context = nullptr) override;
+};
+
+class LiteNativeExecutorImpl
+    : public LiteNativeExecutor,
+      public std::enable_shared_from_this<LiteNativeExecutorImpl> {
+ public:
+  LiteNativeExecutorImpl(const std::shared_ptr<IDevice>& device,
+                     const int32_t core_index = 0,
+                     const int32_t core_count = -1,
+                     const std::shared_ptr<Context>& context = nullptr);
+  virtual ~LiteNativeExecutorImpl();
+  bool Submit(const std::shared_ptr<IExecutable>& executable,
+              const std::shared_ptr<IExecutable>& ref,
+              bool after = true) override;
+  bool Trigger(bool async = false) override;
+  std::shared_ptr<IExecutable> Compile(const std::shared_ptr<Graph>& graph) override;
+  static int executor_count;
+
+private:
+#ifdef VSI_DEVICE_SUPPORT
+  vsi_nn_device_t  sub_device_;
+#endif
+};
+
+class LiteNativeExecutableImpl : public LiteNativeExecutable {
+ public:
+  LiteNativeExecutableImpl(const std::shared_ptr<IExecutor>& executor,
+                           const std::vector<char>& nb_buf);
+  virtual ~LiteNativeExecutableImpl() {};
+  void SetInput(const std::shared_ptr<ITensorHandle>& th) override;
+  void SetOutput(const std::shared_ptr<ITensorHandle>& th) override;
+  void SetInputs(const std::vector<std::shared_ptr<ITensorHandle>>& ths) override;
+  void SetOutputs(const std::vector<std::shared_ptr<ITensorHandle>>& ths) override;
+  bool Submit(const std::shared_ptr<IExecutable>& ref, bool after) override;
+  bool Trigger(bool async) override;
+  bool Verify() override;
+  std::shared_ptr<ITensorHandle> AllocateTensor(const TensorSpec& tensor_spec,
+                                                void* data = nullptr, uint32_t size = 0) override;
+
+ private:
+  uint32_t device_id_;
+  uint32_t input_count_;
+  uint32_t output_count_;
+  std::unique_ptr<LiteNetwork> lite_network_;
+};
+
+class LiteNativeTensorHandleImpl : public LiteNativeTensorHandle {
+ public:
+   typedef enum {
+    ALLOC_MEM_NONE,
+    ALLOC_MEM_EXTERNAL,
+    ALLOC_MEM_INTERNAL,
+    ALLOC_MEM_VIDEOMEM,
+    ALLOC_MEM_PHYSICAL,
+    ALLOC_MEM_FD,
+  } alloc_mem_type;
+
+  LiteNativeTensorHandleImpl(const TensorSpec& tensor_spec,void* data, uint32_t size,uint32_t device_id);
+  virtual ~LiteNativeTensorHandleImpl();
+  bool CopyDataToTensor(const void* data, uint32_t size_in_bytes) override;
+  bool CopyDataFromTensor(void* data) override;
+  bool Flush();
+  bool Invalidate();
+  vip_buffer GetBuffer() {return tensor_buffer_;};
+
+private:
+  vip_buffer tensor_buffer_;
+  void* handle_;
+  uint32_t handle_size_;
+  alloc_mem_type memory_type_;
+};
+
+}  // namespace platform
+}  // namespace vx
+}  // namespace tim
+
+#endif /* TIM_VX_NATIVE_DEVICE_PRIVATE_H_*/
--- a/src/tim/vx/platform/native.cc
+++ b/src/tim/vx/platform/native.cc
@ -22,9 +22,14 @@
 *
 *****************************************************************************/
 #include "tim/vx/platform/native.h"
-#include "native_device_private.h"
+#include "native_private.h"
+#include "context_private.h"
 #include "tim/vx/ops/nbg.h"
+#ifdef ENABLE_PLATFORM_LITE
+#include "tim/vx/platform/lite/lite_native.h"
+#endif

+#include <cassert>
 namespace tim {
 namespace vx {
 namespace platform {
@ -35,215 +40,203 @@ std::shared_ptr<IExecutable> Compile(
  return executor->Compile(graph);
 }

-std::shared_ptr<IExecutable> CreateExecutableSet(
-    const std::vector<std::shared_ptr<IExecutable>>& executables) {
-  ExecutableSet* executable_set = new ExecutableSet(executables);
-  std::shared_ptr<IExecutable> executable(executable_set);
-  return executable;
+NativeDeviceImpl::NativeDeviceImpl(device_id_t id, uint32_t core_count) {
+  device_id_ = id;
+  core_count_ = core_count;
+}
+std::vector<std::shared_ptr<IDevice>> IDevice::Enumerate() {
+#ifdef ENABLE_PLATFORM_LITE
+  auto devices = tim::vx::platform::LiteNativeDevice::Enumerate();
+#else
+  auto devices = tim::vx::platform::NativeDevice::Enumerate();
+#endif
+  return devices;
 }
-
-IDevice::device_id_t IDevice::Id() const { return device_id_; }

 void IDevice::RemoteReset() {}

-NativeDeviceImpl::NativeDeviceImpl(device_id_t id) {
-  vip_device_ = std::make_unique<vip::IDevice>(id);
-  device_id_ = id;
-}
-
 bool NativeDeviceImpl::Submit(const std::shared_ptr<Graph>& graph) {
-  GraphImpl* graphimp =
-      dynamic_cast<GraphImpl*>(graph.get());  // hack to downcast
-  vsi_graph_v_.push_back(graphimp->graph());
+  (void)graph;
  return true;
 }

 bool NativeDeviceImpl::Trigger(bool async, async_callback cb) {
-  // extract graph from tasks
  (void)async;
-  bool status = false;
-  while (!vsi_graph_v_.empty()) {
-    auto task = vsi_graph_v_.front();
-    vsi_graph_v_.erase(vsi_graph_v_.begin());
-    status = vip_device_->GraphSubmit(task, cb, NULL);
-  }
-  return status;
+  (void)cb;
+  return true;
 }

-void NativeDeviceImpl::WaitDeviceIdle() { vip_device_->WaitThreadIdle(); }
+void NativeDeviceImpl::WaitDeviceIdle() {}

-bool NativeDeviceImpl::DeviceExit() { return vip_device_->ThreadExit(); }
+bool NativeDeviceImpl::DeviceExit() { return true; }
+
+std::shared_ptr<IExecutor> NativeDeviceImpl::CreateExecutor(const int32_t core_index,
+                                                    const int32_t core_count,
+                                                    const std::shared_ptr<Context>& context) {
+  std::shared_ptr<IDevice> this_sp = shared_from_this();
+  auto  executor = std::make_shared<NativeExecutorImpl>(this_sp, core_count,core_index,context);
+  return executor;
+}

 std::vector<std::shared_ptr<IDevice>> NativeDevice::Enumerate() {
  std::vector<std::shared_ptr<IDevice>> device_v;
-  device_id_t deviceCount = 0;
-  vsi_nn_context_t context;
-  context = vsi_nn_CreateContext();
+  vsi_nn_context_t context = vsi_nn_CreateContext();
+  vsi_size_t deviceCount = 0;
+#ifdef VSI_DEVICE_SUPPORT
+  vsi_nn_device_t  vsi_devices[VSI_MAX_DEVICES] = {0};
+  vsi_status status  = VSI_FAILURE;
+  vsi_size_t deviceCount = 0;
+
+  status  = vsi_nn_GetDevices(context,vsi_devices,&deviceCount);
+  if(status != VSI_SUCCESS){
+        VSILOGE("Get device count fail");
+        return device_v;
+  }
+
+  for (vsi_size_t i = 0; i < deviceCount; i++) {
+    vsi_size_t available_core_count = 0;
+    vsi_nn_GetDeviceCoreCount(vsi_devices[i],&available_core_count);
+    auto  local_device = std::make_shared<NativeDeviceImpl>(i,available_core_count);
+    device_v.push_back(local_device);
+  }
+#else
  vxQueryContext(context->c, VX_CONTEXT_DEVICE_COUNT_VIV, &deviceCount,
                 sizeof(deviceCount));
-  std::cout << "Device count = " << deviceCount << std::endl;
  for (device_id_t i = 0; i < deviceCount; i++) {
-    IDevice* local_device = new NativeDeviceImpl(i);
-    std::shared_ptr<IDevice> local_device_sp(local_device);
-    device_v.push_back(local_device_sp);
+    auto  local_device = std::make_shared<NativeDeviceImpl>(i,0);
+    device_v.push_back(local_device);
  }
+  VSILOGE("VSI device API is not supportted, please upgrade Vivant SDK version >= 6.4.22 && ovxlib >= 1.2.26 !");
+#endif
  vsi_nn_ReleaseContext(&context);
  return device_v;
 }

-std::shared_ptr<Graph> IExecutable::NBGraph() const { return nb_graph_; }
-
-std::shared_ptr<IExecutor> IExecutable::Executor() const {
-  auto executor = executor_.lock();
-  if (!executor) {
-    std::cout << "Executor unable to lock weak_ptr";
-  }
-  return executor;
-}
-
-NativeExecutable::NativeExecutable(const std::shared_ptr<IExecutor>& executor,
+NativeExecutableImpl::NativeExecutableImpl(const std::shared_ptr<IExecutor>& executor,
                                   const std::vector<char>& nb_buf,
                                   size_t inputs, size_t outputs) {
-  CompileOption opt;
-  opt.setDeviceId(executor->Device()->Id());

  executor_ = executor;
  context_ = executor->Contex();
-  nb_graph_ = context_->CreateGraph(opt);
+  nb_graph_ = context_->CreateGraph();

  nb_buf_ = nb_buf;
  nb_node_ = nb_graph_->CreateOperation<tim::vx::ops::NBG>(nb_buf_.data(),
                                                           inputs, outputs);
 }

-void NativeExecutable::SetInput(const std::shared_ptr<ITensorHandle>& th) {
+void NativeExecutableImpl::SetInput(const std::shared_ptr<ITensorHandle>& th) {
  nb_node_->BindInput(th->GetTensor());
+   input_handles_.push_back(th);
 }

-void NativeExecutable::SetOutput(const std::shared_ptr<ITensorHandle>& th) {
+void NativeExecutableImpl::SetInputs(const std::vector<std::shared_ptr<ITensorHandle>>& ths) {
+    for (auto& t : ths) {
+    SetInput(t);
+  }
+}
+
+void NativeExecutableImpl::SetOutput(const std::shared_ptr<ITensorHandle>& th) {
  nb_node_->BindOutput(th->GetTensor());
+  output_handles_.push_back(th);
 }

-void NativeExecutable::GetOutput(
-    const std::vector<std::shared_ptr<ITensorHandle>>& th) {
-  (void)th;
+void NativeExecutableImpl::SetOutputs(const std::vector<std::shared_ptr<ITensorHandle>>& ths) {
+  for (auto& t : ths) {
+    SetOutput(t);
+  }
+
 }

-bool NativeExecutable::Submit(const std::shared_ptr<IExecutable>& ref,
+bool NativeExecutableImpl::Submit(const std::shared_ptr<IExecutable>& ref,
                              bool after) {
  bool status = false;
  std::shared_ptr<IExecutable> executable = shared_from_this();
-  status = Executor()->Submit(executable, ref, after);
+  std::shared_ptr<NativeExecutorImpl> executor = std::dynamic_pointer_cast<NativeExecutorImpl>(executor_.lock());
+  status = executor->Submit(executable, ref, after);
  return status;
 }

-bool NativeExecutable::Trigger(bool async) {
+bool NativeExecutableImpl::Trigger(bool async) {
  (void)async;
-  bool status = false;
-  auto device = Executor()->Device();
-  device->Submit(nb_graph_);
-  status = device->Trigger();
-  device->WaitDeviceIdle();
+  bool status = nb_graph_->Run();
  return status;
 }

-std::shared_ptr<ITensorHandle> NativeExecutable::AllocateTensor(
-    const TensorSpec& tensor_spec) {
-  auto tensor = nb_graph_->CreateTensor(tensor_spec);
-  ITensorHandle* tensor_handle = new NativeTensorHandle(tensor);
-  std::shared_ptr<ITensorHandle> tensor_handle_sp(tensor_handle);
-  return tensor_handle_sp;
+std::shared_ptr<ITensorHandle> NativeExecutableImpl::AllocateTensor(const TensorSpec& tensor_spec,
+                                                                    void* data, uint32_t size) {
+  (void)size;
+  auto tensor = nb_graph_->CreateTensor(tensor_spec,data);
+  return std::make_shared<NativeTensorHandleImpl>(tensor);
 }

-bool NativeExecutable::Verify() { return nb_graph_->Compile(); }
-
-ExecutableSet::ExecutableSet(
-    const std::vector<std::shared_ptr<IExecutable>>& executables) {
-  executables_ = executables;
-  executor_ = executables[0]->Executor();
-}
-
-void ExecutableSet::SetInput(const std::shared_ptr<ITensorHandle>& th) {
-  (void)th;
-}
-
-void ExecutableSet::SetOutput(const std::shared_ptr<ITensorHandle>& th) {
-  (void)th;
-}
-
-void ExecutableSet::GetOutput(
-    const std::vector<std::shared_ptr<ITensorHandle>>& th) {
-  (void)th;
-}
-
-bool ExecutableSet::Submit(const std::shared_ptr<IExecutable>& ref,
-                           bool after) {
-  bool status = false;
-  std::shared_ptr<IExecutable> executable = shared_from_this();
-  status = Executor()->Submit(executable, ref, after);
-  return status;
-}
-
-bool ExecutableSet::Trigger(bool async) {
-  (void)async;
-  bool status = false;
-  auto device = Executor()->Device();
-  for (auto executable : executables_) {
-    device->Submit(executable->NBGraph());
+bool NativeExecutableImpl::Verify() {
+  std::shared_ptr<NativeExecutorImpl> executor = std::dynamic_pointer_cast<NativeExecutorImpl>(executor_.lock());
+  bool success = executor->BindDevices(NBGraph());
+  if (success == false) {
+    VSILOGE("Executable bind device failed");
+    return false;
  }
-  status = device->Trigger();
-  device->WaitDeviceIdle();
-  return status;
-}
-
-std::shared_ptr<ITensorHandle> ExecutableSet::AllocateTensor(
-    const TensorSpec& tensor_spec) {
-  std::shared_ptr<ITensorHandle> tensor_handle_sp;
-  (void)tensor_spec;
-  return tensor_handle_sp;
-}
-
-std::vector<std::shared_ptr<IExecutable>> ExecutableSet::Executables() const {
-  return executables_;
-}
-
-bool ExecutableSet::Verify() {
-  bool status = false;
-  for (auto executable : executables_) {
-    status = executable->Verify();
+  success = nb_graph_->Compile();
+  return success;
  }
-  return status;
-}

-std::shared_ptr<Context> IExecutor::Contex() const { return context_; }
-
-NativeExecutor::NativeExecutor(const std::shared_ptr<IDevice>& device) {
-  device_ = device;
-  context_ = Context::Create();
-}
-
-NativeExecutor::NativeExecutor(const std::shared_ptr<IDevice>& device,
+NativeExecutorImpl::NativeExecutorImpl(const std::shared_ptr<IDevice>& device,
+                               const int32_t core_count,
+                               const int32_t core_index,
                               const std::shared_ptr<Context>& context) {
  device_ = device;
-  context_ = context;
+  if(!context) {
+    context_ = Context::Create();
+  } else {
+    context_ = context;
+  }
+  auto fixed_core_count = core_count;
+  int32_t fixed_core_index = core_index;
+  int32_t total_core_count  =(int32_t)device_->CoreCount();
+  if (fixed_core_index < 0) {
+    fixed_core_index = 0;
+  }
+  if (fixed_core_index > total_core_count - 1) {
+     VSILOGE("Core index is larger than total core count");
+     assert(false);
+  }
+  if (fixed_core_count <= 0 ) {
+    fixed_core_count = total_core_count - fixed_core_index;
+  }
+
+  if (fixed_core_index + fixed_core_count > total_core_count) {
+    fixed_core_count = total_core_count - fixed_core_index;
+    VSILOGW(
+        "Core_index + core_count is larger than total core count. Fix core count to %d", fixed_core_count);
+  }
+  core_index_ = (uint32_t)fixed_core_index;
+  core_count_ = (uint32_t)fixed_core_count;
+#ifdef VSI_DEVICE_SUPPORT
+  vsi_nn_device_t  vsi_devices[VSI_MAX_DEVICES] = {0};
+  vsi_size_t num_devices = 0;
+  auto ctx = dynamic_cast<ContextImpl*>(context_.get());
+  vsi_nn_GetDevices(ctx->context(),vsi_devices,&num_devices);
+  vsi_nn_CreateSubDevice(vsi_devices[device_->Id()],core_index_,core_count_,&sub_devices_);
+#endif
 }

-bool NativeExecutor::Submit(const std::shared_ptr<IExecutable>& executable,
+bool NativeExecutorImpl::Submit(const std::shared_ptr<IExecutable>& executable,
                            const std::shared_ptr<IExecutable>& ref,
                            bool after) {
  bool success = false;
  success = executable->Verify();
-  if (success == false) {
-    std::cout << "Executable NBG compile failed";
+  if(success == false) {
+    VSILOGE("Executable NBG compile failed");
    return false;
  }
-  if (executable == ref) {
+  if(executable == ref) {
    tasks_.push_back(executable);
    return true;
  }
-  for (size_t i = 0; i < tasks_.size(); i++) {
-    if (tasks_[i].lock() == ref) {
-      if (after == true) {
+  for(size_t i = 0; i < tasks_.size(); i++) {
+    if(tasks_[i].lock() == ref) {
+      if(after == true) {
        tasks_.insert(tasks_.begin() + i + 1, executable);
        success = true;
        break;
@ -257,59 +250,81 @@ bool NativeExecutor::Submit(const std::shared_ptr<IExecutable>& executable,
  return success;
 }

-bool NativeExecutor::Trigger(bool async) {
+bool NativeExecutorImpl::Trigger(bool async) {
  (void)async;
-  while (!tasks_.empty()) {
+  bool ret = false;
+  while(!tasks_.empty()) {
    auto task = tasks_.front();
    tasks_.erase(tasks_.begin());
-    auto task_ = task.lock();
-    if (!task_) {
-      std::cout << "Task unable to lock weak_ptr";
+    auto task_tmp = task.lock();
+    if(!task_tmp) {
+      VSILOGE("Task unable to lock weak_ptr");
+       return false;
    }
-    task_->Trigger();
+    ret = task_tmp->Trigger();
  }
  device_->WaitDeviceIdle();
-  return true;
+  return ret;
 }

-std::shared_ptr<IExecutable> NativeExecutor::Compile(
+std::shared_ptr<IExecutable> NativeExecutorImpl::Compile(
    const std::shared_ptr<Graph>& graph) {
-
-  CompileOption option;
-  option.setDeviceId(device_->Id());
-  graph->SetCompileOption(option);
-
+  bool ret = BindDevices(graph);
+  if(!ret) {
+    return nullptr;
+  }
  size_t bin_size = -1;
-  graph->CompileToBinary(nullptr, &bin_size);
+  ret = graph->CompileToBinary(nullptr, &bin_size);
+  if(!ret) {
+    return nullptr;
+  }
  std::vector<char> nb_buf;
  nb_buf.resize(bin_size);
  size_t inputs = graph->InputsTensor().size();
  size_t outputs = graph->OutputsTensor().size();
-  graph->CompileToBinary(nb_buf.data(), &bin_size);
-  std::shared_ptr<IExecutor> this_sp = shared_from_this();
-  IExecutable* executable =
-      new NativeExecutable(this_sp, nb_buf, inputs, outputs);
-  std::shared_ptr<IExecutable> executable_sp(executable);
-  return executable_sp;
+  ret = graph->CompileToBinary(nb_buf.data(), &bin_size);
+  if(!ret) {
+    return nullptr;
+  }
+  std::shared_ptr<NativeExecutorImpl> this_sp = shared_from_this();
+  auto  executable = std::make_shared<NativeExecutableImpl>(this_sp, nb_buf,inputs,outputs);
+  return executable;
 }

-std::shared_ptr<IDevice> IExecutor::Device() const { return device_; }

-std::shared_ptr<Tensor> ITensorHandle::GetTensor() const { return tensor_; }
+bool NativeExecutorImpl::BindDevices(const std::shared_ptr<Graph>& graph){
+  vsi_status status  = VSI_SUCCESS;
+#ifdef VSI_DEVICE_SUPPORT
+  GraphImpl* graphimp = dynamic_cast<GraphImpl*>(graph.get());
+  status = vsi_nn_BindDevices(graphimp->graph(), 1, &sub_devices_);
+#else
+  CompileOption option;
+  option.setDeviceId(device_->Id());
+  graph->SetCompileOption(option);
+#endif
+  if(status == VSI_SUCCESS) {
+    return true;
+  }
+  else{
+    return false;
+  }
+}

-NativeTensorHandle::NativeTensorHandle(const std::shared_ptr<Tensor>& tensor) {
+
+NativeTensorHandleImpl::NativeTensorHandleImpl(const std::shared_ptr<Tensor>& tensor) {
  tensor_ = tensor;
+  spec_ = tensor->GetSpec();
 }

-bool NativeTensorHandle::CopyDataToTensor(const void* data,
+bool NativeTensorHandleImpl::CopyDataToTensor(const void* data,
                                          uint32_t size_in_bytes) {
  return tensor_->CopyDataToTensor(data, size_in_bytes);
 }

-bool NativeTensorHandle::CopyDataFromTensor(void* data) {
+bool NativeTensorHandleImpl::CopyDataFromTensor(void* data) {
  return tensor_->CopyDataFromTensor(data);
 }

 }  // namespace platform
 }  // namespace vx
-}  // namespace tim
+}  // namespace tim
--- a/src/tim/vx/platform/native_device_private.h
+++ b/src/tim/vx/platform/native_device_private.h
@ -1,58 +0,0 @@
-/****************************************************************************
-*
-*    Copyright (c) 2020-2023 Vivante Corporation
-*
-*    Permission is hereby granted, free of charge, to any person obtaining a
-*    copy of this software and associated documentation files (the "Software"),
-*    to deal in the Software without restriction, including without limitation
-*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
-*    and/or sell copies of the Software, and to permit persons to whom the
-*    Software is furnished to do so, subject to the following conditions:
-*
-*    The above copyright notice and this permission notice shall be included in
-*    all copies or substantial portions of the Software.
-*
-*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-*    DEALINGS IN THE SOFTWARE.
-*
-*****************************************************************************/
-#ifndef TIM_VX_NATIVE_DEVICE_PRIVATE_H_
-#define TIM_VX_NATIVE_DEVICE_PRIVATE_H_
-
-#include "tim/vx/platform/native.h"
-#include "vip/virtual_device.h"
-#include "graph_private.h"
-
-namespace tim {
-namespace vx {
-
-class GraphImpl;
-
-namespace platform {
-
-class NativeDeviceImpl : public NativeDevice {
- public:
-  NativeDeviceImpl(device_id_t id);
-  ~NativeDeviceImpl(){};
-
-  bool Submit(const std::shared_ptr<tim::vx::Graph>& graph) override;
-  bool Trigger(bool async = false, async_callback cb = NULL) override;
-  bool DeviceExit() override;
-  void WaitDeviceIdle() override;
-
- protected:
-  std::unique_ptr<vip::IDevice> vip_device_;
-  std::vector<vsi_nn_graph_t*> vsi_graph_v_;
-
-};
-
-}  // namespace platform
-}  // namespace vx
-}  // namespace tim
-
-#endif /* TIM_VX_NATIVE_DEVICE_PRIVATE_H_*/
--- a/src/tim/vx/platform/native_private.h
+++ b/src/tim/vx/platform/native_private.h
@ -0,0 +1,106 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020-2025 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+#ifndef TIM_VX_NATIVE_DEVICE_PRIVATE_H_
+#define TIM_VX_NATIVE_DEVICE_PRIVATE_H_
+
+#include "tim/vx/platform/native.h"
+#include "vip/virtual_device.h"
+#include "graph_private.h"
+
+namespace tim {
+namespace vx {
+
+class GraphImpl;
+
+namespace platform {
+
+class NativeDeviceImpl : public NativeDevice,
+                         public std::enable_shared_from_this<NativeDeviceImpl>{
+ public:
+  NativeDeviceImpl(device_id_t id,uint32_t core_count);
+  ~NativeDeviceImpl(){};
+
+  bool Submit(const std::shared_ptr<tim::vx::Graph>& graph) override;
+  bool Trigger(bool async = false, async_callback cb = NULL) override;
+  bool DeviceExit() override;
+  void WaitDeviceIdle() override;
+  std::shared_ptr<IExecutor> CreateExecutor(const int32_t core_index = 0,
+                                            const int32_t core_count = -1,
+                                            const std::shared_ptr<Context>& context = nullptr) override;
+};
+
+class NativeExecutableImpl : public NativeExecutable {
+ public:
+  NativeExecutableImpl(const std::shared_ptr<IExecutor>& executor,
+                   const std::vector<char>& nb_buf, size_t inputs,
+                   size_t outputs);
+  ~NativeExecutableImpl() {};
+  void SetInput(const std::shared_ptr<ITensorHandle>& th) override;
+  void SetOutput(const std::shared_ptr<ITensorHandle>& th) override;
+  void SetInputs(const std::vector<std::shared_ptr<ITensorHandle>>& ths) override;
+  void SetOutputs(const std::vector<std::shared_ptr<ITensorHandle>>& ths) override;
+  bool Submit(const std::shared_ptr<IExecutable>& ref, bool after = true) override;
+  bool Trigger(bool async = false) override;
+  std::shared_ptr<ITensorHandle> AllocateTensor(const TensorSpec& tensor_spec,
+                                                void* data = nullptr, uint32_t size = 0) override;
+  bool Verify() override;
+
+ protected:
+  std::shared_ptr<tim::vx::ops::NBG> nb_node_;
+  std::vector<char> nb_buf_;
+};
+
+class NativeExecutorImpl : public NativeExecutor,
+                       public std::enable_shared_from_this<NativeExecutorImpl> {
+ public:
+  NativeExecutorImpl(const std::shared_ptr<IDevice>& device,
+                 const int32_t core_count = -1,
+                 const int32_t core_index = 0,
+                 const std::shared_ptr<Context>& context = nullptr);
+  ~NativeExecutorImpl(){};
+  bool Submit(const std::shared_ptr<IExecutable>& executable,
+              const std::shared_ptr<IExecutable>& ref,
+              bool after = true) override;
+  bool Trigger(bool async = false) override;
+  std::shared_ptr<IExecutable> Compile(const std::shared_ptr<Graph>& graph) override;
+  bool BindDevices(const std::shared_ptr<Graph>& graph);
+
+private:
+#ifdef VSI_DEVICE_SUPPORT
+  vsi_nn_device_t  sub_devices_;
+#endif
+};
+
+class NativeTensorHandleImpl : public NativeTensorHandle {
+ public:
+  NativeTensorHandleImpl(const std::shared_ptr<Tensor>& tensor);
+  bool CopyDataToTensor(const void* data, uint32_t size_in_bytes) override;
+  bool CopyDataFromTensor(void* data) override;
+};
+
+}  // namespace platform
+}  // namespace vx
+}  // namespace tim
+
+#endif /* TIM_VX_NATIVE_DEVICE_PRIVATE_H_*/