diff --git a/cmake/local_sdk.cmake b/cmake/local_sdk.cmake
index a74de46..bed2a21 100644
--- a/cmake/local_sdk.cmake
+++ b/cmake/local_sdk.cmake
@@ -9,7 +9,11 @@ list(APPEND OVXDRV_INCLUDE_DIRS
 if("${CONFIG}" STREQUAL "BUILDROOT")
     set(VIV_SDK_DRIVER_PREFIX "usr/lib")
 else()
-    set(VIV_SDK_DRIVER_PREFIX "drivers")
+    if(EXISTS ${EXTERNAL_VIV_SDK}/drivers)
+       set(VIV_SDK_DRIVER_PREFIX "drivers")
+    else()
+       set(VIV_SDK_DRIVER_PREFIX "lib")
+    endif()
 endif()
 
 message("using driver libs from ${EXTERNAL_VIV_SDK}/${VIV_SDK_DRIVER_PREFIX}")
diff --git a/include/tim/vx/platform/lite/lite_native.h b/include/tim/vx/platform/lite/lite_native.h
index b83d5fe..a9ed553 100644
--- a/include/tim/vx/platform/lite/lite_native.h
+++ b/include/tim/vx/platform/lite/lite_native.h
@@ -25,72 +25,58 @@
 #define TIM_VX_LITE_NATIVE_H_
 
 #include "tim/vx/platform/platform.h"
-#include "vip_lite.h"
-#include "nbg_linker.h"
 
 namespace tim {
 namespace vx {
 namespace platform {
 
-class LiteNativeExecutor
-    : public IExecutor,
-      public std::enable_shared_from_this<LiteNativeExecutor> {
+class LiteNativeDevice : public IDevice {
  public:
-  LiteNativeExecutor(const std::shared_ptr<IDevice>& device);
-  virtual ~LiteNativeExecutor();
-  bool Submit(const std::shared_ptr<IExecutable>& executable,
-              const std::shared_ptr<IExecutable>& ref,
-              bool after = true) override;
-  bool Trigger(bool async = false) override;
-  std::shared_ptr<IExecutable> Compile(
-      const std::shared_ptr<Graph>& graph) override;
-
- private:
-  vip_task_descriptor_t* task_descriptor_;
-  vip_database database_;
+  virtual ~LiteNativeDevice() {};
+  virtual bool Submit(const std::shared_ptr<Graph>& graph) = 0;
+  virtual bool Trigger(bool async = false, async_callback cb = NULL) = 0;
+  virtual bool DeviceExit() = 0;
+  virtual void WaitDeviceIdle() = 0;
+  virtual std::shared_ptr<IExecutor> CreateExecutor(const int32_t core_index = 0,
+                                                    const int32_t core_count = -1,
+                                                    const std::shared_ptr<Context>& context = nullptr) = 0;
+  static std::vector<std::shared_ptr<IDevice>> Enumerate();
+  static bool vip_initialized;
+};
+class LiteNativeExecutor
+    : public IExecutor {
+ public:
+  virtual ~LiteNativeExecutor() {};
+  virtual bool Submit(const std::shared_ptr<IExecutable>& executable,
+                      const std::shared_ptr<IExecutable>& ref,
+                      bool after = true) = 0;
+  virtual bool Trigger(bool async = false) = 0;
+  virtual std::shared_ptr<IExecutable> Compile(
+      const std::shared_ptr<Graph>& graph) = 0;
 };
 
 class LiteNativeExecutable : public IExecutable {
  public:
-  LiteNativeExecutable(const std::shared_ptr<IExecutor>& executor,
-                       const std::vector<char>& nb_buf);
-  virtual ~LiteNativeExecutable();
-  void SetInput(const std::shared_ptr<ITensorHandle>& th) override;
-  void SetOutput(const std::shared_ptr<ITensorHandle>& th) override;
-  void GetOutput(
-      const std::vector<std::shared_ptr<ITensorHandle>>& th) override;
-  bool Submit(const std::shared_ptr<IExecutable>& ref, bool after) override;
-  bool Trigger(bool async) override;
-  bool Verify() override;
-  std::shared_ptr<ITensorHandle> AllocateTensor(
-      const TensorSpec& tensor_spec) override;
-
-  vip_network network_;
-
- private:
-  void SetBuffer(vip_memory_t* dst, gcvip_videomemory_t* src);
-
-  int32_t input_count_;
-  int32_t output_count_;
-
-  gcvip_videomemory_t* coeff_;
-  gcvip_videomemory_t* command_;
-  gcvip_videomemory_t* memory_pool_;
-  gcvip_videomemory_t* others_;
-  gcvip_videomemory_t* pre_command_;
+  virtual ~LiteNativeExecutable() {};
+  virtual void SetInput(const std::shared_ptr<ITensorHandle>& th) = 0;
+  virtual void SetOutput(const std::shared_ptr<ITensorHandle>& th) = 0;
+  virtual void SetInputs(const std::vector<std::shared_ptr<ITensorHandle>>& ths) = 0;
+  virtual void SetOutputs(const std::vector<std::shared_ptr<ITensorHandle>>& ths) = 0;
+  virtual bool Submit(const std::shared_ptr<IExecutable>& ref, bool after) = 0;
+  virtual bool Trigger(bool async) = 0;
+  virtual bool Verify() = 0;
+  virtual std::shared_ptr<ITensorHandle> AllocateTensor(const TensorSpec& tensor_spec,
+                                                        void* data = nullptr, uint32_t size = 0) = 0;
 };
 
 class LiteNativeTensorHandle : public ITensorHandle {
  public:
-  LiteNativeTensorHandle(const std::shared_ptr<Tensor>& tensr);
-  virtual ~LiteNativeTensorHandle();
-  bool CopyDataToTensor(const void* data, uint32_t size_in_bytes) override;
-  bool CopyDataFromTensor(void* data) override;
-
-  gcvip_videomemory_t* tensor_buffer_;
+  virtual ~LiteNativeTensorHandle() {};
+  bool CopyDataToTensor(const void* data, uint32_t size_in_bytes) = 0;
+  bool CopyDataFromTensor(void* data) = 0;
 };
 }  // namespace platform
 }  // namespace vx
 }  // namespace tim
 
-#endif
\ No newline at end of file
+#endif
diff --git a/include/tim/vx/platform/native.h b/include/tim/vx/platform/native.h
index cecf34a..8521731 100644
--- a/include/tim/vx/platform/native.h
+++ b/include/tim/vx/platform/native.h
@@ -37,51 +37,41 @@ class NativeDevice : public IDevice {
   virtual bool Trigger(bool async = false, async_callback cb = NULL) = 0;
   virtual bool DeviceExit() = 0;
   virtual void WaitDeviceIdle() = 0;
+  virtual std::shared_ptr<IExecutor> CreateExecutor(const int32_t core_index = 0,
+                                                    const int32_t core_count = -1,
+                                                    const std::shared_ptr<Context>& context = nullptr) = 0;
   static std::vector<std::shared_ptr<IDevice>> Enumerate();
 };
 
 class NativeExecutable : public IExecutable {
  public:
-  NativeExecutable(const std::shared_ptr<IExecutor>& executor,
-                   const std::vector<char>& nb_buf, size_t inputs,
-                   size_t outputs);
-  ~NativeExecutable(){};
-  void SetInput(const std::shared_ptr<ITensorHandle>& th) override;
-  void SetOutput(const std::shared_ptr<ITensorHandle>& th) override;
-  void GetOutput(
-      const std::vector<std::shared_ptr<ITensorHandle>>& th) override;
-  bool Submit(const std::shared_ptr<IExecutable>& ref,
-              bool after = true) override;
-  bool Trigger(bool async = false) override;
-  std::shared_ptr<ITensorHandle> AllocateTensor(
-      const TensorSpec& tensor_spec) override;
-  bool Verify() override;
-
- protected:
-  std::shared_ptr<tim::vx::ops::NBG> nb_node_;
-  std::vector<char> nb_buf_;
+  virtual ~NativeExecutable() {};
+  virtual void SetInput(const std::shared_ptr<ITensorHandle>& th) = 0;
+  virtual void SetOutput(const std::shared_ptr<ITensorHandle>& th) = 0;
+  virtual void SetInputs(const std::vector<std::shared_ptr<ITensorHandle>>& ths) = 0;
+  virtual void SetOutputs(const std::vector<std::shared_ptr<ITensorHandle>>& ths) = 0;
+  virtual bool Submit(const std::shared_ptr<IExecutable>& ref,
+                      bool after = true) = 0;
+  virtual bool Trigger(bool async = false) = 0;
+  virtual std::shared_ptr<ITensorHandle> AllocateTensor(const TensorSpec& tensor_spec,
+                                                        void* data = nullptr, uint32_t size = 0) = 0;
+  virtual bool Verify() = 0;
 };
 
-class NativeExecutor : public IExecutor,
-                       public std::enable_shared_from_this<NativeExecutor> {
+class NativeExecutor : public IExecutor {
  public:
-  NativeExecutor(const std::shared_ptr<IDevice>& device);
-  NativeExecutor(const std::shared_ptr<IDevice>& device,
-                 const std::shared_ptr<Context>& context);
-  ~NativeExecutor(){};
-  bool Submit(const std::shared_ptr<IExecutable>& executable,
-              const std::shared_ptr<IExecutable>& ref,
-              bool after = true) override;
-  bool Trigger(bool async = false) override;
-  std::shared_ptr<IExecutable> Compile(
-      const std::shared_ptr<Graph>& graph) override;
+  virtual ~NativeExecutor(){};
+  virtual bool Submit(const std::shared_ptr<IExecutable>& executable,
+                      const std::shared_ptr<IExecutable>& ref,
+                      bool after = true) = 0;
+  virtual bool Trigger(bool async = false) = 0;
+  virtual std::shared_ptr<IExecutable> Compile(const std::shared_ptr<Graph>& graph) = 0;
 };
 
 class NativeTensorHandle : public ITensorHandle {
  public:
-  NativeTensorHandle(const std::shared_ptr<Tensor>& tensor);
-  bool CopyDataToTensor(const void* data, uint32_t size_in_bytes) override;
-  bool CopyDataFromTensor(void* data) override;
+  virtual bool CopyDataToTensor(const void* data, uint32_t size_in_bytes) = 0;
+  virtual bool CopyDataFromTensor(void* data) = 0;
 };
 
 }  // namespace platform
diff --git a/include/tim/vx/platform/platform.h b/include/tim/vx/platform/platform.h
index 263042b..94ba61c 100644
--- a/include/tim/vx/platform/platform.h
+++ b/include/tim/vx/platform/platform.h
@@ -46,15 +46,12 @@ namespace platform {
 
 class IDevice;
 class IExecutable;
-class ExecutableSet;
 class IExecutor;
 class ITensorHandle;
 
 std::shared_ptr<IExecutable> Compile(
     const std::shared_ptr<Graph>& graph,
     const std::shared_ptr<IExecutor>& executor);
-std::shared_ptr<IExecutable> CreateExecutableSet(
-    const std::vector<std::shared_ptr<IExecutable>>& executables);
 
 class IDevice {
  public:
@@ -68,17 +65,25 @@ class IDevice {
   virtual ~IDevice(){};
   virtual bool Submit(const std::shared_ptr<Graph>& graph) = 0;
   virtual bool Trigger(bool async = false, async_callback cb = NULL) = 0;
-  device_id_t Id() const;
+  device_id_t Id() const { return device_id_;};
   virtual void WaitDeviceIdle() = 0;
   virtual bool DeviceExit() = 0;
   virtual void RemoteReset();
+  uint32_t CoreCount() const {return core_count_;};
+  virtual std::shared_ptr<IExecutor> CreateExecutor(const int32_t core_index = 0,
+                                                    const int32_t core_count = -1,
+                                                    const std::shared_ptr<Context>& context = nullptr) = 0;
+  static std::vector<std::shared_ptr<IDevice>> Enumerate();
 
  protected:
   device_id_t device_id_;
+  uint32_t core_count_;
+
 };
 
 class IExecutor {
  public:
+  //using task = std::shared_ptr<IExecutable>;
   using task = std::weak_ptr<IExecutable>;
   virtual ~IExecutor(){};
   virtual bool Submit(const std::shared_ptr<IExecutable>& executable,
@@ -87,13 +92,17 @@ class IExecutor {
   virtual bool Trigger(bool async = false) = 0;  // todo: async=true
   virtual std::shared_ptr<IExecutable> Compile(
       const std::shared_ptr<Graph>& graph) = 0;
-  virtual std::shared_ptr<IDevice> Device() const;
-  virtual std::shared_ptr<Context> Contex() const;
-
+  virtual std::shared_ptr<IDevice> Device() const {return device_;};
+  virtual std::shared_ptr<Context> Contex() const {return context_;};
+  virtual uint32_t CoreIndex() const {return core_index_; };
+  virtual uint32_t CoreCount() const {return core_count_; };
  protected:
   std::vector<task> tasks_;
   std::shared_ptr<IDevice> device_;
   std::shared_ptr<Context> context_;
+  uint32_t core_index_;
+  uint32_t core_count_;
+
 };
 
 class IExecutable : public std::enable_shared_from_this<IExecutable> {
@@ -101,40 +110,24 @@ class IExecutable : public std::enable_shared_from_this<IExecutable> {
   virtual ~IExecutable(){};
   virtual void SetInput(const std::shared_ptr<ITensorHandle>& th) = 0;
   virtual void SetOutput(const std::shared_ptr<ITensorHandle>& th) = 0;
-  virtual void GetOutput(
-      const std::vector<std::shared_ptr<ITensorHandle>>& th) = 0;  // for remote
+  virtual void SetInputs(const std::vector<std::shared_ptr<ITensorHandle>>& ths) = 0;
+  virtual void SetOutputs(const std::vector<std::shared_ptr<ITensorHandle>>& ths) = 0;
+  virtual std::vector<std::shared_ptr<ITensorHandle>> GetOutputs() { return input_handles_;};
+  virtual std::vector<std::shared_ptr<ITensorHandle>> Getinputs() { return input_handles_;};
   virtual bool Submit(const std::shared_ptr<IExecutable>& ref,
                       bool after = true) = 0;
   virtual bool Trigger(bool async = false) = 0;  // todo: async=true
   virtual bool Verify() = 0;
-  virtual std::shared_ptr<Graph> NBGraph() const;
-  virtual std::shared_ptr<ITensorHandle> AllocateTensor(
-      const TensorSpec& tensor_spec) = 0;
-  virtual std::shared_ptr<IExecutor> Executor() const;
+  std::shared_ptr<Graph> NBGraph() const {return nb_graph_;};
+  virtual std::shared_ptr<ITensorHandle> AllocateTensor(const TensorSpec& tensor_spec ,
+                                                        void* data = nullptr, uint32_t size = 0) = 0;
 
  protected:
   std::weak_ptr<IExecutor> executor_;
   std::shared_ptr<Context> context_;
   std::shared_ptr<Graph> nb_graph_;
-};
-
-class ExecutableSet : public IExecutable {
- public:
-  ExecutableSet(const std::vector<std::shared_ptr<IExecutable>>& executables);
-  void SetInput(const std::shared_ptr<ITensorHandle>& th) override;
-  void SetOutput(const std::shared_ptr<ITensorHandle>& th) override;
-  void GetOutput(
-      const std::vector<std::shared_ptr<ITensorHandle>>& th) override;
-  bool Submit(const std::shared_ptr<IExecutable>& ref,
-              bool after = true) override;
-  bool Trigger(bool async = false) override;
-  bool Verify() override;
-  std::shared_ptr<ITensorHandle> AllocateTensor(
-      const TensorSpec& tensor_spec) override;
-  std::vector<std::shared_ptr<IExecutable>> Executables() const;
-
- protected:
-  std::vector<std::shared_ptr<IExecutable>> executables_;
+  std::vector<std::shared_ptr<ITensorHandle>> input_handles_;
+  std::vector<std::shared_ptr<ITensorHandle>> output_handles_;
 };
 
 class ITensorHandle {
@@ -142,13 +135,15 @@ class ITensorHandle {
   virtual ~ITensorHandle(){};
   virtual bool CopyDataToTensor(const void* data, uint32_t size_in_bytes) = 0;
   virtual bool CopyDataFromTensor(void* data) = 0;
-  virtual std::shared_ptr<Tensor> GetTensor() const;
+  virtual std::shared_ptr<Tensor> GetTensor() const { return tensor_;};
+  virtual TensorSpec& GetSpec() { return spec_;};
 
  protected:
   std::shared_ptr<Tensor> tensor_;
+  TensorSpec spec_;
 };
 
 }  // namespace platform
 }  // namespace vx
 }  // namespace tim
-#endif
\ No newline at end of file
+#endif
diff --git a/samples/CMakeLists.txt b/samples/CMakeLists.txt
index 919c0a5..301afdb 100644
--- a/samples/CMakeLists.txt
+++ b/samples/CMakeLists.txt
@@ -20,9 +20,7 @@ endif()
 if(TIM_VX_ENABLE_PLATFORM)
     add_subdirectory("lenet_multi_device")
     add_subdirectory("multi_device")
-    if(${TIM_VX_ENABLE_PLATFORM_LITE})
-        add_subdirectory("lite_multi_device")
-    endif()
+    add_subdirectory("platform_sample")
     if(TIM_VX_ENABLE_GRPC)
         add_subdirectory("grpc")
     endif()
diff --git a/samples/lenet_multi_device/CMakeLists.txt b/samples/lenet_multi_device/CMakeLists.txt
index 6658898..e62787b 100644
--- a/samples/lenet_multi_device/CMakeLists.txt
+++ b/samples/lenet_multi_device/CMakeLists.txt
@@ -11,5 +11,10 @@ target_include_directories(${TARGET_NAME} PRIVATE
     ${PROJECT_SOURCE_DIR}/include
 )
 
+target_include_directories(${TARGET_NAME} PRIVATE
+    ${CMAKE_CURRENT_SOURCE_DIR}
+    ${PROJECT_SOURCE_DIR}/include
+)
+
 install(TARGETS ${TARGET_NAME} ${TARGET_NAME}
     DESTINATION ${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_BINDIR})
diff --git a/samples/lenet_multi_device/lenet_multi_device.cc b/samples/lenet_multi_device/lenet_multi_device.cc
index 9ce79c9..1761e97 100644
--- a/samples/lenet_multi_device/lenet_multi_device.cc
+++ b/samples/lenet_multi_device/lenet_multi_device.cc
@@ -33,7 +33,6 @@
 #include "tim/vx/context.h"
 #include "tim/vx/graph.h"
 #include "tim/vx/platform/platform.h"
-#include "tim/vx/platform/native.h"
 
 std::vector<uint8_t> input_data = {
     0,   0,   0,   0,   0,   0,   0,   0,   6,   0,   2,   0,   0,   8,   0,
@@ -108,17 +107,17 @@ static void printTopN(const T* prob, int outputCount, int topNum) {
   }
 }
 
+
 int main(int argc, char** argv) {
   (void) argc, (void) argv;
   auto context0 = tim::vx::Context::Create();
   auto graph0 = lenet(context0);
   auto graph1 = lenet(context0);
 
-  auto devices = tim::vx::platform::NativeDevice::Enumerate();
+  auto devices = tim::vx::platform::IDevice::Enumerate();
   auto device = devices[0];
-  std::shared_ptr<tim::vx::platform::IExecutor> executor = std::make_shared<tim::vx::platform::NativeExecutor> (device);
-
-  auto executable0 = tim::vx::platform::Compile(graph0, executor);  // compile to nbg
+  auto executor = device->CreateExecutor(0,-1,context0);
+  auto executable0 = tim::vx::platform::Compile(graph0, executor);
   auto input_handle0 = executable0->AllocateTensor(graph0->InputsTensor()[0]->GetSpec());
   auto output_handle0 = executable0->AllocateTensor(graph0->OutputsTensor()[0]->GetSpec());
   executable0->SetInput(input_handle0);
@@ -127,7 +126,18 @@ int main(int argc, char** argv) {
   assert(executable0->Submit(executable0));
   executable0->Trigger();
 
-  auto executable1 = tim::vx::platform::Compile(graph1, executor);  // compile to nbg
+  std::vector<float> output_data;
+  output_data.resize(1 * 10);
+  if (!output_handle0->CopyDataFromTensor(output_data.data())) {
+    std::cout << "Copy output data fail." << std::endl;
+    return -1;
+  }
+  std::cout << "executable0 out." << std::endl;
+  printTopN(output_data.data(), output_data.size(), 5);
+  output_data.assign(output_data.size(),0);
+  output_handle0->CopyDataToTensor(output_data.data(), output_data.size());
+
+  auto executable1 = tim::vx::platform::Compile(graph1, executor);
   auto input_handle1 = executable1->AllocateTensor(graph1->InputsTensor()[0]->GetSpec());
   auto output_handle1 = executable1->AllocateTensor(graph1->OutputsTensor()[0]->GetSpec());
   executable1->SetInput(input_handle1);
@@ -136,34 +146,28 @@ int main(int argc, char** argv) {
   assert(executable1->Submit(executable0));
   executable1->Trigger();
 
+  std::vector<float> output_data1;
+  output_data1.resize(1 * 10);
+  if (!output_handle1->CopyDataFromTensor(output_data1.data())) {
+    std::cout << "Copy output data fail." << std::endl;
+    return -1;
+  }
+  std::cout << "executable1 out." << std::endl;
+  printTopN(output_data1.data(), output_data1.size(), 5);
+  output_data1.assign(output_data1.size(),0);
+  output_handle1->CopyDataToTensor(output_data1.data(), output_data1.size());
+
   executor->Submit(executable0, executable0);
   executor->Submit(executable1, executable0);
 
-  std::vector<std::shared_ptr<tim::vx::platform::IExecutable>> executables0;
-  executables0.push_back(executable0);
-  executables0.push_back(executable1);
-  auto executable_set0 = tim::vx::platform::CreateExecutableSet(executables0);
-  executor->Submit(executable_set0, executable_set0);
   executor->Trigger();
-
-  std::vector<uint8_t> input_data0;
-  input_data0.resize(28 * 28);
-  if (!input_handle0->CopyDataFromTensor(input_data0.data())) {
-    std::cout << "Copy intput data fail." << std::endl;
-    return -1;
-  }
-  printTopN(input_data0.data(), input_data0.size(), 5);
-
-  std::vector<float> output_data;
-  output_data.resize(1 * 10);
+  std::cout << "executor out." << std::endl;
   if (!output_handle0->CopyDataFromTensor(output_data.data())) {
     std::cout << "Copy output data fail." << std::endl;
     return -1;
   }
   printTopN(output_data.data(), output_data.size(), 5);
 
-  std::vector<float> output_data1;
-  output_data1.resize(1 * 10);
   if (!output_handle1->CopyDataFromTensor(output_data1.data())) {
     std::cout << "Copy output data fail." << std::endl;
     return -1;
diff --git a/samples/lite_multi_device/CMakeLists.txt b/samples/lite_multi_device/CMakeLists.txt
deleted file mode 100644
index 0356eef..0000000
--- a/samples/lite_multi_device/CMakeLists.txt
+++ /dev/null
@@ -1,13 +0,0 @@
-message("samples/lite_multi_device")
-
-set(TARGET_NAME "lite_multi_device")
-
-add_executable(${TARGET_NAME} ${CMAKE_CURRENT_SOURCE_DIR}/lite_multi_device.cc)
-
-target_link_libraries(${TARGET_NAME} PRIVATE -Wl,--whole-archive tim-vx)
-target_include_directories(${TARGET_NAME} PRIVATE
-    ${PROJECT_SOURCE_DIR}/include
-    ${PROJECT_SOURCE_DIR}/prebuilt-sdk/viplite/build/sdk/include)
-
-install(TARGETS ${TARGET_NAME} ${TARGET_NAME}
-    DESTINATION ${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_BINDIR})
\ No newline at end of file
diff --git a/samples/multi_device/README b/samples/multi_device/README
index 557e20d..890e417 100644
--- a/samples/multi_device/README
+++ b/samples/multi_device/README
@@ -1,15 +1,25 @@
 ## brief
-The multi_device demo uses some acuity exported tim-vx networks, and running on 4 devices of NPU using platform api.
+The multi_device demo uses some acuity exported tim-vx networks, and running on multi-core devices of NPU using platform api.
 
-## environment
-  export VSIMULATOR_CONFIG=VIP9400O_PID0XD9
-  export VIV_MGPU_AFFINITY="1:0"
-  export VIV_OVX_USE_MULTI_DEVICE="1:1"
-  export TIM_VX_ROOT="${workspaceFolder}/tim-vx"
+## note
+Please note that if you have enabled lite platform, a dedicated VIVANTE_SDK(NO_KERNEL) is required as the compiler for NBG.
+The driver for the NPU is the VIPLITE driver
+
+##requirements
+Vivante SDK >= 6.4.22
+ovxlib >= 1.2.26
+viplite >=2.0.0
 
 ## build
 cd build
-cmake .. -DCMAKE_BUILD_TYPE=Debug -DTIM_VX_BUILD_EXAMPLES=ON -DTIM_VX_ENABLE_PLATFORM=ON
+cmake .. -DCMAKE_BUILD_TYPE=Release -DTIM_VX_USE_EXTERNAL_OVXLIB=ON -DEXTERNAL_VIV_SDK=${VIVANTE_NOKERNEL_SDK_DIR} -DOVXLIB_INC=${OVXLIB_DIR}/include \
+         -DOVXLIB_LIB=/path/to/libovxlib.so -DTIM_VX_BUILD_EXAMPLES=ON -DTIM_VX_ENABLE_PLATFORM=ON -DTIM_VX_ENABLE_PLATFORM_LITE=ON -DVIP_LITE_SDK=${VIP_LITE_SDK}
+
+## environment
+# Export VIV_GPU_FILE to specify the NPU hardware configuration file for the NBG compiler
+# VIV_GPU_FILE Specify the NPU hardware configuration file for the NBG compiler
+export VIV_GPU_FILE="/path/to/VIP9400NANOQ_PLUS_PID0X10000055.config"
+export TIM_VX_ROOT="${workspaceFolder}/tim-vx"
 
 ## run
 cd build
diff --git a/samples/multi_device/multi_device.cc b/samples/multi_device/multi_device.cc
index 6e1e772..c3f040a 100644
--- a/samples/multi_device/multi_device.cc
+++ b/samples/multi_device/multi_device.cc
@@ -35,7 +35,6 @@
 #include "tim/vx/context.h"
 #include "tim/vx/graph.h"
 #include "tim/vx/platform/platform.h"
-#include "tim/vx/platform/native.h"
 #include "vx_lenet.h"
 #include "vx_mobilenet.h"
 #include "vx_resnet50.h"
@@ -59,7 +58,7 @@ static void printTopN(const T* prob, int outputCount, int topNum) {
 }
 
 template <typename T>
-void print_topN(std::size_t size, std::shared_ptr<tim::vx::platform::ITensorHandle> handle) {
+void print_topN(std::size_t size, std::shared_ptr<tim::vx::platform::ITensorHandle> & handle) {
   std::vector<T> output_data;
   output_data.resize(size);
   if (!handle->CopyDataFromTensor(output_data.data())) {
@@ -94,7 +93,8 @@ void executor_trigger(std::shared_ptr<tim::vx::platform::IExecutor> executor) {
 }
 
 auto context = tim::vx::Context::Create();
-std::pair<std::shared_ptr<tim::vx::platform::IExecutable>, std::shared_ptr<tim::vx::platform::ITensorHandle>> generate_executable(
+std::pair<std::shared_ptr<tim::vx::platform::IExecutable>, std::shared_ptr<tim::vx::platform::ITensorHandle>>
+  generate_executable(
     std::shared_ptr<tim::vx::platform::IExecutor> executor,
     std::function<void(std::shared_ptr<tim::vx::Graph>, const char*)> construct_func,
     std::string weight_file,
@@ -114,15 +114,17 @@ std::pair<std::shared_ptr<tim::vx::platform::IExecutable>, std::shared_ptr<tim::
 
 int main(int argc, char** argv) {
   (void) argc, (void) argv;
-  auto devices = tim::vx::platform::NativeDevice::Enumerate();
+  auto devices = tim::vx::platform::IDevice::Enumerate();
   auto device0 = devices[0];
-  std::shared_ptr<tim::vx::platform::IExecutor> executor0 = std::make_shared<tim::vx::platform::NativeExecutor> (device0);
-  auto device1 = devices[1];
-  std::shared_ptr<tim::vx::platform::IExecutor> executor1 = std::make_shared<tim::vx::platform::NativeExecutor> (device1);
-  auto device2 = devices[2];
-  std::shared_ptr<tim::vx::platform::IExecutor> executor2 = std::make_shared<tim::vx::platform::NativeExecutor> (device2);
-  auto device3 = devices[3];
-  std::shared_ptr<tim::vx::platform::IExecutor> executor3 = std::make_shared<tim::vx::platform::NativeExecutor> (device3);
+  auto total_core_count = device0->CoreCount();
+  uint32_t core_index = 0;
+  auto use_core_count = 1;
+  std::vector<std::shared_ptr<tim::vx::platform::IExecutor>> executors;
+
+  for(core_index = 0; core_index < total_core_count; core_index += use_core_count) {
+    auto executor = device0->CreateExecutor(core_index,use_core_count, context);
+    executors.push_back(executor);
+  }
 
   auto root = std::getenv("TIM_VX_ROOT");
   assert(root != NULL);
@@ -142,46 +144,57 @@ int main(int argc, char** argv) {
   auto resnet50_weight_file = ROOT + "/samples/multi_device/resnet50/resnet50.export.data";
   std::function<void(std::shared_ptr<tim::vx::Graph>, const char*)> resnet50_construct_func = acuitylite::resnet50::construct_graph;
 
-  std::shared_ptr<tim::vx::platform::IExecutable> lenet_0, lenet_2, lenet_3, mobilenet_1, mobilenet_2, mobilenet_3, resnet50_0, resnet50_1;
-  std::shared_ptr<tim::vx::platform::ITensorHandle> lenet_0_outhandle, lenet_2_outhandle, lenet_3_outhandle, mobilenet_1_outhandle, mobilenet_2_outhandle, mobilenet_3_outhandle,
-    resnet50_0_outhandle, resnet50_1_outhandle;
+  auto excutor_cnt  = executors.size();
 
-  std::tie(lenet_0, lenet_0_outhandle) = generate_executable(executor0, lenet_construct_func, lenet_weight_file, lenet_input_files, lenet_input_bytes);
-  std::tie(resnet50_0, resnet50_0_outhandle) = generate_executable(executor0, resnet50_construct_func, resnet50_weight_file, resnet50_input_files, resnet50_input_bytes);
-  executor0->Submit(lenet_0, lenet_0);
-  executor0->Submit(resnet50_0, lenet_0);
+  //each excutor run 2 models.
+  auto lenet = [&](std::shared_ptr<tim::vx::platform::IExecutor> executor) {
+    return generate_executable(executor, lenet_construct_func, lenet_weight_file,
+                               lenet_input_files, lenet_input_bytes);
+  };
+  auto resnet = [&](std::shared_ptr<tim::vx::platform::IExecutor> executor) {
+     return generate_executable(executor, resnet50_construct_func, resnet50_weight_file,
+                                resnet50_input_files, resnet50_input_bytes);
+  };
+  auto mobilenet = [&](std::shared_ptr<tim::vx::platform::IExecutor> executor) {
+     return generate_executable(executor, mobilenet_construct_func, mobilenet_weight_file,
+                                mobilenet_input_files, mobilenet_input_bytes);
+  };
+  std::vector<std::pair<std::shared_ptr<tim::vx::platform::IExecutable>,
+              std::shared_ptr<tim::vx::platform::ITensorHandle>>> nets;
+  for (size_t i = 0; i < excutor_cnt; i++) {
+    if(i % 3 == 0) {
+      //lenet + resnet
+      nets.push_back(lenet(executors[i]));
+      executors[i]->Submit(nets.back().first, nets.back().first);
+      nets.push_back(resnet(executors[i]));
+      executors[i]->Submit(nets.back().first, nets.back().first);
+    }
+    if(i % 3 == 1) {
+      //resnet + mobilenet
+      nets.push_back(resnet(executors[i]));
+      executors[i]->Submit(nets.back().first, nets.back().first);
+      nets.push_back(mobilenet(executors[i]));
+      executors[i]->Submit(nets.back().first, nets.back().first);
+    }
+    if(i % 3 == 2) {
+      //lenet + mobilenet
+      nets.push_back(mobilenet(executors[i]));
+      executors[i]->Submit(nets.back().first, nets.back().first);
+      nets.push_back(lenet(executors[i]));
+      executors[i]->Submit(nets.back().first, nets.back().first);
+    }
+  }
+  std::vector<std::thread> threads;
+  for(auto executor:executors) {
+        threads.push_back(std::thread(executor_trigger, executor));
+  }
+  for(std::thread &t : threads) {
+     t.join();
+  }
 
-  std::tie(mobilenet_1, mobilenet_1_outhandle) = generate_executable(executor1, mobilenet_construct_func, mobilenet_weight_file, mobilenet_input_files, mobilenet_input_bytes);
-  std::tie(resnet50_1, resnet50_1_outhandle) = generate_executable(executor1, resnet50_construct_func, resnet50_weight_file, resnet50_input_files, resnet50_input_bytes);
-  auto executable_set1 = tim::vx::platform::CreateExecutableSet({mobilenet_1, resnet50_1});
-  executor1->Submit(executable_set1, executable_set1);
-
-  std::tie(lenet_2, lenet_2_outhandle) = generate_executable(executor2, lenet_construct_func, lenet_weight_file, lenet_input_files, lenet_input_bytes);
-  std::tie(mobilenet_2, mobilenet_2_outhandle) = generate_executable(executor2, mobilenet_construct_func, mobilenet_weight_file, mobilenet_input_files, mobilenet_input_bytes);
-  auto executable_set2 = tim::vx::platform::CreateExecutableSet({lenet_2, mobilenet_2});
-  executor2->Submit(executable_set2, executable_set2);
-
-  std::tie(lenet_3, lenet_3_outhandle) = generate_executable(executor3, lenet_construct_func, lenet_weight_file, lenet_input_files, lenet_input_bytes);
-  std::tie(mobilenet_3, mobilenet_3_outhandle) = generate_executable(executor3, mobilenet_construct_func, mobilenet_weight_file, mobilenet_input_files, mobilenet_input_bytes);
-  auto executable_set3 = tim::vx::platform::CreateExecutableSet({lenet_3, mobilenet_3});
-  executor3->Submit(executable_set3, executable_set3);
-
-  std::thread t0(executor_trigger, executor0);
-  std::thread t1(executor_trigger, executor1);
-  std::thread t2(executor_trigger, executor2);
-  std::thread t3(executor_trigger, executor3);
-  t0.join();
-  t1.join();
-  t2.join();
-  t3.join();
-
-  print_topN<float>(1 * 10, lenet_0_outhandle);
-  print_topN<float>(1 * 10, lenet_2_outhandle);
-  print_topN<float>(1 * 10, lenet_3_outhandle);
-  print_topN<float>(1 * 1001, mobilenet_1_outhandle);
-  print_topN<float>(1 * 1001, mobilenet_2_outhandle);
-  print_topN<float>(1 * 1001, mobilenet_3_outhandle);
-  print_topN<uint16_t>(1 * 1000, resnet50_0_outhandle);
-  print_topN<uint16_t>(1 * 1000, resnet50_1_outhandle);
+for (auto net : nets) {
+  auto size = net.second->GetSpec().GetElementNum();
+  print_topN<float>(size, net.second);
+}
   return 0;
 }
diff --git a/samples/multi_device/multi_device_demo.cc b/samples/multi_device/multi_device_demo.cc
index dd20c3c..369569b 100644
--- a/samples/multi_device/multi_device_demo.cc
+++ b/samples/multi_device/multi_device_demo.cc
@@ -29,7 +29,7 @@
 #include "tim/vx/graph.h"
 #include "tim/vx/operation.h"
 #include "tim/vx/tensor.h"
-#include "tim/vx/platform/native.h"
+#include "tim/vx/platform/platform.h"
 
 static void printTopN() {
 }
@@ -46,9 +46,9 @@ int demo(int argc, char** argv) {
   tim::vx::TensorSpec g0_input0, g0_output0, g1_output0, g2_output0, g3_output0, g4_output0, g5_output0;
 
   // query device and get executor of devcie
-  auto devices = tim::vx::platform::NativeDevice::Enumerate();
+  auto devices = tim::vx::platform::IDevice::Enumerate();
   auto device = devices[0];
-  std::shared_ptr<tim::vx::platform::IExecutor> executor = std::make_shared<tim::vx::platform::NativeExecutor> (device);
+  auto executor = device->CreateExecutor(0,-1, context);
 
   // executable0
   auto executable0 = executor->Compile(g0);  // compile to nbg
@@ -89,33 +89,6 @@ int demo(int argc, char** argv) {
   // trigger
   executor->Trigger();  // run all submitted executables
 
-  /* 2. another way to run */
-  // executable_set0
-  std::vector<std::shared_ptr<tim::vx::platform::IExecutable>> executables0;
-  executables0.push_back(executable0);
-  auto executable_set0 = CreateExecutableSet(executables0);
-  // executable_set1
-  std::vector<std::shared_ptr<tim::vx::platform::IExecutable>> executables1;
-  executables1.push_back(executable1);
-  executables1.push_back(executable3);
-  auto executable_set1 = CreateExecutableSet(executables1);
-  // executable_set2
-  std::vector<std::shared_ptr<tim::vx::platform::IExecutable>> executables2;
-  executables2.push_back(executable2);
-  executables2.push_back(executable4);
-  auto executable_set2 = CreateExecutableSet(executables2);
-  // executable_set3
-  std::vector<std::shared_ptr<tim::vx::platform::IExecutable>> executables3;
-  executables3.push_back(executable5);
-  auto executable_set3 = CreateExecutableSet(executables3);
-  // submit executaleSets
-  executable_set0->Submit(executable_set0);
-  executable_set1->Submit(executable_set0);
-  executable_set2->Submit(executable_set1);
-  executable_set3->Submit(executable_set2);
-  // trigger
-  executor->Trigger();  // run all submitted executableSets
-
   printTopN();
 
   return 0;
diff --git a/samples/multi_device/vx_resnet50.cc b/samples/multi_device/vx_resnet50.cc
index 7011e3e..a15480e 100644
--- a/samples/multi_device/vx_resnet50.cc
+++ b/samples/multi_device/vx_resnet50.cc
@@ -1296,7 +1296,7 @@ void resnet50::construct_graph
     auto input_0 = graph->CreateTensor(input_0_spec);
 
     tim::vx::ShapeType output_229_shape({1000,1});
-    tim::vx::TensorSpec output_229_spec(tim::vx::DataType::FLOAT16, output_229_shape,
+    tim::vx::TensorSpec output_229_spec(tim::vx::DataType::FLOAT32, output_229_shape,
     tim::vx::TensorAttribute::OUTPUT);
     auto output_229 = graph->CreateTensor(output_229_spec);
 
diff --git a/samples/platform_sample/CMakeLists.txt b/samples/platform_sample/CMakeLists.txt
new file mode 100644
index 0000000..1b91e85
--- /dev/null
+++ b/samples/platform_sample/CMakeLists.txt
@@ -0,0 +1,13 @@
+message("samples/platform_sample")
+
+set(TARGET_NAME "platform_sample")
+
+add_executable(${TARGET_NAME} ${CMAKE_CURRENT_SOURCE_DIR}/platform_sample.cc)
+
+target_link_libraries(${TARGET_NAME} PRIVATE -Wl,--whole-archive tim-vx)
+target_include_directories(${TARGET_NAME} PRIVATE
+    ${CMAKE_CURRENT_SOURCE_DIR}
+    ${PROJECT_SOURCE_DIR}/include)
+
+install(TARGETS ${TARGET_NAME} ${TARGET_NAME}
+    DESTINATION ${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_BINDIR})
diff --git a/samples/platform_sample/README b/samples/platform_sample/README
new file mode 100644
index 0000000..f1df921
--- /dev/null
+++ b/samples/platform_sample/README
@@ -0,0 +1,25 @@
+## brief
+The platform sample usage which use platform api.
+
+## note
+Please note that if you have enabled lite platform, a dedicated VIVANTE_SDK(NO_KERNEL) is required as the compiler for NBG.
+The driver for the NPU is the VIPLITE driver
+
+##requirements
+Vivante SDK >= 6.4.22
+ovxlib >= 1.2.26
+viplite >=2.0.0
+
+## build
+cd build
+cmake .. -DCMAKE_BUILD_TYPE=Release -DTIM_VX_USE_EXTERNAL_OVXLIB=ON -DEXTERNAL_VIV_SDK=${VIVANTE_NOKERNEL_SDK_DIR} -DOVXLIB_INC=${OVXLIB_DIR}/include \
+         -DOVXLIB_LIB=${VIVANTE_NOKERNEL_SDK_DIR}/drivers/libovxlib.so -DTIM_VX_BUILD_EXAMPLES=ON -DTIM_VX_ENABLE_PLATFORM=ON \
+         -DTIM_VX_ENABLE_PLATFORM_LITE=ON -DVIP_LITE_SDK=${VIP_LITE_SDK}
+
+## environment
+# Export VIV_GPU_FILE to specify the NPU hardware configuration file for the NBG compiler
+export VIV_GPU_FILE="/path/to/VIP9000NANOQ_PLUS_PID0X100000XX.config"
+
+## run
+cd build
+./samples/platform_sample/platform_sample
diff --git a/samples/lite_multi_device/lite_multi_device.cc b/samples/platform_sample/platform_sample.cc
similarity index 87%
rename from samples/lite_multi_device/lite_multi_device.cc
rename to samples/platform_sample/platform_sample.cc
index 51aec07..9506a9e 100644
--- a/samples/lite_multi_device/lite_multi_device.cc
+++ b/samples/platform_sample/platform_sample.cc
@@ -26,8 +26,8 @@
 #include "tim/vx/graph.h"
 #include "tim/vx/ops.h"
 #include "tim/vx/types.h"
-#include "tim/vx/platform/native.h"
-#include "tim/vx/platform/lite/lite_native.h"
+#include "tim/vx/platform/platform.h"
+
 
 int main() {
   //construct tim-vx graph
@@ -49,9 +49,15 @@ int main() {
   std::vector<int> data_vec_i0({1, 2, 3, 4});
   std::vector<int> data_vec_i1({4, 3, 2, 1});
 
-  auto devices = tim::vx::platform::NativeDevice::Enumerate();
+  auto devices = tim::vx::platform::IDevice::Enumerate();
+
+  std::cout << "NPU device count: " << devices.size() <<std::endl;
   auto device = devices[0];
-  auto executor = std::make_shared<tim::vx::platform::LiteNativeExecutor>(device);
+  //run 1 core in device 0
+  std::cout << "NPU device[0] has " << device->CoreCount() << "cores" <<std::endl;
+  auto use_core_count = -1;
+  auto executor = device->CreateExecutor(use_core_count);
+
   auto executable = executor->Compile(graph);
   auto input0_handle = executable->AllocateTensor(input_spec);
   auto input1_handle = executable->AllocateTensor(input_spec);
@@ -73,6 +79,10 @@ int main() {
   //each output value should be "5" in this demo
   for (int i = 0; i < 4; ++i) {
     std::cout << "output value: " << data[i] << std::endl;
+    if(data[i] != 5) {
+      std::cout << "test failed" << std::endl;
+      break;
+    }
   }
   free(data);
   return 0;
diff --git a/src/tim/CMakeLists.txt b/src/tim/CMakeLists.txt
index 8b38b33..f8f36c5 100644
--- a/src/tim/CMakeLists.txt
+++ b/src/tim/CMakeLists.txt
@@ -61,8 +61,10 @@ if(TIM_VX_ENABLE_PLATFORM)
         endif()
         list(APPEND LITE_EXTERNAL_LIBS
             ${VIP_LITE_SDK}/drivers/libNBGlinker.so
-            ${VIP_LITE_SDK}/drivers/libVIPlite.so)
-        list(APPEND LITE_INC_DIRS ${VIP_LITE_SDK}/include)
+            ${VIP_LITE_SDK}/drivers/libVIPhal.so)
+        list(APPEND LITE_INC_DIRS
+            ${VIP_LITE_SDK}/include
+            ${VIP_LITE_SDK}/include/nbg_linker)
     endif()
 
     if(TIM_VX_ENABLE_GRPC)
diff --git a/src/tim/vx/platform/lite/lite_native.cc b/src/tim/vx/platform/lite/lite_native.cc
index 6ab557c..21a45af 100644
--- a/src/tim/vx/platform/lite/lite_native.cc
+++ b/src/tim/vx/platform/lite/lite_native.cc
@@ -22,36 +22,202 @@
 *
 *****************************************************************************/
 #include "tim/vx/platform/lite/lite_native.h"
+#include "lite_native_private.h"
 
 #include <cassert>
-
 #include "tim/vx/graph.h"
 #include "graph_private.h"
-#include "vsi_nn_pub.h"
+#include "context_private.h"
 
 namespace tim {
 namespace vx {
 namespace platform {
-LiteNativeExecutor::LiteNativeExecutor(const std::shared_ptr<IDevice>& device) {
+
+  LiteNetwork::LiteNetwork(vip_create_network_param_t& param) {
+    vip_create_network(&param, sizeof(param), &network_);
+  }
+  vip_status_e LiteNetwork::Query(vip_enum property, void* value) {
+    return vip_query_network(network_, property, value);
+  }
+  vip_status_e LiteNetwork::Set(vip_enum property, void* value) {
+     return vip_set_network(network_, property, value);
+  }
+  vip_status_e LiteNetwork::Prepare() {
+      return vip_prepare_network(network_);
+  }
+   vip_status_e LiteNetwork::Run() {return vip_run_network(network_);}
+
+   vip_status_e LiteNetwork::Trigger() {return vip_trigger_network(network_);}
+
+   vip_status_e LiteNetwork::Wait() {return vip_wait_network(network_);}
+
+   vip_status_e LiteNetwork::Cancel() {return vip_cancel_network(network_);}
+
+   vip_status_e LiteNetwork::QueryInput(vip_uint32_t index, vip_enum property, void* value) {
+    return vip_query_input(network_, index, property,value);
+  }
+
+  vip_status_e LiteNetwork::QueryOutput(vip_uint32_t index, vip_enum property, void* value) {
+    return vip_query_output(network_, index, property, value);
+  }
+
+  vip_status_e LiteNetwork::SetInput(vip_uint32_t index, std::shared_ptr<ITensorHandle> input) {
+      vip_buffer buffer =
+        std::dynamic_pointer_cast<LiteNativeTensorHandleImpl>(input)->GetBuffer();
+    return vip_set_input(network_, index, buffer);
+  }
+
+  vip_status_e LiteNetwork::SetOutput(vip_uint32_t index, std::shared_ptr<ITensorHandle> output) {
+      vip_buffer buffer =
+        std::dynamic_pointer_cast<LiteNativeTensorHandleImpl>(output)->GetBuffer();
+     return vip_set_output(network_, index, buffer);
+  }
+
+  LiteNetwork::~LiteNetwork(){
+    vip_finish_network(network_);
+    vip_destroy_network(network_);
+  }
+
+bool LiteNativeDevice::vip_initialized = false;
+
+LiteNativeDeviceImpl::LiteNativeDeviceImpl(device_id_t id,uint32_t core_count) {
+  device_id_ = id;
+  core_count_ = core_count;
+ }
+
+bool LiteNativeDeviceImpl::Submit(const std::shared_ptr<Graph>& graph) {
+  (void)graph;
+  return true;
+}
+
+bool LiteNativeDeviceImpl::Trigger(bool async, async_callback cb) {
+  (void)async;
+  (void)cb;
+  return true;
+}
+void LiteNativeDeviceImpl::WaitDeviceIdle() {}
+
+bool LiteNativeDeviceImpl::DeviceExit() {return false;}
+
+std::shared_ptr<IExecutor> LiteNativeDeviceImpl::CreateExecutor(const int32_t core_index,
+                                                    const int32_t core_count,
+                                                    const std::shared_ptr<Context>& context) {
+  std::shared_ptr<IDevice> this_sp = shared_from_this();
+  auto executor = std::make_shared<LiteNativeExecutorImpl>(this_sp, core_count, core_index, context);
+  return executor;
+}
+
+std::vector<std::shared_ptr<IDevice>> LiteNativeDevice::Enumerate() {
+  std::vector<std::shared_ptr<IDevice>> device_v;
+  device_id_t deviceCount = 0;
+  std::vector<uint32_t> core_count;
+  uint32_t version = 0;
+  if( !LiteNativeDevice::vip_initialized ) {
+    vip_status_e status = vip_init();
+    if(status != VIP_SUCCESS) {
+      VSILOGE("Initialize viplite driver fail");
+      return device_v;
+    }
+    LiteNativeDevice::vip_initialized = true;
+  }
+  version = vip_get_version();
+  if (version >= 0x00010601 ) {
+      vip_query_hardware(VIP_QUERY_HW_PROP_DEVICE_COUNT, sizeof(uint32_t), &deviceCount);
+      core_count.resize(deviceCount);
+      vip_query_hardware(VIP_QUERY_HW_PROP_CORE_COUNT_EACH_DEVICE,
+      sizeof(uint32_t) * core_count.size(), core_count.data());
+  }
+
+  for (device_id_t i = 0; i < deviceCount; i++) {
+    auto  local_device = std::make_shared<LiteNativeDeviceImpl>(i, core_count.at(i));
+    device_v.push_back(local_device);
+  }
+  return device_v;
+}
+
+int LiteNativeExecutorImpl::executor_count = 0;
+
+LiteNativeExecutorImpl::LiteNativeExecutorImpl(const std::shared_ptr<IDevice>& device,
+  const int32_t core_count, const int32_t core_index, const std::shared_ptr<Context>& context)
+ {
   device_ = device;
-  context_ = Context::Create();
-  database_ = VIP_NULL;
+  context_ = context;
+  if(context_ == nullptr) {
+    context_ = tim::vx::Context::Create();
+  }
+  auto fixed_core_count = core_count;
+  int32_t fixed_core_index = core_index;
+  vip_status_e status  = VIP_SUCCESS;
+  if( !LiteNativeDevice::vip_initialized ) {
+     status = vip_init();
+     if(status != VIP_SUCCESS){
+      throw "Initialize viplite driver fail";
+      }
+  }
+  int32_t total_core_count  = (int32_t)device->CoreCount();
+  if (fixed_core_index < 0)
+  {
+    fixed_core_index = 0;
+  }
+  if (fixed_core_index > total_core_count - 1){
+     throw "Core index is larger than total core count.";
+  }
+  if (fixed_core_count <= 0 ) {
+    fixed_core_count = total_core_count - fixed_core_index;
+  }
 
-  vip_init();
-  vip_query_database(&database_);
-  nbg_linker_init(database_);
+  if (fixed_core_index + fixed_core_count > total_core_count) {
+    fixed_core_count = total_core_count - fixed_core_index;
+    VSILOGW(
+        "Core_index + core_count is larger than total core count. Fix core "
+        "count to %d",
+        fixed_core_count);
+  }
+  core_index_ = (uint32_t)fixed_core_index;
+  core_count_ = (uint32_t)fixed_core_count;
+
+#ifdef VSI_DEVICE_SUPPORT
+  vsi_nn_device_t  vsi_devices[VSI_MAX_DEVICES] = {0};
+  vsi_size_t num_devices = 0;
+  vsi_size_t available_core_count = 0;
+  auto ctx = dynamic_cast<ContextImpl*>(context_.get());
+  vsi_nn_GetDevices(ctx->context(), vsi_devices, &num_devices);
+
+  //Always use device 0 to compile NBG.
+  vsi_nn_GetDeviceCoreCount(vsi_devices[0], &available_core_count);
+
+  if(core_index_ + core_count_ > (uint32_t)available_core_count) {
+      VSILOGE("the used core count is larger than compiler available core count");
+      assert(false);
+  }
+  vsi_nn_CreateSubDevice(vsi_devices[0], core_index_, core_count_, &sub_device_);
+#else
+  VSILOGE("device is not supported!");
+  assert(false);
+#endif
+
+  executor_count++;
 }
 
-LiteNativeExecutor::~LiteNativeExecutor() {
-  nbg_destroy_task(task_descriptor_);
-  nbg_linker_destroy();
-  vip_destroy();
+LiteNativeExecutorImpl::~LiteNativeExecutorImpl() {
+#ifdef VSI_DEVICE_SUPPORT
+  if(sub_device_)
+     vsi_nn_ReleaseDevice(&sub_device_);
+#endif
+  executor_count--;
+  if(executor_count <1)
+    vip_destroy();
 }
 
-bool LiteNativeExecutor::Submit(const std::shared_ptr<IExecutable>& executable,
+bool LiteNativeExecutorImpl::Submit(const std::shared_ptr<IExecutable>& executable,
                                 const std::shared_ptr<IExecutable>& ref,
                                 bool after) {
   bool success = false;
+  success = executable->Verify();
+  if (success == false) {
+    VSILOGE("Executable NBG compile failed");
+    return false;
+  }
   if (executable == ref) {
     tasks_.push_back(executable);
     return true;
@@ -72,239 +238,285 @@ bool LiteNativeExecutor::Submit(const std::shared_ptr<IExecutable>& executable,
   return success;
 }
 
-bool LiteNativeExecutor::Trigger(bool async) {
+bool LiteNativeExecutorImpl::Trigger(bool async) {
   (void)async;
-  vip_status_e status = VIP_SUCCESS;
-  std::vector<vip_network> networks;
-  for (auto exe : tasks_) {
-    auto task = exe.lock();
-    task->Verify();
-    vip_network& network =
-        std::dynamic_pointer_cast<LiteNativeExecutable>(task)->network_;
-    networks.push_back(std::move(network));
-  }
-  status = nbg_create_task(networks.size(), networks.data(), &task_descriptor_);
-  if (status != VIP_SUCCESS) {
-    VSILOGE("create task descriptor fail");
-    return false;
-  }
-  status = vip_trigger_task(task_descriptor_);
-  if (status != VIP_SUCCESS) {
-    VSILOGE("trigger task descriptor fail");
-    return false;
-  }
-  status = vip_wait_task(task_descriptor_);
-  if (status != VIP_SUCCESS) {
-    VSILOGE("wait task descriptor fail");
-    // nbg_gen_capture(networks.size(), networks.data());
-    return false;
+  while (!tasks_.empty()) {
+    auto task = tasks_.front();
+    tasks_.erase(tasks_.begin());
+    auto task_tmp = task.lock();
+    if (!task_tmp) {
+      VSILOGE("Task is empty");
+      return false;
+    }
+    task_tmp->Trigger();
   }
   return true;
 }
 
-std::shared_ptr<IExecutable> LiteNativeExecutor::Compile(
+std::shared_ptr<IExecutable> LiteNativeExecutorImpl::Compile(
     const std::shared_ptr<Graph>& graph) {
-  GraphImpl* graphimp = dynamic_cast<GraphImpl*>(graph.get());
-  IDevice::device_id_t id = device_->Id();
-  vxSetGraphAttribute(graphimp->graph()->g, VX_GRAPH_DEVICE_INDEX_VIV,
-                      (void*)(&id), sizeof(id));
   size_t bin_size = -1;
-  graph->CompileToBinary(nullptr, &bin_size);
   std::vector<char> nb_buf;
+#ifdef VSI_DEVICE_SUPPORT
+  GraphImpl* graphimp = dynamic_cast<GraphImpl*>(graph.get());
+  vsi_nn_BindDevices(graphimp->graph(), 1, &sub_device_);
+#endif
+  auto ret = graph->CompileToBinary(nullptr, &bin_size);
   nb_buf.resize(bin_size);
-  graph->CompileToBinary(nb_buf.data(), &bin_size);
-  return std::make_shared<LiteNativeExecutable>(shared_from_this(), nb_buf);
+  ret |= graph->CompileToBinary(nb_buf.data(), &bin_size);
+  if(!ret) {
+    VSILOGE("Compile fail");
+    return nullptr;
+  }
+
+  std::shared_ptr<IExecutor> this_sp = shared_from_this();
+  auto executable = std::make_shared<LiteNativeExecutableImpl>(this_sp, nb_buf);
+  return executable;
 }
 
-LiteNativeExecutable::LiteNativeExecutable(
+LiteNativeExecutableImpl::LiteNativeExecutableImpl(
     const std::shared_ptr<IExecutor>& executor,
     const std::vector<char>& nb_buf) {
   executor_ = executor;
-  context_ = executor->Contex();
-  nb_graph_ = context_->CreateGraph();
-  nbg_create_network(nb_buf.data(), nb_buf.size(),
-                     VIP_CREATE_NETWORK_FROM_MEMORY, &network_);
-  input_count_ = 0;
-  output_count_ = 0;
-  coeff_ = nullptr;
-  command_ = nullptr;
-  memory_pool_ = nullptr;
-  others_ = nullptr;
-  pre_command_ = nullptr;
+  context_ = nullptr;
+  nb_graph_ = nullptr;
+  vip_status_e  status = VIP_SUCCESS;
+  vip_create_network_param_t net_param;
+  device_id_ = executor_.lock()->Device()->Id();
+  auto core_index  = executor_.lock()->CoreIndex();
+  net_param.device_index = device_id_;
+  net_param.prop = VIP_NET_CREATE_PROP_FROM_NBG;
+  net_param.nbg.type = VIP_NET_CREATE_NBG_FROM_MEMORY;
+  net_param.nbg.memory.nbg_memory = (void*)nb_buf.data();
+  net_param.nbg.memory.nbg_size = nb_buf.size();
 
-  /* prepare vip network */
-  vip_status_e status = VIP_SUCCESS;
-  nbg_network_memory_size_t buffer_size;
-  nbg_network_memory_buffer_t buffer;
-  vip_memory_t coeff_buffer;
-  vip_memory_t cmd_buffer;
-  vip_memory_t pre_cmd_buffer;
-  vip_memory_t pool_buffer;
-  vip_memory_t others_buffer;
-  nbg_query_network(network_, VIP_NETWORK_PROP_MEMORY_SIZE, &buffer_size);
+  auto network(std::make_unique<LiteNetwork>(net_param));
 
-  vip_allocate_videomemory(buffer_size.coeff, &coeff_);
-  vip_allocate_videomemory(buffer_size.command, &command_);
-  vip_allocate_videomemory(buffer_size.memory_pool, &memory_pool_);
-  vip_allocate_videomemory(buffer_size.others, &others_);
-  vip_allocate_videomemory(buffer_size.pre_command, &pre_command_);
-
-  SetBuffer(&coeff_buffer, coeff_);
-  SetBuffer(&cmd_buffer, command_);
-  SetBuffer(&pre_cmd_buffer, pre_command_);
-  SetBuffer(&pool_buffer, memory_pool_);
-  SetBuffer(&others_buffer, others_);
-
-  buffer.coeff = &coeff_buffer;
-  buffer.command = &cmd_buffer;
-  buffer.memory_pool = &pool_buffer;
-  buffer.others = &others_buffer;
-  buffer.pre_command = &pre_cmd_buffer;
-  buffer.dma_command = nullptr;
-  status = nbg_prepare_network(network_, &buffer);
-
-  vip_flush_videomemory(coeff_, VIP_BUFFER_OPER_TYPE_FLUSH);
-  vip_flush_videomemory(command_, VIP_BUFFER_OPER_TYPE_FLUSH);
-  vip_flush_videomemory(pre_command_, VIP_BUFFER_OPER_TYPE_FLUSH);
-  vip_flush_videomemory(memory_pool_, VIP_BUFFER_OPER_TYPE_FLUSH);
-  vip_flush_videomemory(others_, VIP_BUFFER_OPER_TYPE_FLUSH);
+  lite_network_ = std::move(network);
+  status = lite_network_->Query(VIP_NETWORK_PROP_INPUT_COUNT,&input_count_);
+  if (status != VIP_SUCCESS) {
+    VSILOGE("failed to query network inputs");
+    assert(false);
+  }
+  status = lite_network_->Query(VIP_NETWORK_PROP_OUTPUT_COUNT,&output_count_);
+  if (status != VIP_SUCCESS) {
+    VSILOGE("failed to query network outputs");
+    assert(false);
+  }
 
+  status = lite_network_->Set(VIP_NETWORK_PROP_SET_CORE_INDEX,&core_index);
+  if (status != VIP_SUCCESS) {
+    VSILOGE("failed to set core index");
+    assert(false);
+  }
+   status = lite_network_->Prepare();
   if (status != VIP_SUCCESS) {
     VSILOGE("failed to prepare network");
     assert(false);
   }
 }
 
-LiteNativeExecutable::~LiteNativeExecutable() {
-  nbg_finish_network(network_);
-  nbg_destroy_network(network_);
-  if (coeff_) {
-    vip_free_videomemory(coeff_);
-    coeff_ = nullptr;
-  }
-  if (command_) {
-    vip_free_videomemory(command_);
-    command_ = nullptr;
-  }
-  if (memory_pool_) {
-    vip_free_videomemory(memory_pool_);
-    memory_pool_ = nullptr;
-  }
-  if (others_) {
-    vip_free_videomemory(others_);
-    others_ = nullptr;
-  }
-  if (pre_command_) {
-    vip_free_videomemory(pre_command_);
-    pre_command_ = nullptr;
-  }
-}
-
-void LiteNativeExecutable::SetInput(const std::shared_ptr<ITensorHandle>& th) {
+void LiteNativeExecutableImpl::SetInput(const std::shared_ptr<ITensorHandle>& th) {
   vip_status_e status = VIP_SUCCESS;
-  gcvip_videomemory_t* mem =
-      std::dynamic_pointer_cast<LiteNativeTensorHandle>(th)->tensor_buffer_;
-  vip_memory_t buffer;
-  SetBuffer(&buffer, mem);
-
-  status = nbg_set_input(network_, input_count_, &buffer);
+  int32_t input_index = input_handles_.size();
+  status = lite_network_->SetInput(input_index, th);
   if (status != VIP_SUCCESS) {
-    VSILOGE("failed to set input: %d", input_count_);
+    VSILOGE("failed to set input: %d", input_index);
     assert(false);
   }
-  ++input_count_;
+  input_handles_.push_back(th);
+}
+void LiteNativeExecutableImpl::SetInputs(const std::vector<std::shared_ptr<ITensorHandle>>& ths) {
+  for (auto th : ths) {
+    SetInput(th);
+  }
 }
 
-void LiteNativeExecutable::SetOutput(const std::shared_ptr<ITensorHandle>& th) {
+void LiteNativeExecutableImpl::SetOutput(const std::shared_ptr<ITensorHandle>& th) {
   vip_status_e status = VIP_SUCCESS;
-  gcvip_videomemory_t* mem =
-      std::dynamic_pointer_cast<LiteNativeTensorHandle>(th)->tensor_buffer_;
-  vip_memory_t buffer;
-  SetBuffer(&buffer, mem);
-
-  status = nbg_set_output(network_, output_count_, &buffer);
+  int32_t output_index = output_handles_.size();
+  status = lite_network_->SetOutput(output_index,th);
   if (status != VIP_SUCCESS) {
-    VSILOGE("failed to set output: %d", output_count_);
+    VSILOGE("failed to set output: %d", output_index);
     assert(false);
   }
-  ++output_count_;
+  output_handles_.push_back(th);
 }
 
-void LiteNativeExecutable::GetOutput(
-    const std::vector<std::shared_ptr<ITensorHandle>>& th) {
-  (void)th;
+void LiteNativeExecutableImpl::SetOutputs(const std::vector<std::shared_ptr<ITensorHandle>>& ths) {
+  for (auto th : ths) {
+    SetOutput(th);
+  }
 }
 
-bool LiteNativeExecutable::Submit(const std::shared_ptr<IExecutable>& ref,
+bool LiteNativeExecutableImpl::Submit(const std::shared_ptr<IExecutable>& ref,
                                   bool after) {
   bool status = false;
+  std::shared_ptr<LiteNativeExecutorImpl> executor =
+        std::dynamic_pointer_cast<LiteNativeExecutorImpl>(executor_.lock());
   std::shared_ptr<IExecutable> executable = shared_from_this();
-  status = Executor()->Submit(executable, ref, after);
+  status = executor->Submit(executable, ref, after);
   return status;
 }
 
-bool LiteNativeExecutable::Trigger(bool async) {
-  (void)async;
-  return false;
-}
-
-bool LiteNativeExecutable::Verify() {
-  int32_t input_count = 0;
-  nbg_query_network(network_, VIP_NETWORK_PROP_INPUT_COUNT, &input_count);
-  if (input_count != input_count_) {
-    VSILOGE("input count mismatch, required: %d, provided: %d", input_count,
-            input_count_);
-    return false;
+bool LiteNativeExecutableImpl::Trigger(bool async) {
+  vip_status_e status = VIP_SUCCESS;
+  if (async) {
+    status = lite_network_->Trigger();
+    status = lite_network_->Wait();
+    if (status != VIP_SUCCESS) {
+      VSILOGE("trigger network fail");
+      return false;
+    }
+  } else {
+    status = lite_network_->Run();
+    if (status != VIP_SUCCESS) {
+      VSILOGE("run network fail");
+      return false;
+    }
   }
-  int32_t output_count = 0;
-  nbg_query_network(network_, VIP_NETWORK_PROP_OUTPUT_COUNT, &output_count);
-  if (output_count != output_count_) {
-    VSILOGE("output count mismatch, required: %d, provided: %d", output_count,
-            output_count_);
-    return false;
-  }
-
   return true;
 }
 
-std::shared_ptr<ITensorHandle> LiteNativeExecutable::AllocateTensor(
-    const TensorSpec& tensor_spec) {
-  auto tensor = nb_graph_->CreateTensor(tensor_spec);
-  return std::make_shared<LiteNativeTensorHandle>(tensor);
+bool LiteNativeExecutableImpl::Verify() {
+  bool ret = true;
+  auto output_index = output_handles_.size();
+  auto input_index = input_handles_.size();
+  if(input_index != input_count_) {
+      VSILOGE("Network need %d inputs but gaving  %d.\n", input_count_, input_index);
+      ret = false;
+  }
+  if(output_index != output_count_) {
+     VSILOGE("Network need %d outputs but gaving  %d.\n", output_count_, output_index);
+      ret = false;
+  }
+
+  return ret;
 }
 
-void LiteNativeExecutable::SetBuffer(vip_memory_t* dst,
-                                     gcvip_videomemory_t* src) {
-  if (dst && src) {
-    dst->cpu_logical = src->cpu_logical;
-    dst->npu_physical = src->npu_physical;
-    dst->size = src->size;
+std::shared_ptr<ITensorHandle> LiteNativeExecutableImpl::AllocateTensor(const TensorSpec& tensor_spec,
+                                                                        void* data, uint32_t size) {
+  return std::make_shared<LiteNativeTensorHandleImpl>(tensor_spec, data, size, device_id_);
+}
+
+LiteNativeTensorHandleImpl::LiteNativeTensorHandleImpl(const TensorSpec& tensor_spec, void* data, uint32_t size,
+                                                       uint32_t device_id) {
+  vip_status_e  status  = VIP_ERROR_FAILURE;
+  spec_  = tensor_spec;
+  uint32_t tensor_size = tensor_spec.GetByteSize();
+  vip_buffer_create_params_t tensor_param;
+
+  uint32_t block_aligned_size = 64;
+  memory_type_ = ALLOC_MEM_NONE;
+  handle_ = nullptr;
+  handle_size_ = 0;
+  if(size > 0 && !data && tensor_size >  size ) {
+    VSILOGE("Buffer size is less than the memory size required by the tensor");
+    assert(false);
+  }
+#if 0
+  uint32_t addr_aligned_size = 256;
+  if (!data) {
+    data = vsi_nn_MallocAlignedBuffer(tensor_size,addr_aligned_size,block_aligned_size);
+    size = ((tensor_size + block_aligned_size - 1) / block_aligned_size) * block_aligned_size;
+    memory_type_ = ALLOC_MEM_INTERNAL;
+  } else {
+    memory_type_ = ALLOC_MEM_EXTERNAL;
+  }
+  handle_ = data;
+  if(!vsi_nn_IsBufferAligned((uint8_t *)handle_, addr_aligned_size)) {
+      VSILOGE("The starting address of the buffer needs to be 64-byte aligned");
+      assert(false);
+  }
+  if(size % 64 != 0) {
+      VSILOGE("The size of the buffer needs to be 64-byte aligned");
+      assert(false);
+  }
+  handle_size_ = size;
+  tensor_param.type = VIP_BUFFER_CREATE_FROM_USER_MEM;
+  tensor_param.device_index = device_id ;
+  tensor_param.src.from_handle.memory_type = VIP_BUFFER_FROM_USER_MEM_TYPE_HOST;
+  tensor_param.src.from_handle.logical_addr = handle_;
+  tensor_param.src.from_handle.size = handle_size_;
+  status = vip_create_buffer(&tensor_param,sizeof(tensor_param),&tensor_buffer_);
+#else
+  (void)data;
+  tensor_param.type = VIP_BUFFER_CREATE_ALLOC_MEM;
+  tensor_param.device_index = device_id ;
+  tensor_param.src.alloc_mem.size = tensor_size;
+  tensor_param.src.alloc_mem.align = block_aligned_size;
+  status = vip_create_buffer(&tensor_param,sizeof(tensor_param),&tensor_buffer_);
+  memory_type_ = ALLOC_MEM_VIDEOMEM;
+#endif
+  if(status != VIP_SUCCESS) {
+    if(memory_type_ == ALLOC_MEM_INTERNAL) {
+      vsi_nn_FreeAlignedBuffer((uint8_t*)handle_);
+    }
+    VSILOGE("Fail to create vip buffer.");
+    assert(false);
   }
 }
 
-LiteNativeTensorHandle::LiteNativeTensorHandle(
-    const std::shared_ptr<Tensor>& tensor) {
-  tensor_ = tensor;
-  uint32_t size = tensor->GetSpec().GetByteSize();
-  vip_allocate_videomemory(size, &tensor_buffer_);
-}
-
-LiteNativeTensorHandle::~LiteNativeTensorHandle() {
+LiteNativeTensorHandleImpl::~LiteNativeTensorHandleImpl() {
   if (tensor_buffer_) {
-    vip_free_videomemory(tensor_buffer_);
+    vip_destroy_buffer(tensor_buffer_);
     tensor_buffer_ = nullptr;
   }
+  if(memory_type_ == ALLOC_MEM_INTERNAL && handle_) {
+    vsi_nn_FreeAlignedBuffer((uint8_t*)handle_);
+
+  }
 }
 
-bool LiteNativeTensorHandle::CopyDataToTensor(const void* data,
-                                              uint32_t size_in_bytes) {
-  memcpy(tensor_buffer_->cpu_logical, data, size_in_bytes);
+bool LiteNativeTensorHandleImpl::CopyDataToTensor(const void* data,
+                                                  uint32_t size_in_bytes) {
+  void* handle  = handle_;
+  if(memory_type_ == ALLOC_MEM_VIDEOMEM) {
+    handle = vip_map_buffer(tensor_buffer_);
+  }
+  auto buff_size = vip_get_buffer_size(tensor_buffer_);
+  memcpy(handle, data, buff_size > size_in_bytes ? size_in_bytes : buff_size);
+  if(memory_type_ == ALLOC_MEM_VIDEOMEM) {
+    vip_unmap_buffer(tensor_buffer_);
+  }
+  Flush();
   return true;
 }
 
-bool LiteNativeTensorHandle::CopyDataFromTensor(void* data) {
-  memcpy(data, tensor_buffer_->cpu_logical, tensor_buffer_->size);
-  return true;
+bool LiteNativeTensorHandleImpl::CopyDataFromTensor(void* data) {
+  bool ret = Invalidate();
+  if(ret) {
+    void* handle  = handle_;
+    auto buff_size = vip_get_buffer_size(tensor_buffer_);
+    if(memory_type_ == ALLOC_MEM_VIDEOMEM) {
+      handle = vip_map_buffer(tensor_buffer_);
+    }
+    memcpy(data, handle, buff_size);
+    if(memory_type_ == ALLOC_MEM_VIDEOMEM) {
+      vip_unmap_buffer(tensor_buffer_);
+    }
+  }
+
+  return ret;
+}
+
+bool LiteNativeTensorHandleImpl::Flush() {
+  vip_status_e status = vip_flush_buffer(tensor_buffer_,VIP_BUFFER_OPER_TYPE_FLUSH);
+  if (status != VIP_SUCCESS) {
+     return false;
+  }
+  else{
+    return true;
+  }
+}
+bool LiteNativeTensorHandleImpl::Invalidate() {
+  vip_status_e status = vip_flush_buffer(tensor_buffer_,VIP_BUFFER_OPER_TYPE_INVALIDATE);
+  if (status != VIP_SUCCESS) {
+     return false;
+  }
+  else{
+    return true;
+  }
 }
 
 }  // namespace platform
diff --git a/src/tim/vx/platform/lite/lite_native_private.h b/src/tim/vx/platform/lite/lite_native_private.h
new file mode 100644
index 0000000..b371945
--- /dev/null
+++ b/src/tim/vx/platform/lite/lite_native_private.h
@@ -0,0 +1,147 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020-2023 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+#ifndef TIM_VX_LITE_NATIVE_DEVICE_PRIVATE_H_
+#define TIM_VX_LITE_NATIVE_DEVICE_PRIVATE_H_
+
+#include "tim/vx/platform/lite/lite_native.h"
+#include "vip_lite.h"
+#include "vsi_nn_pub.h"
+
+
+namespace tim {
+namespace vx {
+
+namespace platform {
+
+class LiteNetwork
+{
+public:
+  LiteNetwork(vip_create_network_param_t& param);
+  ~LiteNetwork();
+  vip_status_e Query(vip_enum property, void* value);
+  vip_status_e Set(vip_enum property, void* value);
+  vip_status_e Prepare();
+  vip_status_e Run();
+  vip_status_e Trigger();
+  vip_status_e Wait();
+  vip_status_e Cancel();
+  vip_status_e QueryInput(vip_uint32_t index, vip_enum property, void* value);
+  vip_status_e QueryOutput(vip_uint32_t index, vip_enum property, void* value);
+  vip_status_e SetInput(vip_uint32_t index, std::shared_ptr<ITensorHandle> input);
+  vip_status_e SetOutput(vip_uint32_t index, std::shared_ptr<ITensorHandle> output);
+
+private:
+    vip_network network_;
+};
+
+class LiteNativeDeviceImpl : public LiteNativeDevice,
+                             public std::enable_shared_from_this<LiteNativeDeviceImpl> {
+ public:
+  LiteNativeDeviceImpl(device_id_t id,uint32_t core_count);
+  ~LiteNativeDeviceImpl() {};
+
+  bool Submit(const std::shared_ptr<tim::vx::Graph>& graph) override;
+  bool Trigger(bool async = false, async_callback cb = NULL) override;
+  bool DeviceExit() override;
+  void WaitDeviceIdle() override;
+   std::shared_ptr<IExecutor> CreateExecutor(const int32_t core_index = 0,
+                                             const int32_t core_count = -1,
+                                             const std::shared_ptr<Context>& context = nullptr) override;
+};
+
+class LiteNativeExecutorImpl
+    : public LiteNativeExecutor,
+      public std::enable_shared_from_this<LiteNativeExecutorImpl> {
+ public:
+  LiteNativeExecutorImpl(const std::shared_ptr<IDevice>& device,
+                     const int32_t core_index = 0,
+                     const int32_t core_count = -1,
+                     const std::shared_ptr<Context>& context = nullptr);
+  virtual ~LiteNativeExecutorImpl();
+  bool Submit(const std::shared_ptr<IExecutable>& executable,
+              const std::shared_ptr<IExecutable>& ref,
+              bool after = true) override;
+  bool Trigger(bool async = false) override;
+  std::shared_ptr<IExecutable> Compile(const std::shared_ptr<Graph>& graph) override;
+  static int executor_count;
+
+private:
+#ifdef VSI_DEVICE_SUPPORT
+  vsi_nn_device_t  sub_device_;
+#endif
+};
+
+class LiteNativeExecutableImpl : public LiteNativeExecutable {
+ public:
+  LiteNativeExecutableImpl(const std::shared_ptr<IExecutor>& executor,
+                           const std::vector<char>& nb_buf);
+  virtual ~LiteNativeExecutableImpl() {};
+  void SetInput(const std::shared_ptr<ITensorHandle>& th) override;
+  void SetOutput(const std::shared_ptr<ITensorHandle>& th) override;
+  void SetInputs(const std::vector<std::shared_ptr<ITensorHandle>>& ths) override;
+  void SetOutputs(const std::vector<std::shared_ptr<ITensorHandle>>& ths) override;
+  bool Submit(const std::shared_ptr<IExecutable>& ref, bool after) override;
+  bool Trigger(bool async) override;
+  bool Verify() override;
+  std::shared_ptr<ITensorHandle> AllocateTensor(const TensorSpec& tensor_spec,
+                                                void* data = nullptr, uint32_t size = 0) override;
+
+ private:
+  uint32_t device_id_;
+  uint32_t input_count_;
+  uint32_t output_count_;
+  std::unique_ptr<LiteNetwork> lite_network_;
+};
+
+class LiteNativeTensorHandleImpl : public LiteNativeTensorHandle {
+ public:
+   typedef enum {
+    ALLOC_MEM_NONE,
+    ALLOC_MEM_EXTERNAL,
+    ALLOC_MEM_INTERNAL,
+    ALLOC_MEM_VIDEOMEM,
+    ALLOC_MEM_PHYSICAL,
+    ALLOC_MEM_FD,
+  } alloc_mem_type;
+
+  LiteNativeTensorHandleImpl(const TensorSpec& tensor_spec,void* data, uint32_t size,uint32_t device_id);
+  virtual ~LiteNativeTensorHandleImpl();
+  bool CopyDataToTensor(const void* data, uint32_t size_in_bytes) override;
+  bool CopyDataFromTensor(void* data) override;
+  bool Flush();
+  bool Invalidate();
+  vip_buffer GetBuffer() {return tensor_buffer_;};
+
+private:
+  vip_buffer tensor_buffer_;
+  void* handle_;
+  uint32_t handle_size_;
+  alloc_mem_type memory_type_;
+};
+
+}  // namespace platform
+}  // namespace vx
+}  // namespace tim
+
+#endif /* TIM_VX_NATIVE_DEVICE_PRIVATE_H_*/
diff --git a/src/tim/vx/platform/native.cc b/src/tim/vx/platform/native.cc
index 81c9c3a..038f38d 100644
--- a/src/tim/vx/platform/native.cc
+++ b/src/tim/vx/platform/native.cc
@@ -22,9 +22,14 @@
 *
 *****************************************************************************/
 #include "tim/vx/platform/native.h"
-#include "native_device_private.h"
+#include "native_private.h"
+#include "context_private.h"
 #include "tim/vx/ops/nbg.h"
+#ifdef ENABLE_PLATFORM_LITE
+#include "tim/vx/platform/lite/lite_native.h"
+#endif
 
+#include <cassert>
 namespace tim {
 namespace vx {
 namespace platform {
@@ -35,215 +40,203 @@ std::shared_ptr<IExecutable> Compile(
   return executor->Compile(graph);
 }
 
-std::shared_ptr<IExecutable> CreateExecutableSet(
-    const std::vector<std::shared_ptr<IExecutable>>& executables) {
-  ExecutableSet* executable_set = new ExecutableSet(executables);
-  std::shared_ptr<IExecutable> executable(executable_set);
-  return executable;
+NativeDeviceImpl::NativeDeviceImpl(device_id_t id, uint32_t core_count) {
+  device_id_ = id;
+  core_count_ = core_count;
+}
+std::vector<std::shared_ptr<IDevice>> IDevice::Enumerate() {
+#ifdef ENABLE_PLATFORM_LITE
+  auto devices = tim::vx::platform::LiteNativeDevice::Enumerate();
+#else
+  auto devices = tim::vx::platform::NativeDevice::Enumerate();
+#endif
+  return devices;
 }
-
-IDevice::device_id_t IDevice::Id() const { return device_id_; }
 
 void IDevice::RemoteReset() {}
 
-NativeDeviceImpl::NativeDeviceImpl(device_id_t id) {
-  vip_device_ = std::make_unique<vip::IDevice>(id);
-  device_id_ = id;
-}
-
 bool NativeDeviceImpl::Submit(const std::shared_ptr<Graph>& graph) {
-  GraphImpl* graphimp =
-      dynamic_cast<GraphImpl*>(graph.get());  // hack to downcast
-  vsi_graph_v_.push_back(graphimp->graph());
+  (void)graph;
   return true;
 }
 
 bool NativeDeviceImpl::Trigger(bool async, async_callback cb) {
-  // extract graph from tasks
   (void)async;
-  bool status = false;
-  while (!vsi_graph_v_.empty()) {
-    auto task = vsi_graph_v_.front();
-    vsi_graph_v_.erase(vsi_graph_v_.begin());
-    status = vip_device_->GraphSubmit(task, cb, NULL);
-  }
-  return status;
+  (void)cb;
+  return true;
 }
 
-void NativeDeviceImpl::WaitDeviceIdle() { vip_device_->WaitThreadIdle(); }
+void NativeDeviceImpl::WaitDeviceIdle() {}
 
-bool NativeDeviceImpl::DeviceExit() { return vip_device_->ThreadExit(); }
+bool NativeDeviceImpl::DeviceExit() { return true; }
+
+std::shared_ptr<IExecutor> NativeDeviceImpl::CreateExecutor(const int32_t core_index,
+                                                    const int32_t core_count,
+                                                    const std::shared_ptr<Context>& context) {
+  std::shared_ptr<IDevice> this_sp = shared_from_this();
+  auto  executor = std::make_shared<NativeExecutorImpl>(this_sp, core_count,core_index,context);
+  return executor;
+}
 
 std::vector<std::shared_ptr<IDevice>> NativeDevice::Enumerate() {
   std::vector<std::shared_ptr<IDevice>> device_v;
-  device_id_t deviceCount = 0;
-  vsi_nn_context_t context;
-  context = vsi_nn_CreateContext();
+  vsi_nn_context_t context = vsi_nn_CreateContext();
+  vsi_size_t deviceCount = 0;
+#ifdef VSI_DEVICE_SUPPORT
+  vsi_nn_device_t  vsi_devices[VSI_MAX_DEVICES] = {0};
+  vsi_status status  = VSI_FAILURE;
+  vsi_size_t deviceCount = 0;
+
+  status  = vsi_nn_GetDevices(context,vsi_devices,&deviceCount);
+  if(status != VSI_SUCCESS){
+        VSILOGE("Get device count fail");
+        return device_v;
+  }
+
+  for (vsi_size_t i = 0; i < deviceCount; i++) {
+    vsi_size_t available_core_count = 0;
+    vsi_nn_GetDeviceCoreCount(vsi_devices[i],&available_core_count);
+    auto  local_device = std::make_shared<NativeDeviceImpl>(i,available_core_count);
+    device_v.push_back(local_device);
+  }
+#else
   vxQueryContext(context->c, VX_CONTEXT_DEVICE_COUNT_VIV, &deviceCount,
                  sizeof(deviceCount));
-  std::cout << "Device count = " << deviceCount << std::endl;
   for (device_id_t i = 0; i < deviceCount; i++) {
-    IDevice* local_device = new NativeDeviceImpl(i);
-    std::shared_ptr<IDevice> local_device_sp(local_device);
-    device_v.push_back(local_device_sp);
+    auto  local_device = std::make_shared<NativeDeviceImpl>(i,0);
+    device_v.push_back(local_device);
   }
+  VSILOGE("VSI device API is not supportted, please upgrade Vivant SDK version >= 6.4.22 && ovxlib >= 1.2.26 !");
+#endif
   vsi_nn_ReleaseContext(&context);
   return device_v;
 }
 
-std::shared_ptr<Graph> IExecutable::NBGraph() const { return nb_graph_; }
-
-std::shared_ptr<IExecutor> IExecutable::Executor() const {
-  auto executor = executor_.lock();
-  if (!executor) {
-    std::cout << "Executor unable to lock weak_ptr";
-  }
-  return executor;
-}
-
-NativeExecutable::NativeExecutable(const std::shared_ptr<IExecutor>& executor,
+NativeExecutableImpl::NativeExecutableImpl(const std::shared_ptr<IExecutor>& executor,
                                    const std::vector<char>& nb_buf,
                                    size_t inputs, size_t outputs) {
-  CompileOption opt;
-  opt.setDeviceId(executor->Device()->Id());
 
   executor_ = executor;
   context_ = executor->Contex();
-  nb_graph_ = context_->CreateGraph(opt);
+  nb_graph_ = context_->CreateGraph();
 
   nb_buf_ = nb_buf;
   nb_node_ = nb_graph_->CreateOperation<tim::vx::ops::NBG>(nb_buf_.data(),
                                                            inputs, outputs);
 }
 
-void NativeExecutable::SetInput(const std::shared_ptr<ITensorHandle>& th) {
+void NativeExecutableImpl::SetInput(const std::shared_ptr<ITensorHandle>& th) {
   nb_node_->BindInput(th->GetTensor());
+   input_handles_.push_back(th);
 }
 
-void NativeExecutable::SetOutput(const std::shared_ptr<ITensorHandle>& th) {
+void NativeExecutableImpl::SetInputs(const std::vector<std::shared_ptr<ITensorHandle>>& ths) {
+    for (auto& t : ths) {
+    SetInput(t);
+  }
+}
+
+void NativeExecutableImpl::SetOutput(const std::shared_ptr<ITensorHandle>& th) {
   nb_node_->BindOutput(th->GetTensor());
+  output_handles_.push_back(th);
 }
 
-void NativeExecutable::GetOutput(
-    const std::vector<std::shared_ptr<ITensorHandle>>& th) {
-  (void)th;
+void NativeExecutableImpl::SetOutputs(const std::vector<std::shared_ptr<ITensorHandle>>& ths) {
+  for (auto& t : ths) {
+    SetOutput(t);
+  }
+
 }
 
-bool NativeExecutable::Submit(const std::shared_ptr<IExecutable>& ref,
+bool NativeExecutableImpl::Submit(const std::shared_ptr<IExecutable>& ref,
                               bool after) {
   bool status = false;
   std::shared_ptr<IExecutable> executable = shared_from_this();
-  status = Executor()->Submit(executable, ref, after);
+  std::shared_ptr<NativeExecutorImpl> executor = std::dynamic_pointer_cast<NativeExecutorImpl>(executor_.lock());
+  status = executor->Submit(executable, ref, after);
   return status;
 }
 
-bool NativeExecutable::Trigger(bool async) {
+bool NativeExecutableImpl::Trigger(bool async) {
   (void)async;
-  bool status = false;
-  auto device = Executor()->Device();
-  device->Submit(nb_graph_);
-  status = device->Trigger();
-  device->WaitDeviceIdle();
+  bool status = nb_graph_->Run();
   return status;
 }
 
-std::shared_ptr<ITensorHandle> NativeExecutable::AllocateTensor(
-    const TensorSpec& tensor_spec) {
-  auto tensor = nb_graph_->CreateTensor(tensor_spec);
-  ITensorHandle* tensor_handle = new NativeTensorHandle(tensor);
-  std::shared_ptr<ITensorHandle> tensor_handle_sp(tensor_handle);
-  return tensor_handle_sp;
+std::shared_ptr<ITensorHandle> NativeExecutableImpl::AllocateTensor(const TensorSpec& tensor_spec,
+                                                                    void* data, uint32_t size) {
+  (void)size;
+  auto tensor = nb_graph_->CreateTensor(tensor_spec,data);
+  return std::make_shared<NativeTensorHandleImpl>(tensor);
 }
 
-bool NativeExecutable::Verify() { return nb_graph_->Compile(); }
-
-ExecutableSet::ExecutableSet(
-    const std::vector<std::shared_ptr<IExecutable>>& executables) {
-  executables_ = executables;
-  executor_ = executables[0]->Executor();
-}
-
-void ExecutableSet::SetInput(const std::shared_ptr<ITensorHandle>& th) {
-  (void)th;
-}
-
-void ExecutableSet::SetOutput(const std::shared_ptr<ITensorHandle>& th) {
-  (void)th;
-}
-
-void ExecutableSet::GetOutput(
-    const std::vector<std::shared_ptr<ITensorHandle>>& th) {
-  (void)th;
-}
-
-bool ExecutableSet::Submit(const std::shared_ptr<IExecutable>& ref,
-                           bool after) {
-  bool status = false;
-  std::shared_ptr<IExecutable> executable = shared_from_this();
-  status = Executor()->Submit(executable, ref, after);
-  return status;
-}
-
-bool ExecutableSet::Trigger(bool async) {
-  (void)async;
-  bool status = false;
-  auto device = Executor()->Device();
-  for (auto executable : executables_) {
-    device->Submit(executable->NBGraph());
+bool NativeExecutableImpl::Verify() {
+  std::shared_ptr<NativeExecutorImpl> executor = std::dynamic_pointer_cast<NativeExecutorImpl>(executor_.lock());
+  bool success = executor->BindDevices(NBGraph());
+  if (success == false) {
+    VSILOGE("Executable bind device failed");
+    return false;
   }
-  status = device->Trigger();
-  device->WaitDeviceIdle();
-  return status;
-}
-
-std::shared_ptr<ITensorHandle> ExecutableSet::AllocateTensor(
-    const TensorSpec& tensor_spec) {
-  std::shared_ptr<ITensorHandle> tensor_handle_sp;
-  (void)tensor_spec;
-  return tensor_handle_sp;
-}
-
-std::vector<std::shared_ptr<IExecutable>> ExecutableSet::Executables() const {
-  return executables_;
-}
-
-bool ExecutableSet::Verify() {
-  bool status = false;
-  for (auto executable : executables_) {
-    status = executable->Verify();
+  success = nb_graph_->Compile();
+  return success;
   }
-  return status;
-}
 
-std::shared_ptr<Context> IExecutor::Contex() const { return context_; }
-
-NativeExecutor::NativeExecutor(const std::shared_ptr<IDevice>& device) {
-  device_ = device;
-  context_ = Context::Create();
-}
-
-NativeExecutor::NativeExecutor(const std::shared_ptr<IDevice>& device,
+NativeExecutorImpl::NativeExecutorImpl(const std::shared_ptr<IDevice>& device,
+                               const int32_t core_count,
+                               const int32_t core_index,
                                const std::shared_ptr<Context>& context) {
   device_ = device;
-  context_ = context;
+  if(!context) {
+    context_ = Context::Create();
+  } else {
+    context_ = context;
+  }
+  auto fixed_core_count = core_count;
+  int32_t fixed_core_index = core_index;
+  int32_t total_core_count  =(int32_t)device_->CoreCount();
+  if (fixed_core_index < 0) {
+    fixed_core_index = 0;
+  }
+  if (fixed_core_index > total_core_count - 1) {
+     VSILOGE("Core index is larger than total core count");
+     assert(false);
+  }
+  if (fixed_core_count <= 0 ) {
+    fixed_core_count = total_core_count - fixed_core_index;
+  }
+
+  if (fixed_core_index + fixed_core_count > total_core_count) {
+    fixed_core_count = total_core_count - fixed_core_index;
+    VSILOGW(
+        "Core_index + core_count is larger than total core count. Fix core count to %d", fixed_core_count);
+  }
+  core_index_ = (uint32_t)fixed_core_index;
+  core_count_ = (uint32_t)fixed_core_count;
+#ifdef VSI_DEVICE_SUPPORT
+  vsi_nn_device_t  vsi_devices[VSI_MAX_DEVICES] = {0};
+  vsi_size_t num_devices = 0;
+  auto ctx = dynamic_cast<ContextImpl*>(context_.get());
+  vsi_nn_GetDevices(ctx->context(),vsi_devices,&num_devices);
+  vsi_nn_CreateSubDevice(vsi_devices[device_->Id()],core_index_,core_count_,&sub_devices_);
+#endif
 }
 
-bool NativeExecutor::Submit(const std::shared_ptr<IExecutable>& executable,
+bool NativeExecutorImpl::Submit(const std::shared_ptr<IExecutable>& executable,
                             const std::shared_ptr<IExecutable>& ref,
                             bool after) {
   bool success = false;
   success = executable->Verify();
-  if (success == false) {
-    std::cout << "Executable NBG compile failed";
+  if(success == false) {
+    VSILOGE("Executable NBG compile failed");
     return false;
   }
-  if (executable == ref) {
+  if(executable == ref) {
     tasks_.push_back(executable);
     return true;
   }
-  for (size_t i = 0; i < tasks_.size(); i++) {
-    if (tasks_[i].lock() == ref) {
-      if (after == true) {
+  for(size_t i = 0; i < tasks_.size(); i++) {
+    if(tasks_[i].lock() == ref) {
+      if(after == true) {
         tasks_.insert(tasks_.begin() + i + 1, executable);
         success = true;
         break;
@@ -257,59 +250,81 @@ bool NativeExecutor::Submit(const std::shared_ptr<IExecutable>& executable,
   return success;
 }
 
-bool NativeExecutor::Trigger(bool async) {
+bool NativeExecutorImpl::Trigger(bool async) {
   (void)async;
-  while (!tasks_.empty()) {
+  bool ret = false;
+  while(!tasks_.empty()) {
     auto task = tasks_.front();
     tasks_.erase(tasks_.begin());
-    auto task_ = task.lock();
-    if (!task_) {
-      std::cout << "Task unable to lock weak_ptr";
+    auto task_tmp = task.lock();
+    if(!task_tmp) {
+      VSILOGE("Task unable to lock weak_ptr");
+       return false;
     }
-    task_->Trigger();
+    ret = task_tmp->Trigger();
   }
   device_->WaitDeviceIdle();
-  return true;
+  return ret;
 }
 
-std::shared_ptr<IExecutable> NativeExecutor::Compile(
+std::shared_ptr<IExecutable> NativeExecutorImpl::Compile(
     const std::shared_ptr<Graph>& graph) {
-
-  CompileOption option;
-  option.setDeviceId(device_->Id());
-  graph->SetCompileOption(option);
-
+  bool ret = BindDevices(graph);
+  if(!ret) {
+    return nullptr;
+  }
   size_t bin_size = -1;
-  graph->CompileToBinary(nullptr, &bin_size);
+  ret = graph->CompileToBinary(nullptr, &bin_size);
+  if(!ret) {
+    return nullptr;
+  }
   std::vector<char> nb_buf;
   nb_buf.resize(bin_size);
   size_t inputs = graph->InputsTensor().size();
   size_t outputs = graph->OutputsTensor().size();
-  graph->CompileToBinary(nb_buf.data(), &bin_size);
-  std::shared_ptr<IExecutor> this_sp = shared_from_this();
-  IExecutable* executable =
-      new NativeExecutable(this_sp, nb_buf, inputs, outputs);
-  std::shared_ptr<IExecutable> executable_sp(executable);
-  return executable_sp;
+  ret = graph->CompileToBinary(nb_buf.data(), &bin_size);
+  if(!ret) {
+    return nullptr;
+  }
+  std::shared_ptr<NativeExecutorImpl> this_sp = shared_from_this();
+  auto  executable = std::make_shared<NativeExecutableImpl>(this_sp, nb_buf,inputs,outputs);
+  return executable;
 }
 
-std::shared_ptr<IDevice> IExecutor::Device() const { return device_; }
 
-std::shared_ptr<Tensor> ITensorHandle::GetTensor() const { return tensor_; }
+bool NativeExecutorImpl::BindDevices(const std::shared_ptr<Graph>& graph){
+  vsi_status status  = VSI_SUCCESS;
+#ifdef VSI_DEVICE_SUPPORT
+  GraphImpl* graphimp = dynamic_cast<GraphImpl*>(graph.get());
+  status = vsi_nn_BindDevices(graphimp->graph(), 1, &sub_devices_);
+#else
+  CompileOption option;
+  option.setDeviceId(device_->Id());
+  graph->SetCompileOption(option);
+#endif
+  if(status == VSI_SUCCESS) {
+    return true;
+  }
+  else{
+    return false;
+  }
+}
 
-NativeTensorHandle::NativeTensorHandle(const std::shared_ptr<Tensor>& tensor) {
+
+NativeTensorHandleImpl::NativeTensorHandleImpl(const std::shared_ptr<Tensor>& tensor) {
   tensor_ = tensor;
+  spec_ = tensor->GetSpec();
 }
 
-bool NativeTensorHandle::CopyDataToTensor(const void* data,
+bool NativeTensorHandleImpl::CopyDataToTensor(const void* data,
                                           uint32_t size_in_bytes) {
   return tensor_->CopyDataToTensor(data, size_in_bytes);
 }
 
-bool NativeTensorHandle::CopyDataFromTensor(void* data) {
+bool NativeTensorHandleImpl::CopyDataFromTensor(void* data) {
   return tensor_->CopyDataFromTensor(data);
 }
 
 }  // namespace platform
 }  // namespace vx
-}  // namespace tim
\ No newline at end of file
+}  // namespace tim
diff --git a/src/tim/vx/platform/native_device_private.h b/src/tim/vx/platform/native_device_private.h
deleted file mode 100644
index ad005f9..0000000
--- a/src/tim/vx/platform/native_device_private.h
+++ /dev/null
@@ -1,58 +0,0 @@
-/****************************************************************************
-*
-*    Copyright (c) 2020-2023 Vivante Corporation
-*
-*    Permission is hereby granted, free of charge, to any person obtaining a
-*    copy of this software and associated documentation files (the "Software"),
-*    to deal in the Software without restriction, including without limitation
-*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
-*    and/or sell copies of the Software, and to permit persons to whom the
-*    Software is furnished to do so, subject to the following conditions:
-*
-*    The above copyright notice and this permission notice shall be included in
-*    all copies or substantial portions of the Software.
-*
-*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-*    DEALINGS IN THE SOFTWARE.
-*
-*****************************************************************************/
-#ifndef TIM_VX_NATIVE_DEVICE_PRIVATE_H_
-#define TIM_VX_NATIVE_DEVICE_PRIVATE_H_
-
-#include "tim/vx/platform/native.h"
-#include "vip/virtual_device.h"
-#include "graph_private.h"
-
-namespace tim {
-namespace vx {
-
-class GraphImpl;
-
-namespace platform {
-
-class NativeDeviceImpl : public NativeDevice {
- public:
-  NativeDeviceImpl(device_id_t id);
-  ~NativeDeviceImpl(){};
-
-  bool Submit(const std::shared_ptr<tim::vx::Graph>& graph) override;
-  bool Trigger(bool async = false, async_callback cb = NULL) override;
-  bool DeviceExit() override;
-  void WaitDeviceIdle() override;
-
- protected:
-  std::unique_ptr<vip::IDevice> vip_device_;
-  std::vector<vsi_nn_graph_t*> vsi_graph_v_;
-
-};
-
-}  // namespace platform
-}  // namespace vx
-}  // namespace tim
-
-#endif /* TIM_VX_NATIVE_DEVICE_PRIVATE_H_*/
\ No newline at end of file
diff --git a/src/tim/vx/platform/native_private.h b/src/tim/vx/platform/native_private.h
new file mode 100644
index 0000000..3d86a08
--- /dev/null
+++ b/src/tim/vx/platform/native_private.h
@@ -0,0 +1,106 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020-2025 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+#ifndef TIM_VX_NATIVE_DEVICE_PRIVATE_H_
+#define TIM_VX_NATIVE_DEVICE_PRIVATE_H_
+
+#include "tim/vx/platform/native.h"
+#include "vip/virtual_device.h"
+#include "graph_private.h"
+
+namespace tim {
+namespace vx {
+
+class GraphImpl;
+
+namespace platform {
+
+class NativeDeviceImpl : public NativeDevice,
+                         public std::enable_shared_from_this<NativeDeviceImpl>{
+ public:
+  NativeDeviceImpl(device_id_t id,uint32_t core_count);
+  ~NativeDeviceImpl(){};
+
+  bool Submit(const std::shared_ptr<tim::vx::Graph>& graph) override;
+  bool Trigger(bool async = false, async_callback cb = NULL) override;
+  bool DeviceExit() override;
+  void WaitDeviceIdle() override;
+  std::shared_ptr<IExecutor> CreateExecutor(const int32_t core_index = 0,
+                                            const int32_t core_count = -1,
+                                            const std::shared_ptr<Context>& context = nullptr) override;
+};
+
+class NativeExecutableImpl : public NativeExecutable {
+ public:
+  NativeExecutableImpl(const std::shared_ptr<IExecutor>& executor,
+                   const std::vector<char>& nb_buf, size_t inputs,
+                   size_t outputs);
+  ~NativeExecutableImpl() {};
+  void SetInput(const std::shared_ptr<ITensorHandle>& th) override;
+  void SetOutput(const std::shared_ptr<ITensorHandle>& th) override;
+  void SetInputs(const std::vector<std::shared_ptr<ITensorHandle>>& ths) override;
+  void SetOutputs(const std::vector<std::shared_ptr<ITensorHandle>>& ths) override;
+  bool Submit(const std::shared_ptr<IExecutable>& ref, bool after = true) override;
+  bool Trigger(bool async = false) override;
+  std::shared_ptr<ITensorHandle> AllocateTensor(const TensorSpec& tensor_spec,
+                                                void* data = nullptr, uint32_t size = 0) override;
+  bool Verify() override;
+
+ protected:
+  std::shared_ptr<tim::vx::ops::NBG> nb_node_;
+  std::vector<char> nb_buf_;
+};
+
+class NativeExecutorImpl : public NativeExecutor,
+                       public std::enable_shared_from_this<NativeExecutorImpl> {
+ public:
+  NativeExecutorImpl(const std::shared_ptr<IDevice>& device,
+                 const int32_t core_count = -1,
+                 const int32_t core_index = 0,
+                 const std::shared_ptr<Context>& context = nullptr);
+  ~NativeExecutorImpl(){};
+  bool Submit(const std::shared_ptr<IExecutable>& executable,
+              const std::shared_ptr<IExecutable>& ref,
+              bool after = true) override;
+  bool Trigger(bool async = false) override;
+  std::shared_ptr<IExecutable> Compile(const std::shared_ptr<Graph>& graph) override;
+  bool BindDevices(const std::shared_ptr<Graph>& graph);
+
+private:
+#ifdef VSI_DEVICE_SUPPORT
+  vsi_nn_device_t  sub_devices_;
+#endif
+};
+
+class NativeTensorHandleImpl : public NativeTensorHandle {
+ public:
+  NativeTensorHandleImpl(const std::shared_ptr<Tensor>& tensor);
+  bool CopyDataToTensor(const void* data, uint32_t size_in_bytes) override;
+  bool CopyDataFromTensor(void* data) override;
+};
+
+}  // namespace platform
+}  // namespace vx
+}  // namespace tim
+
+#endif /* TIM_VX_NATIVE_DEVICE_PRIVATE_H_*/