From 27890719b617ad3539446b71b36a322adce1afc8 Mon Sep 17 00:00:00 2001
From: liyuenan <37231553+liyuenan2333@users.noreply.github.com>
Date: Tue, 28 Mar 2023 09:51:23 +0800
Subject: [PATCH] Support remote platform by gRPC (#561)

* Support remote platform by gRPC

Signed-off-by: yuenan.li <yuenan.li@verisilicon.com>
---
 CMakeLists.txt                                |  17 +-
 README.md                                     |   5 +
 cmake/gRPC.cmake                              |  27 ++
 include/tim/vx/platform/grpc/grpc_remote.h    |  99 +++++
 include/tim/vx/platform/lite/lite_native.h    |  96 +++++
 include/tim/vx/platform/native.h              |  32 +-
 include/tim/vx/platform/platform.h            |  45 ++-
 samples/CMakeLists.txt                        |   6 +
 samples/grpc/CMakeLists.txt                   |  11 +
 samples/grpc/README.txt                       |   2 +
 samples/grpc/grpc_multi_device.cc             |  87 +++++
 samples/lite_multi_device/CMakeLists.txt      |  13 +
 .../lite_multi_device/lite_multi_device.cc    |  79 ++++
 src/tim/CMakeLists.txt                        | 136 +++++--
 src/tim/vx/platform/grpc/README.md            |  51 +++
 src/tim/vx/platform/grpc/grpc_platform.proto  | 120 ++++++
 .../vx/platform/grpc/grpc_platform_client.cc  | 240 ++++++++++++
 .../vx/platform/grpc/grpc_platform_client.h   |  77 ++++
 .../vx/platform/grpc/grpc_platform_server.cc  | 348 ++++++++++++++++++
 src/tim/vx/platform/grpc/grpc_remote.cc       | 176 +++++++++
 src/tim/vx/platform/lite/lite_native.cc       | 312 ++++++++++++++++
 src/tim/vx/platform/native.cc                 | 124 ++++---
 22 files changed, 1983 insertions(+), 120 deletions(-)
 create mode 100644 cmake/gRPC.cmake
 create mode 100644 include/tim/vx/platform/grpc/grpc_remote.h
 create mode 100644 include/tim/vx/platform/lite/lite_native.h
 create mode 100644 samples/grpc/CMakeLists.txt
 create mode 100644 samples/grpc/README.txt
 create mode 100644 samples/grpc/grpc_multi_device.cc
 create mode 100644 samples/lite_multi_device/CMakeLists.txt
 create mode 100644 samples/lite_multi_device/lite_multi_device.cc
 create mode 100644 src/tim/vx/platform/grpc/README.md
 create mode 100644 src/tim/vx/platform/grpc/grpc_platform.proto
 create mode 100644 src/tim/vx/platform/grpc/grpc_platform_client.cc
 create mode 100644 src/tim/vx/platform/grpc/grpc_platform_client.h
 create mode 100644 src/tim/vx/platform/grpc/grpc_platform_server.cc
 create mode 100644 src/tim/vx/platform/grpc/grpc_remote.cc
 create mode 100644 src/tim/vx/platform/lite/lite_native.cc

diff --git a/CMakeLists.txt b/CMakeLists.txt
index ab9f6a1..1e388a6 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -12,6 +12,8 @@ option(TIM_VX_BUILD_EXAMPLES            "Build demos show general usage"
 option(TIM_VX_ENABLE_VIPLITE            "Enable lite driver api support"        OFF)
 option(TIM_VX_ENABLE_40BIT              "Enable large memory support"           OFF)
 option(TIM_VX_ENABLE_PLATFORM           "Enable multi devices support"          OFF)
+option(TIM_VX_ENABLE_PLATFORM_LITE      "Enable lite multi-device support"      OFF)
+option(TIM_VX_ENABLE_GRPC               "Enable gPRC support"                   OFF)
 option(TIM_VX_DBG_ENABLE_TENSOR_HNDL    "Enable built-in tensor from handle: use malloced memory instead of VideoMemory by kernel driver" ON)
 
 set(CMAKE_CXX_STANDARD 14)
@@ -24,6 +26,10 @@ if(${TIM_VX_CODE_COVERAGE})
     set(CMAKE_C_FLAGS "-g -O0 --coverage -fprofile-arcs -ftest-coverage")
 endif()
 
+if(${TIM_VX_ENABLE_PLATFORM_LITE})
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DENABLE_PLATFORM_LITE")
+endif()
+
 if(${TIM_VX_DBG_ENABLE_TENSOR_HNDL})
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DENABLE_TENSOR_HNDL=1")
 else()
@@ -69,10 +75,9 @@ endif()
 if(TIM_VX_ENABLE_TEST)
     include(FetchContent)
     FetchContent_Declare(
-    googletest
-    GIT_REPOSITORY https://github.com/google/googletest.git
-    GIT_TAG release-1.12.0
-    )
+        googletest
+        GIT_REPOSITORY https://github.com/google/googletest.git
+        GIT_TAG release-1.12.0)
 
     set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
     set(INSTALL_GTEST OFF CACHE BOOL "" FORCE)
@@ -84,6 +89,10 @@ if(TIM_VX_ENABLE_TEST)
     endif()
 endif()
 
+if(TIM_VX_ENABLE_GRPC)
+    include(cmake/gRPC.cmake)
+endif()
+
 add_subdirectory("src/tim")
 
 if(TIM_VX_BUILD_EXAMPLES)
diff --git a/README.md b/README.md
index a069977..5967a3f 100644
--- a/README.md
+++ b/README.md
@@ -79,6 +79,11 @@ cmake options:
 |`EXTERNAL_VIV_SDK`| Give external vivante openvx driver libraries | Not set|
 |`TIM_VX_BUILD_EXAMPLES`| Build example applications | OFF |
 |`TIM_VX_ENABLE_40BIT` | Enable large memory (over 4G) support in NPU driver | OFF |
+|`TIM_VX_ENABLE_PLATFORM` | Enable multi devices support | OFF |
+|`TIM_VX_ENABLE_PLATFORM_LITE` | Enable lite multi-device support, only work when `TIM_VX_ENABLE_PLATFORM`=ON | OFF |
+|`VIP_LITE_SDK` | full path to VIPLite sdk, required when `TIM_VX_ENABLE_PLATFORM_LITE`=ON | Not set |
+|`TIM_VX_ENABLE_GRPC` | Enable gPRC support, only work when `TIM_VX_ENABLE_PLATFORM`=ON | OFF |
+|`TIM_VX_DBG_ENABLE_TENSOR_HNDL` | Enable built-in tensor from handle | ON |
 
 ----
 Run unit test:
diff --git a/cmake/gRPC.cmake b/cmake/gRPC.cmake
new file mode 100644
index 0000000..5d739e7
--- /dev/null
+++ b/cmake/gRPC.cmake
@@ -0,0 +1,27 @@
+find_package(Threads REQUIRED)
+
+# Find Protobuf installation
+# Looks for protobuf-config.cmake file installed by Protobuf's cmake installation.
+set(protobuf_MODULE_COMPATIBLE TRUE)
+find_package(Protobuf CONFIG REQUIRED)
+message(STATUS "Using protobuf ${Protobuf_VERSION}")
+
+set(PROTOBUF_LIBPROTOBUF protobuf::libprotobuf)
+set(GRPCPP_REFLECTION gRPC::grpc++_reflection)
+if(CMAKE_CROSSCOMPILING)
+  find_program(PROTOBUF_PROTOC protoc)
+else()
+  set(PROTOBUF_PROTOC $<TARGET_FILE:protobuf::protoc>)
+endif()
+
+# Find gRPC installation
+# Looks for gRPCConfig.cmake file installed by gRPC's cmake installation.
+find_package(gRPC CONFIG REQUIRED)
+message(STATUS "Using gRPC ${gRPC_VERSION}")
+
+set(GRPC_GRPCPP gRPC::grpc++)
+if(CMAKE_CROSSCOMPILING)
+  find_program(GRPC_CPP_PLUGIN_EXECUTABLE grpc_cpp_plugin)
+else()
+  set(GRPC_CPP_PLUGIN_EXECUTABLE $<TARGET_FILE:gRPC::grpc_cpp_plugin>)
+endif()
diff --git a/include/tim/vx/platform/grpc/grpc_remote.h b/include/tim/vx/platform/grpc/grpc_remote.h
new file mode 100644
index 0000000..63d8e32
--- /dev/null
+++ b/include/tim/vx/platform/grpc/grpc_remote.h
@@ -0,0 +1,99 @@
+/****************************************************************************
+*
+*    Copyright (c) 2023 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+#ifndef TIM_VX_GRPC_REMOTE_H_
+#define TIM_VX_GRPC_REMOTE_H_
+
+#include "tim/vx/platform/platform.h"
+
+namespace tim {
+namespace vx {
+namespace platform {
+
+class GRPCPlatformClient;
+
+class GRPCRemoteDevice : public IDevice {
+ public:
+  GRPCRemoteDevice(int32_t id, std::shared_ptr<GRPCPlatformClient> client);
+  bool Submit(const std::shared_ptr<Graph>& graph) override;
+  bool Trigger(bool async = false, async_callback cb = NULL) override;
+  bool DeviceExit() override;
+  void WaitDeviceIdle() override;
+  void RemoteReset() override;
+  static std::vector<std::shared_ptr<IDevice>> Enumerate(
+      const std::string& port);
+
+  std::shared_ptr<GRPCPlatformClient> client_;
+};
+
+class GRPCRemoteExecutor : public IExecutor {
+ public:
+  GRPCRemoteExecutor(std::shared_ptr<IDevice> device);
+  bool Submit(const std::shared_ptr<IExecutable>& executable,
+              const std::shared_ptr<IExecutable>& ref,
+              bool after = true) override;
+  bool Trigger(bool async = false) override;
+  std::shared_ptr<IExecutable> Compile(
+      const std::shared_ptr<Graph>& graph) override;
+  int32_t Id() const;
+
+ private:
+  int32_t executor_id_;
+  std::shared_ptr<IDevice> device_;
+};
+
+class GRPCRemoteExecutable : public IExecutable {
+ public:
+  GRPCRemoteExecutable(int32_t id, std::shared_ptr<IDevice> device);
+  void SetInput(const std::shared_ptr<ITensorHandle>& th) override;
+  void SetOutput(const std::shared_ptr<ITensorHandle>& th) override;
+  void GetOutput(
+      const std::vector<std::shared_ptr<ITensorHandle>>& th) override;
+  bool Submit(const std::shared_ptr<IExecutable>& ref, bool after) override;
+  bool Trigger(bool async) override;
+  bool Verify() override;
+  std::shared_ptr<ITensorHandle> AllocateTensor(
+      const TensorSpec& tensor_spec) override;
+  int32_t Id() const;
+
+ private:
+  int32_t executable_id_;
+  std::shared_ptr<IDevice> device_;
+};
+
+class GRPCRemoteTensorHandle : public ITensorHandle {
+ public:
+  GRPCRemoteTensorHandle(int32_t id, std::shared_ptr<IDevice> device);
+  bool CopyDataToTensor(const void* data, uint32_t size_in_bytes) override;
+  bool CopyDataFromTensor(void* data) override;
+  int32_t Id() const;
+
+ private:
+  int32_t tensor_id_;
+  std::shared_ptr<IDevice> device_;
+};
+
+}  // namespace platform
+}  // namespace vx
+}  // namespace tim
+#endif
diff --git a/include/tim/vx/platform/lite/lite_native.h b/include/tim/vx/platform/lite/lite_native.h
new file mode 100644
index 0000000..b83d5fe
--- /dev/null
+++ b/include/tim/vx/platform/lite/lite_native.h
@@ -0,0 +1,96 @@
+/****************************************************************************
+*
+*    Copyright (c) 2023 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+#ifndef TIM_VX_LITE_NATIVE_H_
+#define TIM_VX_LITE_NATIVE_H_
+
+#include "tim/vx/platform/platform.h"
+#include "vip_lite.h"
+#include "nbg_linker.h"
+
+namespace tim {
+namespace vx {
+namespace platform {
+
+class LiteNativeExecutor
+    : public IExecutor,
+      public std::enable_shared_from_this<LiteNativeExecutor> {
+ public:
+  LiteNativeExecutor(const std::shared_ptr<IDevice>& device);
+  virtual ~LiteNativeExecutor();
+  bool Submit(const std::shared_ptr<IExecutable>& executable,
+              const std::shared_ptr<IExecutable>& ref,
+              bool after = true) override;
+  bool Trigger(bool async = false) override;
+  std::shared_ptr<IExecutable> Compile(
+      const std::shared_ptr<Graph>& graph) override;
+
+ private:
+  vip_task_descriptor_t* task_descriptor_;
+  vip_database database_;
+};
+
+class LiteNativeExecutable : public IExecutable {
+ public:
+  LiteNativeExecutable(const std::shared_ptr<IExecutor>& executor,
+                       const std::vector<char>& nb_buf);
+  virtual ~LiteNativeExecutable();
+  void SetInput(const std::shared_ptr<ITensorHandle>& th) override;
+  void SetOutput(const std::shared_ptr<ITensorHandle>& th) override;
+  void GetOutput(
+      const std::vector<std::shared_ptr<ITensorHandle>>& th) override;
+  bool Submit(const std::shared_ptr<IExecutable>& ref, bool after) override;
+  bool Trigger(bool async) override;
+  bool Verify() override;
+  std::shared_ptr<ITensorHandle> AllocateTensor(
+      const TensorSpec& tensor_spec) override;
+
+  vip_network network_;
+
+ private:
+  void SetBuffer(vip_memory_t* dst, gcvip_videomemory_t* src);
+
+  int32_t input_count_;
+  int32_t output_count_;
+
+  gcvip_videomemory_t* coeff_;
+  gcvip_videomemory_t* command_;
+  gcvip_videomemory_t* memory_pool_;
+  gcvip_videomemory_t* others_;
+  gcvip_videomemory_t* pre_command_;
+};
+
+class LiteNativeTensorHandle : public ITensorHandle {
+ public:
+  LiteNativeTensorHandle(const std::shared_ptr<Tensor>& tensr);
+  virtual ~LiteNativeTensorHandle();
+  bool CopyDataToTensor(const void* data, uint32_t size_in_bytes) override;
+  bool CopyDataFromTensor(void* data) override;
+
+  gcvip_videomemory_t* tensor_buffer_;
+};
+}  // namespace platform
+}  // namespace vx
+}  // namespace tim
+
+#endif
\ No newline at end of file
diff --git a/include/tim/vx/platform/native.h b/include/tim/vx/platform/native.h
index 18adb4c..cecf34a 100644
--- a/include/tim/vx/platform/native.h
+++ b/include/tim/vx/platform/native.h
@@ -38,36 +38,43 @@ class NativeDevice : public IDevice {
   virtual bool DeviceExit() = 0;
   virtual void WaitDeviceIdle() = 0;
   static std::vector<std::shared_ptr<IDevice>> Enumerate();
-
 };
 
-class NativeExecutable : public IExecutable{
+class NativeExecutable : public IExecutable {
  public:
-  NativeExecutable(const std::shared_ptr<IExecutor>& executor, const std::vector<char>& nb_buf, size_t inputs, size_t outputs);
+  NativeExecutable(const std::shared_ptr<IExecutor>& executor,
+                   const std::vector<char>& nb_buf, size_t inputs,
+                   size_t outputs);
   ~NativeExecutable(){};
   void SetInput(const std::shared_ptr<ITensorHandle>& th) override;
   void SetOutput(const std::shared_ptr<ITensorHandle>& th) override;
-  void GetOutput(const std::vector<std::shared_ptr<ITensorHandle>>& th) override;
-  bool Submit(const std::shared_ptr<IExecutable>& ref, bool after = true) override;
+  void GetOutput(
+      const std::vector<std::shared_ptr<ITensorHandle>>& th) override;
+  bool Submit(const std::shared_ptr<IExecutable>& ref,
+              bool after = true) override;
   bool Trigger(bool async = false) override;
-  std::shared_ptr<ITensorHandle> AllocateTensor(const TensorSpec& tensor_spec) override;
+  std::shared_ptr<ITensorHandle> AllocateTensor(
+      const TensorSpec& tensor_spec) override;
   bool Verify() override;
 
  protected:
   std::shared_ptr<tim::vx::ops::NBG> nb_node_;
   std::vector<char> nb_buf_;
-
 };
 
-class NativeExecutor : public IExecutor, public std::enable_shared_from_this<NativeExecutor>{
+class NativeExecutor : public IExecutor,
+                       public std::enable_shared_from_this<NativeExecutor> {
  public:
   NativeExecutor(const std::shared_ptr<IDevice>& device);
-  NativeExecutor(const std::shared_ptr<IDevice>& device, const std::shared_ptr<Context>& context);
+  NativeExecutor(const std::shared_ptr<IDevice>& device,
+                 const std::shared_ptr<Context>& context);
   ~NativeExecutor(){};
-  bool Submit(const std::shared_ptr<IExecutable>& executable, const std::shared_ptr<IExecutable>& ref, bool after = true) override;
+  bool Submit(const std::shared_ptr<IExecutable>& executable,
+              const std::shared_ptr<IExecutable>& ref,
+              bool after = true) override;
   bool Trigger(bool async = false) override;
-  std::shared_ptr<IExecutable> Compile(const std::shared_ptr<Graph>& graph) override;
-
+  std::shared_ptr<IExecutable> Compile(
+      const std::shared_ptr<Graph>& graph) override;
 };
 
 class NativeTensorHandle : public ITensorHandle {
@@ -75,7 +82,6 @@ class NativeTensorHandle : public ITensorHandle {
   NativeTensorHandle(const std::shared_ptr<Tensor>& tensor);
   bool CopyDataToTensor(const void* data, uint32_t size_in_bytes) override;
   bool CopyDataFromTensor(void* data) override;
-
 };
 
 }  // namespace platform
diff --git a/include/tim/vx/platform/platform.h b/include/tim/vx/platform/platform.h
index 9a3c853..e90bce7 100644
--- a/include/tim/vx/platform/platform.h
+++ b/include/tim/vx/platform/platform.h
@@ -39,7 +39,7 @@ namespace vx {
 class Graph;
 class Context;
 
-namespace ops{
+namespace ops {
 class NBG;
 }
 
@@ -51,13 +51,16 @@ class ExecutableSet;
 class IExecutor;
 class ITensorHandle;
 
-std::shared_ptr<IExecutable> Compile(const std::shared_ptr<Graph>& graph, const std::shared_ptr<IExecutor>& executor);
-std::shared_ptr<IExecutable> CreateExecutableSet(const std::vector<std::shared_ptr<IExecutable>>& executables);
+std::shared_ptr<IExecutable> Compile(
+    const std::shared_ptr<Graph>& graph,
+    const std::shared_ptr<IExecutor>& executor);
+std::shared_ptr<IExecutable> CreateExecutableSet(
+    const std::vector<std::shared_ptr<IExecutable>>& executables);
 
 class IDevice {
  public:
   using device_id_t = uint32_t;
-  using async_callback = std::function<bool (const void*)>;
+  using async_callback = std::function<bool(const void*)>;
   using data_t = const void*;
   virtual ~IDevice(){};
   virtual bool Submit(const std::shared_ptr<Graph>& graph) = 0;
@@ -65,6 +68,7 @@ class IDevice {
   device_id_t Id() const;
   virtual void WaitDeviceIdle() = 0;
   virtual bool DeviceExit() = 0;
+  virtual void RemoteReset();
 
  protected:
   device_id_t device_id_;
@@ -74,9 +78,12 @@ class IExecutor {
  public:
   using task = std::weak_ptr<IExecutable>;
   virtual ~IExecutor(){};
-  virtual bool Submit(const std::shared_ptr<IExecutable>& executable, const std::shared_ptr<IExecutable>& ref, bool after=true) = 0;
+  virtual bool Submit(const std::shared_ptr<IExecutable>& executable,
+                      const std::shared_ptr<IExecutable>& ref,
+                      bool after = true) = 0;
   virtual bool Trigger(bool async = false) = 0;  // todo: async=true
-  virtual std::shared_ptr<IExecutable> Compile(const std::shared_ptr<Graph>& graph) = 0;
+  virtual std::shared_ptr<IExecutable> Compile(
+      const std::shared_ptr<Graph>& graph) = 0;
   virtual std::shared_ptr<IDevice> Device() const;
   virtual std::shared_ptr<Context> Contex() const;
 
@@ -86,17 +93,20 @@ class IExecutor {
   std::shared_ptr<Context> context_;
 };
 
-class IExecutable : public std::enable_shared_from_this<IExecutable>{
+class IExecutable : public std::enable_shared_from_this<IExecutable> {
  public:
   virtual ~IExecutable(){};
   virtual void SetInput(const std::shared_ptr<ITensorHandle>& th) = 0;
   virtual void SetOutput(const std::shared_ptr<ITensorHandle>& th) = 0;
-  virtual void GetOutput(const std::vector<std::shared_ptr<ITensorHandle>>& th) = 0;  // for remote
-  virtual bool Submit(const std::shared_ptr<IExecutable>& ref, bool after = true) = 0;
+  virtual void GetOutput(
+      const std::vector<std::shared_ptr<ITensorHandle>>& th) = 0;  // for remote
+  virtual bool Submit(const std::shared_ptr<IExecutable>& ref,
+                      bool after = true) = 0;
   virtual bool Trigger(bool async = false) = 0;  // todo: async=true
   virtual bool Verify() = 0;
   virtual std::shared_ptr<Graph> NBGraph() const;
-  virtual std::shared_ptr<ITensorHandle> AllocateTensor(const TensorSpec& tensor_spec) = 0;
+  virtual std::shared_ptr<ITensorHandle> AllocateTensor(
+      const TensorSpec& tensor_spec) = 0;
   virtual std::shared_ptr<IExecutor> Executor() const;
 
  protected:
@@ -105,21 +115,23 @@ class IExecutable : public std::enable_shared_from_this<IExecutable>{
   std::shared_ptr<Graph> nb_graph_;
 };
 
-class ExecutableSet : public IExecutable{
+class ExecutableSet : public IExecutable {
  public:
   ExecutableSet(const std::vector<std::shared_ptr<IExecutable>>& executables);
   void SetInput(const std::shared_ptr<ITensorHandle>& th) override;
   void SetOutput(const std::shared_ptr<ITensorHandle>& th) override;
-  void GetOutput(const std::vector<std::shared_ptr<ITensorHandle>>& th) override;
-  bool Submit(const std::shared_ptr<IExecutable>& ref, bool after = true) override;
+  void GetOutput(
+      const std::vector<std::shared_ptr<ITensorHandle>>& th) override;
+  bool Submit(const std::shared_ptr<IExecutable>& ref,
+              bool after = true) override;
   bool Trigger(bool async = false) override;
   bool Verify() override;
-  std::shared_ptr<ITensorHandle> AllocateTensor(const TensorSpec& tensor_spec) override;
+  std::shared_ptr<ITensorHandle> AllocateTensor(
+      const TensorSpec& tensor_spec) override;
   std::vector<std::shared_ptr<IExecutable>> Executables() const;
-  
+
  protected:
   std::vector<std::shared_ptr<IExecutable>> executables_;
-
 };
 
 class ITensorHandle {
@@ -131,7 +143,6 @@ class ITensorHandle {
 
  protected:
   std::shared_ptr<Tensor> tensor_;
-
 };
 
 }  // namespace platform
diff --git a/samples/CMakeLists.txt b/samples/CMakeLists.txt
index e403dc8..919c0a5 100644
--- a/samples/CMakeLists.txt
+++ b/samples/CMakeLists.txt
@@ -20,4 +20,10 @@ endif()
 if(TIM_VX_ENABLE_PLATFORM)
     add_subdirectory("lenet_multi_device")
     add_subdirectory("multi_device")
+    if(${TIM_VX_ENABLE_PLATFORM_LITE})
+        add_subdirectory("lite_multi_device")
+    endif()
+    if(TIM_VX_ENABLE_GRPC)
+        add_subdirectory("grpc")
+    endif()
 endif()
diff --git a/samples/grpc/CMakeLists.txt b/samples/grpc/CMakeLists.txt
new file mode 100644
index 0000000..326f275
--- /dev/null
+++ b/samples/grpc/CMakeLists.txt
@@ -0,0 +1,11 @@
+message("samples/grpc")
+
+set(TARGET_NAME "grpc_multi_device")
+
+add_executable(${TARGET_NAME} ${CMAKE_CURRENT_SOURCE_DIR}/grpc_multi_device.cc)
+
+target_link_libraries(${TARGET_NAME} PRIVATE -Wl,--whole-archive tim-vx)
+target_include_directories(${TARGET_NAME} PRIVATE ${PROJECT_SOURCE_DIR}/include)
+
+install(TARGETS ${TARGET_NAME} ${TARGET_NAME}
+    DESTINATION ${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_BINDIR})
\ No newline at end of file
diff --git a/samples/grpc/README.txt b/samples/grpc/README.txt
new file mode 100644
index 0000000..d456382
--- /dev/null
+++ b/samples/grpc/README.txt
@@ -0,0 +1,2 @@
+run grpc_multi_device with a given port, for example
+./grpc_multi_device 0.0.0.0:50051
\ No newline at end of file
diff --git a/samples/grpc/grpc_multi_device.cc b/samples/grpc/grpc_multi_device.cc
new file mode 100644
index 0000000..42efa24
--- /dev/null
+++ b/samples/grpc/grpc_multi_device.cc
@@ -0,0 +1,87 @@
+
+/****************************************************************************
+*
+*    Copyright (c) 2023 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+#include "tim/vx/context.h"
+#include "tim/vx/graph.h"
+#include "tim/vx/ops.h"
+#include "tim/vx/types.h"
+#include "tim/vx/platform/grpc/grpc_remote.h"
+
+int main(int argc, char** argv) {
+  if(argc < 2) {
+    std::cout << "error: need a port to connect." << std::endl;
+    return -1;
+  }
+  //construct tim-vx graph
+  auto ctx = tim::vx::Context::Create();
+  auto graph = ctx->CreateGraph();
+
+  tim::vx::ShapeType io_shape({2, 2});
+  tim::vx::TensorSpec input_spec(tim::vx::DataType::INT32, io_shape,
+                                 tim::vx::TensorAttribute::INPUT);
+  tim::vx::TensorSpec output_spec(tim::vx::DataType::INT32, io_shape,
+                                  tim::vx::TensorAttribute::OUTPUT);
+  auto input_t0 = graph->CreateTensor(input_spec);
+  auto input_t1 = graph->CreateTensor(input_spec);
+  auto output_t = graph->CreateTensor(output_spec);
+
+  auto add = graph->CreateOperation<tim::vx::ops::Add>();
+  (*add).BindInputs({input_t0, input_t1}).BindOutputs({output_t});
+
+  std::vector<int> data_vec_i0({1, 2, 3, 4});
+  std::vector<int> data_vec_i1({4, 3, 2, 1});
+
+  //use grpc with platfrom remote API
+  std::string port(argv[1]);
+  auto devices = tim::vx::platform::GRPCRemoteDevice::Enumerate(port);
+  auto device = devices[0];
+  auto executor = std::make_shared<tim::vx::platform::GRPCRemoteExecutor>(device);
+  auto executable = executor->Compile(graph);
+  auto input0_handle = executable->AllocateTensor(input_spec);
+  auto input1_handle = executable->AllocateTensor(input_spec);
+  auto output_handle = executable->AllocateTensor(output_spec);
+  executable->SetInput(input0_handle);
+  executable->SetInput(input1_handle);
+  executable->SetOutput(output_handle);
+  input0_handle->CopyDataToTensor(data_vec_i0.data(),
+                                  data_vec_i0.size() * sizeof(int));
+  input1_handle->CopyDataToTensor(data_vec_i1.data(),
+                                  data_vec_i1.size() * sizeof(int));
+  executable->Submit(executable);
+  executor->Trigger();
+
+  int* data = (int*)malloc(4 * sizeof(int));
+
+  output_handle->CopyDataFromTensor(data);
+
+  //each output value should be "5" in this demo
+  for (int i = 0; i < 4; ++i) {
+    std::cout << "output value: " << data[i] << std::endl;
+  }
+  free(data);
+
+  //important step, reset after service
+  device->RemoteReset();
+  return 0;
+}
diff --git a/samples/lite_multi_device/CMakeLists.txt b/samples/lite_multi_device/CMakeLists.txt
new file mode 100644
index 0000000..0356eef
--- /dev/null
+++ b/samples/lite_multi_device/CMakeLists.txt
@@ -0,0 +1,13 @@
+message("samples/lite_multi_device")
+
+set(TARGET_NAME "lite_multi_device")
+
+add_executable(${TARGET_NAME} ${CMAKE_CURRENT_SOURCE_DIR}/lite_multi_device.cc)
+
+target_link_libraries(${TARGET_NAME} PRIVATE -Wl,--whole-archive tim-vx)
+target_include_directories(${TARGET_NAME} PRIVATE
+    ${PROJECT_SOURCE_DIR}/include
+    ${PROJECT_SOURCE_DIR}/prebuilt-sdk/viplite/build/sdk/include)
+
+install(TARGETS ${TARGET_NAME} ${TARGET_NAME}
+    DESTINATION ${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_BINDIR})
\ No newline at end of file
diff --git a/samples/lite_multi_device/lite_multi_device.cc b/samples/lite_multi_device/lite_multi_device.cc
new file mode 100644
index 0000000..51aec07
--- /dev/null
+++ b/samples/lite_multi_device/lite_multi_device.cc
@@ -0,0 +1,79 @@
+
+/****************************************************************************
+*
+*    Copyright (c) 2023 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+#include "tim/vx/context.h"
+#include "tim/vx/graph.h"
+#include "tim/vx/ops.h"
+#include "tim/vx/types.h"
+#include "tim/vx/platform/native.h"
+#include "tim/vx/platform/lite/lite_native.h"
+
+int main() {
+  //construct tim-vx graph
+  auto ctx = tim::vx::Context::Create();
+  auto graph = ctx->CreateGraph();
+
+  tim::vx::ShapeType io_shape({2, 2});
+  tim::vx::TensorSpec input_spec(tim::vx::DataType::INT32, io_shape,
+                                 tim::vx::TensorAttribute::INPUT);
+  tim::vx::TensorSpec output_spec(tim::vx::DataType::INT32, io_shape,
+                                  tim::vx::TensorAttribute::OUTPUT);
+  auto input_t0 = graph->CreateTensor(input_spec);
+  auto input_t1 = graph->CreateTensor(input_spec);
+  auto output_t = graph->CreateTensor(output_spec);
+
+  auto add = graph->CreateOperation<tim::vx::ops::Add>();
+  (*add).BindInputs({input_t0, input_t1}).BindOutputs({output_t});
+
+  std::vector<int> data_vec_i0({1, 2, 3, 4});
+  std::vector<int> data_vec_i1({4, 3, 2, 1});
+
+  auto devices = tim::vx::platform::NativeDevice::Enumerate();
+  auto device = devices[0];
+  auto executor = std::make_shared<tim::vx::platform::LiteNativeExecutor>(device);
+  auto executable = executor->Compile(graph);
+  auto input0_handle = executable->AllocateTensor(input_spec);
+  auto input1_handle = executable->AllocateTensor(input_spec);
+  auto output_handle = executable->AllocateTensor(output_spec);
+  executable->SetInput(input0_handle);
+  executable->SetInput(input1_handle);
+  executable->SetOutput(output_handle);
+  input0_handle->CopyDataToTensor(data_vec_i0.data(),
+                                  data_vec_i0.size() * sizeof(int));
+  input1_handle->CopyDataToTensor(data_vec_i1.data(),
+                                  data_vec_i1.size() * sizeof(int));
+  executable->Submit(executable);
+  executor->Trigger();
+
+  int* data = (int*)malloc(4 * sizeof(int));
+
+  output_handle->CopyDataFromTensor(data);
+
+  //each output value should be "5" in this demo
+  for (int i = 0; i < 4; ++i) {
+    std::cout << "output value: " << data[i] << std::endl;
+  }
+  free(data);
+  return 0;
+}
diff --git a/src/tim/CMakeLists.txt b/src/tim/CMakeLists.txt
index b1dbc34..1172297 100644
--- a/src/tim/CMakeLists.txt
+++ b/src/tim/CMakeLists.txt
@@ -12,8 +12,7 @@ endif()
 set(${TARGET_NAME}_SRCS)
 list(APPEND ${TARGET_NAME}_SRCS
     ${VX_SRC}
-    ${OPS_SRC}
-)
+    ${OPS_SRC})
 
 if(${TIM_VX_USE_EXTERNAL_OVXLIB})
     find_library(OVXLIB_LIB NAMES "ovxlib")
@@ -41,16 +40,60 @@ if(TIM_VX_ENABLE_LAYOUT_INFER)
 
     list(APPEND ${TARGET_NAME}_SRCS
         ${LAYOUT_INFER_FRAMEWORK_SRCS}
-        ${LAYOUT_INFER_OP_SRCS}
-    )
+        ${LAYOUT_INFER_OP_SRCS})
 endif()
 
 if(TIM_VX_ENABLE_PLATFORM)
-    message(STATUS "Using paltform")
+    message(STATUS "Using platform")
     aux_source_directory(./vx/platform PLATFORM_SRC)
     list(APPEND ${TARGET_NAME}_SRCS
-        ${PLATFORM_SRC}
-    )
+        ${PLATFORM_SRC})
+
+    if(${TIM_VX_ENABLE_PLATFORM_LITE})
+        message(STATUS "Using lite paltform")
+        list(APPEND ${TARGET_NAME}_SRCS
+            ${CMAKE_CURRENT_SOURCE_DIR}/vx/platform/lite/lite_native.cc)
+        if(NOT VIP_LITE_SDK)
+            message(FATAL_ERROR "Please set VIP_LITE_SDK when using lite platform(TIM_VX_ENABLE_PLATFORM_LITE)")
+        endif()
+        list(APPEND EXTERNAL_LIBS
+            ${VIP_LITE_SDK}/drivers/libNBGlinker.so
+            ${VIP_LITE_SDK}/drivers/libVIPlite.so)
+        list(APPEND INC_DIRS ${VIP_LITE_SDK}/include)
+    endif()
+
+    if(TIM_VX_ENABLE_GRPC)
+        list(APPEND ${TARGET_NAME}_SRCS
+            "${CMAKE_CURRENT_SOURCE_DIR}/vx/platform/grpc/grpc_platform_client.cc"
+            "${CMAKE_CURRENT_SOURCE_DIR}/vx/platform/grpc/grpc_remote.cc")
+
+        # Proto file
+        get_filename_component(gp_proto "${CMAKE_CURRENT_SOURCE_DIR}/vx/platform/grpc/grpc_platform.proto" ABSOLUTE)
+        get_filename_component(gp_proto_path "${gp_proto}" PATH)
+
+        # Generate sources
+        set(gp_proto_srcs "${CMAKE_CURRENT_BINARY_DIR}/grpc_platform.pb.cc")
+        set(gp_proto_hdrs "${CMAKE_CURRENT_BINARY_DIR}/grpc_platform.pb.h")
+        set(gp_grpc_srcs "${CMAKE_CURRENT_BINARY_DIR}/grpc_platform.grpc.pb.cc")
+        set(gp_grpc_hdrs "${CMAKE_CURRENT_BINARY_DIR}/grpc_platform.grpc.pb.h")
+        add_custom_command(
+            OUTPUT "${gp_proto_srcs}" "${gp_proto_hdrs}" "${gp_grpc_srcs}" "${gp_grpc_hdrs}"
+            COMMAND ${PROTOBUF_PROTOC}
+            ARGS --grpc_out "${CMAKE_CURRENT_BINARY_DIR}"
+                --cpp_out "${CMAKE_CURRENT_BINARY_DIR}"
+                -I "${gp_proto_path}"
+                --plugin=protoc-gen-grpc="${GRPC_CPP_PLUGIN_EXECUTABLE}"
+                    "${gp_proto}"
+            DEPENDS "${gp_proto}")
+
+        include_directories(${CMAKE_CURRENT_BINARY_DIR})
+
+        list(APPEND ${TARGET_NAME}_SRCS
+            ${gp_grpc_srcs}
+            ${gp_grpc_hdrs}
+            ${gp_proto_srcs}
+            ${gp_proto_hdrs})
+    endif()
 endif()
 
 foreach(src_file ${${TARGET_NAME}_SRCS})
@@ -67,8 +110,7 @@ list(APPEND INC_DIRS
     ${CMAKE_CURRENT_SOURCE_DIR}/vx
     ${CMAKE_CURRENT_SOURCE_DIR}/transform
     ${OVXLIB_INCLUDE_DIR}
-    ${OVXDRV_INCLUDE_DIRS}
-)
+    ${OVXDRV_INCLUDE_DIRS})
 
 if(${TIM_VX_ENABLE_VIPLITE})
     aux_source_directory(./lite LITE_SRC)
@@ -77,10 +119,9 @@ if(${TIM_VX_ENABLE_VIPLITE})
     list(APPEND EXTERNAL_LIBS ${VIPLITE_DRV_LIBRARIES})
     list(APPEND INC_DIRS
         ${CMAKE_CURRENT_SOURCE_DIR}/lite
-        ${VIPLITE_DRV_INCLUDE_DIR}
-    )
+        ${VIPLITE_DRV_INCLUDE_DIR})
 endif()
-
+include_directories(${INC_DIRS})
 # convert op list as compile flags so that we can implement compile compatable easier
 if(${TIM_VX_USE_EXTERNAL_OVXLIB})
     file(STRINGS "${OVXLIB_INC}/interface/ops.def" ops_file_content)
@@ -97,49 +138,88 @@ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${op_as_flags}")
 add_library(${TARGET_NAME} ${${TARGET_NAME}_SRCS})
 target_include_directories(${TARGET_NAME} PRIVATE ${INC_DIRS})
 target_link_libraries(${TARGET_NAME} PUBLIC
-    -Wl,--no-whole-archive ${EXTERNAL_LIBS} ${OVXDRV_LIBRARIES})
+    -Wl,--no-whole-archive  ${OVXDRV_LIBRARIES} ${EXTERNAL_LIBS})
 
 if(${TIM_VX_USE_EXTERNAL_OVXLIB})
   #-Wl,--whole-archive should not applied to external library, but only for shared library
-  target_link_libraries(${TARGET_NAME} PUBLIC tim_internal)
+    target_link_libraries(${TARGET_NAME} PUBLIC tim_internal)
 endif()
 
-if (NOT CMAKE_INSTALL_LIBDIR)
+if(TIM_VX_ENABLE_PLATFORM AND TIM_VX_ENABLE_GRPC)
+    target_link_libraries(${TARGET_NAME} PUBLIC
+        ${GRPCPP_REFLECTION}
+        ${GRPC_GRPCPP}
+        ${PROTOBUF_LIBPROTOBUF})
+
+    add_executable(grpc_platform_server
+        ${CMAKE_CURRENT_SOURCE_DIR}/vx/platform/grpc/grpc_platform_server.cc)
+    target_link_libraries(grpc_platform_server -Wl,--whole-archive ${TARGET_NAME})
+    install(TARGETS grpc_platform_server grpc_platform_server
+        DESTINATION ${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_BINDIR})
+endif()
+if(NOT CMAKE_INSTALL_LIBDIR)
 	set(CMAKE_INSTALL_LIBDIR "lib")
 endif()
 
+# Install
 install(TARGETS ${TARGET_NAME} ${TARGET_NAME}
 	DESTINATION ${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR})
 
-install(DIRECTORY ${CMAKE_SOURCE_DIR}/include/tim/vx 
-        DESTINATION ${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_INCLUDEDIR}/tim)
+install(
+    FILES
+        ${CMAKE_SOURCE_DIR}/include/tim/vx/builtin_op.h
+        ${CMAKE_SOURCE_DIR}/include/tim/vx/compile_option.h
+        ${CMAKE_SOURCE_DIR}/include/tim/vx/context.h
+        ${CMAKE_SOURCE_DIR}/include/tim/vx/graph.h
+        ${CMAKE_SOURCE_DIR}/include/tim/vx/operation.h
+        ${CMAKE_SOURCE_DIR}/include/tim/vx/ops.h
+        ${CMAKE_SOURCE_DIR}/include/tim/vx/tensor.h
+        ${CMAKE_SOURCE_DIR}/include/tim/vx/types.h
+    DESTINATION ${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_INCLUDEDIR}/tim/vx)
 
-install(DIRECTORY ${CMAKE_SOURCE_DIR}/include/tim/lite
+install(DIRECTORY ${CMAKE_SOURCE_DIR}/include/tim/vx/ops
+    DESTINATION ${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_INCLUDEDIR}/tim/vx)
+
+if(TIM_VX_ENABLE_VIPLITE)
+    install(DIRECTORY ${CMAKE_SOURCE_DIR}/include/tim/lite
         DESTINATION ${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_INCLUDEDIR}/tim)
+endif()
 
 if(TIM_VX_ENABLE_LAYOUT_INFER)
-    install(DIRECTORY ${CMAKE_SOURCE_DIR}/include/tim/transform 
-            DESTINATION ${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_INCLUDEDIR}/tim)
+    install(DIRECTORY ${CMAKE_SOURCE_DIR}/include/tim/transform
+        DESTINATION ${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_INCLUDEDIR}/tim)
+endif()
+
+if(TIM_VX_ENABLE_PLATFORM)
+    install(
+        FILES
+            ${CMAKE_SOURCE_DIR}/include/tim/vx/platform/platform.h
+            ${CMAKE_SOURCE_DIR}/include/tim/vx/platform/native.h
+        DESTINATION ${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_INCLUDEDIR}/tim/vx/platform)
+    if(TIM_VX_ENABLE_PLATFORM_LITE)
+        install(DIRECTORY ${CMAKE_SOURCE_DIR}/include/tim/vx/platform/lite
+            DESTINATION ${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_INCLUDEDIR}/tim/vx/platform)
+    endif()
+    if(TIM_VX_ENABLE_GRPC)
+        install(DIRECTORY ${CMAKE_SOURCE_DIR}/include/tim/vx/platform/grpc
+            DESTINATION ${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_INCLUDEDIR}/tim/vx/platform)
+    endif()
 endif()
 
 if(TIM_VX_ENABLE_TEST)
     include(GoogleTest)
 
     add_executable(unit_test ${${TARGET_NAME}_TEST_SRCS})
-    target_link_libraries(unit_test PRIVATE gtest gtest_main gmock gmock_main ${TARGET_NAME} ${OVXDRV_LIBRARIES})
+    target_link_libraries(unit_test PRIVATE
+        -Wl,--whole-archive ${TARGET_NAME}
+        -Wl,--no-whole-archive gtest gtest_main gmock gmock_main ${OVXDRV_LIBRARIES})
     target_include_directories(unit_test PRIVATE
         ${PROJECT_SOURCE_DIR}/include
         ${CMAKE_CURRENT_SOURCE_DIR}/vx
         ${OVXLIB_INCLUDE_DIR}
-        ${INC_DIRS}
-    )
+        ${INC_DIRS})
 
     install(TARGETS unit_test DESTINATION ${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_BINDIR})
 endif()
 
-if(TIM_VX_ENABLE_PLATFORM)
-    install(DIRECTORY ${CMAKE_SOURCE_DIR}/include/tim/vx/platform
-            DESTINATION ${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_INCLUDEDIR}/tim/vx)
-endif()
-
 add_subdirectory("utils")
diff --git a/src/tim/vx/platform/grpc/README.md b/src/tim/vx/platform/grpc/README.md
new file mode 100644
index 0000000..22e5dc3
--- /dev/null
+++ b/src/tim/vx/platform/grpc/README.md
@@ -0,0 +1,51 @@
+tim-vx uses gRPC to provide a remote service, by which you can deploy your model on a remote device.
+## Build and run on host
+1. Build and install gRPC, see [build gRPC C++](https://github.com/grpc/grpc/blob/master/BUILDING.md)
+2. Build tim-vx with gRPC
+```shell
+$ cd ${tim_vx_root}
+$ mkdir host_build && cd host_build
+$ cmake .. \
+    -DTIM_VX_ENABLE_PLATFORM=ON \
+    -DTIM_VX_ENABLE_GRPC=ON \
+    -DTIM_VX_BUILD_EXAMPLES=ON \
+    -DCMAKE_PREFIX_PATH=${grpc_host_install_path}
+$ make -j4
+$ make install
+```
+3. Start server
+```shell
+$ export LD_LIBRARY_PATH=${tim_vx_root}/host_build/install/lib:${tim_vx_root}/prebuilt-sdk/x86_64_linux/lib:$LD_LIBRARY_PATH
+$ cd ${tim_vx_root}/host_build/install/bin
+$ ./grpc_platform_server 0.0.0.0:50051
+```
+4. Run demo
+
+Open a new terminal
+```shell
+$ export LD_LIBRARY_PATH=${tim_vx_root}/host_build/install/lib:${tim_vx_root}/prebuilt-sdk/x86_64_linux/lib:$LD_LIBRARY_PATH
+$ cd ${tim_vx_root}/host_build/install/bin
+$ ./grpc_multi_device 0.0.0.0:50051
+```
+## Build for device
+1. Cross-compile gRPC, see [Cross-compile gRPC](https://github.com/grpc/grpc/blob/master/BUILDING.md#cross-compiling)
+
+note: You should keep both two install version of gPRC: host and device.
+
+2. Build tim-vx with host gRPC and device gRPC
+```shell
+$ cd ${tim_vx_root}
+$ mkdir device_build && cd device_build
+$ cmake .. \
+    -DTIM_VX_ENABLE_PLATFORM=ON \
+    -DTIM_VX_ENABLE_GRPC=ON \
+    -DTIM_VX_BUILD_EXAMPLES=ON \
+    -DCMAKE_PREFIX_PATH=${grpc_host_install_path} \
+    -DCMAKE_TOOLCHAIN_FILE=${path_to_tool_chain_file} \
+    -DEXTERNAL_VIV_SDK=${tim_vx_root}/prebuilt-sdk/x86_64_linux \
+    -DProtobuf_DIR=${grpc_device_install_path}/lib/cmake/protobuf \
+    -DgRPC_DIR=${grpc_device_install_path}/lib/cmake/grpc \
+    -Dabsl_DIR=${grpc_device_install_path}/lib/cmake/absl
+$ make -j4
+$ make install
+```
\ No newline at end of file
diff --git a/src/tim/vx/platform/grpc/grpc_platform.proto b/src/tim/vx/platform/grpc/grpc_platform.proto
new file mode 100644
index 0000000..75111f4
--- /dev/null
+++ b/src/tim/vx/platform/grpc/grpc_platform.proto
@@ -0,0 +1,120 @@
+syntax = "proto3";
+
+package rpc;
+
+service GRPCPlatform {
+  rpc Enumerate(EmptyMsg) returns (DeviceCount) {}
+
+  rpc CreateExecutor(Device) returns (Executor) {}
+
+  rpc CreateExecutable(GraphInfo) returns (Executable) {}
+
+  rpc AllocateTensor(TensorInfo) returns (Tensor) {}
+
+  rpc SetInput(IOTensor) returns (Status) {}
+
+  rpc SetOutput(IOTensor) returns (Status) {}
+
+  rpc Submit(Executable) returns (Status) {}
+
+  rpc Trigger(Executor) returns (Status) {}
+
+  rpc CopyDataToTensor(TensorData) returns (Status) {}
+
+  rpc CopyDataFromTensor(Tensor) returns (Data) {}
+
+  rpc Clean(EmptyMsg) returns (Status) {}
+}
+
+message EmptyMsg {}
+
+message DeviceCount {
+  int32 count = 1;
+}
+
+message Device {
+  int32 device = 1;
+}
+
+message Executor {
+  int32 executor = 1;
+}
+
+message Executable {
+  int32 executable = 1;
+}
+
+message Tensor {
+  int32 tensor = 1;
+}
+
+message Data {
+  bytes data = 1;
+}
+
+message GraphInfo {
+  int32 executor = 1;
+  int32 input_size = 2;
+  int32 output_size = 3;
+  bytes nbg = 4;
+}
+
+enum DataType {
+  UNKNOWN = 0;
+  INT8 = 1;
+  UINT8 = 2;
+  INT16 = 3;
+  UINT16 = 4;
+  INT32 = 5;
+  UINT32 = 6;
+  FLOAT16 = 7;
+  FLOAT32 = 8;
+  INT64 = 9;
+  BOOL8 = 10;
+}
+
+enum TensorAttr {
+  INVALID = 0;
+  INPUT = 1;
+  OUTPUT = 2;
+}
+
+enum QuantType {
+  NONE = 0;
+  ASYMMETRIC = 1;
+  SYMMETRIC_PER_CHANNEL = 2;
+}
+
+message Quantization {
+  QuantType quant_type = 1;
+  int32 channel_dim = 2;
+  repeated int32 scales = 3;
+  repeated int32 zero_points = 4;
+}
+
+message TensorSpec {
+  DataType data_type = 1;
+  repeated int32 shape = 2;
+  TensorAttr tensor_attr = 3;
+  Quantization quant = 4;
+
+}
+
+message TensorInfo {
+  int32 executable = 1;
+  TensorSpec tensor_spec = 2;
+}
+
+message IOTensor {
+  int32 tensor = 1;
+  int32 executable = 2;
+}
+
+message TensorData {
+  int32 tensor = 1;
+  bytes data = 2;
+}
+
+message Status {
+  bool status = 1;
+}
\ No newline at end of file
diff --git a/src/tim/vx/platform/grpc/grpc_platform_client.cc b/src/tim/vx/platform/grpc/grpc_platform_client.cc
new file mode 100644
index 0000000..e80a414
--- /dev/null
+++ b/src/tim/vx/platform/grpc/grpc_platform_client.cc
@@ -0,0 +1,240 @@
+/****************************************************************************
+*
+*    Copyright (c) 2023 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+#include "grpc_platform_client.h"
+
+namespace {
+::rpc::DataType MapDataType(tim::vx::DataType type) {
+  ::rpc::DataType rpc_type;
+  switch (type) {
+    case tim::vx::DataType::FLOAT32:
+      rpc_type = ::rpc::DataType::FLOAT32;
+      break;
+    case tim::vx::DataType::FLOAT16:
+      rpc_type = ::rpc::DataType::FLOAT16;
+      break;
+    case tim::vx::DataType::INT64:
+      rpc_type = ::rpc::DataType::INT64;
+      break;
+    case tim::vx::DataType::INT32:
+      rpc_type = ::rpc::DataType::INT32;
+      break;
+    case tim::vx::DataType::INT16:
+      rpc_type = ::rpc::DataType::INT16;
+      break;
+    case tim::vx::DataType::INT8:
+      rpc_type = ::rpc::DataType::INT8;
+      break;
+    case tim::vx::DataType::UINT32:
+      rpc_type = ::rpc::DataType::UINT32;
+      break;
+    case tim::vx::DataType::UINT16:
+      rpc_type = ::rpc::DataType::UINT16;
+      break;
+    case tim::vx::DataType::UINT8:
+      rpc_type = ::rpc::DataType::UINT8;
+      break;
+    case tim::vx::DataType::BOOL8:
+      rpc_type = ::rpc::DataType::BOOL8;
+      break;
+    default:
+      std::cout << "unknown tim vx data type" << std::endl;
+      assert(false);
+  }
+  return rpc_type;
+}
+
+::rpc::TensorAttr MapTensorAttr(tim::vx::TensorAttribute attr) {
+  ::rpc::TensorAttr rpc_attr;
+  switch (attr) {
+    case tim::vx::TensorAttribute::INPUT:
+      rpc_attr = ::rpc::TensorAttr::INPUT;
+      break;
+    case tim::vx::TensorAttribute::OUTPUT:
+      rpc_attr = ::rpc::TensorAttr::OUTPUT;
+      break;
+    default:
+      std::cout << "invalid tim vx tensor attr" << std::endl;
+      assert(false);
+  }
+  return rpc_attr;
+}
+
+::rpc::QuantType MapQuantType(tim::vx::QuantType quant) {
+  ::rpc::QuantType rpc_quant;
+  switch (quant) {
+    case tim::vx::QuantType::NONE:
+      rpc_quant = ::rpc::QuantType::NONE;
+      break;
+    case tim::vx::QuantType::ASYMMETRIC:
+      rpc_quant = ::rpc::QuantType::ASYMMETRIC;
+      break;
+    case tim::vx::QuantType::SYMMETRIC_PER_CHANNEL:
+      rpc_quant = ::rpc::QuantType::SYMMETRIC_PER_CHANNEL;
+      break;
+    default:
+      std::cout << "invalid tim vx quant type" << std::endl;
+      assert(false);
+  }
+  return rpc_quant;
+}
+
+}  // namespace
+namespace tim {
+namespace vx {
+namespace platform {
+int32_t GRPCPlatformClient::Enumerate() {
+  ::grpc::ClientContext context;
+  ::rpc::EmptyMsg emsg;
+  ::rpc::DeviceCount device_count;
+  stub_->Enumerate(&context, emsg, &device_count);
+
+  return device_count.count();
+}
+
+int32_t GRPCPlatformClient::CreateExecutor(int32_t device) {
+  ::grpc::ClientContext context;
+  ::rpc::Device device_msg;
+  device_msg.set_device(device);
+  ::rpc::Executor executor_msg;
+  stub_->CreateExecutor(&context, device_msg, &executor_msg);
+
+  return executor_msg.executor();
+}
+
+int32_t GRPCPlatformClient::CreateExecutable(int32_t executor,
+                                             const std::vector<char>& nbg,
+                                             int32_t input_size,
+                                             int32_t output_size) {
+  ::grpc::ClientContext context;
+  ::rpc::GraphInfo graph_info_msg;
+  graph_info_msg.set_executor(executor);
+  graph_info_msg.set_input_size(input_size);
+  graph_info_msg.set_output_size(output_size);
+  std::string nbg_str(nbg.data(), nbg.size());
+  graph_info_msg.set_nbg(nbg_str);
+  ::rpc::Executable executable_msg;
+  stub_->CreateExecutable(&context, graph_info_msg, &executable_msg);
+
+  return executable_msg.executable();
+}
+
+int32_t GRPCPlatformClient::AllocateTensor(int32_t executable,
+                                           const tim::vx::TensorSpec& spec) {
+  ::grpc::ClientContext context;
+  ::rpc::TensorInfo tensor_info_msg;
+  ::rpc::Tensor tensor_msg;
+  tensor_info_msg.set_executable(executable);
+  tensor_info_msg.mutable_tensor_spec()->set_data_type(
+      MapDataType(spec.datatype_));
+  tensor_info_msg.mutable_tensor_spec()->set_tensor_attr(
+      MapTensorAttr(spec.attr_));
+  tensor_info_msg.mutable_tensor_spec()->mutable_quant()->set_quant_type(
+      MapQuantType(spec.quantization_.Type()));
+  for (uint32_t s : spec.shape_) {
+    tensor_info_msg.mutable_tensor_spec()->add_shape(s);
+  }
+
+  stub_->AllocateTensor(&context, tensor_info_msg, &tensor_msg);
+  return tensor_msg.tensor();
+}
+
+bool GRPCPlatformClient::SetInput(int32_t executable, int32_t tensor) {
+  ::grpc::ClientContext context;
+  ::rpc::IOTensor in_tensor_msg;
+  ::rpc::Status status_msg;
+  in_tensor_msg.set_executable(executable);
+  in_tensor_msg.set_tensor(tensor);
+
+  stub_->SetInput(&context, in_tensor_msg, &status_msg);
+  return status_msg.status();
+}
+
+bool GRPCPlatformClient::SetOutput(int32_t executable, int32_t tensor) {
+  ::grpc::ClientContext context;
+  ::rpc::IOTensor out_tensor_msg;
+  ::rpc::Status status_msg;
+  out_tensor_msg.set_executable(executable);
+  out_tensor_msg.set_tensor(tensor);
+
+  stub_->SetOutput(&context, out_tensor_msg, &status_msg);
+  return status_msg.status();
+}
+
+bool GRPCPlatformClient::Submit(int32_t executable) {
+  ::grpc::ClientContext context;
+  ::rpc::Executable executable_mag;
+  ::rpc::Status status_msg;
+  executable_mag.set_executable(executable);
+
+  stub_->Submit(&context, executable_mag, &status_msg);
+  return status_msg.status();
+}
+
+bool GRPCPlatformClient::Trigger(int32_t executor) {
+  ::grpc::ClientContext context;
+  ::rpc::Executor executor_mag;
+  ::rpc::Status status_msg;
+  executor_mag.set_executor(executor);
+
+  stub_->Trigger(&context, executor_mag, &status_msg);
+  return status_msg.status();
+}
+
+bool GRPCPlatformClient::CopyDataToTensor(int32_t tensor, const void* data,
+                                          int32_t length) {
+  ::grpc::ClientContext context;
+  ::rpc::TensorData tensor_data_msg;
+  ::rpc::Status status_msg;
+  tensor_data_msg.set_tensor(tensor);
+  std::string data_str(reinterpret_cast<const char*>(data), length);
+  tensor_data_msg.set_data(data_str);
+
+  stub_->CopyDataToTensor(&context, tensor_data_msg, &status_msg);
+  return status_msg.status();
+}
+
+bool GRPCPlatformClient::CopyDataFromTensor(int32_t tensor, void* data) {
+  ::grpc::ClientContext context;
+  ::rpc::Tensor tensor_msg;
+  ::rpc::Data data_msg;
+  ::rpc::Status status_msg;
+  tensor_msg.set_tensor(tensor);
+
+  stub_->CopyDataFromTensor(&context, tensor_msg, &data_msg);
+  std::string data_str = data_msg.data();
+  memcpy(data, data_str.data(), data_str.size());
+  return (data != nullptr);
+}
+
+void GRPCPlatformClient::Clean() {
+  ::grpc::ClientContext context;
+  ::rpc::EmptyMsg emsg;
+  ::rpc::Status status_msg;
+
+  stub_->Clean(&context, emsg, &status_msg);
+}
+
+}  // namespace platform
+}  // namespace vx
+}  // namespace tim
diff --git a/src/tim/vx/platform/grpc/grpc_platform_client.h b/src/tim/vx/platform/grpc/grpc_platform_client.h
new file mode 100644
index 0000000..f29ebed
--- /dev/null
+++ b/src/tim/vx/platform/grpc/grpc_platform_client.h
@@ -0,0 +1,77 @@
+/****************************************************************************
+*
+*    Copyright (c) 2023 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+#ifndef _GRPC_PLATFORM_CLIENT_
+#define _GRPC_PLATFORM_CLIENT_
+
+#include <grpc/grpc.h>
+#include <grpcpp/channel.h>
+#include <grpcpp/client_context.h>
+#include <grpcpp/create_channel.h>
+#include <grpcpp/security/credentials.h>
+
+#include "tim/vx/context.h"
+#include "tim/vx/graph.h"
+#include "tim/vx/ops.h"
+#include "tim/vx/types.h"
+#include "grpc_platform.grpc.pb.h"
+
+namespace tim {
+namespace vx {
+namespace platform {
+class GRPCPlatformClient {
+ public:
+  GRPCPlatformClient(const std::string& port)
+      : stub_(rpc::GRPCPlatform::NewStub(
+            grpc::CreateChannel(port, grpc::InsecureChannelCredentials()))) {}
+
+  int32_t Enumerate();
+
+  int32_t CreateExecutor(int32_t device);
+
+  int32_t CreateExecutable(int32_t executor, const std::vector<char>& nbg,
+                           int32_t input_size, int32_t output_size);
+
+  int32_t AllocateTensor(int32_t executable, const tim::vx::TensorSpec& spec);
+
+  bool SetInput(int32_t executable, int32_t tensor);
+
+  bool SetOutput(int32_t executable, int32_t tensor);
+
+  bool Submit(int32_t executable);
+
+  bool Trigger(int32_t executor);
+
+  bool CopyDataToTensor(int32_t tensor, const void* data, int32_t length);
+
+  bool CopyDataFromTensor(int32_t tensor, void* data);
+
+  void Clean();
+
+ private:
+  std::unique_ptr<rpc::GRPCPlatform::Stub> stub_;
+};
+}  // namespace platform
+}  // namespace vx
+}  // namespace tim
+#endif
\ No newline at end of file
diff --git a/src/tim/vx/platform/grpc/grpc_platform_server.cc b/src/tim/vx/platform/grpc/grpc_platform_server.cc
new file mode 100644
index 0000000..943fff2
--- /dev/null
+++ b/src/tim/vx/platform/grpc/grpc_platform_server.cc
@@ -0,0 +1,348 @@
+/****************************************************************************
+*
+*    Copyright (c) 2023 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+#include <grpc/grpc.h>
+#include <grpcpp/security/server_credentials.h>
+#include <grpcpp/server.h>
+#include <grpcpp/server_builder.h>
+#include <grpcpp/server_context.h>
+
+#include "grpc_platform.grpc.pb.h"
+#include "tim/vx/platform/native.h"
+#include "vsi_nn_pub.h"
+#ifdef ENABLE_PLATFORM_LITE
+#include "tim/vx/platform/lite/lite_native.h"
+#endif
+
+std::unordered_map<int32_t, std::shared_ptr<tim::vx::platform::IDevice>>
+    device_table;
+std::unordered_map<int32_t, std::shared_ptr<tim::vx::platform::IExecutor>>
+    executor_table;
+std::vector<std::shared_ptr<tim::vx::platform::IExecutable>> executable_table;
+std::vector<std::shared_ptr<tim::vx::platform::ITensorHandle>> tensor_table;
+
+namespace {
+tim::vx::DataType MapDataType(::rpc::DataType type) {
+  tim::vx::DataType vx_type;
+  switch (type) {
+    case ::rpc::DataType::FLOAT32:
+      vx_type = tim::vx::DataType::FLOAT32;
+      break;
+    case ::rpc::DataType::FLOAT16:
+      vx_type = tim::vx::DataType::FLOAT16;
+      break;
+    case ::rpc::DataType::INT64:
+      vx_type = tim::vx::DataType::INT64;
+      break;
+    case ::rpc::DataType::INT32:
+      vx_type = tim::vx::DataType::INT32;
+      break;
+    case ::rpc::DataType::INT16:
+      vx_type = tim::vx::DataType::INT16;
+      break;
+    case ::rpc::DataType::INT8:
+      vx_type = tim::vx::DataType::INT8;
+      break;
+    case ::rpc::DataType::UINT32:
+      vx_type = tim::vx::DataType::UINT32;
+      break;
+    case ::rpc::DataType::UINT16:
+      vx_type = tim::vx::DataType::UINT16;
+      break;
+    case ::rpc::DataType::UINT8:
+      vx_type = tim::vx::DataType::UINT8;
+      break;
+    case ::rpc::DataType::BOOL8:
+      vx_type = tim::vx::DataType::BOOL8;
+      break;
+    default:
+      std::cout << "unknown data type" << std::endl;
+      assert(false);
+  }
+  return vx_type;
+}
+
+tim::vx::TensorAttribute MapTensorAttr(::rpc::TensorAttr attr) {
+  tim::vx::TensorAttribute vx_attr;
+  switch (attr) {
+    case ::rpc::TensorAttr::INPUT:
+      vx_attr = tim::vx::TensorAttribute::INPUT;
+      break;
+    case ::rpc::TensorAttr::OUTPUT:
+      vx_attr = tim::vx::TensorAttribute::OUTPUT;
+      break;
+    default:
+      std::cout << "invalid tensor attr" << std::endl;
+      assert(false);
+  }
+  return vx_attr;
+}
+
+tim::vx::QuantType MapQuantType(::rpc::QuantType quant) {
+  tim::vx::QuantType vx_quant;
+  switch (quant) {
+    case ::rpc::QuantType::NONE:
+      vx_quant = tim::vx::QuantType::NONE;
+      break;
+    case ::rpc::QuantType::ASYMMETRIC:
+      vx_quant = tim::vx::QuantType::ASYMMETRIC;
+      break;
+    case ::rpc::QuantType::SYMMETRIC_PER_CHANNEL:
+      vx_quant = tim::vx::QuantType::SYMMETRIC_PER_CHANNEL;
+      break;
+    default:
+      std::cout << "invalid quant type" << std::endl;
+      assert(false);
+  }
+  return vx_quant;
+}
+}  // namespace
+
+class GRPCPlatformService final : public ::rpc::GRPCPlatform::Service {
+ public:
+  ::grpc::Status Enumerate(::grpc::ServerContext* context,
+                           const ::rpc::EmptyMsg* request,
+                           ::rpc::DeviceCount* response) override {
+    VSILOGD("------ Calling gRPC Enumerate ------");
+    (void)context;
+    (void)request;
+    auto devices = tim::vx::platform::NativeDevice::Enumerate();
+    response->set_count(devices.size());
+    for (int i = 0; i < static_cast<int>(devices.size()); ++i) {
+      device_table.insert({i, devices[i]});
+    }
+    return ::grpc::Status::OK;
+  }
+
+  ::grpc::Status CreateExecutor(::grpc::ServerContext* context,
+                                const ::rpc::Device* request,
+                                ::rpc::Executor* response) override {
+    VSILOGD("------ Calling gRPC CreateExecutor ------");
+    (void)context;
+    int32_t id = request->device();
+    auto device = device_table[id];
+#ifdef ENABLE_PLATFORM_LITE
+    auto executor =
+        std::make_shared<tim::vx::platform::LiteNativeExecutor>(device);
+#else
+    auto executor = std::make_shared<tim::vx::platform::NativeExecutor>(device);
+#endif
+    executor_table.insert({id, executor});
+    response->set_executor(id);
+    return ::grpc::Status::OK;
+  }
+
+  ::grpc::Status CreateExecutable(::grpc::ServerContext* context,
+                                  const ::rpc::GraphInfo* request,
+                                  ::rpc::Executable* response) override {
+    VSILOGD("------ Calling gRPC CreateExecutable ------");
+    (void)context;
+    int32_t id = request->executor();
+    auto executor = executor_table[id];
+    std::string nbg_str = request->nbg();
+    std::vector<char> nbg_vec(nbg_str.size());
+    memcpy(nbg_vec.data(), nbg_str.data(), nbg_str.size());
+#ifdef ENABLE_PLATFORM_LITE
+    auto executable = std::make_shared<tim::vx::platform::LiteNativeExecutable>(
+        executor, nbg_vec);
+#else
+    int32_t input_size = request->input_size();
+    int32_t output_size = request->output_size();
+    auto executable = std::make_shared<tim::vx::platform::NativeExecutable>(
+        executor, nbg_vec, input_size, output_size);
+#endif
+    executable_table.push_back(executable);
+    response->set_executable(executable_table.size() - 1);
+    return ::grpc::Status::OK;
+  }
+
+  ::grpc::Status AllocateTensor(::grpc::ServerContext* context,
+                                const ::rpc::TensorInfo* request,
+                                ::rpc::Tensor* response) override {
+    VSILOGD("------ Calling gRPC AllocateTensor ------");
+    (void)context;
+    int32_t id = request->executable();
+    auto executable = executable_table[id];
+    tim::vx::DataType data_type =
+        MapDataType(request->tensor_spec().data_type());
+    tim::vx::TensorAttribute tensor_attr =
+        MapTensorAttr(request->tensor_spec().tensor_attr());
+    tim::vx::QuantType quant_type =
+        MapQuantType(request->tensor_spec().quant().quant_type());
+    auto shape = request->tensor_spec().shape();
+    tim::vx::ShapeType vx_shape(shape.size());
+    for (int i = 0; i < shape.size(); ++i) vx_shape[i] = shape[i];
+    tim::vx::TensorSpec tensor_spec;
+    if (quant_type == tim::vx::QuantType::NONE) {
+      tensor_spec = tim::vx::TensorSpec(data_type, vx_shape, tensor_attr);
+    } else {
+      tim::vx::Quantization quantization;
+      quantization.SetType(quant_type);
+      quantization.SetChannelDim(request->tensor_spec().quant().channel_dim());
+      auto scales = request->tensor_spec().quant().scales();
+      auto zero_pionts = request->tensor_spec().quant().zero_points();
+      std::vector<float> vx_scales(scales.size());
+      std::vector<int32_t> vx_zero_points(zero_pionts.size());
+      for (int i = 0; i < scales.size(); ++i) vx_scales[i] = scales[i];
+      for (int i = 0; i < zero_pionts.size(); ++i) {
+        vx_zero_points[i] = zero_pionts[i];
+      }
+      quantization.SetScales(vx_scales);
+      quantization.SetZeroPoints(vx_zero_points);
+
+      tensor_spec =
+          tim::vx::TensorSpec(data_type, vx_shape, tensor_attr, quantization);
+    }
+
+    auto tensor_handle = executable->AllocateTensor(tensor_spec);
+    tensor_table.push_back(tensor_handle);
+    response->set_tensor(tensor_table.size() - 1);
+
+    return ::grpc::Status::OK;
+  }
+
+  ::grpc::Status SetInput(::grpc::ServerContext* context,
+                          const ::rpc::IOTensor* request,
+                          ::rpc::Status* response) override {
+    VSILOGD("------ Calling gRPC SetInput ------");
+    (void)context;
+    int32_t tensor_id = request->tensor();
+    int32_t executable_id = request->executable();
+    auto executable = executable_table[executable_id];
+    auto tensor_handle = tensor_table[tensor_id];
+    if (tensor_handle->GetTensor()->GetSpec().attr_ !=
+        tim::vx::TensorAttribute::INPUT) {
+      VSILOGE("You are setting a no-input tensor as graph input");
+    }
+    executable->SetInput(tensor_handle);
+    response->set_status(true);
+    return ::grpc::Status::OK;
+  }
+
+  ::grpc::Status SetOutput(::grpc::ServerContext* context,
+                           const ::rpc::IOTensor* request,
+                           ::rpc::Status* response) override {
+    VSILOGD("------ Calling gRPC SetOutput ------");
+    (void)context;
+    int32_t tensor_id = request->tensor();
+    int32_t executable_id = request->executable();
+    auto executable = executable_table[executable_id];
+    auto tensor_handle = tensor_table[tensor_id];
+    if (tensor_handle->GetTensor()->GetSpec().attr_ !=
+        tim::vx::TensorAttribute::OUTPUT) {
+      VSILOGE("You are setting a no-output tensor as graph output");
+    }
+    executable->SetOutput(tensor_handle);
+    response->set_status(true);
+    return ::grpc::Status::OK;
+  }
+
+  ::grpc::Status Submit(::grpc::ServerContext* context,
+                        const ::rpc::Executable* request,
+                        ::rpc::Status* response) override {
+    VSILOGD("------ Calling gRPC Submit ------");
+    (void)context;
+    int32_t id = request->executable();
+    auto executable = executable_table[id];
+    executable->Submit(executable);
+    response->set_status(true);
+    return ::grpc::Status::OK;
+  }
+
+  ::grpc::Status Trigger(::grpc::ServerContext* context,
+                         const ::rpc::Executor* request,
+                         ::rpc::Status* response) override {
+    VSILOGD("------ Calling gRPC Trigger ------");
+    (void)context;
+    int32_t id = request->executor();
+    auto executor = executor_table[id];
+    executor->Trigger();
+    response->set_status(true);
+    return ::grpc::Status::OK;
+  }
+
+  ::grpc::Status CopyDataToTensor(::grpc::ServerContext* context,
+                                  const ::rpc::TensorData* request,
+                                  ::rpc::Status* response) override {
+    VSILOGD("------ Calling gRPC CopyDataToTensor ------");
+    (void)context;
+    int32_t id = request->tensor();
+    auto tensor_handle = tensor_table[id];
+    std::string data_str = request->data();
+    bool status =
+        tensor_handle->CopyDataToTensor(data_str.data(), data_str.size());
+    response->set_status(status);
+    return ::grpc::Status::OK;
+  }
+
+  ::grpc::Status CopyDataFromTensor(::grpc::ServerContext* context,
+                                    const ::rpc::Tensor* request,
+                                    ::rpc::Data* response) override {
+    VSILOGD("------ Calling gRPC CopyDataFromTensor ------");
+    (void)context;
+    int32_t id = request->tensor();
+    auto tensor_handle = tensor_table[id];
+    size_t data_size = tensor_handle->GetTensor()->GetSpec().GetByteSize();
+    void* ptr = malloc(data_size);
+    bool status = tensor_handle->CopyDataFromTensor(ptr);
+    if (!status) {
+      VSILOGE("CopyDataFromTensor fail");
+      free(ptr);
+      return ::grpc::Status::CANCELLED;
+    }
+    std::string data_str(reinterpret_cast<char*>(ptr), data_size);
+    response->set_data(std::move(data_str));
+    free(ptr);
+    return ::grpc::Status::OK;
+  }
+
+  ::grpc::Status Clean(::grpc::ServerContext* context,
+                       const ::rpc::EmptyMsg* request,
+                       ::rpc::Status* response) override {
+    VSILOGD("------ Calling gRPC Clean ------");
+    (void)context;
+    (void)request;
+    executor_table.clear();
+    executable_table.clear();
+    tensor_table.clear();
+    response->set_status(true);
+    return ::grpc::Status::OK;
+  }
+};
+
+int main(int argc, char** argv) {
+  if (argc < 2) {
+    std::cout << "error: need a port to connect." << std::endl;
+    return -1;
+  }
+  std::string port(argv[1]);
+  GRPCPlatformService service;
+  ::grpc::ServerBuilder builder;
+  builder.AddListeningPort(port, grpc::InsecureServerCredentials());
+  builder.RegisterService(&service);
+  std::unique_ptr<::grpc::Server> server(builder.BuildAndStart());
+  std::cout << "Server listening on " << port << std::endl;
+  server->Wait();
+
+  return 0;
+}
\ No newline at end of file
diff --git a/src/tim/vx/platform/grpc/grpc_remote.cc b/src/tim/vx/platform/grpc/grpc_remote.cc
new file mode 100644
index 0000000..5bba14d
--- /dev/null
+++ b/src/tim/vx/platform/grpc/grpc_remote.cc
@@ -0,0 +1,176 @@
+/****************************************************************************
+*
+*    Copyright (c) 2023 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+#include "tim/vx/platform/grpc/grpc_remote.h"
+
+#include "tim/vx/platform/platform.h"
+#include "grpc_platform_client.h"
+
+namespace tim {
+namespace vx {
+namespace platform {
+
+std::vector<std::shared_ptr<IDevice>> GRPCRemoteDevice::Enumerate(
+    const std::string& port) {
+  auto client = std::make_shared<GRPCPlatformClient>(port);
+  int32_t count = client->Enumerate();
+  std::vector<std::shared_ptr<IDevice>> devices;
+  for (int i = 0; i < count; ++i) {
+    devices.push_back(std::make_shared<GRPCRemoteDevice>(i, client));
+  }
+  return devices;
+}
+
+GRPCRemoteDevice::GRPCRemoteDevice(int32_t id,
+                                   std::shared_ptr<GRPCPlatformClient> client)
+    : client_(client) {
+  device_id_ = id;
+}
+
+bool GRPCRemoteDevice::Submit(const std::shared_ptr<Graph>& graph) {
+  (void)graph;
+  return false;
+}
+
+bool GRPCRemoteDevice::Trigger(bool async, async_callback cb) {
+  (void)async;
+  (void)cb;
+  return false;
+}
+
+bool GRPCRemoteDevice::DeviceExit() { return false; }
+
+void GRPCRemoteDevice::WaitDeviceIdle() {}
+
+void GRPCRemoteDevice::RemoteReset() { client_->Clean(); }
+
+GRPCRemoteExecutor::GRPCRemoteExecutor(std::shared_ptr<IDevice> device)
+    : device_(device) {
+  executor_id_ = std::dynamic_pointer_cast<GRPCRemoteDevice>(device)
+                     ->client_->CreateExecutor(device->Id());
+}
+
+bool GRPCRemoteExecutor::Submit(const std::shared_ptr<IExecutable>& executable,
+                                const std::shared_ptr<IExecutable>& ref,
+                                bool after) {
+  (void)executable;
+  (void)ref;
+  (void)after;
+  return false;
+}
+
+bool GRPCRemoteExecutor::Trigger(bool async) {
+  (void)async;
+  return std::dynamic_pointer_cast<GRPCRemoteDevice>(device_)->client_->Trigger(
+      executor_id_);
+}
+
+std::shared_ptr<IExecutable> GRPCRemoteExecutor::Compile(
+    const std::shared_ptr<Graph>& graph) {
+  size_t inputs_num = graph->InputsTensor().size();
+  size_t outputs_num = graph->OutputsTensor().size();
+  size_t nbg_size = -1;
+
+  graph->CompileToBinary(nullptr, &nbg_size);
+  std::vector<char> nbg_buf(nbg_size);
+  graph->CompileToBinary(nbg_buf.data(), &nbg_size);
+
+  int32_t executable_id =
+      std::dynamic_pointer_cast<GRPCRemoteDevice>(device_)
+          ->client_->CreateExecutable(executor_id_, nbg_buf, inputs_num,
+                                      outputs_num);
+
+  return std::make_shared<GRPCRemoteExecutable>(executable_id, device_);
+}
+
+int32_t GRPCRemoteExecutor::Id() const { return executor_id_; }
+
+GRPCRemoteExecutable::GRPCRemoteExecutable(int32_t id,
+                                           std::shared_ptr<IDevice> device)
+    : executable_id_(id), device_(device) {}
+
+void GRPCRemoteExecutable::SetInput(const std::shared_ptr<ITensorHandle>& th) {
+  int32_t tensor_id =
+      std::dynamic_pointer_cast<GRPCRemoteTensorHandle>(th)->Id();
+  std::dynamic_pointer_cast<GRPCRemoteDevice>(device_)->client_->SetInput(
+      executable_id_, tensor_id);
+}
+
+void GRPCRemoteExecutable::SetOutput(const std::shared_ptr<ITensorHandle>& th) {
+  int32_t tensor_id =
+      std::dynamic_pointer_cast<GRPCRemoteTensorHandle>(th)->Id();
+  std::dynamic_pointer_cast<GRPCRemoteDevice>(device_)->client_->SetOutput(
+      executable_id_, tensor_id);
+}
+
+void GRPCRemoteExecutable::GetOutput(
+    const std::vector<std::shared_ptr<ITensorHandle>>& th) {
+  (void)th;
+}
+
+bool GRPCRemoteExecutable::Submit(const std::shared_ptr<IExecutable>& ref,
+                                  bool after) {
+  (void)after;
+  int32_t executable_id =
+      std::dynamic_pointer_cast<GRPCRemoteExecutable>(ref)->Id();
+  return std::dynamic_pointer_cast<GRPCRemoteDevice>(device_)->client_->Submit(
+      executable_id);
+}
+
+bool GRPCRemoteExecutable::Trigger(bool async) {
+  (void)async;
+  return false;
+}
+
+bool GRPCRemoteExecutable::Verify() { return false; }
+
+std::shared_ptr<ITensorHandle> GRPCRemoteExecutable::AllocateTensor(
+    const TensorSpec& tensor_spec) {
+  int32_t tensor_id =
+      std::dynamic_pointer_cast<GRPCRemoteDevice>(device_)
+          ->client_->AllocateTensor(executable_id_, tensor_spec);
+
+  return std::make_shared<GRPCRemoteTensorHandle>(tensor_id, device_);
+}
+
+int32_t GRPCRemoteExecutable::Id() const { return executable_id_; }
+
+GRPCRemoteTensorHandle::GRPCRemoteTensorHandle(int32_t id,
+                                               std::shared_ptr<IDevice> device)
+    : tensor_id_(id), device_(device) {}
+
+bool GRPCRemoteTensorHandle::CopyDataToTensor(const void* data,
+                                              uint32_t size_in_bytes) {
+  return std::dynamic_pointer_cast<GRPCRemoteDevice>(device_)
+      ->client_->CopyDataToTensor(tensor_id_, data, size_in_bytes);
+}
+
+bool GRPCRemoteTensorHandle::CopyDataFromTensor(void* data) {
+  return std::dynamic_pointer_cast<GRPCRemoteDevice>(device_)
+      ->client_->CopyDataFromTensor(tensor_id_, data);
+}
+
+int32_t GRPCRemoteTensorHandle::Id() const { return tensor_id_; }
+}  // namespace platform
+}  // namespace vx
+}  // namespace tim
diff --git a/src/tim/vx/platform/lite/lite_native.cc b/src/tim/vx/platform/lite/lite_native.cc
new file mode 100644
index 0000000..137823b
--- /dev/null
+++ b/src/tim/vx/platform/lite/lite_native.cc
@@ -0,0 +1,312 @@
+/****************************************************************************
+*
+*    Copyright (c) 2023 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+#include "tim/vx/platform/lite/lite_native.h"
+
+#include <cassert>
+
+#include "tim/vx/graph.h"
+#include "graph_private.h"
+#include "vsi_nn_pub.h"
+
+namespace tim {
+namespace vx {
+namespace platform {
+LiteNativeExecutor::LiteNativeExecutor(const std::shared_ptr<IDevice>& device) {
+  device_ = device;
+  context_ = Context::Create();
+  database_ = VIP_NULL;
+
+  vip_init();
+  vip_query_database(&database_);
+  nbg_linker_init(database_);
+}
+
+LiteNativeExecutor::~LiteNativeExecutor() {
+  nbg_destroy_task(task_descriptor_);
+  nbg_linker_destroy();
+  vip_destroy();
+}
+
+bool LiteNativeExecutor::Submit(const std::shared_ptr<IExecutable>& executable,
+                                const std::shared_ptr<IExecutable>& ref,
+                                bool after) {
+  bool success = false;
+  if (executable == ref) {
+    tasks_.push_back(executable);
+    return true;
+  }
+  for (size_t i = 0; i < tasks_.size(); i++) {
+    if (tasks_[i].lock() == ref) {
+      if (after == true) {
+        tasks_.insert(tasks_.begin() + i + 1, executable);
+        success = true;
+        break;
+      } else {
+        tasks_.insert(tasks_.begin() + i, executable);
+        success = true;
+        break;
+      }
+    }
+  }
+  return success;
+}
+
+bool LiteNativeExecutor::Trigger(bool async) {
+  (void)async;
+  vip_status_e status = VIP_SUCCESS;
+  std::vector<vip_network> networks;
+  for (auto exe : tasks_) {
+    auto task = exe.lock();
+    task->Verify();
+    vip_network& network =
+        std::dynamic_pointer_cast<LiteNativeExecutable>(task)->network_;
+    networks.push_back(std::move(network));
+  }
+  status = nbg_create_task(networks.size(), networks.data(), &task_descriptor_);
+  if (status != VIP_SUCCESS) {
+    VSILOGE("create task descriptor fail");
+    return false;
+  }
+  status = vip_trigger_task(task_descriptor_);
+  if (status != VIP_SUCCESS) {
+    VSILOGE("trigger task descriptor fail");
+    return false;
+  }
+  status = vip_wait_task(task_descriptor_);
+  if (status != VIP_SUCCESS) {
+    VSILOGE("wait task descriptor fail");
+    nbg_gen_capture(networks.size(), networks.data());
+    return false;
+  }
+  return true;
+}
+
+std::shared_ptr<IExecutable> LiteNativeExecutor::Compile(
+    const std::shared_ptr<Graph>& graph) {
+  GraphImpl* graphimp = dynamic_cast<GraphImpl*>(graph.get());
+  IDevice::device_id_t id = device_->Id();
+  vxSetGraphAttribute(graphimp->graph()->g, VX_GRAPH_DEVICE_INDEX_VIV,
+                      (void*)(&id), sizeof(id));
+  size_t bin_size = -1;
+  graph->CompileToBinary(nullptr, &bin_size);
+  std::vector<char> nb_buf;
+  nb_buf.resize(bin_size);
+  graph->CompileToBinary(nb_buf.data(), &bin_size);
+  return std::make_shared<LiteNativeExecutable>(shared_from_this(), nb_buf);
+}
+
+LiteNativeExecutable::LiteNativeExecutable(
+    const std::shared_ptr<IExecutor>& executor,
+    const std::vector<char>& nb_buf) {
+  executor_ = executor;
+  context_ = executor->Contex();
+  nb_graph_ = context_->CreateGraph();
+  nbg_create_network(nb_buf.data(), nb_buf.size(),
+                     VIP_CREATE_NETWORK_FROM_MEMORY, &network_);
+  input_count_ = 0;
+  output_count_ = 0;
+  coeff_ = nullptr;
+  command_ = nullptr;
+  memory_pool_ = nullptr;
+  others_ = nullptr;
+  pre_command_ = nullptr;
+
+  /* prepare vip network */
+  vip_status_e status = VIP_SUCCESS;
+  nbg_network_memory_size_t buffer_size;
+  nbg_network_memory_buffer_t buffer;
+  vip_memory_t coeff_buffer;
+  vip_memory_t cmd_buffer;
+  vip_memory_t pre_cmd_buffer;
+  vip_memory_t pool_buffer;
+  vip_memory_t others_buffer;
+  nbg_query_network(network_, VIP_NETWORK_PROP_MEMORY_SIZE, &buffer_size);
+
+  vip_allocate_videomemory(buffer_size.coeff, &coeff_);
+  vip_allocate_videomemory(buffer_size.command, &command_);
+  vip_allocate_videomemory(buffer_size.memory_pool, &memory_pool_);
+  vip_allocate_videomemory(buffer_size.others, &others_);
+  vip_allocate_videomemory(buffer_size.pre_command, &pre_command_);
+
+  SetBuffer(&coeff_buffer, coeff_);
+  SetBuffer(&cmd_buffer, command_);
+  SetBuffer(&pre_cmd_buffer, pre_command_);
+  SetBuffer(&pool_buffer, memory_pool_);
+  SetBuffer(&others_buffer, others_);
+
+  buffer.coeff = &coeff_buffer;
+  buffer.command = &cmd_buffer;
+  buffer.memory_pool = &pool_buffer;
+  buffer.others = &others_buffer;
+  buffer.pre_command = &pre_cmd_buffer;
+  buffer.dma_command = nullptr;
+  status = nbg_prepare_network(network_, &buffer);
+
+  vip_flush_videomemory(coeff_, VIP_BUFFER_OPER_TYPE_FLUSH);
+  vip_flush_videomemory(command_, VIP_BUFFER_OPER_TYPE_FLUSH);
+  vip_flush_videomemory(pre_command_, VIP_BUFFER_OPER_TYPE_FLUSH);
+  vip_flush_videomemory(memory_pool_, VIP_BUFFER_OPER_TYPE_FLUSH);
+  vip_flush_videomemory(others_, VIP_BUFFER_OPER_TYPE_FLUSH);
+
+  if (status != VIP_SUCCESS) {
+    VSILOGE("failed to prepare network");
+    assert(false);
+  }
+}
+
+LiteNativeExecutable::~LiteNativeExecutable() {
+  nbg_finish_network(network_);
+  nbg_destroy_network(network_);
+  if (coeff_) {
+    vip_free_videomemory(coeff_);
+    coeff_ = nullptr;
+  }
+  if (command_) {
+    vip_free_videomemory(command_);
+    command_ = nullptr;
+  }
+  if (memory_pool_) {
+    vip_free_videomemory(memory_pool_);
+    memory_pool_ = nullptr;
+  }
+  if (others_) {
+    vip_free_videomemory(others_);
+    others_ = nullptr;
+  }
+  if (pre_command_) {
+    vip_free_videomemory(pre_command_);
+    pre_command_ = nullptr;
+  }
+}
+
+void LiteNativeExecutable::SetInput(const std::shared_ptr<ITensorHandle>& th) {
+  vip_status_e status = VIP_SUCCESS;
+  gcvip_videomemory_t* mem =
+      std::dynamic_pointer_cast<LiteNativeTensorHandle>(th)->tensor_buffer_;
+  vip_memory_t buffer;
+  SetBuffer(&buffer, mem);
+
+  status = nbg_set_input(network_, input_count_, &buffer);
+  if (status != VIP_SUCCESS) {
+    VSILOGE("failed to set input: %d", input_count_);
+    assert(false);
+  }
+  ++input_count_;
+}
+
+void LiteNativeExecutable::SetOutput(const std::shared_ptr<ITensorHandle>& th) {
+  vip_status_e status = VIP_SUCCESS;
+  gcvip_videomemory_t* mem =
+      std::dynamic_pointer_cast<LiteNativeTensorHandle>(th)->tensor_buffer_;
+  vip_memory_t buffer;
+  SetBuffer(&buffer, mem);
+
+  status = nbg_set_output(network_, output_count_, &buffer);
+  if (status != VIP_SUCCESS) {
+    VSILOGE("failed to set output: %d", output_count_);
+    assert(false);
+  }
+  ++output_count_;
+}
+
+void LiteNativeExecutable::GetOutput(
+    const std::vector<std::shared_ptr<ITensorHandle>>& th) {
+  (void)th;
+}
+
+bool LiteNativeExecutable::Submit(const std::shared_ptr<IExecutable>& ref,
+                                  bool after) {
+  bool status = false;
+  std::shared_ptr<IExecutable> executable = shared_from_this();
+  status = Executor()->Submit(executable, ref, after);
+  return status;
+}
+
+bool LiteNativeExecutable::Trigger(bool async) {
+  (void)async;
+  return false;
+}
+
+bool LiteNativeExecutable::Verify() {
+  int32_t input_count = 0;
+  nbg_query_network(network_, VIP_NETWORK_PROP_INPUT_COUNT, &input_count);
+  if (input_count != input_count_) {
+    VSILOGE("input count mismatch, required: %d, provided: %d", input_count,
+            input_count_);
+    return false;
+  }
+  int32_t output_count = 0;
+  nbg_query_network(network_, VIP_NETWORK_PROP_OUTPUT_COUNT, &output_count);
+  if (output_count != output_count_) {
+    VSILOGE("output count mismatch, required: %d, provided: %d", output_count,
+            output_count_);
+    return false;
+  }
+
+  return true;
+}
+
+std::shared_ptr<ITensorHandle> LiteNativeExecutable::AllocateTensor(
+    const TensorSpec& tensor_spec) {
+  auto tensor = nb_graph_->CreateTensor(tensor_spec);
+  return std::make_shared<LiteNativeTensorHandle>(tensor);
+}
+
+void LiteNativeExecutable::SetBuffer(vip_memory_t* dst,
+                                     gcvip_videomemory_t* src) {
+  if (dst && src) {
+    dst->cpu_logical = src->cpu_logical;
+    dst->npu_physical = src->npu_physical;
+    dst->size = src->size;
+  }
+}
+
+LiteNativeTensorHandle::LiteNativeTensorHandle(
+    const std::shared_ptr<Tensor>& tensor) {
+  tensor_ = tensor;
+  uint32_t size = tensor->GetSpec().GetByteSize();
+  vip_allocate_videomemory(size, &tensor_buffer_);
+}
+
+LiteNativeTensorHandle::~LiteNativeTensorHandle() {
+  if (tensor_buffer_) {
+    vip_free_videomemory(tensor_buffer_);
+    tensor_buffer_ = nullptr;
+  }
+}
+
+bool LiteNativeTensorHandle::CopyDataToTensor(const void* data,
+                                              uint32_t size_in_bytes) {
+  memcpy(tensor_buffer_->cpu_logical, data, size_in_bytes);
+  return true;
+}
+
+bool LiteNativeTensorHandle::CopyDataFromTensor(void* data) {
+  memcpy(data, tensor_buffer_->cpu_logical, tensor_buffer_->size);
+  return true;
+}
+
+}  // namespace platform
+}  // namespace vx
+}  // namespace tim
diff --git a/src/tim/vx/platform/native.cc b/src/tim/vx/platform/native.cc
index 45e352e..86c2ab2 100644
--- a/src/tim/vx/platform/native.cc
+++ b/src/tim/vx/platform/native.cc
@@ -28,33 +28,37 @@ namespace tim {
 namespace vx {
 namespace platform {
 
-std::shared_ptr<IExecutable> Compile(const std::shared_ptr<Graph>& graph, const std::shared_ptr<IExecutor>& executor) {
+std::shared_ptr<IExecutable> Compile(
+    const std::shared_ptr<Graph>& graph,
+    const std::shared_ptr<IExecutor>& executor) {
   return executor->Compile(graph);
 }
 
-std::shared_ptr<IExecutable> CreateExecutableSet(const std::vector<std::shared_ptr<IExecutable>>& executables) {
+std::shared_ptr<IExecutable> CreateExecutableSet(
+    const std::vector<std::shared_ptr<IExecutable>>& executables) {
   ExecutableSet* executable_set = new ExecutableSet(executables);
   std::shared_ptr<IExecutable> executable(executable_set);
   return executable;
 }
 
-IDevice::device_id_t IDevice::Id() const {
-  return device_id_;
-}
+IDevice::device_id_t IDevice::Id() const { return device_id_; }
+
+void IDevice::RemoteReset() {}
 
 NativeDeviceImpl::NativeDeviceImpl(device_id_t id) {
-  vip_device_ = std::make_unique<vip::IDevice> (id);
+  vip_device_ = std::make_unique<vip::IDevice>(id);
   device_id_ = id;
 }
 
 bool NativeDeviceImpl::Submit(const std::shared_ptr<Graph>& graph) {
-  GraphImpl* graphimp = dynamic_cast<GraphImpl*> (graph.get()); // hack to downcast
+  GraphImpl* graphimp =
+      dynamic_cast<GraphImpl*>(graph.get());  // hack to downcast
   vsi_graph_v_.push_back(graphimp->graph());
   return true;
 }
 
 bool NativeDeviceImpl::Trigger(bool async, async_callback cb) {
-// extract graph from tasks
+  // extract graph from tasks
   (void)async;
   bool status = false;
   while (!vsi_graph_v_.empty()) {
@@ -65,21 +69,18 @@ bool NativeDeviceImpl::Trigger(bool async, async_callback cb) {
   return status;
 }
 
-void NativeDeviceImpl::WaitDeviceIdle() {
-  vip_device_->WaitThreadIdle();
-}
+void NativeDeviceImpl::WaitDeviceIdle() { vip_device_->WaitThreadIdle(); }
 
-bool NativeDeviceImpl::DeviceExit() {
-  return vip_device_->ThreadExit();
-}
+bool NativeDeviceImpl::DeviceExit() { return vip_device_->ThreadExit(); }
 
 std::vector<std::shared_ptr<IDevice>> NativeDevice::Enumerate() {
   std::vector<std::shared_ptr<IDevice>> device_v;
   device_id_t deviceCount = 0;
   vsi_nn_context_t context;
   context = vsi_nn_CreateContext();
-  vxQueryContext(context->c, VX_CONTEXT_DEVICE_COUNT_VIV, &deviceCount, sizeof(deviceCount));
-  std::cout<< "Device count = "<< deviceCount <<std::endl;
+  vxQueryContext(context->c, VX_CONTEXT_DEVICE_COUNT_VIV, &deviceCount,
+                 sizeof(deviceCount));
+  std::cout << "Device count = " << deviceCount << std::endl;
   for (device_id_t i = 0; i < deviceCount; i++) {
     IDevice* local_device = new NativeDeviceImpl(i);
     std::shared_ptr<IDevice> local_device_sp(local_device);
@@ -89,24 +90,25 @@ std::vector<std::shared_ptr<IDevice>> NativeDevice::Enumerate() {
   return device_v;
 }
 
-std::shared_ptr<Graph> IExecutable::NBGraph() const {
-  return nb_graph_;
-}
+std::shared_ptr<Graph> IExecutable::NBGraph() const { return nb_graph_; }
 
 std::shared_ptr<IExecutor> IExecutable::Executor() const {
   auto executor = executor_.lock();
   if (!executor) {
-    std::cout<< "Executor unable to lock weak_ptr";
+    std::cout << "Executor unable to lock weak_ptr";
   }
   return executor;
 }
 
-NativeExecutable::NativeExecutable(const std::shared_ptr<IExecutor>& executor, const std::vector<char>& nb_buf, size_t inputs, size_t outputs) {
+NativeExecutable::NativeExecutable(const std::shared_ptr<IExecutor>& executor,
+                                   const std::vector<char>& nb_buf,
+                                   size_t inputs, size_t outputs) {
   executor_ = executor;
   context_ = executor->Contex();
   nb_graph_ = context_->CreateGraph();
   nb_buf_ = nb_buf;
-  nb_node_ = nb_graph_->CreateOperation<tim::vx::ops::NBG>(nb_buf_.data(), inputs, outputs);
+  nb_node_ = nb_graph_->CreateOperation<tim::vx::ops::NBG>(nb_buf_.data(),
+                                                           inputs, outputs);
 }
 
 void NativeExecutable::SetInput(const std::shared_ptr<ITensorHandle>& th) {
@@ -117,11 +119,13 @@ void NativeExecutable::SetOutput(const std::shared_ptr<ITensorHandle>& th) {
   nb_node_->BindOutput(th->GetTensor());
 }
 
-void NativeExecutable::GetOutput(const std::vector<std::shared_ptr<ITensorHandle>>& th) {
+void NativeExecutable::GetOutput(
+    const std::vector<std::shared_ptr<ITensorHandle>>& th) {
   (void)th;
 }
 
-bool NativeExecutable::Submit(const std::shared_ptr<IExecutable>& ref, bool after) {
+bool NativeExecutable::Submit(const std::shared_ptr<IExecutable>& ref,
+                              bool after) {
   bool status = false;
   std::shared_ptr<IExecutable> executable = shared_from_this();
   status = Executor()->Submit(executable, ref, after);
@@ -138,18 +142,18 @@ bool NativeExecutable::Trigger(bool async) {
   return status;
 }
 
-std::shared_ptr<ITensorHandle> NativeExecutable::AllocateTensor(const TensorSpec& tensor_spec) {
+std::shared_ptr<ITensorHandle> NativeExecutable::AllocateTensor(
+    const TensorSpec& tensor_spec) {
   auto tensor = nb_graph_->CreateTensor(tensor_spec);
   ITensorHandle* tensor_handle = new NativeTensorHandle(tensor);
-  std::shared_ptr<ITensorHandle> tensor_handle_sp (tensor_handle);
+  std::shared_ptr<ITensorHandle> tensor_handle_sp(tensor_handle);
   return tensor_handle_sp;
 }
 
-bool NativeExecutable::Verify() {
-  return nb_graph_->Compile();
-}
+bool NativeExecutable::Verify() { return nb_graph_->Compile(); }
 
-ExecutableSet::ExecutableSet(const std::vector<std::shared_ptr<IExecutable>>& executables) {
+ExecutableSet::ExecutableSet(
+    const std::vector<std::shared_ptr<IExecutable>>& executables) {
   executables_ = executables;
   executor_ = executables[0]->Executor();
 }
@@ -162,11 +166,13 @@ void ExecutableSet::SetOutput(const std::shared_ptr<ITensorHandle>& th) {
   (void)th;
 }
 
-void ExecutableSet::GetOutput(const std::vector<std::shared_ptr<ITensorHandle>>& th) {
+void ExecutableSet::GetOutput(
+    const std::vector<std::shared_ptr<ITensorHandle>>& th) {
   (void)th;
 }
 
-bool ExecutableSet::Submit(const std::shared_ptr<IExecutable>& ref, bool after) {
+bool ExecutableSet::Submit(const std::shared_ptr<IExecutable>& ref,
+                           bool after) {
   bool status = false;
   std::shared_ptr<IExecutable> executable = shared_from_this();
   status = Executor()->Submit(executable, ref, after);
@@ -177,7 +183,7 @@ bool ExecutableSet::Trigger(bool async) {
   (void)async;
   bool status = false;
   auto device = Executor()->Device();
-  for ( auto executable : executables_ ) {
+  for (auto executable : executables_) {
     device->Submit(executable->NBGraph());
   }
   status = device->Trigger();
@@ -185,9 +191,10 @@ bool ExecutableSet::Trigger(bool async) {
   return status;
 }
 
-std::shared_ptr<ITensorHandle> ExecutableSet::AllocateTensor(const TensorSpec& tensor_spec) {
+std::shared_ptr<ITensorHandle> ExecutableSet::AllocateTensor(
+    const TensorSpec& tensor_spec) {
   std::shared_ptr<ITensorHandle> tensor_handle_sp;
-  (void) tensor_spec;
+  (void)tensor_spec;
   return tensor_handle_sp;
 }
 
@@ -197,31 +204,32 @@ std::vector<std::shared_ptr<IExecutable>> ExecutableSet::Executables() const {
 
 bool ExecutableSet::Verify() {
   bool status = false;
-  for ( auto executable : executables_ ) {
+  for (auto executable : executables_) {
     status = executable->Verify();
   }
   return status;
 }
 
-std::shared_ptr<Context> IExecutor::Contex() const {
-  return context_;
-}
+std::shared_ptr<Context> IExecutor::Contex() const { return context_; }
 
 NativeExecutor::NativeExecutor(const std::shared_ptr<IDevice>& device) {
   device_ = device;
   context_ = Context::Create();
 }
 
-NativeExecutor::NativeExecutor(const std::shared_ptr<IDevice>& device, const std::shared_ptr<Context>& context) {
+NativeExecutor::NativeExecutor(const std::shared_ptr<IDevice>& device,
+                               const std::shared_ptr<Context>& context) {
   device_ = device;
   context_ = context;
 }
 
-bool NativeExecutor::Submit(const std::shared_ptr<IExecutable>& executable, const std::shared_ptr<IExecutable>& ref, bool after) {
+bool NativeExecutor::Submit(const std::shared_ptr<IExecutable>& executable,
+                            const std::shared_ptr<IExecutable>& ref,
+                            bool after) {
   bool success = false;
   success = executable->Verify();
   if (success == false) {
-    std::cout<<"Executable NBG compile failed";
+    std::cout << "Executable NBG compile failed";
     return false;
   }
   if (executable == ref) {
@@ -231,12 +239,11 @@ bool NativeExecutor::Submit(const std::shared_ptr<IExecutable>& executable, cons
   for (size_t i = 0; i < tasks_.size(); i++) {
     if (tasks_[i].lock() == ref) {
       if (after == true) {
-        tasks_.insert(tasks_.begin()+i+1, executable);
+        tasks_.insert(tasks_.begin() + i + 1, executable);
         success = true;
         break;
-      }
-      else{
-        tasks_.insert(tasks_.begin()+i, executable);
+      } else {
+        tasks_.insert(tasks_.begin() + i, executable);
         success = true;
         break;
       }
@@ -252,7 +259,7 @@ bool NativeExecutor::Trigger(bool async) {
     tasks_.erase(tasks_.begin());
     auto task_ = task.lock();
     if (!task_) {
-        std::cout<< "Task unable to lock weak_ptr";
+      std::cout << "Task unable to lock weak_ptr";
     }
     task_->Trigger();
   }
@@ -260,10 +267,13 @@ bool NativeExecutor::Trigger(bool async) {
   return true;
 }
 
-std::shared_ptr<IExecutable> NativeExecutor::Compile(const std::shared_ptr<Graph>& graph) {
-  GraphImpl* graphimp= dynamic_cast<GraphImpl*> (graph.get()); // hack to downcast
+std::shared_ptr<IExecutable> NativeExecutor::Compile(
+    const std::shared_ptr<Graph>& graph) {
+  GraphImpl* graphimp =
+      dynamic_cast<GraphImpl*>(graph.get());  // hack to downcast
   IDevice::device_id_t id = device_->Id();
-  vxSetGraphAttribute(graphimp->graph()->g, VX_GRAPH_DEVICE_INDEX_VIV, (void*)(&id), sizeof(id));
+  vxSetGraphAttribute(graphimp->graph()->g, VX_GRAPH_DEVICE_INDEX_VIV,
+                      (void*)(&id), sizeof(id));
   size_t bin_size = -1;
   graph->CompileToBinary(nullptr, &bin_size);
   std::vector<char> nb_buf;
@@ -272,24 +282,22 @@ std::shared_ptr<IExecutable> NativeExecutor::Compile(const std::shared_ptr<Graph
   size_t outputs = graph->OutputsTensor().size();
   graph->CompileToBinary(nb_buf.data(), &bin_size);
   std::shared_ptr<IExecutor> this_sp = shared_from_this();
-  IExecutable* executable = new NativeExecutable(this_sp, nb_buf, inputs, outputs);
+  IExecutable* executable =
+      new NativeExecutable(this_sp, nb_buf, inputs, outputs);
   std::shared_ptr<IExecutable> executable_sp(executable);
   return executable_sp;
 }
 
-std::shared_ptr<IDevice> IExecutor::Device() const {
-  return device_;
-}
+std::shared_ptr<IDevice> IExecutor::Device() const { return device_; }
 
-std::shared_ptr<Tensor> ITensorHandle::GetTensor() const {
-  return tensor_;
-}
+std::shared_ptr<Tensor> ITensorHandle::GetTensor() const { return tensor_; }
 
 NativeTensorHandle::NativeTensorHandle(const std::shared_ptr<Tensor>& tensor) {
   tensor_ = tensor;
 }
 
-bool NativeTensorHandle::CopyDataToTensor(const void* data, uint32_t size_in_bytes) {
+bool NativeTensorHandle::CopyDataToTensor(const void* data,
+                                          uint32_t size_in_bytes) {
   return tensor_->CopyDataToTensor(data, size_in_bytes);
 }