Update internal to 1.1.37_preview (#254)

* update internal to V1.1.37 Signed-off-by: xiang.zhang <xiang.zhang@verisilicon.com> * Update VSimulator V6.4.9 for linux x86_64 Signed-off-by: xiang.zhang <xiang.zhang@verisilicon.com>
2022-01-10 01:56:00 +08:00 · 2022-01-10 01:56:00 +08:00 · ed47c5c24c
parent 7c63ba621e
commit ed47c5c24c
290 changed files with 14691 additions and 2901 deletions
--- a/prebuilt-sdk/x86_64_linux/VERSION
+++ b/prebuilt-sdk/x86_64_linux/VERSION
@ -1 +1 @@
-REL/6.4.8
+REL/6.4.9
--- a/prebuilt-sdk/x86_64_linux/include/CL/cl_viv_vx_ext.h
+++ b/prebuilt-sdk/x86_64_linux/include/CL/cl_viv_vx_ext.h
@ -349,75 +349,74 @@ enum eVXC_ERROR
 #define VXC_OP1(Op, Dest, Src0)   _viv_asm(INTRINSIC, Dest, VXC_OP_##Op, Src0)

 #define VXC_OP2(Op, Dest, Src0, Src1)                  \
-    do {                                               \
+    {                                                  \
        int _t1;                                       \
-        _viv_asm(PARAM_CHAIN, _t1, Src0, Src1);         \
-        _viv_asm(INTRINSIC, Dest, VXC_OP_##Op, _t1);    \
-    } while(0)
+        _viv_asm(PARAM_CHAIN, _t1, Src0, Src1);        \
+        _viv_asm(INTRINSIC, Dest, VXC_OP_##Op, _t1);   \
+    }

 #define VXC_OP3(Op, Dest, Src0, Src1, Src2)            \
-    do {                                               \
+    {                                                  \
        int _t1, _t2;                                  \
        _viv_asm(PARAM_CHAIN, _t1, Src0, Src1);        \
        _viv_asm(PARAM_CHAIN, _t2, _t1, Src2);         \
        _viv_asm(INTRINSIC, Dest, VXC_OP_##Op, _t2);   \
-    } while(0)
+    }

 #define VXC_OP3_NoDest(Op, Src0, Src1, Src2)           \
-    do {                                               \
+    {                                                  \
        int _t1, _t2, _t3;                             \
        _viv_asm(PARAM_CHAIN, _t1, Src0, Src1);        \
        _viv_asm(PARAM_CHAIN, _t2, _t1, Src2);         \
        _viv_asm(INTRINSIC_ST, _t3, VXC_OP_##Op, _t2); \
-    } while(0)
-
+    }

 #define VXC_OP4(Op, Dest, Src0, Src1, Src2, Src3)      \
-    do {                                               \
+    {                                                  \
        int _t1, _t2, _t3;                             \
        _viv_asm(PARAM_CHAIN, _t1, Src0, Src1);        \
        _viv_asm(PARAM_CHAIN, _t2, _t1, Src2);         \
        _viv_asm(PARAM_CHAIN, _t3, _t2, Src3);         \
        _viv_asm(INTRINSIC, Dest, VXC_OP_##Op, _t3);   \
-    } while(0)
+    }

 #define VXC_OP4_NoDest(Op, Src0, Src1, Src2, Src3)     \
-    do {                                               \
+    {                                                  \
        int _t1, _t2, _t3, _t4;                        \
        _viv_asm(PARAM_CHAIN, _t1, Src0, Src1);        \
        _viv_asm(PARAM_CHAIN, _t2, _t1, Src2);         \
        _viv_asm(PARAM_CHAIN, _t3, _t2, Src3);         \
        _viv_asm(INTRINSIC_ST, _t4, VXC_OP_##Op, _t3); \
-    } while(0)
+    }

 #define VXC_OP4_ST(Op, Dest, Src0, Src1, Src2, Src3)   \
-    do {                                               \
+    {                                                  \
        int _t1, _t2, _t3;                             \
        _viv_asm(PARAM_CHAIN, _t1, Src0, Src1);        \
        _viv_asm(PARAM_CHAIN, _t2, _t1, Src2);         \
        _viv_asm(PARAM_CHAIN, _t3, _t2, Src3);         \
        _viv_asm(INTRINSIC_ST, Dest, VXC_OP_##Op, _t3);\
-    } while(0)
+    }

 #define VXC_OP5(Op, Dest, Src0, Src1, Src2, Src3, Src4)   \
-    do {                                                  \
+    {                                                     \
        int _t1, _t2, _t3, _t4;                           \
        _viv_asm(PARAM_CHAIN, _t1, Src0, Src1);           \
        _viv_asm(PARAM_CHAIN, _t2, _t1, Src2);            \
        _viv_asm(PARAM_CHAIN, _t3, _t2, Src3);            \
        _viv_asm(PARAM_CHAIN, _t4, _t3, Src4);            \
        _viv_asm(INTRINSIC, Dest, VXC_OP_##Op, _t4);      \
-    } while(0)
+    }

 #define VXC_OP5_NoDest(Op, Src0, Src1, Src2, Src3, Src4)  \
-    do {                                                  \
+    {                                                     \
        int _t1, _t2, _t3, _t4, _t5;                      \
        _viv_asm(PARAM_CHAIN, _t1, Src0, Src1);           \
        _viv_asm(PARAM_CHAIN, _t2, _t1, Src2);            \
        _viv_asm(PARAM_CHAIN, _t3, _t2, Src3);            \
        _viv_asm(PARAM_CHAIN, _t4, _t3, Src4);            \
        _viv_asm(INTRINSIC_ST, _t5, VXC_OP_##Op, _t4);    \
-    } while(0)
+    }

 /* make sure the immediate value offsetX and offsetY are in range of [-16, 15] */
 #define VXC_5BITOFFSET_XY(offsetX, offsetY)  ((((offsetY) & 0x1F) << 5) | ((offsetX) & 0x1F))
@ -515,41 +514,34 @@ enum eVXC_ERROR
 * Offset should be composed by using VXC_5BITOFFSET_XY(x, y)
 * Coord must be type of int4 or float4 
 */
-#define VXC_ReadImage2DArray(Dest, Image, Coord, Offset, Info)         \
-    do {                                                               \
-       int8 desc;                                                      \
-       _viv_asm(COPY, desc, Image, sizeof(desc));                      \
-       _viv_asm(CLAMP0MAX, (Coord).w, (Coord).z, desc.s5 - 1);         \
-       int baseAddr =  (int)(Coord).w *desc.s4 + desc.s0;              \
-       _viv_asm(MOV, (Coord).w, baseAddr);                             \
-       VXC_OP4(img_load_3d, Dest, Image, (Coord).xyww, Offset, Info);  \
-    } while (0)
-#define VXC_WriteImage2DArray(Image, Coord, Color, Info)               \
-    do {                                                               \
-       int8 desc;                                                      \
-       _viv_asm(COPY, desc, Image, sizeof(desc));                      \
-       _viv_asm(CLAMP0MAX, (Coord).w, (Coord).z, desc.s5 - 1);         \
-       int baseAddr =  (int)(Coord).w *(desc).s4 + desc.s0;            \
-       _viv_asm(MOV, (Coord).w, baseAddr);                             \
-       VXC_OP4_NoDest(img_store_3d, Image, (Coord).xyww, Color, Info); \
-    } while (0)
+#define VXC_ReadImage2DArray(Dest, Image, OrigCoord, Offset, Info)          \
+    {                                                                       \
+       int8 desc;                                                           \
+       int4 tempCoord = (int4)(OrigCoord.xyzz);                             \
+       _viv_asm(COPY, desc, Image, sizeof(desc));                           \
+       _viv_asm(CLAMP0MAX, tempCoord.z, tempCoord.z, desc.s5 - 1);          \
+       tempCoord.z = tempCoord.z *desc.s4 + desc.s0;                        \
+       VXC_OP4(img_load_3d, Dest, Image, tempCoord, Offset, Info);          \
+    }
+#define VXC_WriteImage2DArray(Image, OrigCoord, Color, Info)                \
+    {                                                                       \
+       int8 desc;                                                           \
+       int4 tempCoord = (int4)(OrigCoord.xyzz);                             \
+       _viv_asm(COPY, desc, Image, sizeof(desc));                           \
+       _viv_asm(CLAMP0MAX, tempCoord.z, tempCoord.z, desc.s5 - 1);          \
+       tempCoord.z = tempCoord.z *desc.s4 + desc.s0;                        \
+       VXC_OP4_NoDest(img_store_3d, Image, tempCoord, Color, Info);         \
+    }

-/* image load/store for image3d_t, 
- * offset should be composed by using VXC_5BITOFFSET_XY(x, y)
- * Coord must be type of int4 or float4 
- */
-#define VXC_ReadImage3D(Dest, Image, Coord, Offset, Info)       VXC_OP4(img_read_3d, Dest, Image, Coord, Offset, Info)
-#define VXC_WriteImage3D(Image, Coord, Color, Info)             VXC_OP4_NoDest(img_write_3d, Image, Coord, Color, Info)
+#define VXC_Vload2(Dest, Pointer, Offset)    { int byteOffset = ((int)sizeof((Dest)))*(Offset); VXC_OP2(vload2, Dest, Pointer, byteOffset); }
+#define VXC_Vload4(Dest, Pointer, Offset)    { int byteOffset = ((int)sizeof((Dest)))*(Offset); VXC_OP2(vload4, Dest, Pointer,  byteOffset); }
+#define VXC_Vload8(Dest, Pointer, Offset)    { int byteOffset = ((int)sizeof((Dest)))*(Offset); VXC_OP2(vload8, Dest, Pointer,  byteOffset); }
+#define VXC_Vload16(Dest, Pointer, Offset)   { int byteOffset = ((int)sizeof((Dest)))*(Offset); VXC_OP2(vload16, Dest, Pointer,  byteOffset); }

-#define VXC_Vload2(Dest, Pointer, Offset)    do { int byteOffset = ((int)sizeof((Dest)))*(Offset); VXC_OP2(vload2, Dest, Pointer, byteOffset); } while(0)
-#define VXC_Vload4(Dest, Pointer, Offset)    do { int byteOffset = ((int)sizeof((Dest)))*(Offset); VXC_OP2(vload4, Dest, Pointer,  byteOffset); } while(0)
-#define VXC_Vload8(Dest, Pointer, Offset)    do { int byteOffset = ((int)sizeof((Dest)))*(Offset); VXC_OP2(vload8, Dest, Pointer,  byteOffset); } while(0)
-#define VXC_Vload16(Dest, Pointer, Offset)   do { int byteOffset = ((int)sizeof((Dest)))*(Offset); VXC_OP2(vload16, Dest, Pointer,  byteOffset); } while(0)
-
-#define VXC_Vstore2(Pointer, Offset, Data)   do { int byteOffset = ((int)sizeof((Data)))*(Offset); VXC_OP3_NoDest(vstore2, Pointer, byteOffset, Data); } while(0)
-#define VXC_Vstore4(Pointer, Offset, Data)   do { int byteOffset = ((int)sizeof((Data)))*(Offset); VXC_OP3_NoDest(vstore4, Pointer, byteOffset, Data); } while(0)
-#define VXC_Vstore8(Pointer, Offset, Data)   do { int byteOffset = ((int)sizeof((Data)))*(Offset); VXC_OP3_NoDest(vstore8, Pointer, byteOffset, Data); } while(0)
-#define VXC_Vstore16(Pointer, Offset, Data)  do { int byteOffset = ((int)sizeof((Data)))*(Offset); VXC_OP3_NoDest(vstore16, Pointer, byteOffset, Data); } while(0)
+#define VXC_Vstore2(Pointer, Offset, Data)   { int byteOffset = ((int)sizeof((Data)))*(Offset); VXC_OP3_NoDest(vstore2, Pointer, byteOffset, Data); }
+#define VXC_Vstore4(Pointer, Offset, Data)   { int byteOffset = ((int)sizeof((Data)))*(Offset); VXC_OP3_NoDest(vstore4, Pointer, byteOffset, Data); }
+#define VXC_Vstore8(Pointer, Offset, Data)   { int byteOffset = ((int)sizeof((Data)))*(Offset); VXC_OP3_NoDest(vstore8, Pointer, byteOffset, Data); }
+#define VXC_Vstore16(Pointer, Offset, Data)  { int byteOffset = ((int)sizeof((Data)))*(Offset); VXC_OP3_NoDest(vstore16, Pointer, byteOffset, Data); }

 /* VX2 only instructions*/
 #define VXC_IndexAdd(Dest, Src0, Src1, Src2, Info)        VXC_OP4(index_add, Dest, Src0, Src1, Src2, Info)
@ -562,7 +554,7 @@ enum eVXC_ERROR

 #if (VX_VERSION == 2)
 #define VXC_BiLinear(Dest, Src0, Src1, Src2, Info)                                      \
-    do {                                                                                \
+    {                                                                                   \
        int endBin = ((Info) & VXC_END_BIN_BITMASK) >> 8;                               \
        int roundMode = ((Info) & VXC_ROUNDING_MODE_BITMASK) >> 2;                      \
        int clamp = ((Info) & VXC_CLAMP_BITMASK) >> 22;                                 \
@ -576,7 +568,7 @@ enum eVXC_ERROR
        _viv_asm(PARAM_CHAIN, bi4, bi3, 8);                                             \
        _viv_asm(INTRINSIC, bi2, OP_bit_extract, bi4);                                  \
        VXC_Lerp(Dest, bi2!<f:UCHAR>, bi2.y!<f:UCHAR>, (Src2).x, Info);                 \
-    }   while (0)
+    }

 #define VXC_BitReplace(Dest, Src0, Src1, Src2, Info)   /* BitReplace definition here */
 #define VXC_IAdd(Dest, Src0, Src1, Src2, Info)         /* IAdd definition here */
@ -592,7 +584,8 @@ enum eVXC_ERROR
 #define VXC_Filter_Max(Dest, Src0, Src1, Src2, Info)        /* Max filter definition here */
 #define VXC_Filter_Min(Dest, Src0, Src1, Src2, Info)        /* Min filter definition here */
 #define VXC_Filter_Median(Dest, Src0, Src1, Src2, Info)     /* Median filter definition here */
-#define VXC_Filter(Dest, Src0, Src1, Src2, Info)       do {                                    \
+#define VXC_Filter(Dest, Src0, Src1, Src2, Info)                                               \
+    {                                                                                          \
        int filter = (((Info) >> 16)&0x0F);                                                    \
        if (filter == VXC_FM_BOX)       { VXC_Filter_Box(Dest, Src0, Src1, Src2, Info); }      \
        if (filter == VXC_FM_Guassian)  { VXC_Filter_Guassian(Dest, Src0, Src1, Src2, Info); } \
@ -603,7 +596,7 @@ enum eVXC_ERROR
        if (filter == VXC_FM_Max)       { VXC_Filter_Max(Dest, Src0, Src1, Src2, Info); }      \
        if (filter == VXC_FM_Min)       { VXC_Filter_Min(Dest, Src0, Src1, Src2, Info); }      \
        if (filter == VXC_FM_Median)    { VXC_Filter_Median(Dest, Src0, Src1, Src2, Info); }   \
-    } while (0)
+    } 

 #else   /* VX1 */

--- a/prebuilt-sdk/x86_64_linux/include/VX/viv_nn_compatibility.h
+++ b/prebuilt-sdk/x86_64_linux/include/VX/viv_nn_compatibility.h
@ -98,7 +98,9 @@ vxCreateTensor_11(
    vx_enum       data_format,
    vx_int8       fixed_point_pos
    );
+#if !VX_VA40_EXT_SUPPORT
 #define vxCreateTensor    vxCreateTensor_11
+#endif

 /* keep the backward compatibility with spec 1.1 for vxCreateVirtualTensor */
 VX_API_ENTRY vx_tensor VX_API_CALL
@ -108,8 +110,11 @@ vxCreateVirtualTensor_11(
    vx_uint32     *sizes,
    vx_enum       data_format,
    vx_int8       fixed_point_pos
-); 
+);
+
+#if !VX_VA40_EXT_SUPPORT
 #define vxCreateVirtualTensor    vxCreateVirtualTensor_11
+#endif

 /* keep the backward compatibility with spec 1.1 for vxCreateTensorFromView */
 VX_API_ENTRY vx_tensor VX_API_CALL
--- a/prebuilt-sdk/x86_64_linux/include/VX/vx_kernels.h
+++ b/prebuilt-sdk/x86_64_linux/include/VX/vx_kernels.h
@ -496,6 +496,8 @@ enum vx_kernel_e {

    VX_KERNEL_NN_BATCH_GEMM = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x2F,

+    VX_KERNEL_NN_CONV_3D_LAYER = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x30,
+
    VX_KERNEL_MAX_1_2, /*!< \internal Used for VX1.2 bounds checking in the conformance test. */
 };

--- a/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_compatible.h
+++ b/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_compatible.h
@ -33,44 +33,58 @@
 0: weight_layout is whnc 
 1: weight_layout is whcn
 */
+#ifndef VX_DECONVOLUTION_WEIGHT_LAYOUT_COMPATIBLE_KHRONOS
 #define VX_DECONVOLUTION_WEIGHT_LAYOUT_COMPATIBLE_KHRONOS 1
+#endif
 /*
 VX_CONVERT_POLICY_WRAP_ENABLE is used to differentiate two overflow_policys(VX_CONVERT_POLICY_WRAP and VX_CONVERT_POLICY_SAT)
 [value]
 0: both overflow_policys considered as VX_CONVERT_POLICY_SAT
 1: overflow_policy is determined by arguments.
 */
+#ifndef VX_CONVERT_POLICY_WRAP_ENABLE
 #define VX_CONVERT_POLICY_WRAP_ENABLE 1
+#endif

+#ifndef VX_13_NN_COMPATIBLITY
 #define VX_13_NN_COMPATIBLITY 1
+#endif
 /*
 VX_L2NORM_AXIS_PARAMETER_SUPPORT is used to declare that L2NORMALIZE can support axis parameter
 [value]
 0: not support
 1: support
 */
+#ifndef VX_L2NORM_AXIS_PARAMETER_SUPPORT
 #define VX_L2NORM_AXIS_PARAMETER_SUPPORT 1
+#endif
 /*
 VX_SOFTMAX_AXIS_PARAMETER_SUPPORT is used to declare that SOFTAMX can support axis parameter
 [value]
 0: not support
 1: support
 */
+#ifndef VX_SOFTMAX_AXIS_PARAMETER_SUPPORT
 #define VX_SOFTMAX_AXIS_PARAMETER_SUPPORT 1
+#endif
 /*
 VX_NORMALIZATION_AXIS_PARAMETER_SUPPORT is used to declare that NORMALIZATION can support axis parameter
 [value]
 0: not support
 1: support
 */
+#ifndef VX_NORMALIZATION_AXIS_PARAMETER_SUPPORT
 #define VX_NORMALIZATION_AXIS_PARAMETER_SUPPORT 1
+#endif
 /*
 VX_ACTIVATION_EXT_SUPPORT is used to declare that ACTIVATION can support swish and hswish
 [value]
 0: not support
 1: support
 */
+#ifndef VX_ACTIVATION_EXT_SUPPORT
 #define VX_ACTIVATION_EXT_SUPPORT 1
+#endif

 /*
 VX_HARDWARE_CAPS_PARAMS_EXT_SUPPORT is used to query more hardware parameter such as shader sub-group size.
@ -78,7 +92,19 @@
 0: not support
 1: support
 */
+#ifndef VX_HARDWARE_CAPS_PARAMS_EXT_SUPPORT
 #define VX_HARDWARE_CAPS_PARAMS_EXT_SUPPORT 1
+#endif
+
+/*
+ VX_VA40_EXT_SUPPORT is used to declare that openvx can support VA40.
+ [value]
+ 0: not support
+ 1: support
+*/
+#ifndef VX_VA40_EXT_SUPPORT
+#define VX_VA40_EXT_SUPPORT 0
+#endif

 /*
 VX_USER_LOOKUP_TABLE_SUPPORT is used to declare that openvx can support user lookuptable.
@ -86,7 +112,9 @@
 0: not support
 1: support
 */
+#ifndef VX_USER_LOOKUP_TABLE_SUPPORT
 #define VX_USER_LOOKUP_TABLE_SUPPORT 1
+#endif

 /*
 VX_PRELOAD_CONST_TENSOR_SUPPORT is used to declare that openvx can support preload weight/bias and const tensor
@ -94,7 +122,9 @@ VX_PRELOAD_CONST_TENSOR_SUPPORT is used to declare that openvx can support prelo
 0: not support
 1: support(NN conv and TP FC weightbias, and SH const tensor)
 */
+#ifndef VX_PRELOAD_CONST_TENSOR_SUPPORT
 #define VX_PRELOAD_CONST_TENSOR_SUPPORT 1
+#endif

 /*
 VX_CREATE_TENSOR_SUPPORT_PHYSICAL is used to declare that openvx can support physical address for vxCreateTensorFromHandle
@ -102,7 +132,9 @@ VX_CREATE_TENSOR_SUPPORT_PHYSICAL is used to declare that openvx can support phy
 0: not support
 1: support
 */
+#ifndef VX_CREATE_TENSOR_SUPPORT_PHYSICAL
 #define VX_CREATE_TENSOR_SUPPORT_PHYSICAL 1
+#endif

 /*
 VX_GRAPH_PREEMPTION_SUPPORT is used to declare that openvx can support different graph preemption function.
@ -110,7 +142,9 @@ VX_CREATE_TENSOR_SUPPORT_PHYSICAL is used to declare that openvx can support phy
 0: not support
 1: support
 */
+#ifndef VX_GRAPH_PREEMPTION_SUPPORT
 #define VX_GRAPH_PREEMPTION_SUPPORT 1
+#endif

 /*
 VX_BATCH_GEMM_API_SUPPORT is used to declare that vsi openvx driver can support vxBatchGemmNode API to transform gemm to convolution
@ -118,6 +152,18 @@ VX_BATCH_GEMM_API_SUPPORT is used to declare that vsi openvx driver can support
 0: not support
 1: support
 */
+#ifndef VX_BATCH_GEMM_API_SUPPORT
 #define VX_BATCH_GEMM_API_SUPPORT 1
+#endif
+
+/*
+VX_CONV_3D_API_SUPPORT is used to declare that vsi openvx driver can support conv3d by vxConv3dLayer API.
+ [value]
+ 0: not support
+ 1: support
+*/
+#ifndef VX_CONV_3D_API_SUPPORT
+#define VX_CONV_3D_API_SUPPORT 1
+#endif

 #endif /* __VX_KHR_COMPATIBLE_H__ */
--- a/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_nn.h
+++ b/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_nn.h
@ -29,6 +29,7 @@
 #define OPENVX_KHR_NN   "vx_khr_nn"

 #include <VX/vx.h>
+#include <VX/vx_khr_compatible.h>
 #include <VX/vx_khr_nn_internal.h>


@ -310,10 +311,47 @@ enum vx_tensor_lifetime_type_e
    VX_TENSOR_LIFE_TIME_DYNAMIC,
 };

+typedef struct _vx_nn_convolution_3d_params_t
+{
+    vx_int32 padding_w_left;                 /*!< \brief Number of elements added at each side in the left of w dimension of the input. */
+    vx_int32 padding_w_right;                /*!< \brief Number of elements added at each side in the right of w dimension of the input. */
+    vx_int32 padding_h_top;                  /*!< \brief Number of elements added at each side in the top of h dimension of the input. */
+    vx_int32 padding_h_bottom;               /*!< \brief Number of elements added at each side in the bottom of h dimension of the input. */
+    vx_int32 padding_d_front;                /*!< \brief Number of elements added at each side in the front of d dimension of the input. */
+    vx_int32 padding_d_rear;                 /*!< \brief Number of elements added at each side in the rear of d dimension of the input. */
+
+    vx_int32 stride_w;                       /*!< \brief  skip w jump for down scale.  */
+    vx_int32 stride_h;                       /*!< \brief  skip h jump for down scale.  */
+    vx_int32 stride_d;                       /*!< \brief  skip d jump for down scale.  */
+    vx_int32 dilation_w;                     /*!< \brief "inflate" the kernel by inserting zeros between the kernel elements in the w direction. The value is the number of zeros to insert.*/
+    vx_int32 dilation_h;                     /*!< \brief "inflate" the kernel by inserting zeros between the kernel elements in the h direction. The value is the number of zeros to insert.*/
+    vx_int32 dilation_d;                     /*!< \brief "inflate" the kernel by inserting zeros between the kernel elements in the d direction. The value is the number of zeros to insert.*/
+
+    vx_enum pad_mode;                       /*!< \brief A VX_TYPE_ENUM of the <tt> \ref vx_pad_mode_e </tt> enumeration. */
+    vx_scalar pad_const;                    /*!< \brief pad const value if setting pad mode to const, the const value is base value, not quantized value. */
+
+    vx_enum overflow_policy;                /*!< \brief A <tt> VX_TYPE_ENUM</tt> of the <tt> vx_convert_policy_e</tt> enumeration. */
+    vx_enum rounding_policy;                /*!< \brief A <tt> VX_TYPE_ENUM</tt> of the <tt> vx_round_policy_e</tt> enumeration. */
+    vx_enum down_scale_size_rounding;       /*!< \brief Rounding method for calculating output dimensions. See <tt>\ref vx_nn_rounding_type_e</tt> */
+
+    vx_int32 depth_multiplier;              /*!< \brief depthwise multiplier value, if 0, means convolution, elsewise(>=1), the convolution is depthwiseconvolution. */
+}vx_nn_convolution_3d_params_t;
+
 /*==============================================================================
    TENSOR DATA FUNCTIONS
 =============================================================================*/
-
+#if VX_VA40_EXT_SUPPORT
+/*! \brief Create  an opaque reference to a tensor view object.
+ * \details Not guaranteed to exist until the <tt>vx_graph</tt> containing it has been verified.
+ * \param [in] context The reference to the implementation context.
+ * \param [in] view_array_start a vx_size array of start values of the view.
+ * \param [in] view_array_end a vx_size array of end values of the view.
+ * \param [in] numViewDimensions number of dimensions of view_array_start and view_array_end.
+ * \return A tensor data view reference or zero when an error is encountered.
+ * \ingroup group_tensor
+ */
+VX_API_ENTRY vx_tensor_view VX_API_CALL vxCreateTensorView(vx_context context, vx_size* view_array_start, vx_size* view_array_end, vx_size numViewDimensions);
+#else
 /*! \brief Create  an opaque reference to a tensor view object.
 * \details Not guaranteed to exist until the <tt>vx_graph</tt> containing it has been verified.
 * \param [in] context The reference to the implementation context.
@ -324,6 +362,7 @@ enum vx_tensor_lifetime_type_e
 * \ingroup group_tensor
 */
 VX_API_ENTRY vx_tensor_view VX_API_CALL vxCreateTensorView(vx_context context, vx_uint32 *view_array_start, vx_uint32 * view_array_end, vx_uint8 numViewDimensions);
+#endif

 /*! \brief Releases a reference to a tensor data view object.
 * The object may not be garbage collected until its total reference count is zero.
@ -337,6 +376,18 @@ VX_API_ENTRY vx_tensor_view VX_API_CALL vxCreateTensorView(vx_context context, v
 */
 VX_API_ENTRY vx_status VX_API_CALL vxReleaseTensorView(vx_tensor_view *tensor_view);

+#if VX_VA40_EXT_SUPPORT
+/*! \brief Create  an opaque reference to a tensor addressing object.
+* \details Not guaranteed to exist until the <tt>vx_graph</tt> containing it has been verified.
+* \param [in] context The reference to the implementation context.
+* \param [in] addressing_array_dimension a vx_size array of sLength of patch in all dimensions in elements.
+* \param [in] addressing_array_stride a vx_size arrayStride in all dimensions in bytes.
+* \param [in] numViewDimensions number of dimensions of view_array_start and view_array_end.
+* \return A tensor data view reference or zero when an error is encountered.
+* \ingroup group_tensor
+*/
+VX_API_ENTRY vx_tensor_addressing VX_API_CALL vxCreateTensorAddressing(vx_context context, vx_size* addressing_array_dimension, vx_size* addressing_array_stride, vx_size numViewDimensions);
+#else
 /*! \brief Create  an opaque reference to a tensor addressing object.
 * \details Not guaranteed to exist until the <tt>vx_graph</tt> containing it has been verified.
 * \param [in] context The reference to the implementation context.
@ -346,7 +397,8 @@ VX_API_ENTRY vx_status VX_API_CALL vxReleaseTensorView(vx_tensor_view *tensor_vi
 * \return A tensor data view reference or zero when an error is encountered.
 * \ingroup group_tensor
 */
-VX_API_ENTRY vx_tensor_addressing VX_API_CALL vxCreateTensorAddressing(vx_context context, vx_uint32 *addressing_array_dimension, vx_uint32 * addressing_array_stride, vx_uint8 numViewDimensions);
+VX_API_ENTRY vx_tensor_addressing VX_API_CALL vxCreateTensorAddressing(vx_context context, vx_uint32 * addressing_array_dimension, vx_uint32 * addressing_array_stride, vx_uint8 numViewDimensions);
+#endif

 /*! \brief Releases a reference to a tensor data addressing object.
 * The object may not be garbage collected until its total reference count is zero.
@ -402,7 +454,11 @@ typedef union _vx_tensor_quant_param
 typedef struct _vx_tensor_create_params_t
 {
    vx_uint32       num_of_dims; /*!< \brief The number of dimensions specified in *sizes*/
+#if VX_VA40_EXT_SUPPORT
+    vx_size *       sizes;       /*!< \brief The pointer to an array of dimension */
+#else
    vx_uint32 *     sizes;       /*!< \brief The pointer to an array of dimension */
+#endif
    vx_enum         data_format; /*!< \brief Data format for the tensor */
    vx_enum         quant_format; /*!< \brief Quantized format <tt>\ref vx_quantized_format_e </tt>. */
    vx_tensor_quant_param quant_data;
@ -482,7 +538,19 @@ VX_API_ENTRY vx_tensor VX_API_CALL vxCreateTensorFromHandle2(
 */
 VX_API_ENTRY vx_status VX_API_CALL vxFlushHandle(vx_reference ref);

-
+#if VX_VA40_EXT_SUPPORT
+/*! \brief Return a new tensor referencing the same memory location but with different shape.
+* \param [in] tensor The input tensor data to reshape.
+* \param [in] num_of_dims Size of each dimension. If one component is special value -1,
+* the size of that dimension is computed so that the total size remains the same as input tensor.
+* If is is [-1], then flatten is performed which turns tensor into 1-D.
+* \param [in] sizes The size of the container to which \a num_of_dims points.
+* \return a vx_tensor that has shaped.
+* \return VX_NULL if an error occurred.
+* \ingroup group_tensor
+*/
+VX_API_ENTRY vx_tensor VX_API_CALL vxReshapeTensor(vx_tensor tensor, vx_size* num_of_dims, vx_size sizes);
+#else
 /*! \brief Return a new tensor referencing the same memory location but with different shape.
 * \param [in] tensor The input tensor data to reshape.
 * \param [in] num_of_dims Size of each dimension. If one component is special value -1,
@ -494,6 +562,7 @@ VX_API_ENTRY vx_status VX_API_CALL vxFlushHandle(vx_reference ref);
 * \ingroup group_tensor
 */
 VX_API_ENTRY vx_tensor VX_API_CALL vxReshapeTensor(vx_tensor tensor, vx_int32* num_of_dims, vx_uint32 sizes);
+#endif

 /*! \brief Allows setting attributes on the tensor.
 * \param [in] tensor The reference to the tensor on which to set the attribute.
@ -1961,6 +2030,7 @@ typedef struct _vx_hardware_caps_params_ext_t
 {
    vx_hardware_caps_params_t base;
    vx_uint32 subGroupSize;        /*!< \brief  shader sub-group size.*/
+    vx_bool   supportVA40;         /*!< \brief  support 40bit virtual address.*/
 } vx_hardware_caps_params_ext_t;

 /*! \brief Queries hardware caps information.
@ -1979,6 +2049,29 @@ VX_API_ENTRY vx_status VX_API_CALL vxQueryHardwareCaps(
    vx_size                             size_of_hardware_caps_param
    );

+/*! \brief [Graph] Creates a Convolutional-3d Network Convolution Layer Node.
+ * \details This function implement Convolutional-3d Network Convolution layer.
+ *  For fixed-point data types, a fixed point calculation is performed with round and saturate according to the number of accumulator bits. The number of the accumulator bits are implementation defined,
+ * and should be at least 16.\n
+ * round: rounding according the <tt>vx_round_policy_e</tt> enumeration. \n
+ * saturate: A saturation according the <tt>vx_convert_policy_e</tt> enumeration.
+ * \param [in] graph The handle to the graph.
+ * \param [in] inputs The input tensor data. 4 lower dimensions represent a single input, all following dimensions represent number of batches, possibly nested.
+ * The dimension order is [width, height, depth, #IFM, #batches].\n
+ * \param [in] weights [*static] Weights are 5d tensor with dimensions [kernel_x, kernel_y, kernel_d, #IFM, #OFM].
+ * see <tt>\ref vxCreateTensor2</tt> and <tt>\ref vxCreateVirtualTensor2</tt> \n Weights data type must match the data type of the inputs.  (Kernel parameter #1)
+ * \param [in] biases [*static] Optional, ignored if NULL. The biases, which may be shared (one per ofm) or unshared (one per ofm * output location). The possible layouts are
+ * either [#OFM] or [width, height, #OFM]. Biases data type must match the data type of the inputs.
+ * \param [in] convolution_params [static] Pointer to parameters of type <tt>\ref vx_nn_convolution_3d_params_t</tt>.
+ * \param [in] size_of_convolution_params [static] Size in bytes of convolution_params. Note that this parameter is not counted as one of the kernel parameters.
+ * \param [out] outputs The output tensor data. Output will have the same number and structure of dimensions as input. Output tensor data type must be same as the inputs.
+ * \return <tt> vx_node</tt>.
+ * \returns A node reference <tt>\ref vx_node</tt>. Any possible errors preventing a
+ * successful creation should be checked using <tt>\ref vxGetStatus</tt>.
+ * \ingroup group_cnn
+ */
+VX_API_ENTRY vx_node VX_API_CALL vxConv3dLayer(vx_graph graph, vx_tensor inputs, vx_tensor weights, vx_tensor biases, const vx_nn_convolution_3d_params_t *convolution_params, vx_size size_of_convolution_params, vx_tensor outputs);
+
 #ifdef  __cplusplus
 }
 #endif
--- a/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_nn_internal.h
+++ b/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_nn_internal.h
@ -289,6 +289,169 @@ typedef struct _vx_weights_biases_parameter_optimizations_ext2_t {
    vx_int8  output_fpp_dw;       /*depthwise conv output fix-point*/
 } vx_weights_biases_parameter_optimizations_ext2_t;

+#if VX_VA40_EXT_SUPPORT
+/*!
+ * \brief Creates a reference to a vx_weights_biases_parameter opaque object.
+ *
+ * \param [in] layer_type                The network type of objects to hold. Types allowed are: 
+ *                                           \arg VX_CONVOLUTIONAL_NETWORK_CONVOLUTION_LAYER for convolution layer.
+ *                                           \arg VX_CONVOLUTIONAL_NETWORK_FULLYCONNECTED_LAYER for fullyconnected layer.
+ * \param [in] num_of_dims               The dimention number of input & output image tensor.
+ * \param [in] inputs_dims               The input tensor's dimension size.
+ * \param [in] pad_x                     The number of elements subtracted at each side in the x dimension of the input.
+ * \param [in] pad_y                     The number of elements subtracted at each side in the y dimension of the input.
+ * \param [in] pooling_size_x            The size of the pooling region in the x dimension, 0 means no pooling operation.
+ * \param [in] pooling_size_y            The size of the pooling region in the y dimension, 0 means no pooling operation.
+ * \param [in] down_scale_size_rounding  A <tt> VX_TYPE_ENUM</tt> of the <tt> vx_round_policy_e</tt> enumeration.
+ * \param [in] convolution_outputs_dims  The output's dimension size after covolution operation.
+ * \param [in] pool_outputs_dims         The output's dimension size after pooling operation.
+ * \param [in] optimizations             A optional param for <tt>\ref vx_weights_biases_parameter_optimizations_t</tt>.
+ * \param [in] weights                   The weights tensor which need be compressed.
+ * \param [in] biases                    The biases tensor which need be compressed.
+ *
+ * \returns An opaque vx_weights_biases_parameter reference with compressed kernel data. Any possible errors preventing a 
+ * successful creation should be checked using <tt>\ref vxGetStatus</tt>.
+ *
+ * \ingroup group_cnn
+ */
+VX_API_ENTRY vx_weights_biases_parameter VX_API_CALL 
+vxCreateWeightsBiasesParameterFromTensors(
+    vx_enum layer_type,
+    vx_size num_of_dims,
+    vx_size * inputs_dims,
+    vx_uint32 pad_x,
+    vx_uint32 pad_y,
+    vx_uint32 pooling_size_x,
+    vx_uint32 pooling_size_y,
+    vx_enum down_scale_size_rounding,
+    vx_size   * convolution_outputs_dims,
+    vx_size   * pool_outputs_dims,
+    vx_weights_biases_parameter_optimizations_t *optimizations,
+    vx_tensor weights, 
+    vx_tensor biases);
+
+/*!
+ * \brief Creates a reference to an opaque vx_weights_biases_parameter object.
+ *
+ * \param [in] layer_type                              The network type of objects to hold. Types allowed are: 
+ *                                                         \arg VX_CONVOLUTIONAL_NETWORK_CONVOLUTION_LAYER for convolution layer.
+ *                                                         \arg VX_CONVOLUTIONAL_NETWORK_FULLYCONNECTED_LAYER for fullyconnected layer.
+ * \param [in] num_of_dims                             The dimention number of input & output image tensor.
+ * \param [in] inputs_dims                             The input tensor's dimension size.
+ * \param [in] convolution_outputs_dims                The output's dimension size after covolution operation.
+ * \param [in] pool_outputs_dims                       The output's dimension size after pooling operation.
+ * \param [in] output_format                           The output tensor element type.
+ * \param [in] convolution_relu_pooling_params         The convolution_relu_pooling_params Pointer to parameters of type <tt>\ref vx_nn_convolution_relu_pooling_params_t</tt>
+ * \param [in] size_of_convolution_relu_pooling_params The size in bytes of convolution_relu_pooling_params.
+ * \param [in] optimizations                           A optional param for <tt>\ref vx_weights_biases_parameter_optimizations_t</tt>.
+ * \param [in] weights                                 The weights tensor which need be compressed.
+ * \param [in] biases                                  The biases tensor which need be compressed.
+ *
+ * \returns An opaque vx_weights_biases_parameter reference with compressed kernel data. Any possible errors preventing a 
+ * successful creation should be checked using <tt>\ref vxGetStatus</tt>.
+ *
+ * \ingroup group_cnn
+ */
+VX_API_ENTRY vx_weights_biases_parameter VX_API_CALL vxCreateWeightsBiasesParameterFromTensors2(
+    vx_enum     layer_type,
+    vx_size     num_of_dims,
+    vx_size   * inputs_dims,
+    vx_size   * convolution_outputs_dims,
+    vx_size   * pool_outputs_dims,
+    vx_enum     output_format,
+    const vx_nn_convolution_relu_pooling_params convolution_relu_pooling_params,
+    vx_size size_of_convolution_relu_pooling_params,
+    vx_weights_biases_parameter_optimizations_t *optimizations,
+    vx_tensor   weights,
+    vx_tensor   biases);
+
+/*!
+ * \brief Creates a reference to an opaque vx_weights_biases_parameter object.
+ *
+ * \param [in] layer_type                              The network type of objects to hold. Types allowed are: 
+ *                                                         \arg VX_CONVOLUTIONAL_NETWORK_CONVOLUTION_LAYER for convolution layer.
+ *                                                         \arg VX_CONVOLUTIONAL_NETWORK_FULLYCONNECTED_LAYER for fullyconnected layer.
+ * \param [in] inputs_dims                             The input tensor's dimension size.
+ * \param [in] convolution_outputs_dims                The output's dimension size after covolution operation.
+ * \param [in] pool_outputs_dims                       The output's dimension size after pooling operation.
+ * \param [in] convolution_relu_pooling_params         The convolution_relu_pooling_params Pointer to parameters of type <tt>\ref vx_nn_convolution_relu_pooling_params_t</tt>
+ * \param [in] size_of_convolution_relu_pooling_params The size in bytes of convolution_relu_pooling_params.
+ * \param [in] optimizations                           A optional param for <tt>\ref vx_weights_biases_parameter_optimizations_t</tt>.
+ * \param [in] size_of_optimizations                   The size in bytes of optimizations.
+ * \param [in] weights                                 The weights tensor which need be compressed.
+ * \param [in] biases                                  The biases tensor which need be compressed.
+ *
+ * \returns An opaque vx_weights_biases_parameter reference with compressed kernel data. Any possible errors preventing a 
+ * successful creation should be checked using <tt>\ref vxGetStatus</tt>.
+ *
+ * \ingroup group_cnn
+ */
+VX_API_ENTRY vx_weights_biases_parameter VX_API_CALL vxCreateWeightsBiasesParameterFromTensors3(
+    vx_enum     layer_type,
+    vx_size   * inputs_dims,
+    vx_size   * convolution_outputs_dims,
+    vx_size   * pool_outputs_dims,
+    const vx_nn_convolution_relu_pooling_params convolution_relu_pooling_params,
+    vx_size size_of_convolution_relu_pooling_params,
+    vx_weights_biases_parameter_optimizations_t *optimizations,
+    vx_size size_of_optimizations,
+    vx_tensor   weights,
+    vx_tensor   biases);
+
+/*!
+ * \brief Creates a reference to an vx_weights_biases_parameter object.
+ * \param [in] context                   The OpenVX context object.
+ * \param [in] layer_type                The network type of objects to hold. Types allowed are: 
+ *                                           \arg VX_CONVOLUTIONAL_NETWORK_CONVOLUTION_LAYER for convolution layer.
+ *                                           \arg VX_CONVOLUTIONAL_NETWORK_FULLYCONNECTED_LAYER for fullyconnected layer.
+ * \param [in] num_of_dims               The dimention number of input & output image tensor.
+ * \param [in] inputs_dims               The input tensor's dimension size.
+ * \param [in] pad_x                     The number of elements subtracted at each side in the x dimension of the input.
+ * \param [in] pad_y                     The number of elements subtracted at each side in the y dimension of the input.
+ * \param [in] pooling_size_x            The size of the pooling region in the x dimension, 0 means no pooling operation.
+ * \param [in] pooling_size_y            The size of the pooling region in the y dimension, 0 means no pooling operation.
+ * \param [in] down_scale_size_rounding  A <tt> VX_TYPE_ENUM</tt> of the <tt> vx_round_policy_e</tt> enumeration.
+ * \param [in] convolution_outputs_dims  The output's dimension size after covolution operation.
+ * \param [in] pool_outputs_dims         The output's dimension size after pooling operation.
+ * \param [in] weights_num_of_dims       The dimention number of weights tensor.
+ * \param [in] weights_dims              The dimention size of weights tensor.
+ * \param [in] weights_data_format       The format of weights tensor.
+ * \param [in] weights_fixed_point_pos   The fixed point position when the weights element type is int16/int8, if 0 calculations are performed in integer math.
+ * \param [in] biases_num_of_dims        The dimention number of biases tensor.
+ * \param [in] biases_dims               The dimention size of biases tensor.
+ * \param [in] biases_data_format        The format of biases tensor.
+ * \param [in] biases_fixed_point_pos    The fixed point position when the biases element type is int16/int8, if 0 calculations are performed in integer math.
+ * \param [in] raw_data_size             The data size of compressed data.
+ *
+ * \returns A weightsbiases reference without compressed kernel data <tt>vx_weights_biases_parameter</tt>. Any possible errors preventing a 
+ * successful creation should be checked using <tt>\ref vxGetStatus</tt>.
+ *
+ * \ingroup group_cnn
+ */
+VX_API_ENTRY vx_weights_biases_parameter VX_API_CALL
+vxCreateWeightsBiasesParameter(
+    vx_context context,
+    vx_enum layer_type,
+    vx_size num_of_dims,
+    vx_size * inputs_dims,
+    vx_uint32 pad_x,
+    vx_uint32 pad_y,
+    vx_uint32 pooling_size_x,
+    vx_uint32 pooling_size_y,
+    vx_enum   down_scale_size_rounding,
+    vx_size   * convolution_outputs_dims,
+    vx_size   * pool_outputs_dims,
+    vx_size   weights_num_of_dims,
+    vx_size   * weights_dims,
+    vx_enum weights_data_format,
+    vx_int8 weights_fixed_point_pos,
+    vx_size biases_num_of_dims,
+    vx_size * biases_dims,
+    vx_enum biases_data_format,
+    vx_int8 biases_fixed_point_pos,
+    vx_uint32 raw_data_size
+    );
+#else
 /*!
 * \brief Creates a reference to a vx_weights_biases_parameter opaque object.
 *
@ -397,17 +560,6 @@ VX_API_ENTRY vx_weights_biases_parameter VX_API_CALL vxCreateWeightsBiasesParame
    vx_tensor   weights,
    vx_tensor   biases);

-/*! \brief Releases the OpenVX object vx_weights_biases_parameter.
- * \param [in] weights_bias The pointer to the reference to the vx_weights_biases_parameter.
- * \post After returning from this function the reference is zeroed.
- * \return A <tt>\ref vx_status_e</tt> enumeration.
- * \retval VX_SUCCESS No errors.
- * \retval VX_ERROR_INVALID_REFERENCE If weights_bias is not a <tt> vx_weights_biases_parameter</tt>.
- * \pre <tt>\ref vxCreateWeightsBiasesParameterFromTensors / vxCreateWeightsBiasesParameterFromTensors2/ vxCreateWeightsBiasesParameter / vxCreateWeightsBiasesParameterFromStream</tt>
- * \ingroup group_cnn
- */
-VX_API_ENTRY vx_status VX_API_CALL vxReleaseWeightsBiasesParameter(vx_weights_biases_parameter *weights_bias);
-
 /*!
 * \brief Creates a reference to an vx_weights_biases_parameter object.
 * \param [in] context                   The OpenVX context object.
@ -461,7 +613,18 @@ vxCreateWeightsBiasesParameter(
    vx_int8 biases_fixed_point_pos,
    vx_uint32 raw_data_size
    );
+#endif

+/*! \brief Releases the OpenVX object vx_weights_biases_parameter.
+ * \param [in] weights_bias The pointer to the reference to the vx_weights_biases_parameter.
+ * \post After returning from this function the reference is zeroed.
+ * \return A <tt>\ref vx_status_e</tt> enumeration.
+ * \retval VX_SUCCESS No errors.
+ * \retval VX_ERROR_INVALID_REFERENCE If weights_bias is not a <tt> vx_weights_biases_parameter</tt>.
+ * \pre <tt>\ref vxCreateWeightsBiasesParameterFromTensors / vxCreateWeightsBiasesParameterFromTensors2/ vxCreateWeightsBiasesParameter / vxCreateWeightsBiasesParameterFromStream</tt>
+ * \ingroup group_cnn
+ */
+VX_API_ENTRY vx_status VX_API_CALL vxReleaseWeightsBiasesParameter(vx_weights_biases_parameter *weights_bias);
 /*! \brief Input parameters for a gru operation.
 * \ingroup group_cnn
 * \version 0.5
@ -900,6 +1063,7 @@ VX_API_ENTRY vx_node VX_API_CALL vxTensorTableLookupLayer(
    vx_lut InLut,
    vx_lut OutLut,
    vx_tensor output);
+
 #ifdef  __cplusplus
 }
 #endif
--- a/prebuilt-sdk/x86_64_linux/include/VX/vx_types.h
+++ b/prebuilt-sdk/x86_64_linux/include/VX/vx_types.h
@ -444,6 +444,11 @@ enum vx_type_e {
 * \ingroup group_basic_features
 */
 enum vx_status_e {
+    VX_ERROR_VENDOR_VSI_END             = -2000, /*!< \brief A vendor defined error status end base. */
+    /* add new error here*/
+    VX_ERROR_CANCEL_JOB                 = -1001, /*!< \brief Indicates that a VIP job was cancelled. */
+    VX_ERROR_VENDOR_VSI_START           = -1000, /*!< \brief A vendor defined error status start base. */
+
    VX_STATUS_MIN                       = -25,/*!< \brief Indicates the lower bound of status codes in VX. Used for bounds checks only. */
    /* add new codes here */
    VX_ERROR_REFERENCE_NONZERO          = -24,/*!< \brief Indicates that an operation did not complete due to a reference count being non-zero. */
@ -718,6 +723,8 @@ enum vx_graph_state_e {
   VX_GRAPH_STATE_ABANDONED = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_GRAPH_STATE) + 0x3,
   /*! \brief The graph execution is completed and the graph is not scheduled for execution */
   VX_GRAPH_STATE_COMPLETED = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_GRAPH_STATE) + 0x4,
+   /*! \brief The graph execution was cancelled */
+   VX_GRAPH_STATE_CANCELLED = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_GRAPH_STATE) + 0x5,
 };

 /*! \brief The graph attributes list.
--- a/prebuilt-sdk/x86_64_linux/include/VX/vx_viv_sys.h
+++ b/prebuilt-sdk/x86_64_linux/include/VX/vx_viv_sys.h
@ -53,6 +53,19 @@ VX_API_ENTRY vx_status VX_API_CALL vxSysSetVipFrequency(
    vx_uint32 shaderFscaleValue
    );

+/*! \brief cancel all VIP processing jobs.
+ * \param [in] context The reference to the implementation context.
+ * \return A <tt>\ref vx_status_e</tt> enumeration.
+ * \retval VX_SUCCESS Cancelled all VIP processing job successfully
+ *                    and user can check return of vxProcessGraph() to get cancelled status.
+ * \retval VX_ERROR_INVAID_PARAMETERS Invalid context reference.
+ * \retval VX_ERROR_NOT_SUPPORTED Hardware does not support job cancellation.
+ * \retval VX_FAILURE Failed to cancel VIP proccessing job.
+ */
+VX_API_ENTRY vx_status VX_API_CALL vxSysCancelJob(
+    vx_context context
+    );
+
 #ifdef  __cplusplus
 }
 #endif
--- a/prebuilt-sdk/x86_64_linux/lib/libArchModelSw.so
+++ b/prebuilt-sdk/x86_64_linux/lib/libArchModelSw.so
--- a/prebuilt-sdk/x86_64_linux/lib/libCLC.so
+++ b/prebuilt-sdk/x86_64_linux/lib/libCLC.so
--- a/prebuilt-sdk/x86_64_linux/lib/libEmulator.so
+++ b/prebuilt-sdk/x86_64_linux/lib/libEmulator.so
--- a/prebuilt-sdk/x86_64_linux/lib/libGAL.so
+++ b/prebuilt-sdk/x86_64_linux/lib/libGAL.so
--- a/prebuilt-sdk/x86_64_linux/lib/libNNArchPerf.so
+++ b/prebuilt-sdk/x86_64_linux/lib/libNNArchPerf.so
--- a/prebuilt-sdk/x86_64_linux/lib/libOpenVX.so.1.3.0
+++ b/prebuilt-sdk/x86_64_linux/lib/libOpenVX.so.1.3.0
--- a/prebuilt-sdk/x86_64_linux/lib/libOpenVXC.so
+++ b/prebuilt-sdk/x86_64_linux/lib/libOpenVXC.so
--- a/prebuilt-sdk/x86_64_linux/lib/libOpenVXU.so
+++ b/prebuilt-sdk/x86_64_linux/lib/libOpenVXU.so
--- a/prebuilt-sdk/x86_64_linux/lib/libVSC.so
+++ b/prebuilt-sdk/x86_64_linux/lib/libVSC.so
--- a/src/tim/vx/internal/BUILD
+++ b/src/tim/vx/internal/BUILD
@ -25,7 +25,6 @@ filegroup(
    srcs = glob([
        "include/kernel/cl/*.h",
        "include/kernel/evis/*.h",
-        "include/kernel/cpu/*.h",
    ])
 )

@ -34,7 +33,6 @@ filegroup(
    srcs = glob([
        "src/kernel/cl/*.c",
        "src/kernel/evis/*.c",
-        "src/kernel/cpu/*.c",
        "src/kernel/vx/*.c",
    ])
 )
@ -137,6 +135,7 @@ cc_library(
        "include/kernel/vsi_nn_kernel_eltwise.h",
        "include/kernel/vsi_nn_kernel_node.h",
        "include/kernel/vsi_nn_kernel_gpu_shape_optimize.h",
+        "include/kernel/vsi_nn_kernel_lut.h",
        "include/vsi_nn_error.h",

        # libnnext
@ -193,6 +192,7 @@ cc_library(
        "src/kernel/vsi_nn_kernel_selector.c",
        "src/kernel/vsi_nn_kernel_node.c",
        "src/kernel/vsi_nn_kernel_param.c",
+        "src/kernel/vsi_nn_kernel_lut.c",
        "src/kernel/vsi_nn_gpu.c",
        "src/kernel/vsi_nn_kernel_gpu_shape_optimize.c",
        "src/libnnext/vsi_nn_libnnext_resource.c",
--- a/src/tim/vx/internal/include/interface/ops.def
+++ b/src/tim/vx/internal/include/interface/ops.def
@ -163,3 +163,5 @@ DEF_OP(CONV2D_LSTM_CELL)
 DEF_OP(GRU)
 DEF_OP(GRUCELL)
 DEF_OP(GRUCELL_ACTIVATION)
+DEF_OP(RESHAPE2)
+DEF_OP(CONV3D)
--- a/src/tim/vx/internal/include/internal/internal_ops.def
+++ b/src/tim/vx/internal/include/internal/internal_ops.def
@ -17,3 +17,5 @@ DEF_OP(GRUCELL_ACTIVATION_INTERNAL_SMA)
 DEF_OP(RESIZE_1D_BILINEAR_INTERNAL)
 DEF_OP(RESIZE_1D_NEAREST_INTERNAL)
 DEF_OP(SPACE2DEPTH_INTERNAL)
+DEF_OP(GRUCELL_H_TIMES_ACTIVATION_R)
+DEF_OP(GRUCELL_ACTIVATION_Z_H)
--- a/src/tim/vx/internal/include/kernel/vsi_nn_kernel.h
+++ b/src/tim/vx/internal/include/kernel/vsi_nn_kernel.h
@ -48,6 +48,7 @@ typedef enum
    VSI_NN_KERNEL_TYPE_EVIS,
    VSI_NN_KERNEL_TYPE_CL,
    VSI_NN_KERNEL_TYPE_VX,
+    VSI_NN_KERNEL_TYPE_SP,
    VSI_NN_KERNEL_TYPE_NUM,
    VSI_NN_KERNEL_TYPE_NONE = VSI_NN_KERNEL_TYPE_NUM
 } vsi_nn_kernel_type_e;
@ -75,7 +76,9 @@ typedef enum
    F32,
    F64,
    BF16,
-    BOOL8
+    BOOL8,
+    I4,
+    U4,
 } vsi_nn_kernel_dtype_e;

 typedef enum
@ -303,6 +306,8 @@ const void * vsi_nn_kernel_param_get_const_buffer
    REGISTER_KERNEL_BACKEND(operation, CPU, func)
 #define REGISTER_BACKEND_OPENVX(operation, func) \
    REGISTER_KERNEL_BACKEND(operation, VX, func)
+#define REGISTER_BACKEND_STREAM_PROCESSOR(operation, func) \
+    REGISTER_KERNEL_BACKEND(operation, SP, func)

 #define DEF_KERNEL_BASE_CALLBACK( NAME )  \
    static vsi_status NAME##_impl( vsi_nn_kernel_node_t node, \
@ -478,6 +483,10 @@ static inline vsi_nn_kernel_dtype_e vsi_nn_kernel_map_dtype
 {
    switch( dtype )
    {
+    case VSI_NN_TYPE_INT4:
+        return I4;
+    case VSI_NN_TYPE_UINT4:
+        return U4;
    case VSI_NN_TYPE_INT8:
        return I8;
    case VSI_NN_TYPE_BOOL8:
@ -514,6 +523,10 @@ static inline  vsi_nn_type_e vsi_nn_dtype_map_kernel
 {
    switch( dtype )
    {
+    case I4:
+        return VSI_NN_TYPE_INT4;
+    case U4:
+        return VSI_NN_TYPE_UINT4;
    case I8:
        return VSI_NN_TYPE_INT8;
    case BOOL8:
@ -572,6 +585,38 @@ static inline size_t vsi_nn_kernel_dtype_get_bytes
    return 0;
 } /* vsi_nn_kernel_dtype_get_bytes() */

+static inline vsi_size_t vsi_nn_kernel_dtype_get_bits
+    (
+    vsi_nn_kernel_dtype_e dtype
+    )
+{
+    switch( dtype )
+    {
+        case I4:
+        case U4:
+            return 4;
+        case I8:
+        case U8:
+        case BOOL8:
+            return 8;
+        case I16:
+        case U16:
+        case F16:
+        case BF16:
+            return 16;
+        case I32:
+        case U32:
+        case F32:
+            return 32;
+        case I64:
+            return 64;
+        default:
+            VSILOGE("Error data type %d", dtype);
+            break;
+    }
+    return 0;
+} /* vsi_nn_kernel_dtype_get_bits() */
+
 static inline vsi_nn_kernel_quant_type_e vsi_nn_kernel_map_quant_type
    ( vsi_nn_qnt_type_e quant_type )
 {
@ -615,6 +660,12 @@ static inline void vsi_nn_kernel_scalar_release
    }
 } /* vsi_nn_kernel_scalar_relase() */

+vsi_status vsi_nn_kernel_scalar_read_uint4
+    ( vsi_nn_kernel_scalar_t scalar, uint8_t * out_data );
+
+vsi_status vsi_nn_kernel_scalar_read_int4
+    ( vsi_nn_kernel_scalar_t scalar, int8_t * out_data );
+
 vsi_status vsi_nn_kernel_scalar_read_int8
    ( vsi_nn_kernel_scalar_t scalar, int8_t * out_data );

@ -751,25 +802,90 @@ static inline vsi_size_t vsi_nn_kernel_tensor_attr_get_size
 static inline vsi_size_t vsi_nn_kernel_tensor_attr_get_bytes
    ( const vsi_nn_kernel_tensor_attr_t * attr )
 {
-    vsi_size_t size;
-    vsi_size_t type_bytes;
+    vsi_size_t i = 0;
+    vsi_size_t bytes;
+    vsi_size_t bits_num;
+    vsi_size_t * shape = NULL;
    if( !attr )
    {
        return 0;
    }
-    size = vsi_nn_kernel_tensor_attr_get_size( attr );
-    type_bytes = (vsi_size_t)vsi_nn_kernel_dtype_get_bytes( attr->dtype );
-    return size * type_bytes;
+
+    shape = attr->shape->data;
+
+    bits_num = vsi_nn_kernel_dtype_get_bits( attr->dtype );
+    if ( bits_num < BITS_PER_BYTE )
+    {
+        if (shape[0] % 2 == 0)
+        {
+            bytes = shape[0] / 2;
+        }
+        else
+        {
+            bytes = shape[0] / 2 + shape[0] % 2;
+        }
+    }
+    else
+    {
+        bytes = shape[0] * bits_num / BITS_PER_BYTE;
+    }
+    for ( i = 1; i < (vsi_size_t)attr->shape->size; i ++ )
+    {
+        bytes *= shape[i];
+    }
+
+    return bytes;
 } /* vsi_nn_kernel_tensor_attr_get_bytes() */

 static inline void vsi_nn_kernel_tensor_attr_get_stride
    ( const vsi_nn_kernel_tensor_attr_t * attr, vsi_size_t * out_stride)
 {
+    vsi_size_t type_bits;
+    vsi_size_t total_bytes;
+    vsi_size_t * shape = NULL;
+
    if( !attr || !out_stride )
    {
        return;
    }
-    vsi_nn_shape_get_stride( attr->shape->data, (vsi_size_t)attr->shape->size, out_stride );
+
+    shape = attr->shape->data;
+    type_bits = vsi_nn_kernel_dtype_get_bits( attr->dtype );
+
+    if ( type_bits < BITS_PER_BYTE )
+    {
+        vsi_size_t i;
+
+        out_stride[0] = type_bits / BITS_PER_BYTE;
+        total_bytes = out_stride[0];
+
+        total_bytes = 1;
+        if ( shape[0] % (BITS_PER_BYTE / type_bits) == 0 )
+        {
+             out_stride[1] = shape[0] * type_bits / BITS_PER_BYTE;
+        }
+        else
+        {
+             out_stride[1] = shape[0] * type_bits / BITS_PER_BYTE + 1;
+        }
+
+        total_bytes *= out_stride[1];
+        for (i = 2; i < (vsi_size_t)attr->shape->size; i++)
+        {
+            out_stride[i] = shape[i - 1] * out_stride[i - 1];
+            total_bytes *= shape[i];
+        }
+        total_bytes *= shape[1];
+
+        for( i = (vsi_size_t)attr->shape->size; i < VSI_NN_MAX_DIM_NUM; i ++ )
+        {
+            out_stride[i] = total_bytes;
+        }
+    }
+    else
+    {
+        vsi_nn_shape_get_stride( attr->shape->data, (vsi_size_t)attr->shape->size, out_stride );
+    }
 } /* vsi_nn_kernel_tensor_attr_get_size() */

 static inline vsi_bool vsi_nn_kernel_tensor_attr_is_quantized
@ -903,12 +1019,115 @@ static inline const char* vsi_nn_kernel_type_str
        return "CL";
    case VSI_NN_KERNEL_TYPE_VX:
        return "OPENVX";
+    case VSI_NN_KERNEL_TYPE_SP:
+        return "STERAM_PROCESSOR";
    default:
        break;
    }
    return "None";
 } /* vsi_nn_kernel_type_str() */

+static inline vsi_status vsi_nn_kernel_unpack_4bit_data
+    (
+    const vsi_nn_kernel_tensor_attr_t * attr,
+    uint8_t * src,
+    uint8_t * dest,
+    vsi_nn_kernel_dtype_e dtype
+    )
+{
+    vsi_status status;
+    uint32_t i = 0, j = 0;
+    uint8_t high = 0, low = 0;
+    vsi_size_t stride[VSI_NN_MAX_DIM_NUM] = {0};
+    vsi_size_t src_size;
+
+    status = VSI_SUCCESS;
+    vsi_nn_kernel_tensor_attr_get_stride( attr, stride );
+
+    src_size = stride[attr->shape->size];
+
+    for ( i = 0 ; i < src_size; i++)
+    {
+        high = src[i] >> 4;
+        low = src[i] & 0x0F;
+        if ( dtype == I4 )
+        {
+            if( high > 7)
+            {
+                high = high | 0xF0;
+            }
+            if( low > 7)
+            {
+                low = low | 0xF0;
+            }
+        }
+        if ( attr->shape->data[0] % stride[1] == 0 )
+        {
+            if ( attr->shape->data[0] == 1 )
+            {
+                dest[j] = low;
+                j++;
+            }
+            else
+            {
+                dest[j] = low;
+                dest[j+1] = high;
+                j += 2;
+            }
+        }
+        else
+        {
+            if ( (i+1) % stride[1] == 0 )
+            {
+                dest[j] = low;
+                j++;
+            }
+            else
+            {
+                dest[j] = low;
+                dest[j+1] = high;
+                j += 2;
+            }
+        }
+    }
+
+    return status;
+}
+
+static inline vsi_status vsi_nn_kernel_pack_4bit_data
+    (
+    const vsi_nn_kernel_tensor_attr_t * attr,
+    uint8_t * src,
+    uint8_t * dest
+    )
+{
+    vsi_status status;
+    uint32_t i = 0, j = 0;
+    uint8_t high = 0, low = 0;
+    vsi_size_t src_size;
+
+    status = VSI_SUCCESS;
+    src_size = vsi_nn_kernel_tensor_attr_get_size( attr );
+    for ( i = 0; i < src_size; i++ )
+    {
+        if ( (i+1) % attr->shape->data[0] == 0)
+        {
+            high = 0;
+            low = src[i];
+        }
+        else
+        {
+            high = src[i+1];
+            low = src[i];
+            i++;
+        }
+        dest[j] = (high << 4) | (low & 0xF);
+        j++;
+    }
+
+    return status;
+}
+
 __END_DECLS

 #endif
--- a/src/tim/vx/internal/include/kernel/vsi_nn_kernel_lut.h
+++ b/src/tim/vx/internal/include/kernel/vsi_nn_kernel_lut.h
@ -0,0 +1,75 @@
+/****************************************************************************
+*
+*    Copyright (c) 2021 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+#ifndef _VSI_NN_KERNEL_LUT_H
+#define _VSI_NN_KERNEL_LUT_H
+
+#include <stdint.h>
+
+__BEGIN_DECLS
+
+typedef int32_t vsi_nn_kernel_lut_act_e; enum
+{
+    VSI_NN_KERNEL_LUT_NONE             = 0,
+    VSI_NN_KERNEL_LUT_MISH             = 1,
+    VSI_NN_KERNEL_LUT_LOG              = 2,
+    VSI_NN_KERNEL_LUT_EXP              = 3,
+    VSI_NN_KERNEL_LUT_ELU              = 4,
+    VSI_NN_KERNEL_LUT_NEG              = 5,
+    VSI_NN_KERNEL_LUT_HSIGMOID         = 6,
+    VSI_NN_KERNEL_LUT_SOFT_PLUS        = 7,
+    VSI_NN_KERNEL_LUT_ERF              = 8,
+    VSI_NN_KERNEL_LUT_GELU             = 9,
+    VSI_NN_KERNEL_LUT_HGELU            = 10,
+    VSI_NN_KERNEL_LUT_RELU_KERAS       = 11,
+    VSI_NN_KERNEL_LUT_CLIP             = 12,
+    VSI_NN_KERNEL_LUT_SQUARE           = 13,
+};
+
+#define VSI_NN_KERNEL_LUT_MAX_SIZE  (1024)
+#define VSI_NN_KERNEL_LUT_FP16_MAX  (57344)
+#define VSI_NN_KERNEL_LUT_FP16_MIN  (-57344)
+
+typedef struct _vsi_nn_kernel_lut_
+{
+    float index;
+    float val;
+} vsi_nn_kernel_lut_t;
+
+typedef struct  _vsi_nn_kernel_lut_params
+{
+    vsi_enum act_type;
+    float params[16];
+} vsi_nn_kernel_lut_params;
+
+vsi_status vsi_nn_kernel_lut
+    (
+    vx_lut index_lut,
+    vx_lut output_lut,
+    vsi_nn_kernel_lut_params *param
+    );
+
+__END_DECLS
+
+#endif
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_argmin.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_argmin.h
@ -26,6 +26,9 @@

 #include "vsi_nn_types.h"

+#ifdef __cplusplus
+extern "C" {
+#endif

 #define VSI_NN_ARGMIN_SH_KERNEL_IDX(_AXIS, _INPUT_TYPE, _OUTPUT_TYPE, _IMAGE_DIMS) \
    VSI_NN_ARGMIN_AXIS##_AXIS##_##_INPUT_TYPE##TO##_OUTPUT_TYPE##_##_IMAGE_DIMS##_KERNEL,
@ -110,5 +113,8 @@ typedef struct _vsi_nn_argmin_param
    int32_t axis;
 } vsi_nn_argmin_param;

+#ifdef __cplusplus
+}
 #endif

+#endif
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_axis_aligned_bbox_transform.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_axis_aligned_bbox_transform.h
@ -26,10 +26,17 @@

 #include "vsi_nn_types.h"

+#ifdef __cplusplus
+extern "C" {
+#endif
+
 typedef struct _vsi_nn_axis_aligned_bbox_transform_param
 {
    vsi_enum     type;
 } vsi_nn_axis_aligned_bbox_transform_param;

+#ifdef __cplusplus
+}
 #endif

+#endif
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_batchnorm_single.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_batchnorm_single.h
@ -27,6 +27,10 @@

 #include "vsi_nn_types.h"

+#ifdef __cplusplus
+extern "C" {
+#endif
+
 /* enum for inputs/outputs */
 enum
 {
@ -50,5 +54,8 @@ typedef struct _vsi_nn_batchnorm_single_param
    float eps;
 } vsi_nn_batchnorm_single_param;

+#ifdef __cplusplus
+}
 #endif

+#endif
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_bidirectional_sequence_lstm.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_bidirectional_sequence_lstm.h
@ -26,6 +26,10 @@

 #include "vsi_nn_types.h"

+#ifdef __cplusplus
+extern "C" {
+#endif
+
 enum
 {
    BI_LSTM_INPUT_INPUT             = 0,
@ -132,5 +136,8 @@ typedef struct _vsi_nn_bidirectional_sequence_lstm_param
    vsi_nn_dtype_t *internal_dtype;
 } vsi_nn_bidirectional_sequence_lstm_param;

+#ifdef __cplusplus
+}
 #endif

+#endif
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_bidirectional_sequence_rnn.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_bidirectional_sequence_rnn.h
@ -27,6 +27,10 @@
 #include "vsi_nn_types.h"
 #include "vsi_nn_op_rnn.h"

+#ifdef __cplusplus
+extern "C" {
+#endif
+
 /* enum for inputs/outputs */
 enum
 {
@ -62,5 +66,8 @@ typedef struct _vsi_nn_bidirectional_sequence_rnn_param
    vsi_nn_dtype_t* internal_dtype;
 } vsi_nn_bidirectional_sequence_rnn_param;

+#ifdef __cplusplus
+}
 #endif

+#endif
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_box_with_nms_limit.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_box_with_nms_limit.h
@ -26,6 +26,10 @@

 #include "vsi_nn_types.h"

+#ifdef __cplusplus
+extern "C" {
+#endif
+
 typedef struct _vsi_nn_box_with_nms_limit_param
 {
    float score_threshold;
@ -36,5 +40,8 @@ typedef struct _vsi_nn_box_with_nms_limit_param
    float nms_score_threshold;
 } vsi_nn_box_with_nms_limit_param;

+#ifdef __cplusplus
+}
 #endif

+#endif
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_cast.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_cast.h
@ -27,11 +27,18 @@

 #include "vsi_nn_types.h"

+#ifdef __cplusplus
+extern "C" {
+#endif
+
 typedef struct _vsi_nn_cast_param
 {
    // Add parameters here
    int32_t nothing;
 } vsi_nn_cast_param;

+#ifdef __cplusplus
+}
 #endif

+#endif
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_conv2d_lstm.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_conv2d_lstm.h
@ -26,6 +26,10 @@

 #include "vsi_nn_types.h"

+#ifdef __cplusplus
+extern "C" {
+#endif
+
 enum
 {
    CONV2D_LSTM_IN_INPUT        = 0,
@ -73,4 +77,8 @@ typedef struct _vsi_nn_conv2d_lstm_param
    vsi_nn_conv2d_param conv2d;
 } vsi_nn_conv2d_lstm_param;

+#ifdef __cplusplus
+}
+#endif
+
 #endif
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_conv2d_lstm_cell.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_conv2d_lstm_cell.h
@ -26,6 +26,10 @@

 #include "vsi_nn_types.h"

+#ifdef __cplusplus
+extern "C" {
+#endif
+
 #define CONV2D_LSTM_CELL_GATE_NUM 4  // i,f,c,o

 enum
@ -73,4 +77,8 @@ typedef struct _vsi_nn_conv2d_lstm_cell_param
    vsi_nn_conv2d_param conv2d;
 } vsi_nn_conv2d_lstm_cell_param;

+#ifdef __cplusplus
+}
+#endif
+
 #endif
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_conv3d.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_conv3d.h
@ -0,0 +1,58 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+#ifndef _VSI_NN_OP_CONV3D_H
+#define _VSI_NN_OP_CONV3D_H
+
+#include "vsi_nn_types.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct _vsi_nn_conv3d_param
+{
+    struct _conv3d_local_data_t* local;
+    // Add parameters here
+    /*w, h, d*/
+    int32_t     ksize[3];
+    int32_t     stride[3];
+    int32_t     dilation[3];
+
+    /* Pad left, right, top, bottom, front, rear*/
+    int32_t     pad[6];
+    /* Pad type default value shall be AUTO */
+    vsi_nn_pad_e pad_type;
+    int32_t     weights;
+
+    int32_t      multiplier;
+} vsi_nn_conv3d_param;
+_compiler_assert(offsetof(vsi_nn_conv3d_param, local) == 0, \
+    vsi_nn_conv3d_h );
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_detection_postprocess.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_detection_postprocess.h
@ -26,6 +26,10 @@

 #include "vsi_nn_types.h"

+#ifdef __cplusplus
+extern "C" {
+#endif
+
 typedef struct _vsi_nn_detection_postprocess_param
 {
    float dy;
@ -41,5 +45,8 @@ typedef struct _vsi_nn_detection_postprocess_param
    int32_t is_bg_in_label;
 } vsi_nn_detection_postprocess_param;

+#ifdef __cplusplus
+}
 #endif

+#endif
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_exp.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_exp.h
@ -26,6 +26,9 @@

 #include "vsi_nn_types.h"

+#ifdef __cplusplus
+extern "C" {
+#endif

 #define _VSI_NN_EXP_LOCAL_TENSOR_NUM 2

@ -42,5 +45,8 @@ typedef struct _vsi_nn_exp_param
    vsi_nn_exp_lcl_data local;
 } vsi_nn_exp_param;

+#ifdef __cplusplus
+}
 #endif

+#endif
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_extra_ending.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_extra_ending.h
@ -26,6 +26,10 @@

 #include "vsi_nn_types.h"

+#ifdef __cplusplus
+extern "C" {
+#endif
+
 #define _VSI_NN_EXTRA_ENDING_LOCAL_TENSOR_NUM 3

 typedef struct _vsi_nn_extra_ending_lcl_data
@ -44,5 +48,8 @@ typedef struct _vsi_nn_extra_ending_param
    int length;
 } vsi_nn_extra_ending_param;

+#ifdef __cplusplus
+}
 #endif

+#endif
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_floor.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_floor.h
@ -26,10 +26,17 @@

 #include "vsi_nn_types.h"

+#ifdef __cplusplus
+extern "C" {
+#endif
+
 typedef struct _vsi_nn_floor_param
 {
    vsi_enum     type;
 } vsi_nn_floor_param;

+#ifdef __cplusplus
+}
 #endif

+#endif
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_gelu.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_gelu.h
@ -27,11 +27,17 @@

 #include "vsi_nn_types.h"

+#ifdef __cplusplus
+extern "C" {
+#endif
+
 typedef struct _vsi_nn_gelu_param
 {
    vsi_bool approximate;
 } vsi_nn_gelu_param;

-
+#ifdef __cplusplus
+}
 #endif

+#endif
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_generate_proposals.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_generate_proposals.h
@ -26,6 +26,10 @@

 #include "vsi_nn_types.h"

+#ifdef __cplusplus
+extern "C" {
+#endif
+
 typedef struct _vsi_nn_generate_proposals_param
 {
    float height_stride;
@ -37,5 +41,8 @@ typedef struct _vsi_nn_generate_proposals_param
    int32_t type;
 } vsi_nn_generate_proposals_param;

+#ifdef __cplusplus
+}
 #endif

+#endif
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_grouped_conv1d.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_grouped_conv1d.h
@ -27,11 +27,14 @@

 #include "vsi_nn_types.h"

+#ifdef __cplusplus
+extern "C" {
+#endif
+
 typedef struct _grouped_conv1d_local_data_t {
    vsi_nn_tensor_t* input;
    vsi_nn_tensor_t* weight;
    vsi_nn_tensor_t* output;
-
 } grouped_conv1d_local_data_t;

 typedef struct _vsi_nn_grouped_conv1d_param
@ -50,6 +53,8 @@ typedef struct _vsi_nn_grouped_conv1d_param
    int32_t      multiplier;
 } vsi_nn_grouped_conv1d_param;

-
+#ifdef __cplusplus
+}
 #endif

+#endif
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_grouped_conv2d.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_grouped_conv2d.h
@ -26,6 +26,10 @@

 #include "vsi_nn_types.h"

+#ifdef __cplusplus
+extern "C" {
+#endif
+
 typedef struct _vsi_nn_grouped_conv2d_param
 {
    uint32_t     ksize[2];
@ -41,5 +45,8 @@ typedef struct _vsi_nn_grouped_conv2d_param
    void* local;
 } vsi_nn_grouped_conv2d_param;

+#ifdef __cplusplus
+}
 #endif

+#endif
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_gru.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_gru.h
@ -26,6 +26,10 @@

 #include "vsi_nn_types.h"

+#ifdef __cplusplus
+extern "C" {
+#endif
+
 /* Define the inputs and outputs for GRU Layer */
 enum
 {
@ -74,5 +78,8 @@ typedef struct _vsi_nn_gru_param
 _compiler_assert(offsetof(vsi_nn_gru_param, local) == 0, \
                 vsi_nn_gru_h );

+#ifdef __cplusplus
+}
 #endif

+#endif
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_gru_ovxlib.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_gru_ovxlib.h
@ -27,6 +27,10 @@
 #include "vsi_nn_types.h"
 #include "vsi_nn_op_grucell_ovxlib.h"

+#ifdef __cplusplus
+extern "C" {
+#endif
+
 /* enum for inputs/outputs */
 enum
 {
@ -74,5 +78,8 @@ typedef struct _vsi_nn_gru_ovxlib_param
    uint32_t cudnn_implementation_version;
 } vsi_nn_gru_ovxlib_param;

+#ifdef __cplusplus
+}
 #endif

+#endif
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_grucell.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_grucell.h
@ -26,6 +26,10 @@

 #include "vsi_nn_types.h"

+#ifdef __cplusplus
+extern "C" {
+#endif
+
 enum
 {
    GRUCELL_GATES_Z = 0,
@ -81,4 +85,8 @@ typedef struct _vsi_nn_grucell_param
 _compiler_assert(offsetof(vsi_nn_grucell_param, local) == 0, \
                 vsi_nn_conv1d_h );

+#ifdef __cplusplus
+}
+#endif
+
 #endif
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_grucell_activation.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_grucell_activation.h
@ -26,11 +26,18 @@

 #include "vsi_nn_types.h"

+#ifdef __cplusplus
+extern "C" {
+#endif
+
 enum {
-    GRUCELL_ACT_IN_H_STATE    = 0,
-    GRUCELL_ACT_IN_INPUT_FC_H = 1,
-    GRUCELL_ACT_IN_H_T        = 2,
-    GRUCELL_ACT_IN_Z_T        = 3,
+    GRUCELL_ACT_H_STATE = 0,
+    GRUCELL_ACT_I_FC_Z  = 1,
+    GRUCELL_ACT_I_FC_R  = 2,
+    GRUCELL_ACT_I_FC_H  = 3,
+    GRUCELL_ACT_H_FC_Z  = 4,
+    GRUCELL_ACT_H_FC_R  = 5,
+    GRUCELL_ACT_H_FC_H  = 6,

    GRUCELL_ACT_IN_CNT,

@ -45,8 +52,13 @@ typedef struct _vsi_nn_grucell_activation_param
    struct _vsi_nn_grucell_activation_local * local;

    vsi_nn_activation_e activation;
+    vsi_nn_activation_e recurrent_activation;
 } vsi_nn_grucell_activation_param;
 _compiler_assert(offsetof(vsi_nn_grucell_activation_param, local) == 0, \
                 vsi_nn_grucell_activation_h );

+#ifdef __cplusplus
+}
+#endif
+
 #endif
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_grucell_activation_internal.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_grucell_activation_internal.h
@ -27,6 +27,10 @@

 #include "vsi_nn_types.h"

+#ifdef __cplusplus
+extern "C" {
+#endif
+
 enum {
    GRUCELL_ACTIVATION_INPUT_ZT_    = 0,
    GRUCELL_ACTIVATION_INPUT_HT__   = 1,
@ -83,5 +87,8 @@ typedef struct _vsi_nn_grucell_activation_internal_param
    grucell_activation_input_layout_e input_layout;
 } vsi_nn_grucell_activation_internal_param;

+#ifdef __cplusplus
+}
 #endif

+#endif
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_grucell_activation_internal_sma.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_grucell_activation_internal_sma.h
@ -27,6 +27,10 @@

 #include "vsi_nn_types.h"

+#ifdef __cplusplus
+extern "C" {
+#endif
+
 enum {
    GRUCELL_ACTIVATION_SMA_INPUT_H_STATE    = 0,
    GRUCELL_ACTIVATION_SMA_INPUT_H_T_       = 1,
@ -47,5 +51,8 @@ typedef struct _vsi_nn_grucell_activation_internal_sma_param
    vsi_nn_grucell_activation_internal_sma_local* local;
 } vsi_nn_grucell_activation_internal_sma_param;

+#ifdef __cplusplus
+}
 #endif

+#endif
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_grucell_activation_z_h.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_grucell_activation_z_h.h
@ -0,0 +1,63 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+#ifndef _VSI_NN_OP_GRUCELL_ACTIVATION_Z_H_H
+#define _VSI_NN_OP_GRUCELL_ACTIVATION_Z_H_H
+
+#include "vsi_nn_types.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+enum {
+    GRUCELL_ACT_Z_H_HSTATE = 0,
+    GRUCELL_ACT_Z_H_I_FC_Z  = 1,
+    GRUCELL_ACT_Z_H_I_FC_H  = 2,
+    GRUCELL_ACT_Z_H_H_FC_Z  = 3,
+    GRUCELL_ACT_Z_H_H_FC_H  = 4,
+
+    GRUCELL_ACT_Z_H_IN_CNT,
+
+    GRUCELL_ACT_Z_H_OUT_OUTPUT = 0,
+    GRUCELL_ACT_Z_H_OUT_HSTATE = 1,
+
+    GRUCELL_ACT_Z_H_OUT_CNT
+};
+
+typedef struct _vsi_nn_grucell_activation_z_h_param
+{
+    struct _grucell_activation_z_h_local_data_t* local;
+    // Add parameters here
+    vsi_nn_activation_e activation;
+    vsi_nn_activation_e recurrent_activation;
+} vsi_nn_grucell_activation_z_h_param;
+_compiler_assert(offsetof(vsi_nn_grucell_activation_z_h_param, local) == 0, \
+    vsi_nn_grucell_activation_z_h_h );
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_grucell_h_times_activation_r.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_grucell_h_times_activation_r.h
@ -0,0 +1,47 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+#ifndef _VSI_NN_OP_GRUCELL_H_TIMES_ACTIVATION_R_H
+#define _VSI_NN_OP_GRUCELL_H_TIMES_ACTIVATION_R_H
+
+#include "vsi_nn_types.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct _vsi_nn_grucell_h_times_activation_r_param
+{
+    struct _grucell_h_times_activation_r_local_data_t* local;
+
+    vsi_nn_activation_e recurrent_activation;
+} vsi_nn_grucell_h_times_activation_r_param;
+_compiler_assert(offsetof(vsi_nn_grucell_h_times_activation_r_param, local) == 0, \
+    vsi_nn_grucell_h_times_activation_r_h );
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_grucell_ovxlib.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_grucell_ovxlib.h
@ -27,6 +27,10 @@
 #include "vsi_nn_types.h"
 #include "vsi_nn_op_grucell_ovxlib.h"

+#ifdef __cplusplus
+extern "C" {
+#endif
+
 #define GRUCELL_RZ_GATE_COUNT 2

 /* enum for inputs/outputs */
@ -103,4 +107,8 @@ typedef struct _vsi_nn_grucell_ovxlib_param
 _compiler_assert(offsetof(vsi_nn_grucell_ovxlib_param, local) == 0, \
    vsi_nn_vsi_nn_grucell_ovxlib_h );

+#ifdef __cplusplus
+}
+#endif
+
 #endif
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_hard_sigmoid.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_hard_sigmoid.h
@ -0,0 +1,46 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+#ifndef _VSI_NN_OP_HARD_SIGMOID_H
+#define _VSI_NN_OP_HARD_SIGMOID_H
+
+#include "vsi_nn_types.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct _vsi_nn_hard_sigmoid_param
+{
+    void* local;
+    // Add parameters here
+    float alpha;
+    float beta;
+} vsi_nn_hard_sigmoid_param;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_heatmap_max_keypoint.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_heatmap_max_keypoint.h
@ -26,10 +26,17 @@

 #include "vsi_nn_types.h"

+#ifdef __cplusplus
+extern "C" {
+#endif
+
 typedef struct _vsi_nn_heatmap_max_keypoint_param
 {
    vsi_enum     type;
 } vsi_nn_heatmap_max_keypoint_param;

+#ifdef __cplusplus
+}
 #endif

+#endif
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_interp.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_interp.h
@ -27,6 +27,10 @@

 #include "vsi_nn_types.h"

+#ifdef __cplusplus
+extern "C" {
+#endif
+
 typedef struct _vsi_nn_interp_param
 {
    struct _interp_local_data_t* local;
@ -38,7 +42,8 @@ typedef struct _vsi_nn_interp_param
    int32_t   pad_end; //padding at end of intput
 } vsi_nn_interp_param;

-
-
+#ifdef __cplusplus
+}
 #endif

+#endif
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_log.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_log.h
@ -26,6 +26,9 @@

 #include "vsi_nn_types.h"

+#ifdef __cplusplus
+extern "C" {
+#endif

 #define VSI_NN_LOG_SH_KERNEL_IDX(_INPUT_TYPE, _OUTPUT_TYPE, _IMAGE_DIMS) \
    VSI_NN_LOG_##_INPUT_TYPE##TO##_OUTPUT_TYPE##_##_IMAGE_DIMS##_KERNEL,
@ -113,6 +116,8 @@ typedef struct _vsi_nn_log_param
    vsi_nn_log_lcl_data local;
 } vsi_nn_log_param;

-
+#ifdef __cplusplus
+}
 #endif

+#endif
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_log_softmax.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_log_softmax.h
@ -26,6 +26,10 @@

 #include "vsi_nn_types.h"

+#ifdef __cplusplus
+extern "C" {
+#endif
+
 #define VSI_NN_LOGSOFTMAX_SH_KERNEL_IDX(_AXIS, _INPUT_TYPE, _OUTPUT_TYPE, _IMAGE_DIMS) \
    VSI_NN_LOGSOFTMAX_AXIS##_AXIS##_##_INPUT_TYPE##TO##_OUTPUT_TYPE##_##_IMAGE_DIMS##_KERNEL,

@ -150,5 +154,8 @@ typedef struct _vsi_nn_log_softmax_param
    int32_t                     axis;
 } vsi_nn_log_softmax_param;

+#ifdef __cplusplus
+}
 #endif

+#endif
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_lsh_projection.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_lsh_projection.h
@ -26,6 +26,10 @@

 #include "vsi_nn_types.h"

+#ifdef __cplusplus
+extern "C" {
+#endif
+
 typedef enum
    {
        VSI_NN_LSH_PROJECTION_SPARSE = 1,
@ -37,5 +41,8 @@ typedef struct _vsi_nn_lsh_projection_param
        vsi_nn_lsh_projection_type_e type;
    } vsi_nn_lsh_projection_param;

+#ifdef __cplusplus
+}
 #endif

+#endif
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_lstm_ovxlib.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_lstm_ovxlib.h
@ -27,6 +27,10 @@
 #include "vsi_nn_types.h"
 #include "vsi_nn_op_lstmunit.h"

+#ifdef __cplusplus
+extern "C" {
+#endif
+
 enum
 {
    LSTM_INPUT_INPUT        = 0,
@ -100,5 +104,8 @@ typedef struct _vsi_nn_lstm_ovxlib_param
    uint32_t weights; /* compatible with LSTM, NOT used */
 } vsi_nn_lstm_ovxlib_param;

+#ifdef __cplusplus
+}
 #endif

+#endif
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_lstmunit_activation.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_lstmunit_activation.h
@ -27,6 +27,10 @@
 #include "vsi_nn_types.h"
 #include "vsi_nn_op_lstmunit.h"

+#ifdef __cplusplus
+extern "C" {
+#endif
+
 /* c -> cifg, l -> layer norm, p -> projection, h -> peephole, b -> hybrid bias fp32, s -> standard*/

 enum {
@ -96,5 +100,8 @@ typedef struct _vsi_nn_lstmunit_activation_param
    vsi_nn_activation_e recurrent_activation;
 } vsi_nn_lstmunit_activation_param;

+#ifdef __cplusplus
+}
 #endif

+#endif
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_lstmunit_ovxlib.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_lstmunit_ovxlib.h
@ -28,6 +28,10 @@
 #include "vsi_nn_types.h"
 #include "vsi_nn_op_lstmunit.h"

+#ifdef __cplusplus
+extern "C" {
+#endif
+
 #define LSTMUNIT_IFCO_GATE_COUNT 4

 /* enum for inputs/outputs */
@ -274,4 +278,8 @@ typedef struct _vsi_nn_lstmunit_ovxlib_param
    vsi_nn_dtype_t *internal_dtype_aux;
 } vsi_nn_lstmunit_ovxlib_param;

+#ifdef __cplusplus
+}
+#endif
+
 #endif
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_neg.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_neg.h
@ -26,6 +26,9 @@

 #include "vsi_nn_types.h"

+#ifdef __cplusplus
+extern "C" {
+#endif

 #define _VSI_NN_ELU_LOCAL_TENSOR_NUM 2

@ -34,5 +37,8 @@ typedef struct _vsi_nn_neg_param
    vx_tensor   local_tensor[_VSI_NN_ELU_LOCAL_TENSOR_NUM];
 } vsi_nn_neg_param;

+#ifdef __cplusplus
+}
 #endif

+#endif
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_nms.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_nms.h
@ -27,6 +27,10 @@

 #include "vsi_nn_types.h"

+#ifdef __cplusplus
+extern "C" {
+#endif
+
 typedef struct _vsi_nn_nms_param
 {
    int32_t max_output_size;
@ -35,4 +39,8 @@ typedef struct _vsi_nn_nms_param
    float soft_nms_sigma;
 } vsi_nn_nms_param;

+#ifdef __cplusplus
+}
+#endif
+
 #endif
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_one_hot.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_one_hot.h
@ -27,6 +27,10 @@

 #include "vsi_nn_types.h"

+#ifdef __cplusplus
+extern "C" {
+#endif
+
 typedef struct _vsi_nn_one_hot_param
 {
    struct _one_hot_local_data_t* local;
@ -39,4 +43,8 @@ typedef struct _vsi_nn_one_hot_param
 _compiler_assert(offsetof(vsi_nn_one_hot_param, local) == 0, \
    vsi_nn_one_hot_h );

+#ifdef __cplusplus
+}
+#endif
+
 #endif
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_post_process.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_post_process.h
@ -26,6 +26,10 @@

 #include "vsi_nn_types.h"

+#ifdef __cplusplus
+extern "C" {
+#endif
+
 enum
 {
    POST_PROCESS_INPUT = 0,
@ -53,5 +57,8 @@ typedef struct _vsi_nn_post_process_param
    vsi_nn_post_process_lcl_data local;
 } vsi_nn_post_process_param;

+#ifdef __cplusplus
+}
 #endif

+#endif
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process.h
@ -27,6 +27,10 @@
 #include "vsi_nn_types.h"
 #include "vsi_nn_pre_post_process.h"

+#ifdef __cplusplus
+extern "C" {
+#endif
+
 typedef  vsi_nn_preprocess_source_format_e vsi_nn_pre_process_type_e;

 enum
@ -80,5 +84,9 @@ typedef struct _vsi_nn_pre_process_param

    vsi_nn_pre_process_lcl_data *local;
 } vsi_nn_pre_process_param;
+
+#ifdef __cplusplus
+}
 #endif

+#endif
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_bgra.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_bgra.h
@ -26,6 +26,10 @@

 #include "vsi_nn_types.h"

+#ifdef __cplusplus
+extern "C" {
+#endif
+
 typedef struct _vsi_nn_pre_process_bgra_lcl_data
 {
    int32_t scale_x;
@ -65,5 +69,8 @@ typedef struct _vsi_nn_pre_process_bgra_param
    vsi_nn_pre_process_bgra_lcl_data local;
 } vsi_nn_pre_process_bgra_param;

+#ifdef __cplusplus
+}
 #endif

+#endif
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_gray.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_gray.h
@ -26,6 +26,10 @@

 #include "vsi_nn_types.h"

+#ifdef __cplusplus
+extern "C" {
+#endif
+
 enum
 {
    PRE_PROCESS_GRAY_INPUT = 0,
@ -67,5 +71,8 @@ typedef struct _vsi_nn_pre_process_gray_param
    vsi_nn_pre_process_gray_lcl_data local;
 } vsi_nn_pre_process_gray_param;

+#ifdef __cplusplus
+}
 #endif

+#endif
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_rgb.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_rgb.h
@ -26,6 +26,9 @@

 #include "vsi_nn_types.h"

+#ifdef __cplusplus
+extern "C" {
+#endif

 enum
 {
@ -77,5 +80,8 @@ typedef struct _vsi_nn_pre_process_rgb_param
    vsi_nn_pre_process_rgb_lcl_data local;
 } vsi_nn_pre_process_rgb_param;

+#ifdef __cplusplus
+}
 #endif

+#endif
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_tensor.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_tensor.h
@ -26,6 +26,10 @@

 #include "vsi_nn_types.h"

+#ifdef __cplusplus
+extern "C" {
+#endif
+
 enum
 {
    PRE_PROCESS_TENSOR_INPUT = 0,
@ -53,5 +57,8 @@ typedef struct _vsi_nn_pre_process_tensor_param
    vsi_nn_pre_process_tensor_lcl_data local;
 } vsi_nn_pre_process_tensor_param;

+#ifdef __cplusplus
+}
 #endif

+#endif
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_quantized_16bit_lstm.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_quantized_16bit_lstm.h
@ -26,6 +26,10 @@

 #include "vsi_nn_types.h"

+#ifdef __cplusplus
+extern "C" {
+#endif
+
 enum
 {
    Q16_LSTM_INPUT_INPUT        = 0,
@ -60,5 +64,8 @@ typedef struct _vsi_nn_quantized_16bit_lstm_param
    void* local;
 } vsi_nn_quantized_16bit_lstm_param;

+#ifdef __cplusplus
+}
 #endif

+#endif
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_random_multinomial.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_random_multinomial.h
@ -26,10 +26,17 @@

 #include "vsi_nn_types.h"

+#ifdef __cplusplus
+extern "C" {
+#endif
+
 typedef struct _vsi_nn_random_multinomial_param
 {
    int32_t     sample_num;
 } vsi_nn_random_multinomial_param;

+#ifdef __cplusplus
+}
 #endif

+#endif
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_reduceall_internal.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_reduceall_internal.h
@ -26,6 +26,10 @@

 #include "vsi_nn_types.h"

+#ifdef __cplusplus
+extern "C" {
+#endif
+
 #define VSI_NN_REDUCEALL_SH_KERNEL_IDX(_AXIS, _INPUT_TYPE, _OUTPUT_TYPE, _IMAGE_DIMS) \
    VSI_NN_REDUCEALL_AXIS##_AXIS##_##_INPUT_TYPE##TO##_OUTPUT_TYPE##_##_IMAGE_DIMS##_KERNEL,

@ -68,5 +72,8 @@ typedef struct _vsi_nn_reduceall_internal_param
    vx_bool     keep_dim;
 } vsi_nn_reduceall_internal_param;

+#ifdef __cplusplus
+}
 #endif

+#endif
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_reduceany_internal.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_reduceany_internal.h
@ -26,6 +26,10 @@

 #include "vsi_nn_types.h"

+#ifdef __cplusplus
+extern "C" {
+#endif
+
 #define VSI_NN_REDUCEANY_SH_KERNEL_IDX(_AXIS, _INPUT_TYPE, _OUTPUT_TYPE, _IMAGE_DIMS) \
    VSI_NN_REDUCEANY_AXIS##_AXIS##_##_INPUT_TYPE##TO##_OUTPUT_TYPE##_##_IMAGE_DIMS##_KERNEL,

@ -68,5 +72,8 @@ typedef struct _vsi_nn_reduceany_internal_param
    vx_bool     keep_dim;
 } vsi_nn_reduceany_internal_param;

+#ifdef __cplusplus
+}
 #endif

+#endif
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_reducemax_internal.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_reducemax_internal.h
@ -26,6 +26,10 @@

 #include "vsi_nn_types.h"

+#ifdef __cplusplus
+extern "C" {
+#endif
+
 #define VSI_NN_REDUCEMAX_SH_KERNEL_IDX(_AXIS, _INPUT_TYPE, _OUTPUT_TYPE, _IMAGE_DIMS) \
    VSI_NN_REDUCEMAX_AXIS##_AXIS##_##_INPUT_TYPE##TO##_OUTPUT_TYPE##_##_IMAGE_DIMS##_KERNEL,

@ -113,5 +117,8 @@ typedef struct _vsi_nn_reducemax_internal_param
    vx_bool     keep_dim;
 } vsi_nn_reducemax_internal_param;

+#ifdef __cplusplus
+}
 #endif

+#endif
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_reducemin_internal.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_reducemin_internal.h
@ -26,6 +26,10 @@

 #include "vsi_nn_types.h"

+#ifdef __cplusplus
+extern "C" {
+#endif
+
 #define VSI_NN_REDUCEMIN_SH_KERNEL_IDX(_AXIS, _INPUT_TYPE, _OUTPUT_TYPE, _IMAGE_DIMS) \
    VSI_NN_REDUCEMIN_AXIS##_AXIS##_##_INPUT_TYPE##TO##_OUTPUT_TYPE##_##_IMAGE_DIMS##_KERNEL,

@ -113,5 +117,8 @@ typedef struct _vsi_nn_reducemin_internal_param
    vx_bool     keep_dim;
 } vsi_nn_reducemin_internal_param;

+#ifdef __cplusplus
+}
 #endif

+#endif
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_reduceprod_internal.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_reduceprod_internal.h
@ -26,6 +26,10 @@

 #include "vsi_nn_types.h"

+#ifdef __cplusplus
+extern "C" {
+#endif
+
 #define VSI_NN_REDUCEPROD_SH_KERNEL_IDX(_AXIS, _INPUT_TYPE, _OUTPUT_TYPE, _IMAGE_DIMS) \
    VSI_NN_REDUCEPROD_AXIS##_AXIS##_##_INPUT_TYPE##TO##_OUTPUT_TYPE##_##_IMAGE_DIMS##_KERNEL,

@ -118,5 +122,8 @@ typedef struct _vsi_nn_reduceprod_internal_param
    vx_bool     keep_dim;
 } vsi_nn_reduceprod_internal_param;

+#ifdef __cplusplus
+}
 #endif

+#endif
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_reducesum_internal.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_reducesum_internal.h
@ -26,6 +26,10 @@

 #include "vsi_nn_types.h"

+#ifdef __cplusplus
+extern "C" {
+#endif
+
 typedef struct _vsi_nn_reducesum_lcl_data_t
 {
    vsi_nn_tensor_t *reshaped_input;
@ -40,5 +44,8 @@ typedef struct _vsi_nn_reducesum_internal_param
    vsi_nn_reducesum_lcl_data_t* local;
 } vsi_nn_reducesum_internal_param;

+#ifdef __cplusplus
+}
 #endif

+#endif
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_relu_keras.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_relu_keras.h
@ -26,6 +26,10 @@

 #include "vsi_nn_types.h"

+#ifdef __cplusplus
+extern "C" {
+#endif
+
 typedef struct _vsi_nn_relu_keras_param
 {
    float     alpha;
@ -33,5 +37,8 @@ typedef struct _vsi_nn_relu_keras_param
    float     threshold;
 } vsi_nn_relu_keras_param;

+#ifdef __cplusplus
+}
 #endif

+#endif
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_relu_keras_internal.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_relu_keras_internal.h
@ -26,6 +26,10 @@

 #include "vsi_nn_types.h"

+#ifdef __cplusplus
+extern "C" {
+#endif
+
 #define _VSI_NN_RELU_KERAS_INTERNAL_LOCAL_TENSOR_NUM 2

 typedef struct _vsi_nn_relu_keras_internal_lcl_data
@ -44,5 +48,8 @@ typedef struct _vsi_nn_relu_keras_internal_param
    float     threshold;
 } vsi_nn_relu_keras_internal_param;

+#ifdef __cplusplus
+}
 #endif

+#endif
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_reshape.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_reshape.h
@ -37,7 +37,7 @@ typedef struct _vsi_nn_reshape_lcl_data

 typedef struct _vsi_nn_reshape_param
 {
-    const vsi_size_t * size;
+    const uint32_t * size;
    uint32_t dim_num;

    /* reshape layer local data structure */
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_reshape2.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_reshape2.h
@ -0,0 +1,53 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+#ifndef _VSI_NN_OP_RESHAPE2_H
+#define _VSI_NN_OP_RESHAPE2_H
+
+#include "vsi_nn_types.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct _vsi_nn_reshape2_local_data
+{
+    vsi_bool initialized;
+} vsi_nn_reshape2_local_data;
+
+typedef struct _vsi_nn_reshape2_param
+{
+    vsi_nn_reshape2_local_data* local;
+    // Add parameters here
+    const vsi_size_t * size;
+    uint32_t dim_num;
+} vsi_nn_reshape2_param;
+_compiler_assert(offsetof(vsi_nn_reshape2_param, local) == 0, \
+    vsi_nn_reshape2_h );
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_resize.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_resize.h
@ -41,10 +41,15 @@ typedef uint32_t vsi_nn_interpolation_type_t; enum
    VSI_NN_INTERPOLATION_AREA
 };

-typedef struct _vsi_nn_resize_lcl_data
+typedef uint32_t vsi_nn_resize_layout_type_t; enum
 {
-    vx_tensor   local_tensor[_VSI_NN_RESIZE_LOCAL_TENSOR_NUM];
-} vsi_nn_resize_lcl_data;
+    VSI_NN_RESIZE_LAYOUT_NCHW = 0,
+    VSI_NN_RESIZE_LAYOUT_NHWC
+};
+
+typedef struct _vsi_nn_resize_local_data {
+    vsi_bool use_internal_node;
+} vsi_nn_resize_local_data;

 typedef struct _vsi_nn_resize_param
 {
@ -53,9 +58,16 @@ typedef struct _vsi_nn_resize_param
    int32_t      size[2];

    /* resize layer local data structure */
-    vsi_nn_resize_lcl_data local;
+    union
+    {
+        vsi_nn_resize_local_data *lcl_data;
+        struct {
+            vx_tensor   local_tensor[_VSI_NN_RESIZE_LOCAL_TENSOR_NUM];
+        } reserved;
+    };
    vsi_bool    align_corners;
    vsi_bool    half_pixel_centers;
+    vsi_enum    layout;
 } vsi_nn_resize_param;

 #ifdef __cplusplus
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_resize_1d.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_resize_1d.h
@ -27,6 +27,10 @@

 #include "vsi_nn_types.h"

+#ifdef __cplusplus
+extern "C" {
+#endif
+
 typedef struct _vsi_nn_resize_1d_param
 {
    struct _resize_1d_local_data_t* local;
@ -40,5 +44,8 @@ typedef struct _vsi_nn_resize_1d_param
 _compiler_assert(offsetof(vsi_nn_resize_1d_param, local) == 0, \
    vsi_nn_resize_1d_h );

+#ifdef __cplusplus
+}
 #endif

+#endif
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_resize_1d_bilinear_internal.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_resize_1d_bilinear_internal.h
@ -27,6 +27,10 @@

 #include "vsi_nn_types.h"

+#ifdef __cplusplus
+extern "C" {
+#endif
+
 typedef struct _vsi_nn_resize_1d_bilinear_internal_param
 {
    struct _resize_1d_bilinear_internal_local_data_t* local;
@ -38,5 +42,8 @@ typedef struct _vsi_nn_resize_1d_bilinear_internal_param
 _compiler_assert(offsetof(vsi_nn_resize_1d_bilinear_internal_param, local) == 0, \
    vsi_nn_resize_1d_bilinear_internal_h );

+#ifdef __cplusplus
+}
 #endif

+#endif
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_resize_1d_nearest_internal.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_resize_1d_nearest_internal.h
@ -27,6 +27,10 @@

 #include "vsi_nn_types.h"

+#ifdef __cplusplus
+extern "C" {
+#endif
+
 typedef struct _vsi_nn_resize_1d_nearest_internal_param
 {
    struct _resize_1d_nearest_internal_local_data_t* local;
@ -38,5 +42,8 @@ typedef struct _vsi_nn_resize_1d_nearest_internal_param
 _compiler_assert(offsetof(vsi_nn_resize_1d_nearest_internal_param, local) == 0, \
    vsi_nn_resize_1d_nearest_internal_h );

+#ifdef __cplusplus
+}
 #endif

+#endif
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_resize_internal.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_resize_internal.h
@ -26,6 +26,9 @@

 #include "vsi_nn_types.h"

+#ifdef __cplusplus
+extern "C" {
+#endif

 typedef struct _vsi_nn_resize_in_lcl_data
 {
@ -38,8 +41,12 @@ typedef struct _vsi_nn_resize_internal_param
    vsi_nn_resize_in_lcl_data *lcl_data_ptr;
    vsi_bool    align_corners;
    vsi_bool    half_pixel_centers;
-    float        factor;
+    float       factor;
+    vsi_enum    layout;
 } vsi_nn_resize_internal_param;

+#ifdef __cplusplus
+}
 #endif

+#endif
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_resize_nearest_internal.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_resize_nearest_internal.h
@ -26,6 +26,10 @@

 #include "vsi_nn_types.h"

+#ifdef __cplusplus
+extern "C" {
+#endif
+
 typedef struct _vsi_nn_resize_nearest_in_lcl_data
 {
    uint32_t    hash_idx;
@ -40,6 +44,8 @@ typedef struct _vsi_nn_resize_nearest_internal_param
    float        factor;
 } vsi_nn_resize_nearest_internal_param;

-
+#ifdef __cplusplus
+}
 #endif

+#endif
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_rnncell_ovxlib.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_rnncell_ovxlib.h
@ -27,6 +27,10 @@
 #include "vsi_nn_types.h"
 #include "vsi_nn_op_rnn.h"

+#ifdef __cplusplus
+extern "C" {
+#endif
+
 typedef struct _vsi_nn_rnncell_ovxlib_lcl_data_t
 {
    vsi_bool multi_batch;
@ -40,5 +44,8 @@ typedef struct _vsi_nn_rnncell_ovxlib_param
    vsi_nn_dtype_t* internal_dtype;
 } vsi_nn_rnncell_ovxlib_param;

+#ifdef __cplusplus
+}
 #endif

+#endif
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_roi_align.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_roi_align.h
@ -26,6 +26,10 @@

 #include "vsi_nn_types.h"

+#ifdef __cplusplus
+extern "C" {
+#endif
+
 typedef struct _vsi_nn_roi_align_param
 {
    int32_t output_height;
@ -36,5 +40,8 @@ typedef struct _vsi_nn_roi_align_param
    int32_t width_sample_num;
 } vsi_nn_roi_align_param;

+#ifdef __cplusplus
+}
 #endif

+#endif
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_sin.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_sin.h
@ -26,6 +26,9 @@

 #include "vsi_nn_types.h"

+#ifdef __cplusplus
+extern "C" {
+#endif

 #define _VSI_NN_SIN_LOCAL_TENSOR_NUM 2

@ -42,5 +45,8 @@ typedef struct _vsi_nn_sin_param
    vsi_nn_sin_lcl_data local;
 } vsi_nn_sin_param;

+#ifdef __cplusplus
+}
 #endif

+#endif
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_softmax_internal.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_softmax_internal.h
@ -28,6 +28,10 @@
 #include "vsi_nn_platform.h"
 #include "utils/vsi_nn_link_list.h"

+#ifdef __cplusplus
+extern "C" {
+#endif
+
 typedef struct _vsi_nn_softmax_internal_lcl_data
 {
    vsi_nn_link_list_t link_list;
@ -40,7 +44,11 @@ typedef struct _vsi_nn_softmax_internal_param
 {
    vsi_nn_softmax_internal_lcl_data *data;
    float beta;
+    int32_t axis;
 } vsi_nn_softmax_internal_param;

+#ifdef __cplusplus
+}
 #endif

+#endif
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_squeeze.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_squeeze.h
@ -27,6 +27,10 @@

 #include "vsi_nn_types.h"

+#ifdef __cplusplus
+extern "C" {
+#endif
+
 typedef struct _vsi_nn_squeeze_param
 {
    // Add parameters here
@ -34,5 +38,8 @@ typedef struct _vsi_nn_squeeze_param
    vx_uint32   axis_num;
 } vsi_nn_squeeze_param;

+#ifdef __cplusplus
+}
 #endif

+#endif
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_stack.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_stack.h
@ -25,6 +25,11 @@
 #define _VSI_NN_OP_STACK_H

 #include "vsi_nn_types.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 #define VSI_NN_STACK_MAX_INPUTS (16)

 typedef struct _vsi_nn_stack_lcl_data
@ -63,5 +68,8 @@ typedef struct _vsi_nn_stack_param
    uint32_t axis;
 } vsi_nn_stack_param;

+#ifdef __cplusplus
+}
 #endif

+#endif
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_tensor_add_mean_stddev_norm.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_tensor_add_mean_stddev_norm.h
@ -26,6 +26,10 @@

 #include "vsi_nn_types.h"

+#ifdef __cplusplus
+extern "C" {
+#endif
+
 #define _VSI_NN_TENSORADD_MEANSTDNORM_LOCAL_TENSOR_NUM 3

 typedef struct _vsi_nn_tensoradd_meanstdnorm_lcl_data
@ -39,5 +43,8 @@ typedef struct _vsi_nn_tensor_add_mean_stddev_norm_param
    float eps;
 } vsi_nn_tensor_add_mean_stddev_norm_param;

+#ifdef __cplusplus
+}
 #endif

+#endif
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_tile.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_tile.h
@ -26,6 +26,10 @@

 #include "vsi_nn_types.h"

+#ifdef __cplusplus
+extern "C" {
+#endif
+
 #define _VSI_NN_TILE_LOCAL_TENSOR_NUM 2

 typedef struct _vsi_nn_tile_lcl_data_t
@ -43,5 +47,8 @@ typedef struct _vsi_nn_tile_param
    uint32_t multiples_num;
 } vsi_nn_tile_param;

+#ifdef __cplusplus
+}
 #endif

+#endif
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_topk.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_topk.h
@ -26,10 +26,17 @@

 #include "vsi_nn_types.h"

+#ifdef __cplusplus
+extern "C" {
+#endif
+
 typedef struct _vsi_nn_topk_param
 {
    uint32_t     k;
 } vsi_nn_topk_param;

+#ifdef __cplusplus
+}
 #endif

+#endif
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_unidirectional_sequence_rnn.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_unidirectional_sequence_rnn.h
@ -27,6 +27,10 @@
 #include "vsi_nn_types.h"
 #include "vsi_nn_op_rnn.h"

+#ifdef __cplusplus
+extern "C" {
+#endif
+
 /* enum for inputs/outputs */
 enum
 {
@ -48,5 +52,8 @@ typedef struct _vsi_nn_unidirectional_sequence_rnn_param
    vsi_nn_dtype_t internal_dtype[RNNCELL_QUANTIZE_PARAM_COUNT];
 } vsi_nn_unidirectional_sequence_rnn_param;

+#ifdef __cplusplus
+}
 #endif

+#endif
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_unstack.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_unstack.h
@ -25,6 +25,11 @@
 #define _VSI_NN_OP_UNSTACK_H

 #include "vsi_nn_types.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 #define VSI_NN_UNSTACK_MAX_OUTPUTS (16)

 typedef struct _vsi_nn_unstack_lcl_data
@ -39,5 +44,8 @@ typedef struct _vsi_nn_unstack_param
    uint32_t     axis;
 } vsi_nn_unstack_param;

+#ifdef __cplusplus
+}
 #endif

+#endif
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_upsamplescale.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_upsamplescale.h
@ -27,6 +27,10 @@

 #include "vsi_nn_types.h"

+#ifdef __cplusplus
+extern "C" {
+#endif
+
 typedef struct _vsi_nn_upsamplescale_param
 {
    struct _upsamplescale_local_data_t* local;
@ -35,5 +39,8 @@ typedef struct _vsi_nn_upsamplescale_param
    float scale;
 } vsi_nn_upsamplescale_param;

+#ifdef __cplusplus
+}
 #endif

+#endif
--- a/src/tim/vx/internal/include/utils/vsi_nn_constraint_check.h
+++ b/src/tim/vx/internal/include/utils/vsi_nn_constraint_check.h
@ -50,7 +50,9 @@ enum {
    D_F32 = VSI_NN_TYPE_FLOAT32,
    D_F64 = VSI_NN_TYPE_FLOAT64,
    D_BF16 = VSI_NN_TYPE_BFLOAT16,
-    D_BOOL8 = VSI_NN_TYPE_BOOL8
+    D_BOOL8 = VSI_NN_TYPE_BOOL8,
+    D_I4 = VSI_NN_TYPE_INT4,
+    D_U4 = VSI_NN_TYPE_UINT4
 };

 /* short alias for qtype */
--- a/src/tim/vx/internal/include/utils/vsi_nn_dtype_util.h
+++ b/src/tim/vx/internal/include/utils/vsi_nn_dtype_util.h
@ -72,6 +72,16 @@ OVXLIB_API uint32_t vsi_nn_TypeGetBytes
    const vsi_nn_type_e type
    );

+OVXLIB_API uint32_t vsi_nn_TypeGetBytesExt
+    (
+    const vsi_nn_type_e type
+    );
+
+OVXLIB_API uint32_t vsi_nn_TypeGetBits
+    (
+    const vsi_nn_type_e type
+    );
+
 OVXLIB_API uint16_t vsi_nn_Fp32ToFp16
    (
    float in
--- a/Show More
+++ b/Show More