diff --git a/prebuilt-sdk/x86_64_linux/VERSION b/prebuilt-sdk/x86_64_linux/VERSION
index effb0ed..40da7fc 100644
--- a/prebuilt-sdk/x86_64_linux/VERSION
+++ b/prebuilt-sdk/x86_64_linux/VERSION
@@ -1 +1 @@
-REL/6.4.6
+REL/6.4.8
diff --git a/prebuilt-sdk/x86_64_linux/include/CL/cl_viv_vx_ext.h b/prebuilt-sdk/x86_64_linux/include/CL/cl_viv_vx_ext.h
index f5e2df1..02286d8 100644
--- a/prebuilt-sdk/x86_64_linux/include/CL/cl_viv_vx_ext.h
+++ b/prebuilt-sdk/x86_64_linux/include/CL/cl_viv_vx_ext.h
@@ -300,9 +300,13 @@ enum VXC_OP {
     VXC_OP_dp4x8_b,
     VXC_OP_dp2x16_b,
     VXC_OP_img_load,
+    VXC_OP_img_read,
     VXC_OP_img_load_3d,
+    VXC_OP_img_read_3d,
     VXC_OP_img_store,
+    VXC_OP_img_write,
     VXC_OP_img_store_3d,
+    VXC_OP_img_write_3d,
     VXC_OP_vload2,
     VXC_OP_vload3,
     VXC_OP_vload4,
@@ -534,8 +538,8 @@ enum eVXC_ERROR
  * offset should be composed by using VXC_5BITOFFSET_XY(x, y)
  * Coord must be type of int4 or float4 
  */
-#define VXC_ReadImage3D(Dest, Image, Coord, Offset, Info)       VXC_OP4(img_load_3d, Dest, Image, Coord, Offset, Info)
-#define VXC_WriteImage3D(Image, Coord, Color, Info)             VXC_OP4_NoDest(img_store_3d, Image, Coord, Color, Info)
+#define VXC_ReadImage3D(Dest, Image, Coord, Offset, Info)       VXC_OP4(img_read_3d, Dest, Image, Coord, Offset, Info)
+#define VXC_WriteImage3D(Image, Coord, Color, Info)             VXC_OP4_NoDest(img_write_3d, Image, Coord, Color, Info)
 
 #define VXC_Vload2(Dest, Pointer, Offset)    do { int byteOffset = ((int)sizeof((Dest)))*(Offset); VXC_OP2(vload2, Dest, Pointer, byteOffset); } while(0)
 #define VXC_Vload4(Dest, Pointer, Offset)    do { int byteOffset = ((int)sizeof((Dest)))*(Offset); VXC_OP2(vload4, Dest, Pointer,  byteOffset); } while(0)
diff --git a/prebuilt-sdk/x86_64_linux/include/VX/vx_kernels.h b/prebuilt-sdk/x86_64_linux/include/VX/vx_kernels.h
index 6e8c9a0..6c3671e 100644
--- a/prebuilt-sdk/x86_64_linux/include/VX/vx_kernels.h
+++ b/prebuilt-sdk/x86_64_linux/include/VX/vx_kernels.h
@@ -494,6 +494,8 @@ enum vx_kernel_e {
 
     VX_KERNEL_NN_CONVOLUTION_RELU_POOLING_MULTIPLY_LAYER2 = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x2E,
 
+    VX_KERNEL_NN_BATCH_GEMM = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x2F,
+
     VX_KERNEL_MAX_1_2, /*!< \internal Used for VX1.2 bounds checking in the conformance test. */
 };
 
diff --git a/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_compatible.h b/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_compatible.h
index e682779..e3baa23 100644
--- a/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_compatible.h
+++ b/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_compatible.h
@@ -112,4 +112,12 @@ VX_CREATE_TENSOR_SUPPORT_PHYSICAL is used to declare that openvx can support phy
 */
 #define VX_GRAPH_PREEMPTION_SUPPORT 1
 
+/*
+VX_BATCH_GEMM_API_SUPPORT is used to declare that vsi openvx driver can support vxBatchGemmNode API to transform gemm to convolution
+ [value]
+ 0: not support
+ 1: support
+*/
+#define VX_BATCH_GEMM_API_SUPPORT 1
+
 #endif /* __VX_KHR_COMPATIBLE_H__ */
diff --git a/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_nn_internal.h b/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_nn_internal.h
index 71c2932..41e1653 100644
--- a/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_nn_internal.h
+++ b/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_nn_internal.h
@@ -216,6 +216,7 @@ typedef struct _vx_nn_convolution_relu_pooling_params_ext4_t
     vx_uint32       poolingPadRight;
     vx_uint32       poolingPadTop;
     vx_uint32       poolingPadBottom;
+    vx_bool         enable_nn_tensor_add_relu;  /*!< \brief  Enable Relu function after tensor add. */
 } vx_nn_convolution_relu_pooling_params_ext4_t, * vx_nn_convolution_relu_pooling_params_ext4;
 
 /*! \brief [Graph] Creates a Convolutional Network Convolution and Activation(Relu) and Pooling Layer Node, this fucntion match kronos NN Extension 1.2 verion.
@@ -816,7 +817,8 @@ VX_API_ENTRY vx_node VX_API_CALL vxL2NormalizeLayer(vx_graph graph, vx_tensor in
  * \param [in] weights_biases [static] Point to WeightBiasesParameter data, vx_weights_biases_parameter is an opaque reference. 
  * \param [in] convolution_relu_pooling_params [static] Pointer to parameters of type <tt>\ref vx_nn_convolution_relu_pooling_params_t</tt>
  * \param [in] size_of_convolution_relu_pooling_params [static] Size in bytes of convolution_relu_pooling_params.
- * \param [out] outputs_conv The convolution output tensor data. Output will have the same number and structure of dimensions as inputs_conv. 
+ * \param [in] outputs_conv The convolution output tensor data. Output will have the same number and structure of dimensions as inputs_conv.
+ * We uses this tensor to provide format information of convolution output data to hardware, don't really return convolution output data.
  * \param [out] outputs_add The final add output tensor data. Output will have the same number and structure of dimensions as input. 
  * \return <tt> vx_node</tt>.
  * \returns A node reference <tt>\ref vx_node</tt>. Any possible errors preventing a
@@ -863,7 +865,8 @@ VX_API_ENTRY vx_node VX_API_CALL vxConvolutionReluPoolingAddLayer2(
  * \param [in] weights_biases [static] Point to WeightBiasesParameter data, vx_weights_biases_parameter is an opaque reference. 
  * \param [in] convolution_relu_pooling_params [static] Pointer to parameters of type <tt>\ref vx_nn_convolution_relu_pooling_params_t</tt>
  * \param [in] size_of_convolution_relu_pooling_params [static] Size in bytes of convolution_relu_pooling_params.
- * \param [out] outputs_conv The convolution output tensor data. Output will have the same number and structure of dimensions as inputs_conv. 
+ * \param [in] outputs_conv The convolution output tensor data. Output will have the same number and structure of dimensions as inputs_conv.
+ * We uses this tensor to provide format information of convolution output data to hardware, don't really return convolution output data.
  * \param [out] outputs_mul The final mul output tensor data. Output will have the same number and structure of dimensions as input. 
  * \return <tt> vx_node</tt>.
  * \returns A node reference <tt>\ref vx_node</tt>. Any possible errors preventing a
diff --git a/prebuilt-sdk/x86_64_linux/include/VX/vx_nodes.h b/prebuilt-sdk/x86_64_linux/include/VX/vx_nodes.h
index 3bfb7f2..bf513b5 100644
--- a/prebuilt-sdk/x86_64_linux/include/VX/vx_nodes.h
+++ b/prebuilt-sdk/x86_64_linux/include/VX/vx_nodes.h
@@ -940,6 +940,29 @@ VX_API_ENTRY vx_node VX_API_CALL vxTensorMatrixMultiplyNode(vx_graph graph, vx_t
  */
 VX_API_ENTRY vx_node VX_API_CALL vxCopyNode(vx_graph graph, vx_reference input, vx_reference output);
 
+/*! \brief Create a batch gemm node, the calcution formula is output = matrix_a * matrix_b + matrix_c.
+ * \param [in] graph The reference to the graph.
+ * \param [in] matrix_a The first input tensor.
+ * \param [in] matrix_b The second input tensor. Must be in the same data type and batch count as first input tensor.
+ * \param [in] matrix_c The third input tensor. Must be in the same data type and batch count as first input tensor. [optional]
+ * \param [in] trans_a If true, the matrix_a has been transposed before calcution.
+ * \param [in] trans_b If true, the matrix_b has been transposed before calcution.
+ * \param [in] trans_c If true, the matrix_c has been transposed before calcution. [optional]
+ * \param [out] output The output tensor. Output dimension must agree the formula in the description.
+ * \return <tt>\ref vx_node</tt>.
+ * \retval vx_node A node reference. Any possible errors preventing a successful creation
+ * should be checked using <tt>\ref vxGetStatus</tt>
+ * \ingroup group_vision_function_gemm
+ */
+VX_API_ENTRY vx_node VX_API_CALL vxBatchGemmNode(vx_graph graph,
+                                                 vx_tensor matrix_a,
+                                                 vx_tensor matrix_b,
+                                                 vx_tensor matrix_c,
+                                                 vx_scalar trans_a,
+                                                 vx_scalar trans_b,
+                                                 vx_scalar trans_c,
+                                                 vx_tensor output);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/prebuilt-sdk/x86_64_linux/include/VX/vx_types.h b/prebuilt-sdk/x86_64_linux/include/VX/vx_types.h
index cbff50c..0881c15 100644
--- a/prebuilt-sdk/x86_64_linux/include/VX/vx_types.h
+++ b/prebuilt-sdk/x86_64_linux/include/VX/vx_types.h
@@ -1288,6 +1288,8 @@ enum vx_channel_e {
     VX_CHANNEL_U = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_CHANNEL) + 0x15,
     /*! \brief Use to extract the Cr/V/Value channel, no matter the byte or packing order. */
     VX_CHANNEL_V = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_CHANNEL) + 0x16,
+
+    VX_CHANNEL_UV = VX_ENUM_BASE(VX_ID_VIVANTE, VX_ENUM_CHANNEL) + 0x0,
 };
 
 /*! \brief An enumeration of memory import types.
diff --git a/prebuilt-sdk/x86_64_linux/lib/libArchModelSw.so b/prebuilt-sdk/x86_64_linux/lib/libArchModelSw.so
index 2780651..575b344 100755
Binary files a/prebuilt-sdk/x86_64_linux/lib/libArchModelSw.so and b/prebuilt-sdk/x86_64_linux/lib/libArchModelSw.so differ
diff --git a/prebuilt-sdk/x86_64_linux/lib/libCLC.so b/prebuilt-sdk/x86_64_linux/lib/libCLC.so
index 032cf6f..ff87c25 100755
Binary files a/prebuilt-sdk/x86_64_linux/lib/libCLC.so and b/prebuilt-sdk/x86_64_linux/lib/libCLC.so differ
diff --git a/prebuilt-sdk/x86_64_linux/lib/libEmulator.so b/prebuilt-sdk/x86_64_linux/lib/libEmulator.so
index 783dccf..dbd7197 100755
Binary files a/prebuilt-sdk/x86_64_linux/lib/libEmulator.so and b/prebuilt-sdk/x86_64_linux/lib/libEmulator.so differ
diff --git a/prebuilt-sdk/x86_64_linux/lib/libGAL.so b/prebuilt-sdk/x86_64_linux/lib/libGAL.so
index b1d825f..0439666 100755
Binary files a/prebuilt-sdk/x86_64_linux/lib/libGAL.so and b/prebuilt-sdk/x86_64_linux/lib/libGAL.so differ
diff --git a/prebuilt-sdk/x86_64_linux/lib/libNNArchPerf.so b/prebuilt-sdk/x86_64_linux/lib/libNNArchPerf.so
index 6e87b6e..9a4e15c 100755
Binary files a/prebuilt-sdk/x86_64_linux/lib/libNNArchPerf.so and b/prebuilt-sdk/x86_64_linux/lib/libNNArchPerf.so differ
diff --git a/prebuilt-sdk/x86_64_linux/lib/libOpenVX.so.1.3.0 b/prebuilt-sdk/x86_64_linux/lib/libOpenVX.so.1.3.0
index 622ef8f..99ec9c8 100755
Binary files a/prebuilt-sdk/x86_64_linux/lib/libOpenVX.so.1.3.0 and b/prebuilt-sdk/x86_64_linux/lib/libOpenVX.so.1.3.0 differ
diff --git a/prebuilt-sdk/x86_64_linux/lib/libOpenVXC.so b/prebuilt-sdk/x86_64_linux/lib/libOpenVXC.so
index 64d2d68..44e37de 100755
Binary files a/prebuilt-sdk/x86_64_linux/lib/libOpenVXC.so and b/prebuilt-sdk/x86_64_linux/lib/libOpenVXC.so differ
diff --git a/prebuilt-sdk/x86_64_linux/lib/libVSC.so b/prebuilt-sdk/x86_64_linux/lib/libVSC.so
index c398988..07646f8 100755
Binary files a/prebuilt-sdk/x86_64_linux/lib/libVSC.so and b/prebuilt-sdk/x86_64_linux/lib/libVSC.so differ