From 3c59694025387fda56474e882f3f55e6fc5d46b4 Mon Sep 17 00:00:00 2001 From: Kainan Cha Date: Wed, 23 Jun 2021 15:26:25 +0800 Subject: [PATCH] Update internal to 1.1.32 SHA: 9aa0b0f Signed-off-by: Kainan Cha --- src/tim/vx/internal/BUILD | 11 +- src/tim/vx/internal/CMakeLists.txt | 2 - src/tim/vx/internal/include/interface/ops.def | 9 + .../internal/include/kernel/vsi_nn_kernel.h | 6 + .../{client => libnnext}/vsi_nn_vxkernel.h | 0 .../internal/include/ops/vsi_nn_op_conv1d.h | 20 +- .../include/ops/vsi_nn_op_grouped_conv1d.h | 55 + .../include/ops/vsi_nn_op_groupnormalize.h | 53 + .../internal/include/ops/vsi_nn_op_moments.h | 6 +- .../{vsi_nn_post.h => ops/vsi_nn_op_nms.h} | 18 +- .../internal/include/ops/vsi_nn_op_one_hot.h | 42 + .../vx/internal/include/ops/vsi_nn_op_pool.h | 12 +- .../internal/include/ops/vsi_nn_op_repeat.h | 54 + .../include/ops/vsi_nn_op_sequence_mask.h | 43 + .../include/ops/vsi_nn_op_strided_slice.h | 19 + .../internal/include/ops/vsi_nn_op_upsample.h | 2 +- .../include/utils/vsi_nn_constraint_check.h | 2 +- .../include/utils/vsi_nn_dtype_util_prv.h | 1 + src/tim/vx/internal/include/vsi_nn_graph.h | 12 + .../vx/internal/include/vsi_nn_node_type.h | 12 + src/tim/vx/internal/include/vsi_nn_version.h | 2 +- .../ops/kernel/cl/custom_softmax.cl} | 4 +- .../ops/kernel/cpu/custom_softmax_cpu.c | 194 + .../evis/custom_softmax.vx} | 7 +- .../ops/kernel/evis/custom_softmax_evis.c | 202 + .../ops/kernel/vsi_nn_kernel_custom_softmax.c | 231 - .../src/custom/ops/op_custom_softmax.c | 102 + .../src/custom/ops/vsi_nn_op_custom_softmax.c | 299 - src/tim/vx/internal/src/kernel/cl/argmax_cl.c | 21 +- src/tim/vx/internal/src/kernel/cl/argmin_cl.c | 20 +- src/tim/vx/internal/src/kernel/cl/cast_cl.c | 2 +- .../internal/src/kernel/cl/comparisons_cl.c | 6 + .../internal/src/kernel/cl/eltwise_unary_cl.c | 13 +- src/tim/vx/internal/src/kernel/cl/erf_cl.c | 328 + .../vx/internal/src/kernel/cl/floordiv_cl.c | 5 +- .../src/kernel/cl/group_normalization_cl.c | 760 ++ .../vx/internal/src/kernel/cl/moments_cl.c | 88 +- .../vx/internal/src/kernel/cl/one_hot_cl.c | 332 + .../src/kernel/cl/reducemax_internal_cl.c | 8 + src/tim/vx/internal/src/kernel/cl/repeat_cl.c | 407 + .../internal/src/kernel/cl/sequence_mask_cl.c | 354 + src/tim/vx/internal/src/kernel/cl/slice_cl.c | 308 + .../vx/internal/src/kernel/cpu/argmax_cpu.c | 2 +- .../vx/internal/src/kernel/cpu/argmin_cpu.c | 2 +- .../cpu/axis_aligned_bbox_transform_cpu.c | 279 + .../src/kernel/cpu/batchnorm_single_cpu.c | 2 +- .../internal/src/kernel/cpu/comparisons_cpu.c | 6 +- .../src/kernel/cpu/conv1d_ovxlib_cpu.c | 264 + .../src/kernel/cpu/depth2space_internal_cpu.c | 2 +- .../src/kernel/cpu/eltwise_unary_cpu.c | 14 +- src/tim/vx/internal/src/kernel/cpu/erf_cpu.c | 229 + .../vx/internal/src/kernel/cpu/gather_cpu.c | 2 +- .../internal/src/kernel/cpu/gather_nd_cpu.c | 2 +- .../src/kernel/cpu/group_normalization_cpu.c | 315 + .../kernel/cpu/instance_normalization_cpu.c | 2 +- .../src/kernel/cpu/layer_normalization_cpu.c | 6 +- .../internal/src/kernel/cpu/log_softmax_cpu.c | 2 +- .../internal/src/kernel/cpu/matrixmul_cpu.c | 2 +- .../vx/internal/src/kernel/cpu/maximum_cpu.c | 2 +- .../vx/internal/src/kernel/cpu/minimum_cpu.c | 2 +- .../vx/internal/src/kernel/cpu/moments_cpu.c | 2 +- src/tim/vx/internal/src/kernel/cpu/nms_cpu.c | 441 + .../vx/internal/src/kernel/cpu/one_hot_cpu.c | 252 + src/tim/vx/internal/src/kernel/cpu/pow_cpu.c | 2 +- .../src/kernel/cpu/pre_process_bgra_cpu.c | 2 +- .../src/kernel/cpu/pre_process_gray_cpu.c | 2 +- .../src/kernel/cpu/pre_process_nv12_cpu.c | 2 +- .../src/kernel/cpu/pre_process_rgb_cpu.c | 2 +- .../src/kernel/cpu/pre_process_yuv420_cpu.c | 2 +- .../src/kernel/cpu/pre_process_yuv444_cpu.c | 2 +- .../vx/internal/src/kernel/cpu/prelu_cpu.c | 2 +- .../src/kernel/cpu/random_multinomial_cpu.c | 2 +- .../vx/internal/src/kernel/cpu/repeat_cpu.c | 286 + .../internal/src/kernel/cpu/scatter_nd_cpu.c | 2 +- .../src/kernel/cpu/sequence_mask_cpu.c | 248 + .../vx/internal/src/kernel/cpu/slice_cpu.c | 246 + .../src/kernel/cpu/space2depth_internal_cpu.c | 2 +- src/tim/vx/internal/src/kernel/cpu/tile_cpu.c | 2 +- src/tim/vx/internal/src/kernel/cpu/topk_cpu.c | 297 + .../src/kernel/evis/batchnorm_single_evis.c | 152 +- .../vx/internal/src/kernel/evis/cast_evis.c | 1 + .../src/kernel/evis/comparisons_evis.c | 1 + .../src/kernel/evis/conv1d_ovxlib_evis.c | 702 ++ .../kernel/evis/depth2space_internal_evis.c | 166 +- .../src/kernel/evis/depthwise_conv1d_evis.c | 24 +- .../src/kernel/evis/eltwise_unary_evis.c | 29 + .../vx/internal/src/kernel/evis/erf_evis.c | 428 + .../vx/internal/src/kernel/evis/gather_evis.c | 109 +- .../internal/src/kernel/evis/gather_nd_evis.c | 9 + .../kernel/evis/group_normalization_evis.c | 1219 +++ .../kernel/evis/instance_normalization_evis.c | 355 +- .../kernel/evis/layer_normalization_evis.c | 189 +- .../internal/src/kernel/evis/maximum_evis.c | 2 +- .../internal/src/kernel/evis/minimum_evis.c | 2 +- .../internal/src/kernel/evis/one_hot_evis.c | 460 + .../src/kernel/evis/pre_process_rgb_evis.c | 85 +- .../src/kernel/evis/pre_process_yuv420_evis.c | 11 +- .../src/kernel/evis/pre_process_yuv444_evis.c | 11 +- .../vx/internal/src/kernel/evis/repeat_evis.c | 609 ++ .../src/kernel/evis/resize_bilinear_evis.c | 232 +- .../src/kernel/evis/sequence_mask_evis.c | 393 + .../vx/internal/src/kernel/evis/slice_evis.c | 451 + .../vx/internal/src/kernel/evis/tile_evis.c | 96 +- .../internal/src/kernel/vsi_nn_kernel_param.c | 41 + .../src/kernel/vsi_nn_kernel_selector.c | 16 + src/tim/vx/internal/src/kernel/vx/erf_vx.c | 216 + .../src/libnnext/ops/cl/eltwise_ops_helper.cl | 56 + .../src/libnnext/ops/cl/eltwise_unary.cl | 10 +- .../vx/internal/src/libnnext/ops/cl/erf.cl | 113 + .../internal/src/libnnext/ops/cl/floordiv.cl | 84 + .../ops/cl/group_normalization_f32.cl | 248 + .../ops/cl/group_normalization_i32.cl | 278 + .../libnnext/ops/cl/group_normalization_u8.cl | 287 + .../internal/src/libnnext/ops/cl/matrixmul.cl | 4 +- .../internal/src/libnnext/ops/cl/one_hot.cl | 130 + .../vx/internal/src/libnnext/ops/cl/repeat.cl | 176 + .../src/libnnext/ops/cl/sequence_mask.cl | 72 + .../vx/internal/src/libnnext/ops/cl/slice.cl | 144 + ...si_nn_kernel_axis_aligned_bbox_transform.c | 275 - .../kernel/vsi_nn_kernel_box_with_nms_limit.c | 2 +- .../ops/kernel/vsi_nn_kernel_extra_ending.c | 2 +- .../kernel/vsi_nn_kernel_generate_proposals.c | 483 - .../vsi_nn_kernel_heatmap_max_keypoint.c | 2 +- .../ops/kernel/vsi_nn_kernel_imageprocess.c | 2 +- .../ops/kernel/vsi_nn_kernel_signalframe.c | 2 +- .../vsi_nn_kernel_spatial_transformer.c | 2 +- .../ops/kernel/vsi_nn_kernel_sync_host.c | 2 +- .../kernel/vsi_nn_kernel_tensorstackconcat.c | 2 +- .../libnnext/ops/kernel/vsi_nn_kernel_topk.c | 266 - .../src/libnnext/ops/vx/batchnorm_single.vx | 8 +- .../libnnext/ops/vx/batchnorm_single_f32.vx | 267 + .../src/libnnext/ops/vx/conv1d_ovxlib.vx | 151 + .../libnnext/ops/vx/conv1d_ovxlib_k1024.vx | 167 + .../src/libnnext/ops/vx/depth2space_crd.vx | 242 +- .../src/libnnext/ops/vx/eltwise_unary_2d.vx | 19 +- .../src/libnnext/ops/vx/eltwise_unary_3d.vx | 20 +- .../vx/internal/src/libnnext/ops/vx/erf.vx | 174 + .../src/libnnext/ops/vx/gather_array.vx | 157 + .../internal/src/libnnext/ops/vx/gather_nd.vx | 20 +- .../src/libnnext/ops/vx/gather_nd_2d.vx | 20 +- .../src/libnnext/ops/vx/gather_nd_2d_mix.vx | 10 +- .../src/libnnext/ops/vx/gather_nd_3d.vx | 22 +- .../src/libnnext/ops/vx/gather_nd_3d_mix.vx | 10 +- .../src/libnnext/ops/vx/gather_nd_mix.vx | 10 +- .../ops/vx/group_normalization_f16.vx | 306 + .../ops/vx/group_normalization_i16.vx | 339 + .../libnnext/ops/vx/group_normalization_i8.vx | 317 + .../libnnext/ops/vx/group_normalization_u8.vx | 261 + .../ops/vx/group_normalization_u8_f16.vx | 114 + .../ops/vx/instance_normalization_f16.vx | 65 +- .../ops/vx/instance_normalization_i16.vx | 72 +- .../ops/vx/instance_normalization_i8.vx | 234 +- .../vx/instance_normalization_scale_f32.vx | 285 + .../instance_normalization_scale_f32_bf16.vx | 253 + .../instance_normalization_scale_f32_f16.vx | 143 + .../ops/vx/instance_normalization_u8.vx | 287 +- .../ops/vx/instance_normalization_u8_f16.vx | 147 + .../ops/vx/layer_normalization_scale_f32.vx | 275 + .../vx/layer_normalization_scale_f32_2d.vx | 237 + .../vx/layer_normalization_scale_f32_bf16.vx | 159 + .../internal/src/libnnext/ops/vx/one_hot.vx | 205 + .../libnnext/ops/vx/pre_process_rgb_copy.vx | 65 +- .../ops/vx/pre_process_yuv420_copy_u8.vx | 108 +- .../ops/vx/pre_process_yuv444_copy_u8.vx | 121 +- .../vx/internal/src/libnnext/ops/vx/repeat.vx | 224 + .../src/libnnext/ops/vx/repeat_axis1.vx | 232 + .../ops/vx/resize_bilinear_U8_UP_2X.vx | 65 - .../resize_bilinear_U8_half_pixel_centers.vx | 229 + .../src/libnnext/ops/vx/sequence_mask.vx | 150 + .../vx/internal/src/libnnext/ops/vx/slice.vx | 239 + .../vx/internal/src/libnnext/ops/vx/tile.vx | 37 + ...i_nn_kernel_axis_aligned_bbox_transform.vx | 8 - .../vx/vsi_nn_kernel_generate_proposals.vx | 8 - .../libnnext/ops/vx/vsi_nn_kernel_header.vx | 56 + .../src/libnnext/vsi_nn_libnnext_resource.c | 8145 +++++++++++++++-- .../{client => libnnext}/vsi_nn_vxkernel.c | 5 +- src/tim/vx/internal/src/makefile.linux | 16 +- .../vx/internal/src/ops/vsi_nn_op_argmaxmin.c | 1 + .../vsi_nn_op_axis_aligned_bbox_transform.c | 192 +- .../internal/src/ops/vsi_nn_op_batch_norm.c | 202 +- .../src/ops/vsi_nn_op_batchnorm_single.c | 26 +- .../vsi_nn_op_bidirectional_sequence_lstm.c | 10 +- .../vsi_nn_op_bidirectional_sequence_rnn.c | 2 +- .../src/ops/vsi_nn_op_box_with_nms_limit.c | 2 +- src/tim/vx/internal/src/ops/vsi_nn_op_cast.c | 99 +- src/tim/vx/internal/src/ops/vsi_nn_op_ceil.c | 110 + src/tim/vx/internal/src/ops/vsi_nn_op_clip.c | 3 +- .../vx/internal/src/ops/vsi_nn_op_conv1d.c | 183 +- .../vx/internal/src/ops/vsi_nn_op_conv2d.c | 1 + src/tim/vx/internal/src/ops/vsi_nn_op_crop.c | 2 +- .../internal/src/ops/vsi_nn_op_dataconvert.c | 131 +- .../src/ops/vsi_nn_op_deconvolution.c | 10 +- .../src/ops/vsi_nn_op_deconvolution1d.c | 28 +- .../vx/internal/src/ops/vsi_nn_op_dropout.c | 2 +- .../vx/internal/src/ops/vsi_nn_op_eltwise.c | 114 +- .../src/ops/vsi_nn_op_eltwise_unary.c | 1 + src/tim/vx/internal/src/ops/vsi_nn_op_erf.c | 128 + .../internal/src/ops/vsi_nn_op_extra_ending.c | 2 +- .../vx/internal/src/ops/vsi_nn_op_floordiv.c | 6 +- .../internal/src/ops/vsi_nn_op_fullconnect.c | 7 +- .../internal/src/ops/vsi_nn_op_fullconnect2.c | 2 +- .../vx/internal/src/ops/vsi_nn_op_gather.c | 14 +- .../vx/internal/src/ops/vsi_nn_op_gather_nd.c | 9 +- .../src/ops/vsi_nn_op_generate_proposals.c | 205 +- .../src/ops/vsi_nn_op_grouped_conv1d.c | 207 + .../src/ops/vsi_nn_op_grouped_conv2d.c | 8 +- .../src/ops/vsi_nn_op_groupnormalize.c | 297 + .../internal/src/ops/vsi_nn_op_gru_ovxlib.c | 13 +- .../src/ops/vsi_nn_op_grucell_ovxlib.c | 93 +- .../src/ops/vsi_nn_op_heatmap_max_keypoint.c | 2 +- .../internal/src/ops/vsi_nn_op_imageprocess.c | 2 +- .../src/ops/vsi_nn_op_instancenormalize.c | 9 +- .../vx/internal/src/ops/vsi_nn_op_interp.c | 1 + .../src/ops/vsi_nn_op_l2normalizescale.c | 133 +- .../src/ops/vsi_nn_op_layernormalize.c | 27 +- .../internal/src/ops/vsi_nn_op_lstm_ovxlib.c | 2 +- .../src/ops/vsi_nn_op_lstmunit_activation.c | 8 +- .../src/ops/vsi_nn_op_lstmunit_ovxlib.c | 8 +- .../vx/internal/src/ops/vsi_nn_op_matrixmul.c | 3 + .../vx/internal/src/ops/vsi_nn_op_moments.c | 118 +- src/tim/vx/internal/src/ops/vsi_nn_op_nms.c | 136 + .../vx/internal/src/ops/vsi_nn_op_one_hot.c | 176 + src/tim/vx/internal/src/ops/vsi_nn_op_pool.c | 276 +- .../src/ops/vsi_nn_op_poolwithargmax.c | 14 +- .../internal/src/ops/vsi_nn_op_post_process.c | 2 +- .../internal/src/ops/vsi_nn_op_pre_process.c | 2 +- .../src/ops/vsi_nn_op_pre_process_bgra.c | 3 +- .../src/ops/vsi_nn_op_pre_process_gray.c | 6 +- .../src/ops/vsi_nn_op_pre_process_nv12.c | 4 + .../src/ops/vsi_nn_op_pre_process_rgb.c | 6 +- .../src/ops/vsi_nn_op_pre_process_tensor.c | 2 +- .../src/ops/vsi_nn_op_pre_process_yuv420.c | 4 + .../src/ops/vsi_nn_op_pre_process_yuv444.c | 4 + src/tim/vx/internal/src/ops/vsi_nn_op_prelu.c | 2 +- .../src/ops/vsi_nn_op_quantized_16bit_lstm.c | 2 +- .../vx/internal/src/ops/vsi_nn_op_reduce.c | 2 +- .../src/ops/vsi_nn_op_reduce_internal.c | 2 + .../src/ops/vsi_nn_op_reducesum_internal.c | 2 +- .../src/ops/vsi_nn_op_relational_ops.c | 15 + .../internal/src/ops/vsi_nn_op_relu_keras.c | 2 +- .../src/ops/vsi_nn_op_relu_keras_internal.c | 2 +- .../vx/internal/src/ops/vsi_nn_op_repeat.c | 340 + .../vx/internal/src/ops/vsi_nn_op_resize.c | 2 +- .../src/ops/vsi_nn_op_resize_internal.c | 2 +- .../ops/vsi_nn_op_resize_nearest_internal.c | 2 +- .../vx/internal/src/ops/vsi_nn_op_reverse.c | 336 +- .../src/ops/vsi_nn_op_rnncell_ovxlib.c | 2 +- .../vx/internal/src/ops/vsi_nn_op_roi_align.c | 2 +- src/tim/vx/internal/src/ops/vsi_nn_op_scale.c | 2 +- .../src/ops/vsi_nn_op_sequence_mask.c | 176 + .../src/ops/vsi_nn_op_shufflechannel.c | 2 +- .../internal/src/ops/vsi_nn_op_signalframe.c | 2 +- src/tim/vx/internal/src/ops/vsi_nn_op_slice.c | 107 +- .../src/ops/vsi_nn_op_softmax_internal.c | 2 +- .../internal/src/ops/vsi_nn_op_space2depth.c | 2 +- .../src/ops/vsi_nn_op_spatial_transformer.c | 2 +- src/tim/vx/internal/src/ops/vsi_nn_op_split.c | 28 +- src/tim/vx/internal/src/ops/vsi_nn_op_stack.c | 2 +- .../src/ops/vsi_nn_op_strided_slice.c | 236 +- src/tim/vx/internal/src/ops/vsi_nn_op_svdf.c | 18 +- .../vx/internal/src/ops/vsi_nn_op_sync_host.c | 2 +- .../vsi_nn_op_tensor_add_mean_stddev_norm.c | 2 +- .../src/ops/vsi_nn_op_tensorstackconcat.c | 2 +- src/tim/vx/internal/src/ops/vsi_nn_op_topk.c | 203 +- .../vsi_nn_op_unidirectional_sequence_rnn.c | 2 +- .../vx/internal/src/ops/vsi_nn_op_unstack.c | 2 +- .../vx/internal/src/ops/vsi_nn_op_upsample.c | 2 +- .../src/utils/vsi_nn_code_generator.c | 9 + .../src/utils/vsi_nn_constraint_check.c | 19 +- src/tim/vx/internal/src/utils/vsi_nn_dtype.c | 3 + src/tim/vx/internal/src/vsi_nn_graph.c | 155 +- .../internal/src/vsi_nn_graph_optimization.c | 21 +- .../vx/internal/src/vsi_nn_internal_node.c | 3 +- src/tim/vx/internal/src/vsi_nn_node.c | 2 +- .../internal/src/vsi_nn_node_attr_template.c | 3 + .../vx/internal/src/vsi_nn_pre_post_process.c | 32 +- src/tim/vx/internal/src/vsi_nn_tensor.c | 55 +- 277 files changed, 30752 insertions(+), 4475 deletions(-) rename src/tim/vx/internal/include/{client => libnnext}/vsi_nn_vxkernel.h (100%) create mode 100644 src/tim/vx/internal/include/ops/vsi_nn_op_grouped_conv1d.h create mode 100644 src/tim/vx/internal/include/ops/vsi_nn_op_groupnormalize.h rename src/tim/vx/internal/include/{vsi_nn_post.h => ops/vsi_nn_op_nms.h} (84%) create mode 100644 src/tim/vx/internal/include/ops/vsi_nn_op_one_hot.h create mode 100644 src/tim/vx/internal/include/ops/vsi_nn_op_repeat.h create mode 100644 src/tim/vx/internal/include/ops/vsi_nn_op_sequence_mask.h rename src/tim/vx/internal/src/{libnnext/ops/vx/vsi_nn_kernel_topk.vx => custom/ops/kernel/cl/custom_softmax.cl} (63%) create mode 100644 src/tim/vx/internal/src/custom/ops/kernel/cpu/custom_softmax_cpu.c rename src/tim/vx/internal/src/custom/ops/{vx/vsi_nn_kernel_custom_softmax.vx => kernel/evis/custom_softmax.vx} (87%) create mode 100644 src/tim/vx/internal/src/custom/ops/kernel/evis/custom_softmax_evis.c delete mode 100644 src/tim/vx/internal/src/custom/ops/kernel/vsi_nn_kernel_custom_softmax.c create mode 100644 src/tim/vx/internal/src/custom/ops/op_custom_softmax.c delete mode 100644 src/tim/vx/internal/src/custom/ops/vsi_nn_op_custom_softmax.c create mode 100644 src/tim/vx/internal/src/kernel/cl/erf_cl.c create mode 100644 src/tim/vx/internal/src/kernel/cl/group_normalization_cl.c create mode 100644 src/tim/vx/internal/src/kernel/cl/one_hot_cl.c create mode 100644 src/tim/vx/internal/src/kernel/cl/repeat_cl.c create mode 100644 src/tim/vx/internal/src/kernel/cl/sequence_mask_cl.c create mode 100644 src/tim/vx/internal/src/kernel/cl/slice_cl.c create mode 100644 src/tim/vx/internal/src/kernel/cpu/axis_aligned_bbox_transform_cpu.c create mode 100644 src/tim/vx/internal/src/kernel/cpu/conv1d_ovxlib_cpu.c create mode 100644 src/tim/vx/internal/src/kernel/cpu/erf_cpu.c create mode 100644 src/tim/vx/internal/src/kernel/cpu/group_normalization_cpu.c create mode 100644 src/tim/vx/internal/src/kernel/cpu/nms_cpu.c create mode 100644 src/tim/vx/internal/src/kernel/cpu/one_hot_cpu.c create mode 100644 src/tim/vx/internal/src/kernel/cpu/repeat_cpu.c create mode 100644 src/tim/vx/internal/src/kernel/cpu/sequence_mask_cpu.c create mode 100644 src/tim/vx/internal/src/kernel/cpu/slice_cpu.c create mode 100644 src/tim/vx/internal/src/kernel/cpu/topk_cpu.c create mode 100644 src/tim/vx/internal/src/kernel/evis/conv1d_ovxlib_evis.c create mode 100644 src/tim/vx/internal/src/kernel/evis/erf_evis.c create mode 100644 src/tim/vx/internal/src/kernel/evis/group_normalization_evis.c create mode 100644 src/tim/vx/internal/src/kernel/evis/one_hot_evis.c create mode 100644 src/tim/vx/internal/src/kernel/evis/repeat_evis.c create mode 100644 src/tim/vx/internal/src/kernel/evis/sequence_mask_evis.c create mode 100644 src/tim/vx/internal/src/kernel/evis/slice_evis.c create mode 100644 src/tim/vx/internal/src/kernel/vx/erf_vx.c create mode 100644 src/tim/vx/internal/src/libnnext/ops/cl/erf.cl create mode 100644 src/tim/vx/internal/src/libnnext/ops/cl/group_normalization_f32.cl create mode 100644 src/tim/vx/internal/src/libnnext/ops/cl/group_normalization_i32.cl create mode 100644 src/tim/vx/internal/src/libnnext/ops/cl/group_normalization_u8.cl create mode 100644 src/tim/vx/internal/src/libnnext/ops/cl/one_hot.cl create mode 100644 src/tim/vx/internal/src/libnnext/ops/cl/repeat.cl create mode 100644 src/tim/vx/internal/src/libnnext/ops/cl/sequence_mask.cl create mode 100644 src/tim/vx/internal/src/libnnext/ops/cl/slice.cl delete mode 100644 src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_axis_aligned_bbox_transform.c delete mode 100644 src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_generate_proposals.c delete mode 100644 src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_topk.c create mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/batchnorm_single_f32.vx create mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/conv1d_ovxlib.vx create mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/conv1d_ovxlib_k1024.vx create mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/erf.vx create mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/gather_array.vx create mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_f16.vx create mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_i16.vx create mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_i8.vx create mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_u8.vx create mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_u8_f16.vx create mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_scale_f32.vx create mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_scale_f32_bf16.vx create mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_scale_f32_f16.vx create mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_u8_f16.vx create mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_scale_f32.vx create mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_scale_f32_2d.vx create mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_scale_f32_bf16.vx create mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/one_hot.vx create mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/repeat.vx create mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/repeat_axis1.vx delete mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_U8_UP_2X.vx create mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_U8_half_pixel_centers.vx create mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/sequence_mask.vx create mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/slice.vx delete mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_axis_aligned_bbox_transform.vx delete mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_generate_proposals.vx rename src/tim/vx/internal/src/{client => libnnext}/vsi_nn_vxkernel.c (98%) create mode 100644 src/tim/vx/internal/src/ops/vsi_nn_op_ceil.c create mode 100644 src/tim/vx/internal/src/ops/vsi_nn_op_erf.c create mode 100644 src/tim/vx/internal/src/ops/vsi_nn_op_grouped_conv1d.c create mode 100644 src/tim/vx/internal/src/ops/vsi_nn_op_groupnormalize.c create mode 100644 src/tim/vx/internal/src/ops/vsi_nn_op_nms.c create mode 100644 src/tim/vx/internal/src/ops/vsi_nn_op_one_hot.c create mode 100644 src/tim/vx/internal/src/ops/vsi_nn_op_repeat.c create mode 100644 src/tim/vx/internal/src/ops/vsi_nn_op_sequence_mask.c diff --git a/src/tim/vx/internal/BUILD b/src/tim/vx/internal/BUILD index 1803e76..97e4591 100644 --- a/src/tim/vx/internal/BUILD +++ b/src/tim/vx/internal/BUILD @@ -69,7 +69,9 @@ filegroup( name = "custom_srcs", srcs = glob([ "src/custom/ops/*.c", - "src/custom/ops/kernel/*.c", + "src/custom/ops/kernel/evis/*.c", + "src/custom/ops/kernel/cl/*.c", + "src/custom/ops/kernel/cpu/*.c", ]) ) @@ -128,7 +130,6 @@ cc_library( "include/quantization/vsi_nn_asymmetric_affine.h", "include/quantization/vsi_nn_dynamic_fixed_point.h", "include/quantization/vsi_nn_perchannel_symmetric_affine.h", - "include/client/vsi_nn_vxkernel.h", "include/interface/ops.def", "include/kernel/vsi_nn_kernel.h", "include/kernel/vsi_nn_gpu.h", @@ -139,6 +140,7 @@ cc_library( "include/vsi_nn_error.h", # libnnext + "include/libnnext/vsi_nn_vxkernel.h", "include/libnnext/vx_lib_nnext.h", "include/libnnext/vsi_nn_libnnext_resource.h", @@ -167,7 +169,6 @@ cc_library( "src/vsi_nn_daemon.c", "src/vsi_nn_graph_optimization.c", "src/vsi_nn_pre_post_process.c", - "src/client/vsi_nn_vxkernel.c", "src/utils/vsi_nn_link_list.c", "src/utils/vsi_nn_util.c", "src/utils/vsi_nn_math.c", @@ -200,12 +201,10 @@ cc_library( "src/libnnext/ops/kernel/vsi_nn_kernel_sync_host.c", "src/libnnext/ops/kernel/vsi_nn_kernel_spatial_transformer.c", "src/libnnext/ops/kernel/vsi_nn_kernel_extra_ending.c", - "src/libnnext/ops/kernel/vsi_nn_kernel_topk.c", "src/libnnext/ops/kernel/vsi_nn_kernel_heatmap_max_keypoint.c", - "src/libnnext/ops/kernel/vsi_nn_kernel_axis_aligned_bbox_transform.c", "src/libnnext/ops/kernel/vsi_nn_kernel_box_with_nms_limit.c", - "src/libnnext/ops/kernel/vsi_nn_kernel_generate_proposals.c", "src/libnnext/vsi_nn_libnnext_resource.c", + "src/libnnext/vsi_nn_vxkernel.c", ] + [":kernel_srcs"] + [":operation_srcs"] + [":custom_srcs"], diff --git a/src/tim/vx/internal/CMakeLists.txt b/src/tim/vx/internal/CMakeLists.txt index f09bb9a..ee9fc3f 100644 --- a/src/tim/vx/internal/CMakeLists.txt +++ b/src/tim/vx/internal/CMakeLists.txt @@ -12,7 +12,6 @@ aux_source_directory(src/kernel/cpu INTERNAL_KERNEL_CPU) aux_source_directory(src/kernel/evis INTERNAL_KERNEL_EVIS) aux_source_directory(src/kernel/vx INTERNAL_KERNEL_VX) aux_source_directory(src/ops INTERNAL_OPS) -aux_source_directory(src/client INTERNAL_CLIENT) aux_source_directory(src/libnnext INTERNAL_LIBNNEXT) aux_source_directory(src/libnnext/ops/kernel INTERNAL_LIBNNEXT_OPS_KERNEL) aux_source_directory(src/quantization INTERNAL_QUANTIZATION) @@ -29,7 +28,6 @@ list(APPEND SRC ${INTERNAL_KERNEL_EVIS} ${INTERNAL_KERNEL_VX} ${INTERNAL_OPS} - ${INTERNAL_CLIENT} ${INTERNAL_LIBNNEXT} ${INTERNAL_LIBNNEXT_OPS_KERNEL} ${INTERNAL_QUANTIZATION} diff --git a/src/tim/vx/internal/include/interface/ops.def b/src/tim/vx/internal/include/interface/ops.def index 523f299..6315513 100644 --- a/src/tim/vx/internal/include/interface/ops.def +++ b/src/tim/vx/internal/include/interface/ops.def @@ -147,3 +147,12 @@ DEF_OP(DECONVOLUTION1D) DEF_OP(INTERP) DEF_OP(RESIZE_1D) DEF_OP(UPSAMPLESCALE) +DEF_OP(GROUP_NORM) +DEF_OP(ROUND) +DEF_OP(CEIL) +DEF_OP(SEQUENCE_MASK) +DEF_OP(REPEAT) +DEF_OP(ERF) +DEF_OP(ONE_HOT) +DEF_OP(NMS) +DEF_OP(GROUPED_CONV1D) diff --git a/src/tim/vx/internal/include/kernel/vsi_nn_kernel.h b/src/tim/vx/internal/include/kernel/vsi_nn_kernel.h index c5c8b2c..9d89a4a 100644 --- a/src/tim/vx/internal/include/kernel/vsi_nn_kernel.h +++ b/src/tim/vx/internal/include/kernel/vsi_nn_kernel.h @@ -244,6 +244,12 @@ vsi_bool vsi_nn_kernel_param_add_buffer void * vsi_nn_kernel_param_get_buffer ( const vsi_nn_kernel_param_t * params, const char * key, size_t * size); +vsi_bool vsi_nn_kernel_param_add_const_buffer + ( vsi_nn_kernel_param_t * params, const char * key, const void * buf, size_t size); + +const void * vsi_nn_kernel_param_get_const_buffer + ( const vsi_nn_kernel_param_t * params, const char * key, size_t * size); + /** Kernel register */ #define REGISTER_KERNEL_BACKEND(kernel_name, kernel_type, func) \ _INITIALIZER(_register_kernel_##kernel_name##_##kernel_type) \ diff --git a/src/tim/vx/internal/include/client/vsi_nn_vxkernel.h b/src/tim/vx/internal/include/libnnext/vsi_nn_vxkernel.h similarity index 100% rename from src/tim/vx/internal/include/client/vsi_nn_vxkernel.h rename to src/tim/vx/internal/include/libnnext/vsi_nn_vxkernel.h diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_conv1d.h b/src/tim/vx/internal/include/ops/vsi_nn_op_conv1d.h index 9d216ff..5fa5041 100644 --- a/src/tim/vx/internal/include/ops/vsi_nn_op_conv1d.h +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_conv1d.h @@ -30,17 +30,19 @@ extern "C" { #endif -typedef struct _vsi_nn_conv1d_lcl_data_t -{ - vx_tensor input_tensor; - vx_tensor weight_tensor; - vx_tensor output_tensor; -} vsi_nn_conv1d_lcl_data_t; - typedef struct _vsi_nn_conv1d_param { /* local data must be the first. */ - vsi_nn_conv1d_lcl_data_t local; + union + { + struct _conv1d_local_data_t *local; + + struct { + vx_tensor input_tensor; + vx_tensor weight_tensor; + vx_tensor output_tensor; + } reserved; + }; uint32_t ksize; uint32_t stride; @@ -53,6 +55,8 @@ typedef struct _vsi_nn_conv1d_param uint32_t dilation; int32_t multiplier; } vsi_nn_conv1d_param; +_compiler_assert(offsetof(vsi_nn_conv1d_param, local) == 0, \ + vsi_nn_vsi_nn_conv1d_h ); #ifdef __cplusplus } diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_grouped_conv1d.h b/src/tim/vx/internal/include/ops/vsi_nn_op_grouped_conv1d.h new file mode 100644 index 0000000..f9470ee --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_grouped_conv1d.h @@ -0,0 +1,55 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#ifndef _VSI_NN_OP_GROUPED_CONV1D_H +#define _VSI_NN_OP_GROUPED_CONV1D_H + +#include "vsi_nn_types.h" + +typedef struct _grouped_conv1d_local_data_t { + vsi_nn_tensor_t* input; + vsi_nn_tensor_t* weight; + vsi_nn_tensor_t* output; + +} grouped_conv1d_local_data_t; + +typedef struct _vsi_nn_grouped_conv1d_param +{ + grouped_conv1d_local_data_t *local; + + uint32_t ksize; + uint32_t stride; + /* Pad left, right, top, bottom */ + uint32_t pad[2]; + /* Pad type default value shall be AUTO */ + vsi_nn_pad_e pad_type; + uint32_t weights; + uint32_t group; + uint32_t dilation; + int32_t multiplier; +} vsi_nn_grouped_conv1d_param; + + +#endif + diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_groupnormalize.h b/src/tim/vx/internal/include/ops/vsi_nn_op_groupnormalize.h new file mode 100644 index 0000000..417a4cf --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_groupnormalize.h @@ -0,0 +1,53 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef _VSI_NN_OP_CLIENT_GROUPNORMALIZE_H +#define _VSI_NN_OP_CLIENT_GROUPNORMALIZE_H + +#include "vsi_nn_types.h" +#include "vsi_nn_platform.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct _vsi_nn_groupnorm_lcl_data +{ + /* handle 3D group norm */ + vsi_nn_tensor_t *reshaped_input; + vsi_nn_tensor_t *reshaped_output; +} vsi_nn_groupnorm_lcl_data; + +typedef struct _vsi_nn_groupnormalize_param +{ + /* local data must be the first. */ + vsi_nn_groupnorm_lcl_data* lcl_data; + float eps; + int32_t group_num; +} vsi_nn_groupnormalize_param; + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_moments.h b/src/tim/vx/internal/include/ops/vsi_nn_op_moments.h index c9f39ed..fd6427a 100644 --- a/src/tim/vx/internal/include/ops/vsi_nn_op_moments.h +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_moments.h @@ -32,9 +32,9 @@ extern "C" { typedef struct _vsi_nn_moments_param { - int32_t* axis; - int32_t axis_num; - vsi_bool keep_dim; + const int32_t* axis; + int32_t axis_num; + vsi_bool keep_dim; } vsi_nn_moments_param; #ifdef __cplusplus diff --git a/src/tim/vx/internal/include/vsi_nn_post.h b/src/tim/vx/internal/include/ops/vsi_nn_op_nms.h similarity index 84% rename from src/tim/vx/internal/include/vsi_nn_post.h rename to src/tim/vx/internal/include/ops/vsi_nn_op_nms.h index 61fe75f..174bb10 100644 --- a/src/tim/vx/internal/include/vsi_nn_post.h +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_nms.h @@ -21,10 +21,18 @@ * DEALINGS IN THE SOFTWARE. * *****************************************************************************/ -#ifndef _VSI_NN_POST_H -#define _VSI_NN_POST_H -#include "post/vsi_nn_post_fasterrcnn.h" -#include "post/vsi_nn_post_cmupose.h" +#ifndef _VSI_NN_OP_NMS_H +#define _VSI_NN_OP_NMS_H -#endif \ No newline at end of file +#include "vsi_nn_types.h" + +typedef struct _vsi_nn_nms_param +{ + int32_t max_output_size; + float iou_threshold; + float score_threshold; + float soft_nms_sigma; +} vsi_nn_nms_param; + +#endif diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_one_hot.h b/src/tim/vx/internal/include/ops/vsi_nn_op_one_hot.h new file mode 100644 index 0000000..5cad574 --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_one_hot.h @@ -0,0 +1,42 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#ifndef _VSI_NN_OP_ONE_HOT_H +#define _VSI_NN_OP_ONE_HOT_H + +#include "vsi_nn_types.h" + +typedef struct _vsi_nn_one_hot_param +{ + struct _one_hot_local_data_t* local; + + int32_t depth; + float on_value; + float off_value; + int32_t axis; +} vsi_nn_one_hot_param; +_compiler_assert(offsetof(vsi_nn_one_hot_param, local) == 0, \ + vsi_nn_one_hot_h ); + +#endif diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_pool.h b/src/tim/vx/internal/include/ops/vsi_nn_op_pool.h index 979d22c..ee32df3 100644 --- a/src/tim/vx/internal/include/ops/vsi_nn_op_pool.h +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_pool.h @@ -30,12 +30,12 @@ extern "C" { #endif -#define _VSI_NN_POOLWITHARGMAX_LOCAL_TENSOR_NUM 3 - -typedef struct _vsi_nn_poolwithargmax_lcl_data +typedef struct _vsi_nn_pool_lcl_data { - vx_tensor local_tensor[_VSI_NN_POOLWITHARGMAX_LOCAL_TENSOR_NUM]; -} vsi_nn_poolwithargmax_lcl_data; + /* handle pool1d */ + vsi_nn_tensor_t *reshaped_input; + vsi_nn_tensor_t *reshaped_output; +} vsi_nn_pool_lcl_data; typedef struct _vsi_nn_pool_param { @@ -49,7 +49,7 @@ typedef struct _vsi_nn_pool_param /* Pad type default value shall be AUTO */ vsi_nn_pad_e pad_type; /* poolwithargmax layer local data structure */ - vsi_nn_poolwithargmax_lcl_data local; + vsi_nn_pool_lcl_data *local; } vsi_nn_pool_param; #ifdef __cplusplus diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_repeat.h b/src/tim/vx/internal/include/ops/vsi_nn_op_repeat.h new file mode 100644 index 0000000..973570e --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_repeat.h @@ -0,0 +1,54 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef _VSI_NN_OP_REPEAT_H +#define _VSI_NN_OP_REPEAT_H + +#include "vsi_nn_types.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct _vsi_nn_repeat_lcl_data +{ + vsi_nn_tensor_t *repeat_tensor; + vsi_nn_tensor_t *reshaped_input; + vsi_nn_tensor_t *reshaped_output; +} vsi_nn_repeat_lcl_data; + +typedef struct _vsi_nn_repeat__param +{ + vsi_nn_repeat_lcl_data* local; + int32_t axis; + int32_t maxlen; // default max repeat number + int32_t* repeat_host; // host repeat array + int32_t repeat_len; // length of host repeat array +} vsi_nn_repeat_param; + +#ifdef __cplusplus +} +#endif + +#endif + diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_sequence_mask.h b/src/tim/vx/internal/include/ops/vsi_nn_op_sequence_mask.h new file mode 100644 index 0000000..e8bb2f1 --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_sequence_mask.h @@ -0,0 +1,43 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef _VSI_NN_OP_SEQUENCE_MASK_H +#define _VSI_NN_OP_SEQUENCE_MASK_H + +#include "vsi_nn_types.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct _vsi_nn_sequence_mask__param +{ + int32_t maxlen; +} vsi_nn_sequence_mask_param; + +#ifdef __cplusplus +} +#endif + +#endif + diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_strided_slice.h b/src/tim/vx/internal/include/ops/vsi_nn_op_strided_slice.h index ec5b6c9..d7bb3c7 100644 --- a/src/tim/vx/internal/include/ops/vsi_nn_op_strided_slice.h +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_strided_slice.h @@ -32,6 +32,22 @@ extern "C" { #endif +typedef struct _strided_slice_param +{ + int32_t *begin_dims; + int32_t begin_dims_num; + int32_t *end_dims; + int32_t end_dims_num; + int32_t *stride_dims; + int32_t stride_dims_num; + int32_t begin_mask; + int32_t end_mask; + int32_t shrink_axis_mask; + int32_t new_axis_mask; + + int32_t num_add_axis; +} strided_slice_param; + typedef struct _vsi_nn_strided_slice_lcl_data2 { vsi_nn_link_list_t link_list; @@ -55,6 +71,8 @@ typedef struct _vsi_nn_strided_slice_lcl_data2 vsi_bool is_dataconvert_op; vsi_bool is_optimized; + + strided_slice_param params; } vsi_nn_strided_slice_lcl_data2; typedef struct _vsi_nn_strided_slice_lcl_data_t @@ -78,6 +96,7 @@ typedef struct _vsi_nn_strided_slice_param vx_int32 begin_mask; vx_int32 end_mask; vx_int32 shrink_axis_mask; + int32_t new_axis_mask; vsi_nn_strided_slice_lcl_data2 * lcl2_data; } vsi_nn_strided_slice_param; diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_upsample.h b/src/tim/vx/internal/include/ops/vsi_nn_op_upsample.h index 112f633..ef191bf 100644 --- a/src/tim/vx/internal/include/ops/vsi_nn_op_upsample.h +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_upsample.h @@ -34,7 +34,7 @@ extern "C" { typedef struct _vsi_nn_upsample_lcl_data { - vx_tensor local_tensor[_VSI_NN_POOLWITHARGMAX_LOCAL_TENSOR_NUM]; + vx_tensor local_tensor[_VSI_NN_UPSAMPLE_LOCAL_TENSOR_NUM]; } vsi_nn_upsample_lcl_data; typedef struct _vsi_nn_upsample_param diff --git a/src/tim/vx/internal/include/utils/vsi_nn_constraint_check.h b/src/tim/vx/internal/include/utils/vsi_nn_constraint_check.h index 3bb7c5d..a491adc 100644 --- a/src/tim/vx/internal/include/utils/vsi_nn_constraint_check.h +++ b/src/tim/vx/internal/include/utils/vsi_nn_constraint_check.h @@ -119,7 +119,7 @@ vsi_bool is_item_in_array enum { NAME##_INPUT_COUNT = INPUT_COUNT, \ NAME##_OUTPUT_COUNT = OUTPUT_COUNT, \ NAME##_IO_COUNT = NAME##_INPUT_COUNT + NAME##_OUTPUT_COUNT}; \ -static const struct {vsi_nn_type_e types[NAME##_IO_COUNT];} \ +static const struct {int types[NAME##_IO_COUNT];} \ NAME##_supported_io_types[] = { #define DECL_OP_CONSTRAINT_REG(NAME) \ diff --git a/src/tim/vx/internal/include/utils/vsi_nn_dtype_util_prv.h b/src/tim/vx/internal/include/utils/vsi_nn_dtype_util_prv.h index d85fafd..334c7a0 100644 --- a/src/tim/vx/internal/include/utils/vsi_nn_dtype_util_prv.h +++ b/src/tim/vx/internal/include/utils/vsi_nn_dtype_util_prv.h @@ -438,6 +438,7 @@ static inline vsi_status float32_to_dtype case VSI_NN_TYPE_UINT8: case VSI_NN_TYPE_INT16: case VSI_NN_TYPE_INT32: + case VSI_NN_TYPE_UINT32: { int32_t dst_value = 0; switch( dst_dtype->qnt_type ) diff --git a/src/tim/vx/internal/include/vsi_nn_graph.h b/src/tim/vx/internal/include/vsi_nn_graph.h index 584bdd8..ffb5dd0 100644 --- a/src/tim/vx/internal/include/vsi_nn_graph.h +++ b/src/tim/vx/internal/include/vsi_nn_graph.h @@ -165,6 +165,8 @@ struct _vsi_nn_graph * so please keep it NULL.*/ vsi_nn_tensor_t* tensor; } complete_signal; + + vsi_bool isAllowFastMode; }; /** @@ -716,6 +718,16 @@ OVXLIB_API vsi_status vsi_nn_SetGraphPriority uint32_t priority ); +OVXLIB_API vsi_status vsi_nn_SetGraphFastMode + ( + vsi_nn_graph_t* graph, + vsi_bool fastmode + ); + +OVXLIB_API vsi_bool vsi_nn_IsGraphFastMode + ( + const vsi_nn_graph_t* graph + ); #ifdef __cplusplus } #endif diff --git a/src/tim/vx/internal/include/vsi_nn_node_type.h b/src/tim/vx/internal/include/vsi_nn_node_type.h index 89cd104..f9a4606 100644 --- a/src/tim/vx/internal/include/vsi_nn_node_type.h +++ b/src/tim/vx/internal/include/vsi_nn_node_type.h @@ -164,6 +164,12 @@ #include "ops/vsi_nn_op_resize_1d_bilinear_internal.h" #include "ops/vsi_nn_op_resize_1d_nearest_internal.h" #include "ops/vsi_nn_op_upsamplescale.h" +#include "ops/vsi_nn_op_groupnormalize.h" +#include "ops/vsi_nn_op_sequence_mask.h" +#include "ops/vsi_nn_op_repeat.h" +#include "ops/vsi_nn_op_one_hot.h" +#include "ops/vsi_nn_op_nms.h" +#include "ops/vsi_nn_op_grouped_conv1d.h" /* custom node head define define */ #include "custom/vsi_nn_custom_node_type.h" @@ -314,6 +320,12 @@ typedef union _vsi_nn_nn_param vsi_nn_resize_1d_bilinear_internal_param resize_1d_bilinear_internal; vsi_nn_resize_1d_nearest_internal_param resize_1d_nearest_internal; vsi_nn_upsamplescale_param upsamplescale; + vsi_nn_groupnormalize_param groupnorm; + vsi_nn_sequence_mask_param sequence_mask; + vsi_nn_repeat_param repeat; + vsi_nn_one_hot_param one_hot; + vsi_nn_nms_param nms; + vsi_nn_grouped_conv1d_param grouped_conv1d; uint8_t client_param[128]; /* custom node data struct define */ diff --git a/src/tim/vx/internal/include/vsi_nn_version.h b/src/tim/vx/internal/include/vsi_nn_version.h index da62e48..5e544c2 100644 --- a/src/tim/vx/internal/include/vsi_nn_version.h +++ b/src/tim/vx/internal/include/vsi_nn_version.h @@ -33,7 +33,7 @@ extern "C"{ #define VSI_NN_VERSION_MAJOR 1 #define VSI_NN_VERSION_MINOR 1 -#define VSI_NN_VERSION_PATCH 30 +#define VSI_NN_VERSION_PATCH 32 #define VSI_NN_VERSION \ (VSI_NN_VERSION_MAJOR * 10000 + VSI_NN_VERSION_MINOR * 100 + VSI_NN_VERSION_PATCH) diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_topk.vx b/src/tim/vx/internal/src/custom/ops/kernel/cl/custom_softmax.cl similarity index 63% rename from src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_topk.vx rename to src/tim/vx/internal/src/custom/ops/kernel/cl/custom_softmax.cl index fdacd41..05587c8 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_topk.vx +++ b/src/tim/vx/internal/src/custom/ops/kernel/cl/custom_softmax.cl @@ -1,6 +1,4 @@ -#include "cl_viv_vx_ext.h" - -__kernel void vxcTopk( +__kernel void testop( __read_only image2d_array_t input, __write_only image2d_array_t output) { diff --git a/src/tim/vx/internal/src/custom/ops/kernel/cpu/custom_softmax_cpu.c b/src/tim/vx/internal/src/custom/ops/kernel/cpu/custom_softmax_cpu.c new file mode 100644 index 0000000..abedba1 --- /dev/null +++ b/src/tim/vx/internal/src/custom/ops/kernel/cpu/custom_softmax_cpu.c @@ -0,0 +1,194 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_platform.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_node.h" +#include "vsi_nn_log.h" +#include "vsi_nn_test.h" +#include "vsi_nn_error.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "utils/vsi_nn_dtype_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "libnnext/vsi_nn_vxkernel.h" +//#include "libnnext/vx_lib_nnext.h" + +#define _CPU_ARG_NUM (1) +#define _CPU_INPUT_NUM (1) +#define _CPU_OUTPUT_NUM (1) +#define _CPU_IO_NUM (_CPU_INPUT_NUM + _CPU_OUTPUT_NUM) +#define _CPU_PARAM_NUM (_CPU_ARG_NUM + _CPU_IO_NUM) +#define _KERNEL_NAME ("com.vivantecorp.extension.CustomSoftmaxVXC") + +#define SCALAR_INPUT_AXIS (2) + +__BEGIN_DECLS + +DEF_KERNEL_EXECUTOR(_softmax_exec) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t* param, + size_t param_size + ) +{ + vsi_status status = VX_SUCCESS; + float* buffer[_CPU_IO_NUM] = { NULL }; + vsi_nn_kernel_tensor_t tensors[_CPU_IO_NUM] = { NULL }; + vsi_nn_kernel_tensor_attr_t* attr[_CPU_IO_NUM] = { NULL }; + uint32_t i = 0; + uint32_t out_elements; + int32_t sf_axis; + float fMax = 0.0; + float fProbSum = 0.0f; + + tensors[0] = (vsi_nn_kernel_tensor_t)param[0]; + tensors[1] = (vsi_nn_kernel_tensor_t)param[1]; + + attr[0] = vsi_nn_kernel_tensor_attr_create( tensors[0] ); + attr[1] = vsi_nn_kernel_tensor_attr_create( tensors[1] ); + + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[2], &sf_axis); + CHECK_STATUS_FAIL_GOTO(status, final ); + + out_elements = (uint32_t)vsi_nn_kernel_tensor_attr_get_size( attr[1] ); + + /* alloc the float32 data buffer */ + buffer[1] = (float *)malloc(out_elements * sizeof(float)); + CHECK_PTR_FAIL_GOTO( buffer[0], "Create input buffer fail.", final ); + memset(buffer[1], 0, out_elements * sizeof(float)); + + buffer[0] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[0], attr[0], TRUE ); + CHECK_PTR_FAIL_GOTO( buffer[0], "Create input buffer fail.", final ); + + /* Softmax implement */ + for ( i = 0; i < out_elements; i++) + { + fMax = buffer[0][i] > fMax ? buffer[0][i] : fMax; + } + + for ( i = 0; i < out_elements; i++) + { + buffer[1][i] = (float)expf(buffer[0][i] - fMax); + fProbSum += buffer[1][i]; + } + for ( i = 0; i < out_elements; i++) + { + buffer[1][i] = buffer[1][i] / fProbSum; + } + status = vsi_nn_kernel_tensor_write_from_float( + tensors[1], attr[1], buffer[1], out_elements ); + +final: + for( i = 0; i < _CPU_IO_NUM; i ++ ) + { + if( buffer[i] ) + { + free( buffer[i] ); + } + vsi_nn_kernel_tensor_attr_release( &attr[i] ); + } + return status; +} + +static vx_param_description_t kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED} +}; + +static const vx_kernel_description_t _kernel_info = +{ + KERNEL_ID_PLACEHOLDER, + _KERNEL_NAME, + _softmax_exec, + kernel_param_def, + _cnt_of_array( kernel_param_def ), + vsi_nn_KernelValidator, + NULL, + NULL, + vsi_nn_KernelInitializer, + vsi_nn_KernelDeinitializer +}; + +static vsi_status _query_kernel + ( + vsi_nn_tensor_t* const* const inputs, + vsi_nn_tensor_t* const* const outputs, + vsi_nn_kernel_t* kernel + ) +{ + memmove( &kernel->info, &_kernel_info, sizeof(vx_kernel_description_t) ); + return VSI_SUCCESS; +} + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_SUCCESS; + vsi_nn_kernel_node_param_t backend_params[_CPU_PARAM_NUM] = {NULL}; + vsi_nn_kernel_node_t node = NULL; + int32_t axis = 0; + + axis = vsi_nn_kernel_param_get_int32(params, "axis"); + + status = _query_kernel( inputs, outputs, kernel ); + if( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if( node ) + { + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( backend_params, _CPU_PARAM_NUM, + inputs, _CPU_INPUT_NUM, outputs, _CPU_OUTPUT_NUM ); + backend_params[SCALAR_INPUT_AXIS] = vsi_nn_kernel_scalar_create( + graph, I32, &axis ); + + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, backend_params, _CPU_PARAM_NUM ); + vsi_nn_kernel_scalar_release( &backend_params[SCALAR_INPUT_AXIS] ); + } + else + { + status = VSI_FAILURE; + } + } + return node; +} /* _setup() */ + + +__END_DECLS + +REGISTER_BACKEND_CPU( custom_softmax, _setup ) diff --git a/src/tim/vx/internal/src/custom/ops/vx/vsi_nn_kernel_custom_softmax.vx b/src/tim/vx/internal/src/custom/ops/kernel/evis/custom_softmax.vx similarity index 87% rename from src/tim/vx/internal/src/custom/ops/vx/vsi_nn_kernel_custom_softmax.vx rename to src/tim/vx/internal/src/custom/ops/kernel/evis/custom_softmax.vx index fce529d..305f666 100644 --- a/src/tim/vx/internal/src/custom/ops/vx/vsi_nn_kernel_custom_softmax.vx +++ b/src/tim/vx/internal/src/custom/ops/kernel/evis/custom_softmax.vx @@ -34,6 +34,7 @@ __kernel void Softmax2VXC } float fProbSum = 0.0f; + vxc_short8 dst; for (int i = 0; i < sf_size; i++) { vxc_char8 val; @@ -47,7 +48,8 @@ __kernel void Softmax2VXC fProbSum += fOut; half hVal; _viv_asm(CONV,hVal,fOut); - VXC_WriteImage2DArray(output, coord_in, hVal, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY,dst,hVal, 4); + VXC_WriteImage2DArray(output, coord_in, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); } for (int i = 0; i < sf_size; i++) @@ -63,7 +65,8 @@ __kernel void Softmax2VXC float fOut =fval/fProbSum; half hVal; _viv_asm(CONV,hVal,fOut); - VXC_WriteImage2DArray(output, coord_in, hVal, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY,dst,hVal, 4); + VXC_WriteImage2DArray(output, coord_in, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); } } diff --git a/src/tim/vx/internal/src/custom/ops/kernel/evis/custom_softmax_evis.c b/src/tim/vx/internal/src/custom/ops/kernel/evis/custom_softmax_evis.c new file mode 100644 index 0000000..34d679b --- /dev/null +++ b/src/tim/vx/internal/src/custom/ops/kernel/evis/custom_softmax_evis.c @@ -0,0 +1,202 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_platform.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_node.h" +#include "vsi_nn_log.h" +#include "vsi_nn_test.h" +#include "vsi_nn_error.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "utils/vsi_nn_dtype_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "libnnext/vsi_nn_vxkernel.h" +//#include "libnnext/vx_lib_nnext.h" + +#define _CPU_ARG_NUM (1) +#define _CPU_INPUT_NUM (1) +#define _CPU_OUTPUT_NUM (1) +#define _CPU_IO_NUM (_CPU_INPUT_NUM + _CPU_OUTPUT_NUM) +#define _CPU_PARAM_NUM (_CPU_ARG_NUM + _CPU_IO_NUM) +#define _KERNEL_NAME ("com.vivantecorp.extension.Softmax2VXC") + +#define SCALAR_INPUT_AXIS (2) + +__BEGIN_DECLS + +static vx_param_description_t kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED} +}; +#define _EVIS_PARAM_NUM _cnt_of_array(kernel_param_def) + +DEF_KERNEL_INITIALIZER(_softmax_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t* param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + int sf_size = 0; + vsi_nn_kernel_tensor_attr_t* attr = NULL; + // Alignment with a power of two value. + gpu_param_t gpu_param = { + 2, // workdim + {0, 0, 0}, // global_offset: control the start location be processed in the image + {0, 0, 0}, // global_scale: how many pixels could be processed by a single thread + {0, 0, 0}, // local_size: local group size in thread + {0, 0, 0}}; // global_size: image size in thread + + attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); + if (!attr) + { + VSILOGE("Query failure! at line"); + return status; + } + + sf_size = attr->shape->data[0]; + + gpu_param.global_offset[0] = 0; + gpu_param.global_offset[1] = 0; + gpu_param.global_scale[0] = 1; + gpu_param.global_scale[1] = 1; + gpu_param.local_size[0] = 1; + gpu_param.local_size[1] = 1; + gpu_param.global_size[0] = + gpu_align_p2((1 + gpu_param.global_scale[0] - 1) / gpu_param.global_scale[0], + gpu_param.local_size[0]); + gpu_param.global_size[1] = + gpu_align_p2((1 + gpu_param.global_scale[1] - 1) / gpu_param.global_scale[1], + gpu_param.local_size[1]); + { + gpu_dp_inst_t Uni4x4_Fp16ToFp32 = {{ + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00010000, 0x00030002, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000000, 0x00000001, 0x00000000, + 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + + status = vsi_nn_kernel_gpu_add_param( node, + "Uni4x4_Fp16ToFp32", &Uni4x4_Fp16ToFp32 ); + vsi_nn_kernel_gpu_add_param(node, + "sf_size", &sf_size); + } + + status = vsi_nn_kernel_gpu_config( node, &gpu_param ); + + if(status != VSI_SUCCESS) + { + VSILOGE("Initializer failure!"); + } + if (attr) vsi_nn_kernel_tensor_attr_release( &attr ); + + return status; +} + +static const vx_kernel_description_t _kernel_info = +{ + KERNEL_ID_PLACEHOLDER, + _KERNEL_NAME, + NULL, + kernel_param_def, + _cnt_of_array( kernel_param_def ), + vsi_nn_KernelValidator, + NULL, + NULL, + _softmax_initializer, + vsi_nn_KernelDeinitializer +}; + +static vsi_status _query_kernel + ( + vsi_nn_tensor_t* const* const inputs, + vsi_nn_tensor_t* const* const outputs, + vsi_nn_kernel_t* kernel + ) +{ + memmove( &kernel->info, &_kernel_info, sizeof(vx_kernel_description_t) ); + + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2, + "vsi_nn_kernel_header", + "custom_softmax" ); + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + "custom_softmax" ); + return VSI_SUCCESS; +} + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_SUCCESS; + vsi_nn_kernel_node_param_t backend_params[_CPU_PARAM_NUM] = {NULL}; + vsi_nn_kernel_node_t node = NULL; + int32_t axis = 0; + + axis = vsi_nn_kernel_param_get_int32(params, "axis"); + + status = _query_kernel( inputs, outputs, kernel ); + if( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if( node ) + { + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( backend_params, _CPU_PARAM_NUM, + inputs, _CPU_INPUT_NUM, outputs, _CPU_OUTPUT_NUM ); + backend_params[SCALAR_INPUT_AXIS] = vsi_nn_kernel_scalar_create( + graph, I32, &axis ); + + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, backend_params, _CPU_PARAM_NUM ); + vsi_nn_kernel_scalar_release( &backend_params[SCALAR_INPUT_AXIS] ); + } + else + { + status = VSI_FAILURE; + } + } + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_EVIS( custom_softmax, _setup ) diff --git a/src/tim/vx/internal/src/custom/ops/kernel/vsi_nn_kernel_custom_softmax.c b/src/tim/vx/internal/src/custom/ops/kernel/vsi_nn_kernel_custom_softmax.c deleted file mode 100644 index 0230420..0000000 --- a/src/tim/vx/internal/src/custom/ops/kernel/vsi_nn_kernel_custom_softmax.c +++ /dev/null @@ -1,231 +0,0 @@ -/**************************************************************************** -* -* Copyright (c) 2020 Vivante Corporation -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -* DEALINGS IN THE SOFTWARE. -* -*****************************************************************************/ -#include -#include -#include "vsi_nn_types.h" -#include "vsi_nn_platform.h" -#include "vsi_nn_graph.h" -#include "vsi_nn_node.h" -#include "vsi_nn_log.h" -#include "vsi_nn_test.h" -#include "vsi_nn_tensor_util.h" -#include "utils/vsi_nn_util.h" -#include "utils/vsi_nn_dtype_util.h" -#include "client/vsi_nn_vxkernel.h" -#include "libnnext/vx_lib_nnext.h" - -#define _VX_KERNEL_ID VX_KERNEL_ID(CUSTOM_SOFTMAX) -#define _VX_KERNEL_VAR_CPU (vx_client_kernel_CUSTOM_SOFTMAX_CPU) -#define _VX_KERNEL_VAR_VX (vx_client_kernel_CUSTOM_SOFTMAX_VX) -#define _VX_KERNEL_NAME ("com.vivantecorp.extension.CustomSoftmaxVXC") -#define _VX_KERNEL_FUNC_KERNEL (vxCustomSoftmaxKernel) - -static vsi_status VX_CALLBACK vxCustomSoftmaxKernel - ( - vx_node node, - const vx_reference* paramObj, - uint32_t paramNum - ) -{ - vsi_status status = VX_SUCCESS; - vx_tensor input = NULL,output = NULL; - float *f32_in_buffer = NULL,*f32_out_buffer=NULL; - vx_context context = NULL; - vsi_nn_tensor_attr_t in_attr,out_attr; - uint32_t i,in_elements,out_elements; - int32_t sf_axis; - float fMax = 0.0; - float fProbSum = 0.0f; - - context = vxGetContext((vx_reference)node); - input = (vx_tensor)paramObj[0]; - output = (vx_tensor)paramObj[1]; - vxCopyScalar((vx_scalar)paramObj[2], &(sf_axis),VX_READ_ONLY, VX_MEMORY_TYPE_HOST); - - /* Fill input & output attribute data struct */ - status = vsi_nn_vxGetTensorAttr(input, &in_attr); - TEST_CHECK_STATUS(status, final); - status = vsi_nn_vxGetTensorAttr(output, &out_attr); - TEST_CHECK_STATUS(status, final); - - in_elements = vsi_nn_vxGetTensorElementNum(&in_attr); - out_elements = vsi_nn_vxGetTensorElementNum(&out_attr); - - /* alloc the float32 data buffer */ - f32_in_buffer = (float *)malloc(in_elements * sizeof(float)); - f32_out_buffer= (float *)malloc(out_elements * sizeof(float)); - memset(f32_in_buffer, 0, in_elements * sizeof(float)); - memset(f32_out_buffer, 0, out_elements * sizeof(float)); - - /* Copy tensor to buffer, and convert bufer to float32 format */ - status = vsi_nn_vxConvertTensorToFloat32Data( - context, input, &in_attr, f32_in_buffer, in_elements * sizeof(float)); - TEST_CHECK_STATUS(status, final); - - /* Softmax implement */ - for ( i = 0; i < out_elements; i++) - { - fMax = f32_in_buffer[i] > fMax ? f32_in_buffer[i] : fMax; - } - - for ( i = 0; i < out_elements; i++) - { - f32_out_buffer[i] = (float)expf(f32_in_buffer[i] - fMax); - fProbSum += f32_out_buffer[i]; - } - for ( i = 0; i < out_elements; i++) - { - f32_out_buffer[i] = f32_out_buffer[i]/ fProbSum; - } - status = vsi_nn_vxConvertFloat32DataToTensor( - context, output, &out_attr, f32_out_buffer, out_elements * sizeof(float)); - -final: - if(f32_in_buffer)free(f32_in_buffer); - if(f32_out_buffer)free(f32_out_buffer); - return status; -} - -static vx_status VX_CALLBACK vxCustomSoftmaxInitializer - ( - vx_node nodObj, - const vx_reference *paramObj, - vx_uint32 paraNum - ) -{ - vx_status status = VX_SUCCESS; - /*TODO: Add initial code for VX program*/ - // Alignment with a power of two value. -#define gcmALIGN(n, align) ((n) + ((align) - 1)) & ~((align) - 1) - vx_kernel_execution_parameters_t shaderParam = { - 2, // workdim - {0, 0, 0}, // globalWorkOffset: control the start location be processed in the image - {0, 0, 0}, // globalWorkScale: how many pixels could be processed by a single thread - {0, 0, 0}, // localWorkSize: local group size in thread - {0, 0, 0}}; // globalWorkSize: image size in thread - int input_size[6] = {1, 1, 1, 1, 1, 1}; - int sf_size; - uint32_t input_dims; - uint32_t i; - vsi_nn_tensor_attr_t input_attr; - - memset(&input_attr, 0, sizeof(vsi_nn_tensor_attr_t)); - - status = vsi_nn_vxGetTensorAttr((vx_tensor)paramObj[0], &input_attr); - if (status != VX_SUCCESS) - { - VSILOGE("vsi_nn_vxGetTensorAttr failure! at line %d\n", __LINE__); - return status; - } - - input_dims = input_attr.dim_num; - for (i = 0; i < input_dims; i++) - { - input_size[i] = input_attr.size[i]; - } - - sf_size = input_size[0]; - - shaderParam.globalWorkOffset[0] = 0; - shaderParam.globalWorkOffset[1] = 0; - shaderParam.globalWorkScale[0] = 1; - shaderParam.globalWorkScale[1] = 1; - shaderParam.localWorkSize[0] = 1; - shaderParam.localWorkSize[1] = 1; - shaderParam.globalWorkSize[0] = - gcmALIGN((1 + shaderParam.globalWorkScale[0] - 1) / shaderParam.globalWorkScale[0], shaderParam.localWorkSize[0]); - shaderParam.globalWorkSize[1] = - gcmALIGN((1 + shaderParam.globalWorkScale[1] - 1) / shaderParam.globalWorkScale[1], shaderParam.localWorkSize[1]); - { - vx_uint32 Uni4x4_Fp16ToFp32[16] = { - 0x01010101, // TCfg - 0x00000000, // ASelt - 0x00010000, 0x00030002, // ABin - 0x02020202, // BSelt - 0x00000000, 0x00000000, // BBin - 0x00000400, // AccumType, ConstantType, and PostShift - 0x00000001, 0x00000000, 0x00000001, 0x00000000, 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant - }; - - vxSetNodeUniform(nodObj, "Uni4x4_Fp16ToFp32", 1, Uni4x4_Fp16ToFp32); - vxSetNodeUniform(nodObj, "sf_size", 1, &sf_size); - } - status |= vxSetNodeAttribute(nodObj, VX_NODE_ATTRIBUTE_KERNEL_EXECUTION_PARAMETERS, - &shaderParam, sizeof(vx_kernel_execution_parameters_t)); - - if(status < 0) - { - VSILOGE("Initializer failure!"); - } - - return status; -} - -static vx_param_description_t s_params[] = -{ - { VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED }, - { VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED }, - { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED }, -}; - -#ifdef __cplusplus -extern "C" { -#endif -vx_kernel_description_t _VX_KERNEL_VAR_CPU = -{ - _VX_KERNEL_ID, - _VX_KERNEL_NAME, - _VX_KERNEL_FUNC_KERNEL, - s_params, - _cnt_of_array( s_params ), - vsi_nn_KernelValidator, - NULL, - NULL, - vsi_nn_KernelInitializer, - vsi_nn_KernelDeinitializer -}; - -vx_kernel_description_t _VX_KERNEL_VAR_VX = -{ - _VX_KERNEL_ID, - _VX_KERNEL_NAME, - NULL, - s_params, - _cnt_of_array( s_params ), - vsi_nn_KernelValidator, - NULL, - NULL, - vxCustomSoftmaxInitializer, - vsi_nn_KernelDeinitializer -}; - -vx_kernel_description_t * vx_kernel_CUSTOM_SOFTMAX_list[] = -{ - &_VX_KERNEL_VAR_CPU, - &_VX_KERNEL_VAR_VX, - NULL -}; -#ifdef __cplusplus -} -#endif diff --git a/src/tim/vx/internal/src/custom/ops/op_custom_softmax.c b/src/tim/vx/internal/src/custom/ops/op_custom_softmax.c new file mode 100644 index 0000000..3aa9835 --- /dev/null +++ b/src/tim/vx/internal/src/custom/ops/op_custom_softmax.c @@ -0,0 +1,102 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include +#include "vsi_nn_types.h" +#include "vsi_nn_platform.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_node.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_log.h" +#include "kernel/vsi_nn_kernel.h" + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_nn_kernel_param_t * param = NULL; + vsi_nn_custom_softmax_param * p; + p = &(self->nn_param.custom_softmax); + + param = vsi_nn_kernel_param_create(); + vsi_nn_kernel_param_add_int32( param, "axis", p->axis ); + + self->n = (vx_node)vsi_nn_kernel_selector( self->graph, + "custom_softmax", + inputs, 1, + outputs, 1, param ); + + vsi_nn_kernel_param_release( ¶m ); + + return VSI_SUCCESS; +} /* op_compute() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + /*TODO: Check params. */ + return TRUE; +} /* op_check() */ + +static vsi_bool op_setup + ( + vsi_nn_node_t * node, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) + { + outputs[0]->attr.dim_num = inputs[0]->attr.dim_num; + memmove(outputs[0]->attr.size, inputs[0]->attr.size, + inputs[0]->attr.dim_num * sizeof(uint32_t)); + } + return TRUE; +} /* op_setup() */ + +#ifdef __cplusplus +extern "C" { +#endif +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ CUSTOM_SOFTMAX, + /* init */ NULL, + /* compute */ op_compute, + /* deinit */ vsi_nn_op_common_deinit, + /* check */ op_check, + /* setup */ op_setup, + /* optimize */ NULL, + /* input_num */ 1, + /* output_num */ 1 + ); +#ifdef __cplusplus +} +#endif diff --git a/src/tim/vx/internal/src/custom/ops/vsi_nn_op_custom_softmax.c b/src/tim/vx/internal/src/custom/ops/vsi_nn_op_custom_softmax.c deleted file mode 100644 index 215334e..0000000 --- a/src/tim/vx/internal/src/custom/ops/vsi_nn_op_custom_softmax.c +++ /dev/null @@ -1,299 +0,0 @@ -/**************************************************************************** -* -* Copyright (c) 2020 Vivante Corporation -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -* DEALINGS IN THE SOFTWARE. -* -*****************************************************************************/ -#include -#include "vsi_nn_types.h" -#include "vsi_nn_platform.h" -#include "vsi_nn_graph.h" -#include "vsi_nn_node.h" -#include "vsi_nn_ops.h" -#include "vsi_nn_log.h" -#include "client/vsi_nn_vxkernel.h" - -#define _ARG_NUM (1) -#define _INPUT_NUM (1) -#define _OUTPUT_NUM (1) -#define _IO_NUM (_INPUT_NUM + _OUTPUT_NUM) -#define _PARAM_NUM (_ARG_NUM + _IO_NUM) - -extern vx_kernel_description_t * vx_kernel_CUSTOM_SOFTMAX_list[]; - -static void _set_inputs_outputs - ( - vx_reference * params, - vsi_nn_tensor_t ** inputs, - vsi_nn_tensor_t ** outputs - ) -{ - uint32_t i; - uint32_t cnt; - - /* Set inputs */ - cnt = 0; - for( i = 0; i < _INPUT_NUM; i ++, cnt ++ ) - { - params[cnt] = (vx_reference)inputs[i]->t; - } - - /* Set outputs */ - for( i = 0; i < _OUTPUT_NUM; i ++, cnt ++ ) - { - params[cnt] = (vx_reference)outputs[i]->t; - } -} /* _set_inputs_outputs() */ - -static vsi_status _create_params - ( - vsi_nn_node_t * node, - vx_reference * params, - uint32_t num - ) -{ - vsi_status status; - vx_context ctx; - vsi_nn_custom_softmax_param * p; - if( 0 == num ) - { - return VSI_SUCCESS; - } - memset( params, 0, sizeof( vx_reference * ) * num ); - p = &(node->nn_param.custom_softmax); - ctx = vxGetContext( (vx_reference)node->graph->g ); - /* Init parameters */ -#define _SET_PARAM( i, type, arg ) do{ \ - params[i] = (vx_reference)vxCreateScalar( ctx, type, &p->arg ); \ - status = vxGetStatus( params[i] ); \ - if( VSI_SUCCESS != status ) { \ - goto set_param_error; \ - } \ - } while(0) - _SET_PARAM( 0, VX_TYPE_INT32, axis ); -#undef _SET_PARAM -set_param_error: - - return status; -} /* _create_params */ - -static void _release_params - ( - vx_reference * params, - uint32_t num - ) -{ - uint32_t i; - vx_scalar scalar; - for( i = 0; i < num; i ++ ) - { - scalar = (vx_scalar)params[i]; - vxReleaseScalar( &scalar ); - } -} /* _release_params() */ - -static vsi_status cpu_op_compute - ( - vsi_nn_node_t * self, - vsi_nn_tensor_t ** inputs, - vsi_nn_tensor_t ** outputs - ) -{ - vsi_status status = VSI_SUCCESS; - vx_reference params[_PARAM_NUM]; - vx_reference * args; - - args = ¶ms[_IO_NUM]; - - if( NULL == self->n ) - { - return VSI_FAILURE; - } - - /* Set inputs and outputs */ - _set_inputs_outputs( params, inputs, outputs ); - - /* Init parameters. */ - _create_params( self, args, _ARG_NUM ); - - /* Pass parameters to node. */ - status = vsi_nn_ClientNodePassParameters( self->n, params, _PARAM_NUM ); - - _release_params( args, _ARG_NUM ); - - return status; -} - -static vsi_status vx_op_compute - ( - vsi_nn_node_t * self, - vsi_nn_tensor_t ** inputs, - vsi_nn_tensor_t ** outputs - ) -{ - vsi_status status = VSI_SUCCESS; - vx_reference params[_PARAM_NUM]; - vx_reference * args; - //vsi_nn_tensor_attr_t attr; - - args = ¶ms[_IO_NUM]; - - if( NULL == self->n ) - { - return VSI_FAILURE; - } - - /* Set inputs and outputs */ - _set_inputs_outputs( params, inputs, outputs ); - - /*TODO: Add code if need to change your parameter*/ - /* Init parameters. */ - _create_params( self, args, _ARG_NUM ); -#if 0 - memcpy(&attr, &(inputs[0]->attr), sizeof(vsi_nn_tensor_attr_t)); - attr.size[0] = attr.size[0]; - attr.size[1] = 1; - attr.dim_num = 2; - params[0] = (vx_reference)vxReshapeTensor(inputs[0]->t, (int32_t*)(attr.size), attr.dim_num); - params[1] = (vx_reference)vxReshapeTensor(outputs[0]->t, (int32_t*)(attr.size), attr.dim_num); -#endif - /* Init parameters. */ - _create_params( self, args, _ARG_NUM ); - - /* Pass parameters to node. */ - status = vsi_nn_ClientNodePassParameters( self->n, params, _PARAM_NUM ); - - _release_params( args, _ARG_NUM ); -#if 0 - vxReleaseTensor((vx_tensor*)¶ms[0]); - vxReleaseTensor((vx_tensor*)¶ms[1]); -#endif - return status; -} - -static vsi_nn_op_compute_t op_compute_list[] = -{ - cpu_op_compute, - vx_op_compute, - NULL -}; - -static vsi_status op_compute - ( - vsi_nn_node_t * self, - vsi_nn_tensor_t ** inputs, - vsi_nn_tensor_t ** outputs - ) -{ - vsi_status status; - vsi_nn_kernel_info_t kernel_info; - char *path = NULL; - - memset(&kernel_info, 0x0, sizeof(vsi_nn_kernel_info_t)); - status = VSI_FAILURE; - kernel_info.type = VX_KERNEL_TYPE_CPU; - kernel_info.kernel = vx_kernel_CUSTOM_SOFTMAX_list; - kernel_info.resource_num = 1; - kernel_info.resource_name = (char **)malloc(kernel_info.resource_num * sizeof(char *)); - kernel_info.resource_name[0] = "vsi_nn_kernel_custom_softmax"; - path = getenv("USER_VX_SOURCE_PATH"); - if(path) - { - vsi_nn_VxResourceSetPath(path); - } - - if( kernel_info.type == VX_KERNEL_TYPE_VX) - { - kernel_info.kernel_index = 1; - kernel_info.init_index = 1; - } - else /*kernel_info.type = VX_KERNEL_TYPE_CPU;*/ - { - kernel_info.kernel_index = 0; - kernel_info.init_index = 0; - } - - self->n = vsi_nn_RegisterClientKernelAndNewNode( - self->graph, &kernel_info); - if (kernel_info.resource_name) - { - free(kernel_info.resource_name); - } - if( NULL == self->n ) - { - return VSI_FAILURE; - } - if (NULL != op_compute_list[kernel_info.init_index]) - { - status = op_compute_list[kernel_info.init_index](self, inputs, outputs); - } - - return status; -} /* op_compute() */ - -static vsi_bool op_check - ( - vsi_nn_node_t * self, - vsi_nn_tensor_t ** inputs, - vsi_nn_tensor_t ** outputs - ) -{ - /*TODO: Check input tensor shapes. */ - return TRUE; -} /* op_check() */ - -static vsi_bool op_setup - ( - vsi_nn_node_t * node, - vsi_nn_tensor_t ** inputs, - vsi_nn_tensor_t ** outputs - ) -{ - /* TODO: Compute output tensor shape. */ - if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) - { - outputs[0]->attr.dim_num = inputs[0]->attr.dim_num; - outputs[0]->attr.size[0] = inputs[0]->attr.size[0]; - outputs[0]->attr.size[1] = inputs[0]->attr.size[1]; - outputs[0]->attr.size[2] = inputs[0]->attr.size[2]; - outputs[0]->attr.size[3] = inputs[0]->attr.size[3]; - } - return TRUE; -} /* op_setup() */ - -#ifdef __cplusplus -extern "C" { -#endif -/* Registrar */ -DEF_OP_REG - ( - /* op_name */ CUSTOM_SOFTMAX, - /* init */ NULL, - /* compute */ op_compute, - /* deinit */ vsi_nn_op_common_deinit, - /* check */ op_check, - /* setup */ op_setup, - /* optimize */ NULL, - /* input_num */ _INPUT_NUM, - /* output_num */ _OUTPUT_NUM - ); -#ifdef __cplusplus -} -#endif diff --git a/src/tim/vx/internal/src/kernel/cl/argmax_cl.c b/src/tim/vx/internal/src/kernel/cl/argmax_cl.c index 6311201..5855db8 100644 --- a/src/tim/vx/internal/src/kernel/cl/argmax_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/argmax_cl.c @@ -183,26 +183,31 @@ static vsi_status _query_kernel vsi_nn_kernel_dtype_e output_dtype; vsi_status status = VSI_FAILURE; uint32_t key; - int i; + int32_t i; input_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); - if(input_dtype == I8) + if (input_dtype == I8) { input_dtype = I32; } + if (output_dtype == I16) + { + output_dtype = I32; + } + key = HASH_ARGMAX_KEY( axis, input_dtype, output_dtype, image_2d ); - for( i = 0; i < _cnt_of_array(kernel_map); i ++ ) + for ( i = 0; i < _cnt_of_array(kernel_map); i ++ ) { - if( kernel_map[i].key == key ) + if ( kernel_map[i].key == key ) { break; } } - if( i < _cnt_of_array(kernel_map) ) + if ( i < _cnt_of_array(kernel_map) ) { snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name ); kernel->info.parameters = kernel_param_def; @@ -237,7 +242,7 @@ static vsi_nn_kernel_node_t _setup axis = vsi_nn_kernel_param_get_int32(params, "axis"); - if( !vsi_nn_kernel_gpu_check_shape( (int32_t*)inputs[0]->attr.size, + if ( !vsi_nn_kernel_gpu_check_shape( (int32_t*)inputs[0]->attr.size, inputs[0]->attr.dim_num ) || !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size, outputs[0]->attr.dim_num ) @@ -250,11 +255,11 @@ static vsi_nn_kernel_node_t _setup image_2d = (inputs[0]->attr.dim_num == 2 || inputs[0]->attr.size[2] == 1); status = _query_kernel( inputs, outputs, axis, image_2d, kernel ); - if( VSI_SUCCESS == status) + if ( VSI_SUCCESS == status) { node = vsi_nn_kernel_create_node( graph, kernel ); - if( node ) + if ( node ) { vsi_nn_kernel_node_pack_io( node_params, _CL_PARAM_NUM, inputs, 1, outputs, 1 ); diff --git a/src/tim/vx/internal/src/kernel/cl/argmin_cl.c b/src/tim/vx/internal/src/kernel/cl/argmin_cl.c index 7afa3b6..399e496 100644 --- a/src/tim/vx/internal/src/kernel/cl/argmin_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/argmin_cl.c @@ -183,20 +183,26 @@ static vsi_status _query_kernel vsi_nn_kernel_dtype_e output_dtype; vsi_status status = VSI_FAILURE; uint32_t key; - int i; + int32_t i; input_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + + if (output_dtype == I16) + { + output_dtype = I32; + } + key = HASH_ARGMIN_KEY( axis, input_dtype, output_dtype, image_2d ); - for( i = 0; i < _cnt_of_array(kernel_map); i ++ ) + for ( i = 0; i < _cnt_of_array(kernel_map); i ++ ) { - if( kernel_map[i].key == key ) + if ( kernel_map[i].key == key ) { break; } } - if( i < _cnt_of_array(kernel_map) ) + if ( i < _cnt_of_array(kernel_map) ) { snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name ); kernel->info.parameters = kernel_param_def; @@ -231,7 +237,7 @@ static vsi_nn_kernel_node_t _setup axis = vsi_nn_kernel_param_get_int32(params, "axis"); - if( !vsi_nn_kernel_gpu_check_shape( (int32_t*)inputs[0]->attr.size, + if ( !vsi_nn_kernel_gpu_check_shape( (int32_t*)inputs[0]->attr.size, inputs[0]->attr.dim_num ) || !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size, outputs[0]->attr.dim_num ) @@ -244,11 +250,11 @@ static vsi_nn_kernel_node_t _setup image_2d = (inputs[0]->attr.dim_num == 2 || inputs[0]->attr.size[2] == 1); status = _query_kernel( inputs, outputs, axis, image_2d, kernel ); - if( VSI_SUCCESS == status) + if ( VSI_SUCCESS == status) { node = vsi_nn_kernel_create_node( graph, kernel ); - if( node ) + if ( node ) { vsi_nn_kernel_node_pack_io( node_params, _CL_PARAM_NUM, inputs, 1, outputs, 1 ); diff --git a/src/tim/vx/internal/src/kernel/cl/cast_cl.c b/src/tim/vx/internal/src/kernel/cl/cast_cl.c index d89849a..112cfca 100644 --- a/src/tim/vx/internal/src/kernel/cl/cast_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/cast_cl.c @@ -186,7 +186,7 @@ static vsi_status _query_kernel { in_dtype = F32; } - else if ((I8 == in_dtype) || (I16 == in_dtype)) + else if ((I8 == in_dtype) || (BOOL8 == in_dtype) || (I16 == in_dtype)) { in_dtype = I32; } diff --git a/src/tim/vx/internal/src/kernel/cl/comparisons_cl.c b/src/tim/vx/internal/src/kernel/cl/comparisons_cl.c index 856042d..62bb0f4 100644 --- a/src/tim/vx/internal/src/kernel/cl/comparisons_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/comparisons_cl.c @@ -289,6 +289,12 @@ static vsi_status _query_kernel input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); input1_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + + if (outputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_NONE && output_dtype == I8) + { + output_dtype = BOOL8; + } + key = HASH_COMPARISONS_KEY( operation, input0_dtype, input1_dtype, output_dtype, image_2d ); for( i = 0; i < _cnt_of_array(_comparisons_cl_kernel_map); i ++ ) diff --git a/src/tim/vx/internal/src/kernel/cl/eltwise_unary_cl.c b/src/tim/vx/internal/src/kernel/cl/eltwise_unary_cl.c index c0de129..6b0d6d5 100644 --- a/src/tim/vx/internal/src/kernel/cl/eltwise_unary_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/eltwise_unary_cl.c @@ -48,6 +48,7 @@ typedef enum UNARY_NEG, UNARY_HSIGMOID, UNARY_MISH, + UNARY_ROUND, } unary_type_e; /* @@ -91,7 +92,8 @@ typedef enum #define ELU_OPERATION elu #define NEG_OPERATION neg #define HSIGMOID_OPERATION hard_sigmoid -#define MISH_OPERATION mish +#define MISH_OPERATION mish +#define ROUND_OPERATION round static const struct { uint32_t key; @@ -113,6 +115,8 @@ static const struct { TENSOR_UNARY_KERNELS_FLOAT(HSIGMOID_OPERATION, UNARY_HSIGMOID, F16, F16) TENSOR_UNARY_KERNELS_FLOAT(MISH_OPERATION, UNARY_MISH, F32, F32) TENSOR_UNARY_KERNELS_FLOAT(MISH_OPERATION, UNARY_MISH, F16, F16) + TENSOR_UNARY_KERNELS_FLOAT(ROUND_OPERATION, UNARY_ROUND, F32, F32) + TENSOR_UNARY_KERNELS_FLOAT(ROUND_OPERATION, UNARY_ROUND, F16, F16) TENSOR_UNARY_KERNELS_FLOAT_2D(SIN_OPERATION, UNARY_SIN, F32, F32) TENSOR_UNARY_KERNELS_FLOAT_2D(SIN_OPERATION, UNARY_SIN, F16, F16) @@ -128,6 +132,8 @@ static const struct { TENSOR_UNARY_KERNELS_FLOAT_2D(HSIGMOID_OPERATION, UNARY_HSIGMOID, F16, F16) TENSOR_UNARY_KERNELS_FLOAT_2D(MISH_OPERATION, UNARY_MISH, F32, F32) TENSOR_UNARY_KERNELS_FLOAT_2D(MISH_OPERATION, UNARY_MISH, F16, F16) + TENSOR_UNARY_KERNELS_FLOAT_2D(ROUND_OPERATION, UNARY_ROUND, F32, F32) + TENSOR_UNARY_KERNELS_FLOAT_2D(ROUND_OPERATION, UNARY_ROUND, F16, F16) TENSOR_UNARY_KERNELS(SIN_OPERATION, UNARY_SIN, U8, U8) TENSOR_UNARY_KERNELS(EXP_OPERATION, UNARY_EXP, U8, U8) @@ -136,6 +142,7 @@ static const struct { TENSOR_UNARY_KERNELS(NEG_OPERATION, UNARY_NEG, U8, U8) TENSOR_UNARY_KERNELS(HSIGMOID_OPERATION, UNARY_HSIGMOID, U8, U8) TENSOR_UNARY_KERNELS(MISH_OPERATION, UNARY_MISH, U8, U8) + TENSOR_UNARY_KERNELS(ROUND_OPERATION, UNARY_ROUND, U8, U8) TENSOR_UNARY_KERNELS_2D(SIN_OPERATION, UNARY_SIN, U8, U8) TENSOR_UNARY_KERNELS_2D(EXP_OPERATION, UNARY_EXP, U8, U8) @@ -144,6 +151,7 @@ static const struct { TENSOR_UNARY_KERNELS_2D(NEG_OPERATION, UNARY_NEG, U8, U8) TENSOR_UNARY_KERNELS_2D(HSIGMOID_OPERATION, UNARY_HSIGMOID, U8, U8) TENSOR_UNARY_KERNELS_2D(MISH_OPERATION, UNARY_MISH, U8, U8) + TENSOR_UNARY_KERNELS_2D(ROUND_OPERATION, UNARY_ROUND, U8, U8) TENSOR_UNARY_KERNELS(NEG_OPERATION, UNARY_NEG, I32, I32) @@ -157,6 +165,7 @@ static const struct { #undef NEG_OPERATION #undef HSIGMOID_OPERATION #undef MISH_OPERATION +#undef ROUND_OPERATION /* * Kernel params */ @@ -407,5 +416,5 @@ REGISTER_ELTWISE_UNARY_BACKEND_CL( elu, UNARY_ELU ) REGISTER_ELTWISE_UNARY_BACKEND_CL( neg, UNARY_NEG ) REGISTER_ELTWISE_UNARY_BACKEND_CL( hard_sigmoid, UNARY_HSIGMOID ) REGISTER_ELTWISE_UNARY_BACKEND_CL( mish, UNARY_MISH ) - +REGISTER_ELTWISE_UNARY_BACKEND_CL( round, UNARY_ROUND ) __END_DECLS diff --git a/src/tim/vx/internal/src/kernel/cl/erf_cl.c b/src/tim/vx/internal/src/kernel/cl/erf_cl.c new file mode 100644 index 0000000..e817d19 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/cl/erf_cl.c @@ -0,0 +1,328 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_error.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h" + +__BEGIN_DECLS + +/* + * Define kernel meta. + */ +#define HASH_UNARY_KEY(_input_type, _output_type, _image_2d) \ + ( (_input_type << 12) | (_output_type << 4) | (_image_2d)) + + #define VSI_NN_GEN_UNARY_KERNEL_SOURCE_NAME() \ + "erf" + +#define HASH_UNARY_SH_KERNEL_NAME( SRC_TYPE, DST_TYPE) \ + CVIVANTE_NAMESPACE("cl.erf_"#SRC_TYPE"to"#DST_TYPE) + +#define TENSOR_UNARY_KERNELS(SRC_TYPE, OUT_TYPE) \ + { HASH_UNARY_KEY(SRC_TYPE, OUT_TYPE, 0), \ + HASH_UNARY_SH_KERNEL_NAME(SRC_TYPE, OUT_TYPE), \ + VSI_NN_GEN_UNARY_KERNEL_SOURCE_NAME() }, + +#define HASH_UNARY_SH_KERNEL_2D_NAME(SRC_TYPE, DST_TYPE) \ + CVIVANTE_NAMESPACE("cl.erf_"#SRC_TYPE"to"#DST_TYPE"_2D") + +#define TENSOR_UNARY_KERNELS_2D(SRC_TYPE, OUT_TYPE) \ + { HASH_UNARY_KEY(SRC_TYPE, OUT_TYPE, 1), \ + HASH_UNARY_SH_KERNEL_2D_NAME(SRC_TYPE, OUT_TYPE), \ + VSI_NN_GEN_UNARY_KERNEL_SOURCE_NAME() }, + +#define TENSOR_UNARY_KERNELS_FLOAT(SRC_TYPE, OUT_TYPE) \ + { HASH_UNARY_KEY(SRC_TYPE, OUT_TYPE, 0), \ + HASH_UNARY_SH_KERNEL_NAME(F32, F32), \ + VSI_NN_GEN_UNARY_KERNEL_SOURCE_NAME() }, + +#define TENSOR_UNARY_KERNELS_FLOAT_2D(SRC_TYPE, OUT_TYPE) \ + { HASH_UNARY_KEY(SRC_TYPE, OUT_TYPE, 1), \ + HASH_UNARY_SH_KERNEL_2D_NAME(F32, F32), \ + VSI_NN_GEN_UNARY_KERNEL_SOURCE_NAME() }, + +typedef struct +{ + uint32_t key; + char * function_name; + const char * source_name; +} _kernel_map_type; + +static const _kernel_map_type _erf_kernel_map[] = +{ + // Register kernel here + TENSOR_UNARY_KERNELS_FLOAT(F32, F32) + TENSOR_UNARY_KERNELS_FLOAT(F16, F16) + + TENSOR_UNARY_KERNELS_FLOAT_2D(F32, F32) + TENSOR_UNARY_KERNELS_FLOAT_2D(F16, F16) + + TENSOR_UNARY_KERNELS(U8, U8) + + TENSOR_UNARY_KERNELS_2D(U8, U8) +}; + +/* + * Kernel params + */ +static vx_param_description_t _erf_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, +}; +#define SCALAR_INPUT_SCALE (2) +#define SCALAR_INPUT_TAIL (3) +#define SCALAR_OUTPUT_SCALE (4) +#define SCALAR_OUTPUT_ZP (5) +#define _ERF_PARAM_NUM _cnt_of_array( _erf_kernel_param_def ) + +/* + * Kernel initializer + */ +DEF_KERNEL_INITIALIZER(_erf_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + gpu_param_t gpu_param = { + 3, // workdim + {0, 0, 0}, // globalWorkOffset: control the start location be processed in the image + {0, 0, 0}, // globalWorkScale: how many pixels could be processed by a single thread + {0, 0, 0}, // localWorkSize: local group size in thread + {0, 0, 0} // globalWorkSize: image size in thread + }; + + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_tensor_attr_t * attr[2] = { NULL }; + vsi_int_array_t * out_shape = NULL; + + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); + CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); + attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); + CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final ); + + out_shape = attr[1]->shape; + + gpu_param.global_scale[0] = 1; + gpu_param.global_scale[1] = 1; + gpu_param.global_scale[2] = 1; + gpu_param.global_size[0] = gpu_align_p2( + (out_shape->data[0] + gpu_param.global_scale[0] - 1) + / gpu_param.global_scale[0], 4); + gpu_param.global_size[1] = ( + (out_shape->data[1] + gpu_param.global_scale[1] - 1) + / gpu_param.global_scale[1]); + gpu_param.global_size[2] = out_shape->size > 2 ? out_shape->data[2] : 1; + + status = vsi_nn_kernel_gpu_config( node, &gpu_param ); + +final: +#define SAFE_FREE_TENSOR_ATTR(_PTR) if ( _PTR ) { vsi_nn_kernel_tensor_attr_release( &_PTR ); _PTR = NULL; } + SAFE_FREE_TENSOR_ATTR(attr[0]); + SAFE_FREE_TENSOR_ATTR(attr[1]); +#undef SAFE_FREE_TENSOR_ATTR + + return status; +} /* _erf_initializer() */ + + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + vsi_nn_tensor_t * const * const inputs, + vsi_nn_tensor_t * const * const outputs, + vsi_bool image_2d + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_dtype_e in_dtype; + vsi_nn_kernel_dtype_e out_dtype; + const _kernel_map_type * kernel_map = _erf_kernel_map; + size_t kernel_map_size = _cnt_of_array( _erf_kernel_map ); + vx_param_description_t * param_def = _erf_kernel_param_def; + vx_kernel_initialize_f initializer = _erf_initializer; + + uint32_t key; + uint32_t i; + + in_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + + key = HASH_UNARY_KEY( in_dtype, out_dtype, image_2d ); + + for ( i = 0; i < (uint32_t)kernel_map_size; i ++ ) + { + if ( kernel_map[i].key == key ) + { + break; + } + } + + if ( i < (uint32_t)kernel_map_size ) + { + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name ); + kernel->info.parameters = param_def; + kernel->info.numParams = _cnt_of_array( _erf_kernel_param_def ); + kernel->info.initialize = initializer; + // Register code source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 1, + kernel_map[i].source_name ); + // Register binary source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + kernel_map[i].source_name ); + status = VSI_SUCCESS; + } + return status; +} /* _query_kernel() */ + + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_ERF_PARAM_NUM] = {NULL}; + vsi_nn_kernel_node_t node = NULL; + vsi_nn_tensor_t* rs_tensors[2] = { NULL }; + int32_t shape[VSI_NN_MAX_DIM_NUM] = { 0 }; + int32_t new_rank = 0; + vsi_bool ret = FALSE; + vsi_bool image_2d = FALSE; + + float inputScale = inputs[0]->attr.dtype.scale; + float inputTail = (float)inputs[0]->attr.dtype.zero_point * inputScale; + float outputScale = outputs[0]->attr.dtype.scale; + float outputZP = (float)outputs[0]->attr.dtype.zero_point + 0.5f; + + ret = vsi_nn_kernel_optimize_element_shape( + (int32_t *)inputs[0]->attr.size, inputs[0]->attr.dim_num, + shape, &new_rank ); + if ( ret ) + { + rs_tensors[0] = vsi_nn_reshape_tensor( graph, + inputs[0], (uint32_t*)shape, new_rank ); + rs_tensors[1] = vsi_nn_reshape_tensor( graph, + outputs[0], (uint32_t*)shape, new_rank ); + } + + if ( !vsi_nn_kernel_gpu_check_shape( (int32_t*)rs_tensors[0]->attr.size, + rs_tensors[0]->attr.dim_num ) ) + { + return NULL; + } + + outputScale = vsi_abs(outputScale) < 1e-5 ? 0.0f : 1.0f / outputScale; + + image_2d = (rs_tensors[0]->attr.dim_num == 2 || rs_tensors[0]->attr.size[2] == 1); + + status = _query_kernel( kernel, inputs, outputs, image_2d ); + if ( VSI_SUCCESS == status ) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if ( node ) + { + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( node_params, _ERF_PARAM_NUM, + rs_tensors, 1, &rs_tensors[1], 1 ); + + node_params[SCALAR_INPUT_SCALE] = vsi_nn_kernel_scalar_create( + graph, F32, &inputScale ); + node_params[SCALAR_INPUT_TAIL] = vsi_nn_kernel_scalar_create( + graph, F32, &inputTail ); + node_params[SCALAR_OUTPUT_SCALE] = vsi_nn_kernel_scalar_create( + graph, F32, &outputScale ); + node_params[SCALAR_OUTPUT_ZP] = vsi_nn_kernel_scalar_create( + graph, F32, &outputZP ); + + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, node_params, _ERF_PARAM_NUM ); + CHECK_STATUS_FAIL_GOTO( status, OnError ); + } + } + +OnError: + if (rs_tensors[0]) + { + vsi_nn_ReleaseTensor( &rs_tensors[0] ); + } + + if (rs_tensors[1]) + { + vsi_nn_ReleaseTensor( &rs_tensors[1] ); + } + + if (node_params[SCALAR_INPUT_SCALE]) + { + vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_SCALE] ); + } + + if (node_params[SCALAR_INPUT_TAIL]) + { + vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_TAIL] ); + } + + if (node_params[SCALAR_OUTPUT_SCALE]) + { + vsi_nn_kernel_scalar_release( &node_params[SCALAR_OUTPUT_SCALE] ); + } + + if (node_params[SCALAR_OUTPUT_ZP]) + { + vsi_nn_kernel_scalar_release( &node_params[SCALAR_OUTPUT_ZP] ); + } + + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_CL( erf, _setup ) diff --git a/src/tim/vx/internal/src/kernel/cl/floordiv_cl.c b/src/tim/vx/internal/src/kernel/cl/floordiv_cl.c index 831e27c..a500383 100644 --- a/src/tim/vx/internal/src/kernel/cl/floordiv_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/floordiv_cl.c @@ -68,11 +68,15 @@ static const _kernel_map_type _floordiv_kernel_map[] = // Register kernel here FLOORDIV_KERNELS( F32, F32, F32 ) FLOORDIV_KERNELS( I32, I32, I32 ) + FLOORDIV_KERNELS( I32, I32, U8 ) FLOORDIV_KERNELS( U8, U8, U8 ) + FLOORDIV_KERNELS( U8, I32, U8 ) FLOORDIV_KERNELS_2D( F32, F32, F32 ) FLOORDIV_KERNELS_2D( I32, I32, I32 ) + FLOORDIV_KERNELS_2D( I32, I32, U8 ) FLOORDIV_KERNELS_2D( U8, U8, U8 ) + FLOORDIV_KERNELS_2D( U8, I32, U8 ) }; @@ -311,4 +315,3 @@ static vsi_nn_kernel_node_t _setup __END_DECLS REGISTER_BACKEND_CL( floordiv, _setup ) - diff --git a/src/tim/vx/internal/src/kernel/cl/group_normalization_cl.c b/src/tim/vx/internal/src/kernel/cl/group_normalization_cl.c new file mode 100644 index 0000000..f4ecf0e --- /dev/null +++ b/src/tim/vx/internal/src/kernel/cl/group_normalization_cl.c @@ -0,0 +1,760 @@ +/**************************************************************************** +* +* Copyright (c) 2019 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_error.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "kernel/vsi_nn_kernel_eltwise.h" +#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h" + +__BEGIN_DECLS + +/* + * Define kernel meta. + */ +typedef enum +{ + INTERNAL_KERNEL_SUM_SQR, + INTERNAL_KERNEL_MEAN_VARI, + INTERNAL_KERNEL_NORM, +} _internal_kernel_e; + +#define KERNEL_SOURCE_1 "group_normalization_u8" +#define KERNEL_SOURCE_2 "group_normalization_f32" +#define KERNEL_SOURCE_3 "group_normalization_i32" + +// Add kernel hashtable here +#define HASH_GROUPNORM_SUM_SQR_KERNEL_NAME(SRC0_TYPE) \ + CVIVANTE_NAMESPACE("cl.group_norm_sumsqr_"#SRC0_TYPE) + +#define HASH_GROUPNORM_SUM_SQR_KERNEL_2D_NAME(SRC0_TYPE) \ + CVIVANTE_NAMESPACE("cl.group_norm_sumsqr_"#SRC0_TYPE"_2D") + +#define HASH_GROUPNORM_MEAN_VARI_KERNEL_NAME \ + CVIVANTE_NAMESPACE("cl.group_norm_meanvari") + +#define HASH_GROUPNORM_KERNEL_NAME(SRC0_TYPE, DST_TYPE) \ + CVIVANTE_NAMESPACE("cl.group_norm_"#SRC0_TYPE"to"#DST_TYPE) + +#define HASH_GROUPNORM_KERNEL_2D_NAME(SRC0_TYPE, DST_TYPE) \ + CVIVANTE_NAMESPACE("cl.group_norm_"#SRC0_TYPE"to"#DST_TYPE"_2D") + +// Add kernel hashtable here +// sum sqr +#define HASH_GROUPNORM_SUM_SQR_KEY(_input0_type, _output_type, _reshape_flag) \ + ((_input0_type << 24) | (_output_type << 16) | (_reshape_flag << 8)) + +#define TENSOR_GROUPNORM_SUM_SQR_KERNELS(IN0_TYPE, OUT_TYPE, SOURCE) \ + { HASH_GROUPNORM_SUM_SQR_KEY(IN0_TYPE, OUT_TYPE, 0), \ + HASH_GROUPNORM_SUM_SQR_KERNEL_NAME(IN0_TYPE), \ + SOURCE }, + +#define TENSOR_GROUPNORM_SUM_SQR_KERNELS_2D(IN0_TYPE, OUT_TYPE, SOURCE) \ + { HASH_GROUPNORM_SUM_SQR_KEY(IN0_TYPE, OUT_TYPE, 1), \ + HASH_GROUPNORM_SUM_SQR_KERNEL_2D_NAME(IN0_TYPE), \ + SOURCE }, + +#define HASH_GROUPNORM_MEAN_VARI_KEY(_input0_type, _output_type) \ + ((_input0_type << 24) | (_output_type << 16)) + +#define TENSOR_GROUPNORM_MEAN_VARI_KERNELS(SOURCE) \ + { HASH_GROUPNORM_MEAN_VARI_KEY(F32, F32), \ + HASH_GROUPNORM_MEAN_VARI_KERNEL_NAME, \ + SOURCE }, + +// normalization +#define HASH_GROUPNORM_KEY(_input0_type, _output_type, _reshape_flag) \ + ((_input0_type << 24) | (_output_type << 16) | (_reshape_flag << 8)) + +#define TENSOR_GROUPNORM_KERNELS(IN0_TYPE, OUT_TYPE, SOURCE) \ + { HASH_GROUPNORM_KEY(IN0_TYPE, OUT_TYPE, 0), \ + HASH_GROUPNORM_KERNEL_NAME(IN0_TYPE, OUT_TYPE), \ + SOURCE }, + +#define TENSOR_GROUPNORM_KERNELS_2D(IN0_TYPE, OUT_TYPE, SOURCE) \ + { HASH_GROUPNORM_KEY(IN0_TYPE, OUT_TYPE, 1), \ + HASH_GROUPNORM_KERNEL_2D_NAME(IN0_TYPE, OUT_TYPE), \ + SOURCE }, + +typedef struct +{ + uint32_t key; + char * function_name; + const char * source_name; +} _kernel_map_type; + +static const _kernel_map_type _groupnorm_sum_sqr_kernel_map[] = +{ + // Register kernel here + TENSOR_GROUPNORM_SUM_SQR_KERNELS( U8, F32, KERNEL_SOURCE_1 ) + TENSOR_GROUPNORM_SUM_SQR_KERNELS_2D( U8, F32, KERNEL_SOURCE_1 ) + TENSOR_GROUPNORM_SUM_SQR_KERNELS( F32, F32, KERNEL_SOURCE_2 ) + TENSOR_GROUPNORM_SUM_SQR_KERNELS_2D( F32, F32, KERNEL_SOURCE_2 ) + TENSOR_GROUPNORM_SUM_SQR_KERNELS( I32, F32, KERNEL_SOURCE_3 ) + TENSOR_GROUPNORM_SUM_SQR_KERNELS_2D( I32, F32, KERNEL_SOURCE_3 ) +}; + +static const _kernel_map_type _groupnorm_mean_vari_kernel_map[] = +{ + // Register kernel here + TENSOR_GROUPNORM_MEAN_VARI_KERNELS( KERNEL_SOURCE_2 ) +}; + +static const _kernel_map_type _groupnorm_kernel_map[] = +{ + // Register kernel here + TENSOR_GROUPNORM_KERNELS( U8, U8, KERNEL_SOURCE_1 ) + TENSOR_GROUPNORM_KERNELS_2D( U8, U8, KERNEL_SOURCE_1 ) + TENSOR_GROUPNORM_KERNELS( U8, F32, KERNEL_SOURCE_1 ) + TENSOR_GROUPNORM_KERNELS_2D( U8, F32, KERNEL_SOURCE_1 ) + + TENSOR_GROUPNORM_KERNELS( F32, F32, KERNEL_SOURCE_2 ) + TENSOR_GROUPNORM_KERNELS_2D( F32, F32, KERNEL_SOURCE_2 ) + + TENSOR_GROUPNORM_KERNELS( I32, I32, KERNEL_SOURCE_3 ) + TENSOR_GROUPNORM_KERNELS_2D( I32, I32, KERNEL_SOURCE_3 ) + TENSOR_GROUPNORM_KERNELS( I32, F32, KERNEL_SOURCE_3 ) + TENSOR_GROUPNORM_KERNELS_2D( I32, F32, KERNEL_SOURCE_3 ) +}; + +/* + * Kernel params + */ +static vx_param_description_t _groupnorm_sum_sqr_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + // Add kererl parameters here +}; +#define _GROUPNORM_SUM_SQR_PARAM_NUM _cnt_of_array( _groupnorm_sum_sqr_kernel_param_def ) + +static vx_param_description_t _groupnorm_mean_vari_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + // Add kererl parameters here +}; +#define _GROUPNORM_MEAN_VARI_PARAM_NUM _cnt_of_array( _groupnorm_mean_vari_kernel_param_def ) + +static vx_param_description_t _groupnorm_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + // Add kererl parameters here +}; +#define _GROUPNORM_PARAM_NUM _cnt_of_array( _groupnorm_kernel_param_def ) + +/* + * Kernel initializer + */ +DEF_KERNEL_INITIALIZER(_groupnorm_sum_sqr_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + gpu_param_t gpu_param = { + 3, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0} + }; + + vsi_nn_kernel_tensor_attr_t * attr[2] = { NULL }; + vsi_int_array_t * input_shape = NULL; + int32_t width = 0; + int32_t chn = 0; + + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); + CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); + attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); + CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final ); + + input_shape = attr[0]->shape; + width = input_shape->data[0]; + chn = attr[1]->shape->data[1]; + + gpu_param.global_scale[0] = 1; + gpu_param.global_scale[1] = 1; + gpu_param.global_scale[2] = 1; + gpu_param.local_size[0] = 16; + gpu_param.local_size[1] = 1; + gpu_param.local_size[2] = 1; + gpu_param.global_size[0] = (width + 15) / 16 * 16; + gpu_param.global_size[1] = chn; + gpu_param.global_size[2] = 1; + + status = vsi_nn_kernel_gpu_config( node, &gpu_param ); + CHECK_STATUS_FAIL_GOTO(status, final); + +final: + if (attr[0]) + { + vsi_nn_kernel_tensor_attr_release( &attr[0] ); + attr[0] = NULL; + } + if (attr[1]) + { + vsi_nn_kernel_tensor_attr_release( &attr[1] ); + attr[1] = NULL; + } + return status; +} /* _group_normalization_sum_sqr_initializer() */ + +DEF_KERNEL_INITIALIZER(_groupnorm_mean_vari_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + gpu_param_t gpu_param = { + 3, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0} + }; + + vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL }; + int32_t chn = 0; + + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); + CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); + + chn = attr[0]->shape->data[1]; + + gpu_param.global_scale[0] = 4; + gpu_param.global_scale[1] = 1; + gpu_param.global_scale[2] = 1; + gpu_param.local_size[0] = 16; + gpu_param.local_size[1] = 1; + gpu_param.local_size[2] = 1; + gpu_param.global_size[0] = 16; + gpu_param.global_size[1] = chn; + gpu_param.global_size[2] = 1; + + status = vsi_nn_kernel_gpu_config( node, &gpu_param ); + CHECK_STATUS_FAIL_GOTO(status, final); + +final: + if (attr[0]) + { + vsi_nn_kernel_tensor_attr_release( &attr[0] ); + attr[0] = NULL; + } + return status; +} /* _group_normalization_sum_sqr_initializer() */ + +DEF_KERNEL_INITIALIZER(_groupnorm_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + gpu_param_t gpu_param = { + 3, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0} + }; + + vsi_nn_kernel_tensor_attr_t * attr[2] = { NULL }; + vsi_int_array_t * input_shape = NULL; + int32_t width = 0; + int32_t height = 0; + int32_t chn = 0; + int32_t is2D = 0; + + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); + CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); + attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[3] ); + CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final ); + + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[6], &is2D); + CHECK_STATUS_FAIL_GOTO(status, final ); + + input_shape = attr[0]->shape; + width = input_shape->data[0]; + height = input_shape->data[1]; + chn = attr[1]->shape->data[1]; + + gpu_param.global_scale[0] = 1; + gpu_param.global_scale[1] = 1; + gpu_param.global_scale[2] = 1; + gpu_param.local_size[0] = 16; + gpu_param.local_size[1] = 1; + gpu_param.local_size[2] = 1; + gpu_param.global_size[0] = (width + 15) / 16 * 16; + gpu_param.global_size[1] = height; + gpu_param.global_size[2] = chn; + if (is2D) + { + gpu_param.global_size[0] = (width + 15) / 16 * 16; + gpu_param.global_size[1] = chn; + gpu_param.global_size[2] = 1; + } + + status = vsi_nn_kernel_gpu_config( node, &gpu_param ); + CHECK_STATUS_FAIL_GOTO(status, final); + +final: + if (attr[0]) + { + vsi_nn_kernel_tensor_attr_release( &attr[0] ); + attr[0] = NULL; + } + if (attr[1]) + { + vsi_nn_kernel_tensor_attr_release( &attr[1] ); + attr[1] = NULL; + } + return status; +} /* _groupnorm_initializer() */ + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + const uint32_t hashkey, + _internal_kernel_e kernel_id + /* Add extra params */ + ) +{ + vsi_status status = VSI_FAILURE; + vx_kernel_initialize_f initializer = NULL; + vx_param_description_t * param_def = NULL; + const _kernel_map_type* kernel_map; + size_t kernel_map_size = 0; + size_t param_size = 0; + uint32_t i = 0; + + switch( kernel_id ) + { + case INTERNAL_KERNEL_SUM_SQR: + initializer = _groupnorm_sum_sqr_initializer; + kernel_map = _groupnorm_sum_sqr_kernel_map; + kernel_map_size = _cnt_of_array( _groupnorm_sum_sqr_kernel_map ); + param_def = _groupnorm_sum_sqr_kernel_param_def; + param_size = _GROUPNORM_SUM_SQR_PARAM_NUM; + break; + case INTERNAL_KERNEL_MEAN_VARI: + initializer = _groupnorm_mean_vari_initializer; + kernel_map = _groupnorm_mean_vari_kernel_map; + kernel_map_size = _cnt_of_array( _groupnorm_mean_vari_kernel_map ); + param_def = _groupnorm_mean_vari_kernel_param_def; + param_size = _GROUPNORM_MEAN_VARI_PARAM_NUM; + break; + case INTERNAL_KERNEL_NORM: + initializer = _groupnorm_initializer; + kernel_map = _groupnorm_kernel_map; + kernel_map_size = _cnt_of_array( _groupnorm_kernel_map ); + param_def = _groupnorm_kernel_param_def; + param_size = _GROUPNORM_PARAM_NUM; + break; + default: + VSI_ASSERT( FALSE ); + return VSI_FAILURE; + } + + for( i = 0; i < kernel_map_size; i ++ ) + { + if ( kernel_map[i].key == hashkey ) + { + break; + } + } + if ( i < kernel_map_size ) + { + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name ); + kernel->info.parameters = param_def; + kernel->info.numParams = (uint32_t)param_size; + kernel->info.initialize = initializer; + // Register code source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2, + "eltwise_ops_helper", + kernel_map[i].source_name ); + // Register binary source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + kernel_map[i].source_name ); + status = VSI_SUCCESS; + } + + return status; +} /* _query_kernel() */ + +static int32_t _optimize_gn_shape_cl + ( + vsi_nn_tensor_t ** inputs, + int32_t group_size, + int32_t group_num, + int32_t* opt_shape, + int32_t* is2D_flg + ) +{ + vsi_status status = VSI_SUCCESS; + int32_t group_shape[VSI_NN_MAX_DIM_NUM] = {0}; + int32_t new_rank = 0; + group_shape[0] = inputs[0]->attr.size[0]; + group_shape[1] = inputs[0]->attr.size[1]; + group_shape[2] = group_size; + + vsi_nn_kernel_optimize_element_shape(group_shape, 3, opt_shape, &new_rank ); + + if (opt_shape[1] == 1) + { + opt_shape[1] = group_num; + opt_shape[2] = 1; + opt_shape[3] = inputs[0]->attr.dim_num > 3 ? inputs[0]->attr.size[3] : 1; + is2D_flg[0] = 1; + } + else if (new_rank == 2) + { + opt_shape[2] = group_num; + opt_shape[3] = inputs[0]->attr.dim_num > 3 ? inputs[0]->attr.size[3] : 1; + } + else + { + status = VSI_FAILURE; + } + + return status; +} + + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ +#define INTERNAL_KERNEL_SIZE (2) +#define SUM_SQR_INDEX (0) +#define MEAN_VARI_INDEX (1) + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t sum_sqr_node_params[_GROUPNORM_SUM_SQR_PARAM_NUM] = { NULL }; + vsi_nn_kernel_node_param_t mean_vari_node_params[_GROUPNORM_MEAN_VARI_PARAM_NUM] = { NULL }; + vsi_nn_kernel_node_param_t node_params[_GROUPNORM_PARAM_NUM] = { NULL }; + vsi_nn_kernel_node_t tmp_node = NULL, tmp_node1 = NULL; + vsi_nn_kernel_node_t node = NULL; + vsi_nn_kernel_dtype_e in0_dtype = U8; + vsi_nn_kernel_dtype_e out_dtype = U8; + vsi_nn_tensor_attr_t attr; + vsi_nn_kernel_t * ikernels[INTERNAL_KERNEL_SIZE] = { NULL }; + vsi_nn_tensor_t * tensors[INTERNAL_KERNEL_SIZE] = { NULL }; + vsi_nn_kernel_tensor_t rs_input = NULL, rs_output = NULL; + int32_t new_shape[VSI_NN_MAX_DIM_NUM] = { 1, 1, 1, 1 }; + int32_t is2D_flg = 0; + uint32_t hashkeys[INTERNAL_KERNEL_SIZE] = { 0 }; + uint32_t hashkey = 0; + int32_t i = 0; + float eps = vsi_nn_kernel_param_get_float32( params, "eps" ); + int32_t group_num = vsi_nn_kernel_param_get_int32( params, "group_num" ); + int32_t group_size = inputs[0]->attr.size[2] / group_num; + + int32_t width = inputs[0]->attr.size[0]; + int32_t height = inputs[0]->attr.size[1]; + int32_t group_stride = 1; + float input_zp = 0; + float input_scale = 1.0f; + int32_t input_fl = 0; + float output_zp = 0; + float output_scale = 1.0f; + int32_t output_fl = 0; + float rSpaceOrg = 1.0f / (width * height); + float group_ratio = 1.0f / (inputs[0]->attr.size[0] * inputs[0]->attr.size[1] * group_size); + + if ( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size, + outputs[0]->attr.dim_num ) ) + { + return NULL; + } + + status = _optimize_gn_shape_cl(inputs, group_size, group_num, new_shape, &is2D_flg); + if ( VSI_SUCCESS != status ) + { + goto final; + } + rs_input = vsi_nn_kernel_tensor_reshape(inputs[0]->t, new_shape, 4); + rs_output = vsi_nn_kernel_tensor_reshape(outputs[0]->t, new_shape, 4); + + width = new_shape[0]; + height = is2D_flg > 0 ? 1 : new_shape[1]; + group_stride = ((width + 15) / 16) * 4; + + if (inputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC) + { + input_zp = (float)inputs[0]->attr.dtype.zero_point; + input_scale = inputs[0]->attr.dtype.scale; + } + else if (inputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_DFP) + { + input_fl = inputs[0]->attr.dtype.fl; + if (input_fl > 0) + { + input_scale = (1.0f / ((float) ((int64_t)1 << input_fl))); + } + else + { + input_scale = ((float) ((int64_t)1 << -input_fl)); + } + input_zp = 0.0f; + } + + if (outputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC) + { + output_zp = (float)outputs[0]->attr.dtype.zero_point; + output_scale = 1.0f / outputs[0]->attr.dtype.scale; + } + else if (outputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_DFP) + { + output_fl = outputs[0]->attr.dtype.fl; + if (output_fl > 0) + { + output_scale = (float)((int64_t)1 << output_fl); + } + else + { + output_scale = (1.0f / (float)((int64_t)1 << -output_fl)); + } + output_zp = 0.0f; + } + + for( i = 0; i < INTERNAL_KERNEL_SIZE; i ++ ) + { + ikernels[i] = vsi_nn_kernel_create( VSI_NN_KERNEL_TYPE_CL ); + // Assign unique_id + ikernels[i]->unique_id = kernel->unique_id; + } + + memset( &attr, 0, sizeof(vsi_nn_tensor_attr_t) ); + attr.dtype.vx_type = VSI_NN_TYPE_FLOAT32; + attr.is_const = FALSE; + attr.vtl = TRUE; + attr.size[0] = ((new_shape[0] + 15) / 16) * 4; + attr.size[1] = group_num; + attr.size[2] = 1; + attr.size[3] = inputs[0]->attr.dim_num > 3 ? inputs[0]->attr.size[3] : 1; + attr.dim_num = 4; + tensors[SUM_SQR_INDEX] = vsi_nn_CreateTensor( graph, &attr ); + + attr.size[0] = 4; + tensors[MEAN_VARI_INDEX] = vsi_nn_CreateTensor( graph, &attr ); + + in0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + if (in0_dtype == F16) + { + in0_dtype = F32; + } + if (out_dtype == F16) + { + out_dtype = F32; + } + + hashkeys[SUM_SQR_INDEX]= HASH_GROUPNORM_SUM_SQR_KEY( in0_dtype, F32, is2D_flg ); + hashkeys[MEAN_VARI_INDEX]= HASH_GROUPNORM_MEAN_VARI_KEY( F32, F32 ); + hashkey = HASH_GROUPNORM_KEY( in0_dtype, out_dtype, is2D_flg ); + + status = _query_kernel( ikernels[SUM_SQR_INDEX], hashkeys[SUM_SQR_INDEX], INTERNAL_KERNEL_SUM_SQR ); + if ( VSI_SUCCESS != status ) + { + goto final; + } + status = _query_kernel( ikernels[MEAN_VARI_INDEX], hashkeys[MEAN_VARI_INDEX], INTERNAL_KERNEL_MEAN_VARI ); + if ( VSI_SUCCESS != status ) + { + goto final; + } + status = _query_kernel( kernel, hashkey, INTERNAL_KERNEL_NORM ); + if ( VSI_SUCCESS != status ) + { + goto final; + } + + // Sum Sqr + tmp_node = vsi_nn_kernel_create_node( graph, ikernels[SUM_SQR_INDEX] ); + if (tmp_node) + { + uint32_t index = 0; + sum_sqr_node_params[index++] = rs_input; + sum_sqr_node_params[index++] = (vsi_nn_kernel_node_param_t)tensors[SUM_SQR_INDEX]->t; + sum_sqr_node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &eps ); + sum_sqr_node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &is2D_flg ); + sum_sqr_node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &input_zp ); + sum_sqr_node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &input_scale ); + sum_sqr_node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &width ); + sum_sqr_node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &height ); + + status = vsi_nn_kernel_node_pass_param( tmp_node, sum_sqr_node_params, + _GROUPNORM_SUM_SQR_PARAM_NUM ); + CHECK_STATUS(status); + vsi_nn_kernel_scalar_release( &sum_sqr_node_params[2] ); + vsi_nn_kernel_scalar_release( &sum_sqr_node_params[3] ); + vsi_nn_kernel_scalar_release( &sum_sqr_node_params[4] ); + vsi_nn_kernel_scalar_release( &sum_sqr_node_params[5] ); + vsi_nn_kernel_scalar_release( &sum_sqr_node_params[6] ); + vsi_nn_kernel_scalar_release( &sum_sqr_node_params[7] ); + vsi_nn_kernel_node_release( &tmp_node ); + } + + // mean vari + tmp_node1 = vsi_nn_kernel_create_node( graph, ikernels[MEAN_VARI_INDEX] ); + if (tmp_node1) + { + uint32_t index = 0; + mean_vari_node_params[index++] = (vsi_nn_kernel_node_param_t)tensors[SUM_SQR_INDEX]->t; + mean_vari_node_params[index++] = (vsi_nn_kernel_node_param_t)tensors[MEAN_VARI_INDEX]->t; + mean_vari_node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &eps ); + mean_vari_node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &group_ratio ); + mean_vari_node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &group_stride ); + + status = vsi_nn_kernel_node_pass_param( tmp_node1, mean_vari_node_params, + _GROUPNORM_MEAN_VARI_PARAM_NUM ); + CHECK_STATUS(status); + vsi_nn_kernel_scalar_release( &mean_vari_node_params[2] ); + vsi_nn_kernel_scalar_release( &mean_vari_node_params[3] ); + vsi_nn_kernel_scalar_release( &mean_vari_node_params[4] ); + vsi_nn_kernel_node_release( &tmp_node1 ); + } + + // Nomalization + node = vsi_nn_kernel_create_node( graph, kernel ); + if (node) + { + uint32_t index = 0; + int32_t pStride = 0; + if (!is2D_flg) + { + pStride = inputs[1]->attr.size[0] / new_shape[1]; + rSpaceOrg = 1.0f / (new_shape[0] / pStride); + } + node_params[index++] = rs_input; + node_params[index++] = (vsi_nn_kernel_node_param_t)inputs[1]->t; + node_params[index++] = (vsi_nn_kernel_node_param_t)inputs[2]->t; + node_params[index++] = (vsi_nn_kernel_node_param_t)tensors[MEAN_VARI_INDEX]->t; + node_params[index++] = rs_output; + node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &eps ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &is2D_flg ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &input_zp ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &input_scale ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &output_zp ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &output_scale ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &rSpaceOrg ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &width ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &height ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &pStride ); + + status = vsi_nn_kernel_node_pass_param( node, node_params, + _GROUPNORM_PARAM_NUM ); + CHECK_STATUS(status); + vsi_nn_kernel_scalar_release( &node_params[5] ); + vsi_nn_kernel_scalar_release( &node_params[6] ); + vsi_nn_kernel_scalar_release( &node_params[7] ); + vsi_nn_kernel_scalar_release( &node_params[8] ); + vsi_nn_kernel_scalar_release( &node_params[9] ); + vsi_nn_kernel_scalar_release( &node_params[10] ); + vsi_nn_kernel_scalar_release( &node_params[11] ); + vsi_nn_kernel_scalar_release( &node_params[12] ); + vsi_nn_kernel_scalar_release( &node_params[13] ); + vsi_nn_kernel_scalar_release( &node_params[14] ); + } + + /* Pass parameters to node. */ +final: + if (rs_input) + { + vsi_nn_kernel_tensor_release( &rs_input ); + } + if (rs_output) + { + vsi_nn_kernel_tensor_release( &rs_output ); + } + for( i = 0; i < INTERNAL_KERNEL_SIZE; i ++ ) + { + if ( ikernels[i] ) + { + vsi_nn_kernel_release( &ikernels[i] ); + } + if ( tensors[i] ) + { + vsi_nn_ReleaseTensor( &tensors[i] ); + } + } +#undef INTERNAL_KERNEL_SIZE +#undef SUM_SQR_INDEX +#undef MEAN_VARI_INDEX + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_CL( group_norm, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/cl/moments_cl.c b/src/tim/vx/internal/src/kernel/cl/moments_cl.c index d258e39..59e3efa 100644 --- a/src/tim/vx/internal/src/kernel/cl/moments_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/moments_cl.c @@ -176,19 +176,19 @@ static int32_t get_moments_output_reshape_size } sizes[3] = out_dims_num > 3 ? output_size[3] : 1; - if(axis_num == 1 && axis[0] == 0) + if (axis_num == 1 && axis[0] == 0) { sizes[0] = output_size[1]; sizes[1] = out_dims_num > 2 ? output_size[2] : 1; out_rs_flg = 1; } - else if(axis_num == 1 && axis[0] == 1) + else if (axis_num == 1 && axis[0] == 1) { sizes[0] = output_size[0]; sizes[1] = out_dims_num > 2 ? output_size[2] : 1; out_rs_flg = 1; } - else if(axis_num == 2 && axis[0] == 0 && axis[1] == 1) + else if (axis_num == 2 && axis[0] == 0 && axis[1] == 1) { sizes[0] = out_dims_num > 2 ? output_size[2] : 1; out_rs_flg = 1; @@ -240,25 +240,25 @@ DEF_KERNEL_INITIALIZER(_moments_initializer) gpu_param.global_scale[0] = 1; gpu_param.global_scale[1] = 1; gpu_param.global_scale[2] = 1; - if(axis_num == 1 && axis == 0) + if (axis_num == 1 && axis == 0) { gpu_param.global_size[0] = gpu_align_p2((height + gpu_param.global_scale[0] - 1) / gpu_param.global_scale[0], 4); gpu_param.global_size[1] = chn; } - else if(axis_num == 1 && axis == 1) + else if (axis_num == 1 && axis == 1) { gpu_param.global_size[0] = gpu_align_p2((width + gpu_param.global_scale[0] - 1) / gpu_param.global_scale[0], 4); gpu_param.global_size[1] = chn; } - else if(axis_num == 1 && axis == 2) + else if (axis_num == 1 && axis == 2) { gpu_param.global_size[0] = gpu_align_p2((width + gpu_param.global_scale[0] - 1) / gpu_param.global_scale[0], 4); gpu_param.global_size[1] = height; } - else if(axis_num == 2) + else if (axis_num == 2) { gpu_param.local_size[0] = 16; gpu_param.local_size[1] = 1; @@ -266,7 +266,7 @@ DEF_KERNEL_INITIALIZER(_moments_initializer) gpu_param.global_size[0] = 16; gpu_param.global_size[1] = chn; } - else if(axis_num == 3) + else if (axis_num == 3) { gpu_param.local_size[0] = 16; gpu_param.local_size[1] = 1; @@ -315,13 +315,13 @@ static vsi_status _query_kernel for( i = 0; i < _cnt_of_array(moments_map); i ++ ) { - if( moments_map[i].key == key ) + if ( moments_map[i].key == key ) { break; } } - if( i < _cnt_of_array(moments_map) ) + if ( i < _cnt_of_array(moments_map) ) { snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", moments_map[i].function_name ); kernel->info.parameters = _moments_kernel_param_def; @@ -354,6 +354,7 @@ static vsi_nn_kernel_node_t _setup vsi_nn_kernel_node_param_t node_params[_MOMENTS_PARAM_NUM] = { NULL }; vsi_nn_kernel_node_t node = NULL; int32_t out_shape[VSI_NN_MAX_DIM_NUM] = {0}; + int32_t shape[VSI_NN_MAX_DIM_NUM] = {0}; int32_t out_rs_flg = 0; int32_t axis_num = 0; size_t axis_num_temp = 0; @@ -362,6 +363,7 @@ static vsi_nn_kernel_node_t _setup int32_t first_axis = axis[0]; int32_t i = 0; vsi_nn_kernel_scalar_t scalar_list[INTERNAL_MOMENTS_SCALAR_NUM] = {NULL}; + vsi_nn_kernel_tensor_t reshape_tensors[3] = { NULL }; int32_t width = inputs[0]->attr.size[0]; int32_t height = inputs[0]->attr.size[1]; @@ -372,7 +374,7 @@ static vsi_nn_kernel_node_t _setup axis_num = (int32_t)axis_num_temp; - if(inputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_DFP) + if (inputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_DFP) { if (inputs[0]->attr.dtype.fl > 0) { @@ -385,38 +387,52 @@ static vsi_nn_kernel_node_t _setup input_zp = 0; } - if(axis_num == 1 && axis[0] == 0) + if (axis_num == 1 && axis[0] == 0) { dim_ratio = (float)1.0 / (float)(width); } - else if(axis_num == 1 && axis[0] == 1) + else if (axis_num == 1 && axis[0] == 1) { dim_ratio = (float)1.0 / (float)(height); } - else if(axis_num == 1 && axis[0] == 2) + else if (axis_num == 1 && axis[0] == 2) { dim_ratio = (float)1.0 / (float)(chn); } - else if(axis_num == 2 && axis[0] == 0 && axis[1] == 1) + else if (axis_num == 2 && axis[0] == 0 && axis[1] == 1) { dim_ratio = (float)1.0 / (float)(width * height); } - else if(axis_num == 3) + else if (axis_num == 3) { dim_ratio = (float)1.0 / (float)(width * height * chn); } - if( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size, + if ( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size, outputs[0]->attr.dim_num ) ) { return NULL; } - if(keep_dim) + if (keep_dim) { out_rs_flg = get_moments_output_reshape_size(&outputs[0], out_shape, axis, axis_num); } + if (inputs[0]->attr.dim_num < 2) + { + shape[0] = inputs[0]->attr.size[0]; + shape[1] = 1; + reshape_tensors[0] = vsi_nn_kernel_tensor_reshape( inputs[0]->t, shape, 2 ); + } + if (outputs[0]->attr.dim_num < 2) + { + shape[0] = outputs[0]->attr.size[0]; + shape[1] = 1; + reshape_tensors[1] = vsi_nn_kernel_tensor_reshape( outputs[0]->t, shape, 2 ); + reshape_tensors[2] = vsi_nn_kernel_tensor_reshape( outputs[1]->t, shape, 2 ); + } + scalar_list[AXIS] = vsi_nn_kernel_scalar_create( graph, I32, &first_axis ); scalar_list[AXIS_NUM] = vsi_nn_kernel_scalar_create( graph, I32, &axis_num ); scalar_list[ZP] = vsi_nn_kernel_scalar_create( graph, I32, &input_zp ); @@ -427,19 +443,31 @@ static vsi_nn_kernel_node_t _setup scalar_list[DIMRATIO] = vsi_nn_kernel_scalar_create( graph, F32, &dim_ratio ); status = _query_kernel( inputs, outputs, kernel, params, axis, axis_num, 0 ); - if( VSI_SUCCESS == status) + if ( VSI_SUCCESS == status) { node = vsi_nn_kernel_create_node( graph, kernel ); - if( node ) + if ( node ) { uint32_t index = 0; /* Pass parameters to node. */ - node_params[index++] = (vsi_nn_kernel_node_param_t)(inputs[0]->t); - if(out_rs_flg) + if (reshape_tensors[0]) + { + node_params[index++] = reshape_tensors[0]; + } + else + { + node_params[index++] = (vsi_nn_kernel_node_param_t)(inputs[0]->t); + } + if (out_rs_flg) { node_params[index++] = vsi_nn_kernel_tensor_reshape( outputs[0]->t, out_shape, 4 ); node_params[index++] = vsi_nn_kernel_tensor_reshape( outputs[1]->t, out_shape, 4 ); } + else if (reshape_tensors[1]) + { + node_params[index++] = reshape_tensors[1]; + node_params[index++] = reshape_tensors[2]; + } else { node_params[index++] = (vsi_nn_kernel_node_param_t)(outputs[0]->t); @@ -455,7 +483,7 @@ static vsi_nn_kernel_node_t _setup node_params[index++] = scalar_list[DIMRATIO]; status = vsi_nn_kernel_node_pass_param( node, node_params, _MOMENTS_PARAM_NUM ); CHECK_STATUS(status); - if(out_rs_flg) + if (out_rs_flg) { vsi_nn_kernel_tensor_release( &node_params[1] ); vsi_nn_kernel_tensor_release( &node_params[2] ); @@ -465,10 +493,22 @@ static vsi_nn_kernel_node_t _setup } } + if (reshape_tensors[0]) + { + vsi_nn_kernel_tensor_release( &reshape_tensors[0] ); + } + if (reshape_tensors[1]) + { + vsi_nn_kernel_tensor_release( &reshape_tensors[1] ); + } + if (reshape_tensors[2]) + { + vsi_nn_kernel_tensor_release( &reshape_tensors[2] ); + } /* Pass parameters to node. */ for( i = 0; i < INTERNAL_MOMENTS_SCALAR_NUM; i ++ ) { - if(scalar_list[i]) + if (scalar_list[i]) { vsi_nn_kernel_scalar_release( &scalar_list[i] ); } diff --git a/src/tim/vx/internal/src/kernel/cl/one_hot_cl.c b/src/tim/vx/internal/src/kernel/cl/one_hot_cl.c new file mode 100644 index 0000000..bfbb653 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/cl/one_hot_cl.c @@ -0,0 +1,332 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_error.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "utils/vsi_nn_dtype_util.h" + +__BEGIN_DECLS + +/* + * Define kernel meta. + */ +typedef enum +{ + INTERNAL_KERNEL_ONE_HOT, +} _internal_kernel_e; + +#define _ONE_HOT_KERNEL_SOURCE "one_hot" + +// Add kernel hashtable here +#define HASH_ONE_HOT_SH_KERNEL_NAME(SRC_TYPE, DST_TYPE) \ + CVIVANTE_NAMESPACE("cl.one_hot_"#SRC_TYPE"to"#DST_TYPE) + +#define ONE_HOT_HASH_KEY( IN_DTYPE, OUT_DTYPE ) \ + (( IN_DTYPE << 8 ) | ( OUT_DTYPE )) + +#define PACK_ONE_HOT_KERNEL_MAP( IN_DTYPE, OUT_DTYPE ) \ +{ ONE_HOT_HASH_KEY( IN_DTYPE, OUT_DTYPE ), \ + HASH_ONE_HOT_SH_KERNEL_NAME( IN_DTYPE, OUT_DTYPE ), \ + _ONE_HOT_KERNEL_SOURCE } + +typedef struct +{ + uint32_t key; + char * function_name; + const char * source_name; +} _kernel_map_type; + +static const _kernel_map_type _one_hot_kernel_map[] = +{ + // Register kernel here + PACK_ONE_HOT_KERNEL_MAP( F32, F32 ), + PACK_ONE_HOT_KERNEL_MAP( I32, I32 ), + PACK_ONE_HOT_KERNEL_MAP( I32, F32 ), + PACK_ONE_HOT_KERNEL_MAP( I32, U8 ), + PACK_ONE_HOT_KERNEL_MAP( U8, U8 ), +}; + + +/* + * Kernel params + */ +static vx_param_description_t _one_hot_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, +}; +#define SCALAR_INPUT_DEPTH (2) +#define SCALAR_INPUT_ON_VALUE (3) +#define SCALAR_INPUT_OFF_VALUE (4) +#define SCALAR_INPUT_SCALE (5) +#define SCALAR_INPUT_TAIL (6) +#define _ONE_HOT_PARAM_NUM _cnt_of_array( _one_hot_kernel_param_def ) + +/* + * Kernel initializer + */ +DEF_KERNEL_INITIALIZER(_one_hot_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + gpu_param_t gpu_param = { + 2, // workdim + {0, 0, 0}, // globalWorkOffset: control the start location be processed in the image + {0, 0, 0}, // globalWorkScale: how many pixels could be processed by a single thread + {0, 0, 0}, // localWorkSize: local group size in thread + {0, 0, 0} // globalWorkSize: image size in thread + }; + + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_tensor_attr_t * attr[2] = { NULL }; + vsi_int_array_t * in_shape = NULL; + + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); + CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); + attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); + CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final ); + + in_shape = attr[0]->shape; + + gpu_param.global_scale[0] = 1; + gpu_param.global_scale[1] = 1; + gpu_param.global_size[0] = gpu_align_p2( + (in_shape->data[0] + gpu_param.global_scale[0] - 1) + / gpu_param.global_scale[0], 4); + gpu_param.global_size[1] = in_shape->data[1]; + + status = vsi_nn_kernel_gpu_config( node, &gpu_param ); + +final: +#define SAFE_FREE_TENSOR_ATTR(_PTR) if ( _PTR ) { vsi_nn_kernel_tensor_attr_release( &_PTR ); _PTR = NULL; } + SAFE_FREE_TENSOR_ATTR(attr[0]); + SAFE_FREE_TENSOR_ATTR(attr[1]); +#undef SAFE_FREE_TENSOR_ATTR + + return status; +} /* _one_hot_initializer() */ + + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + vsi_nn_tensor_t * const * const inputs, + vsi_nn_tensor_t * const * const outputs + /* Add extra params */ + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_dtype_e in_dtype; + vsi_nn_kernel_dtype_e out_dtype; + const _kernel_map_type * kernel_map = _one_hot_kernel_map; + size_t kernel_map_size = _cnt_of_array( _one_hot_kernel_map ); + vx_param_description_t * param_def = _one_hot_kernel_param_def; + vx_kernel_initialize_f initializer = _one_hot_initializer; + + uint32_t key; + uint32_t i; + + in_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + + if (in_dtype == F16) + { + in_dtype = F32; + } + + if (out_dtype == F16) + { + out_dtype = F32; + } + else if (out_dtype == I16 || out_dtype == I8) + { + out_dtype = I32; + } + + key = ONE_HOT_HASH_KEY( in_dtype, out_dtype ); + + for ( i = 0; i < (uint32_t)kernel_map_size; i ++ ) + { + if ( kernel_map[i].key == key ) + { + break; + } + } + if ( i < (uint32_t)kernel_map_size ) + { + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name ); + kernel->info.parameters = param_def; + kernel->info.numParams = _cnt_of_array( _one_hot_kernel_param_def ); + kernel->info.initialize = initializer; + // Register code source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 1, + kernel_map[i].source_name ); + // Register binary source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + kernel_map[i].source_name ); + status = VSI_SUCCESS; + } + return status; +} /* _query_kernel() */ + + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_ONE_HOT_PARAM_NUM] = {NULL}; + vsi_nn_kernel_node_t node = NULL; + vsi_nn_tensor_t* rs_tensors[2] = { NULL }; + int32_t shape[2][VSI_NN_MAX_DIM_NUM] = {{ 0 }}; + int32_t i = 0; + int32_t num_elements = vsi_nn_vxGetTensorElementNum(&inputs[0]->attr); + int32_t prefix_dim_size = 1; + int32_t suffix_dim_size = 0; + int32_t depth = vsi_nn_kernel_param_get_int32( params, "depth" ); + vsi_nn_kernel_dtype_e out_dtype; + uint32_t data[2] = {0}; + float on_value = vsi_nn_kernel_param_get_float32( params, "on_value" ); + float off_value = vsi_nn_kernel_param_get_float32( params, "off_value" ); + int32_t axis = vsi_nn_kernel_param_get_int32( params, "axis" ); + float inputScale = inputs[0]->attr.dtype.scale; + float inputTail = (float)inputs[0]->attr.dtype.zero_point * inputScale; + + out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + + if (out_dtype != F32 && out_dtype != F16) + { + vsi_nn_Float32ToDtype(on_value, (uint8_t*)&data[0], &outputs[0]->attr.dtype); + vsi_nn_Float32ToDtype(off_value, (uint8_t*)&data[1], &outputs[0]->attr.dtype); + } + else + { + data[0] = *(uint32_t*)&on_value; + data[1] = *(uint32_t*)&off_value; + } + + axis = axis == -1 ? (int32_t)inputs[0]->attr.dim_num : (int32_t)inputs[0]->attr.dim_num - axis; + for (i = 0; i < axis; i++) + { + prefix_dim_size *= inputs[0]->attr.size[i]; + } + + suffix_dim_size = num_elements / prefix_dim_size; + + shape[0][0] = suffix_dim_size; + shape[0][1] = prefix_dim_size; + shape[1][0] = suffix_dim_size; + shape[1][1] = depth; + shape[1][2] = prefix_dim_size; + + rs_tensors[0] = vsi_nn_reshape_tensor( graph, + inputs[0], (uint32_t*)shape[0], 2 ); + rs_tensors[1] = vsi_nn_reshape_tensor( graph, + outputs[0], (uint32_t*)shape[1], 3 ); + + if ( !vsi_nn_kernel_gpu_check_shape( (int32_t*)rs_tensors[1]->attr.size, + rs_tensors[1]->attr.dim_num ) ) + { + return NULL; + } + + status = _query_kernel( kernel, inputs, outputs ); + if ( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if ( node ) + { + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( node_params, _ONE_HOT_PARAM_NUM, + &rs_tensors[0], input_num, &rs_tensors[1], output_num ); + node_params[SCALAR_INPUT_DEPTH] = vsi_nn_kernel_scalar_create( + graph, I32, &depth ); + node_params[SCALAR_INPUT_ON_VALUE] = vsi_nn_kernel_scalar_create( + graph, U32, &data[0] ); + node_params[SCALAR_INPUT_OFF_VALUE] = vsi_nn_kernel_scalar_create( + graph, U32, &data[1] ); + node_params[SCALAR_INPUT_SCALE] = vsi_nn_kernel_scalar_create( + graph, F32, &inputScale ); + node_params[SCALAR_INPUT_TAIL] = vsi_nn_kernel_scalar_create( + graph, F32, &inputTail ); + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, node_params, _ONE_HOT_PARAM_NUM ); + CHECK_STATUS_FAIL_GOTO( status, final ); + } + } + +final: + if (rs_tensors[0]) + { + vsi_nn_ReleaseTensor( &rs_tensors[0] ); + } + + if (rs_tensors[1]) + { + vsi_nn_ReleaseTensor( &rs_tensors[1] ); + } + + for (i = SCALAR_INPUT_DEPTH; i < _ONE_HOT_PARAM_NUM; i++) + { + if (node_params[i]) + { + vsi_nn_kernel_scalar_release( &node_params[i] ); + } + } + + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_CL( one_hot, _setup ) diff --git a/src/tim/vx/internal/src/kernel/cl/reducemax_internal_cl.c b/src/tim/vx/internal/src/kernel/cl/reducemax_internal_cl.c index ee5b0a4..6f112bd 100644 --- a/src/tim/vx/internal/src/kernel/cl/reducemax_internal_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/reducemax_internal_cl.c @@ -178,11 +178,19 @@ static vsi_status _query_kernel { in_dtype = F32; } + else if (I16 == in_dtype && inputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_NONE) + { + in_dtype = I32; + } if (F16 == out_dtype) { out_dtype = F32; } + else if (I16 == out_dtype && outputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_NONE) + { + out_dtype = I32; + } key = HASH_REDUCEMAX_HASH_KEY( axis, in_dtype, out_dtype, image_2d ); diff --git a/src/tim/vx/internal/src/kernel/cl/repeat_cl.c b/src/tim/vx/internal/src/kernel/cl/repeat_cl.c new file mode 100644 index 0000000..d133782 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/cl/repeat_cl.c @@ -0,0 +1,407 @@ +/**************************************************************************** +* +* Copyright (c) 2019 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_error.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "kernel/vsi_nn_kernel_eltwise.h" +#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h" + +__BEGIN_DECLS + +/* + * Define kernel meta. + */ +#define KERNEL_SOURCE_1 "repeat" + +// Add kernel hashtable here + +#define HASH_REPEAT_KERNEL_NAME(SRC0_TYPE, AXIS) \ + CVIVANTE_NAMESPACE("cl.repeat_"#SRC0_TYPE"_axis"#AXIS) + +#define HASH_REPEAT_KERNEL_1D_NAME(SRC0_TYPE) \ + CVIVANTE_NAMESPACE("cl.repeat_"#SRC0_TYPE"_1D") + +// Add kernel hashtable here +#define HASH_REPEAT_KEY(_input0_type, _output_type, _is1d, _axis) \ + ((_input0_type << 24) | (_output_type << 16) | (_is1d << 8) | _axis) + +#define TENSOR_REPEAT_KERNELS(IN0_TYPE, OUT_TYPE, AXIS, SOURCE) \ + { HASH_REPEAT_KEY(IN0_TYPE, OUT_TYPE, 0, AXIS), \ + HASH_REPEAT_KERNEL_NAME(IN0_TYPE, AXIS), \ + SOURCE }, + +#define TENSOR_REPEAT_1D_KERNELS(IN0_TYPE, OUT_TYPE, SOURCE) \ + { HASH_REPEAT_KEY(IN0_TYPE, OUT_TYPE, 1, 0), \ + HASH_REPEAT_KERNEL_1D_NAME(IN0_TYPE), \ + SOURCE }, + +typedef struct +{ + uint32_t key; + char * function_name; + const char * source_name; +} _kernel_map_type; + +static const _kernel_map_type _repeat_kernel_map[] = +{ + // Register kernel here + TENSOR_REPEAT_KERNELS( I32, I32, 0, KERNEL_SOURCE_1 ) + TENSOR_REPEAT_KERNELS( I32, I32, 1, KERNEL_SOURCE_1 ) + TENSOR_REPEAT_KERNELS( I32, I32, 2, KERNEL_SOURCE_1 ) + TENSOR_REPEAT_KERNELS( F32, F32, 0, KERNEL_SOURCE_1 ) + TENSOR_REPEAT_KERNELS( F32, F32, 1, KERNEL_SOURCE_1 ) + TENSOR_REPEAT_KERNELS( F32, F32, 2, KERNEL_SOURCE_1 ) + + TENSOR_REPEAT_1D_KERNELS( I32, I32, KERNEL_SOURCE_1 ) + TENSOR_REPEAT_1D_KERNELS( F32, F32, KERNEL_SOURCE_1 ) +}; + +/* + * Kernel params + */ +static vx_param_description_t _repeat_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + // Add kererl parameters here +}; +#define _REPEAT_PARAM_NUM _cnt_of_array( _repeat_kernel_param_def ) + +/* + * Kernel initializer + */ +DEF_KERNEL_INITIALIZER(_repeat_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + gpu_param_t gpu_param = { + 3, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0} + }; + + vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL }; + vsi_int_array_t * input_shape = NULL; + int32_t height = 0, width = 0, chn = 0; + int32_t is1d = 0; + int32_t axis = 0; + + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); + CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[6], &axis); + CHECK_STATUS_FAIL_GOTO(status, final ); + + input_shape = attr[0]->shape; + width = input_shape->data[0]; + height = input_shape->data[1]; + if (height == 1 && input_shape->size == 2) + { + is1d = 1; + } + chn = input_shape->size > 2 ? input_shape->data[2] : 1; + + gpu_param.global_scale[0] = 1; + gpu_param.global_scale[1] = 1; + gpu_param.global_scale[2] = 1; + gpu_param.global_size[0] = width; + gpu_param.global_size[1] = height; + gpu_param.global_size[2] = chn; + if (is1d || axis == 1) + { + gpu_param.global_size[0] = 1; + } + else if (axis == 0) + { + gpu_param.global_size[1] = 1; + } + else if (axis == 2) + { + gpu_param.global_size[2] = 1; + } + + status = vsi_nn_kernel_gpu_config( node, &gpu_param ); + CHECK_STATUS_FAIL_GOTO(status, final); + +final: + if (attr[0]) + { + vsi_nn_kernel_tensor_attr_release( &attr[0] ); + attr[0] = NULL; + } + return status; +} /* _repeat_initializer() */ + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + vsi_nn_tensor_t * const * const inputs, + vsi_nn_tensor_t * const * const outputs, + int32_t axis + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_dtype_e input0_dtype = U8; + vsi_nn_kernel_dtype_e output_dtype = U8; + int32_t is1d = inputs[0]->attr.dim_num == 1 ? 1 : 0; + uint32_t key = 0; + int i = 0; + + input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + + if (input0_dtype == F16) + { + input0_dtype = F32; + } + if (output_dtype == F16) + { + output_dtype = F32; + } + + key = HASH_REPEAT_KEY( input0_dtype, output_dtype, is1d, axis ); + + for( i = 0; i < _cnt_of_array(_repeat_kernel_map); i ++ ) + { + if ( _repeat_kernel_map[i].key == key ) + { + break; + } + } + + if ( i < _cnt_of_array(_repeat_kernel_map) ) + { + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", _repeat_kernel_map[i].function_name ); + kernel->info.parameters = _repeat_kernel_param_def; + kernel->info.numParams = _REPEAT_PARAM_NUM; + kernel->info.initialize = _repeat_initializer; + // Register code source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2, + "eltwise_ops_helper", + _repeat_kernel_map[i].source_name ); + // Register binary source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + _repeat_kernel_map[i].source_name ); + status = VSI_SUCCESS; + } + return status; +} /* _query_kernel() */ + +static int32_t _optimize_repeat_shape + ( + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs, + int32_t* axis, + int32_t* opt_shape_in, + int32_t* opt_shape_out, + int32_t* new_rank + ) +{ + vsi_status status = VSI_SUCCESS; + + if (inputs[0]->attr.dim_num == 1) + { + opt_shape_in[0] = inputs[0]->attr.size[0]; + opt_shape_in[1] = 1; + opt_shape_out[0] = outputs[0]->attr.size[0]; + opt_shape_out[1] = 1; + new_rank[0] = 2; + new_rank[1] = 2; + } + else if (axis[0] == 3) + { + vsi_nn_kernel_optimize_element_shape( (int32_t*)inputs[0]->attr.size, 3, opt_shape_in, new_rank ); + if (opt_shape_in[1] == 1) + { + opt_shape_in[1] = inputs[0]->attr.size[3]; + opt_shape_out[0] = opt_shape_in[0]; + opt_shape_out[1] = outputs[0]->attr.size[3]; + axis[0] = 0; + new_rank[0] = 2; + new_rank[1] = 2; + } + else if (new_rank[0] == 2) + { + opt_shape_in[2] = inputs[0]->attr.size[3]; + opt_shape_out[0] = opt_shape_in[0]; + opt_shape_out[1] = opt_shape_in[1]; + opt_shape_out[2] = outputs[0]->attr.size[3]; + axis[0] = 2; + new_rank[0] = 3; + new_rank[1] = 3; + } + else + { + status = VSI_FAILURE; + } + } + + return status; +} + + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_REPEAT_PARAM_NUM] = { NULL }; + vsi_nn_kernel_node_t node = NULL; + vsi_nn_kernel_tensor_t rs_input = NULL, rs_input1 = NULL, rs_output = NULL; + int32_t new_shape[2][VSI_NN_MAX_DIM_NUM] = {{ 1, 1, 1, 1 }, { 1, 1, 1, 1 }}; + int32_t new_rank[2] = {0, 0}; + int32_t axis = vsi_nn_kernel_param_get_int32( params, "axis" ); + + int32_t width = inputs[0]->attr.size[0]; + int32_t height = inputs[0]->attr.dim_num > 1 ? inputs[0]->attr.size[1] : 1; + int32_t channel = inputs[0]->attr.dim_num > 2 ? inputs[0]->attr.size[2] : 1; + + if ( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size, + outputs[0]->attr.dim_num ) ) + { + return NULL; + } + + if (axis > 2 || outputs[0]->attr.dim_num == 1) + { + status = _optimize_repeat_shape(inputs, outputs, &axis, new_shape[0], new_shape[1], new_rank); + if ( VSI_SUCCESS != status ) + { + goto final; + } + rs_input = vsi_nn_kernel_tensor_reshape(inputs[0]->t, new_shape[0], new_rank[0]); + rs_output = vsi_nn_kernel_tensor_reshape(outputs[0]->t, new_shape[1], new_rank[1]); + + width = new_shape[0][0]; + height = new_shape[0][1]; + channel = new_rank[0] > 2 ? new_shape[0][2]: 1; + } + + if (inputs[1]->attr.dim_num == 1) + { + new_shape[0][0] = inputs[1]->attr.size[0]; + new_shape[0][1] = 1; + rs_input1 = vsi_nn_kernel_tensor_reshape(inputs[1]->t, new_shape[0], 2); + } + + status = _query_kernel( kernel, inputs, outputs, axis ); + if ( VSI_SUCCESS != status ) + { + goto final; + } + + node = vsi_nn_kernel_create_node( graph, kernel ); + if (node) + { + uint32_t index = 0; + if (rs_input) + { + node_params[index++] = rs_input; + } + else + { + node_params[index++] = (vsi_nn_kernel_node_param_t)inputs[0]->t; + } + if (rs_input1) + { + node_params[index++] = rs_input1; + } + else + { + node_params[index++] = (vsi_nn_kernel_node_param_t)inputs[1]->t; + } + if (rs_output) + { + node_params[index++] = rs_output; + } + else + { + node_params[index++] = (vsi_nn_kernel_node_param_t)outputs[0]->t; + } + node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &width ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &height ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &channel ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &axis ); + + status = vsi_nn_kernel_node_pass_param( node, node_params, + _REPEAT_PARAM_NUM ); + CHECK_STATUS(status); + vsi_nn_kernel_scalar_release( &node_params[3] ); + vsi_nn_kernel_scalar_release( &node_params[4] ); + vsi_nn_kernel_scalar_release( &node_params[5] ); + vsi_nn_kernel_scalar_release( &node_params[6] ); + } + + /* Pass parameters to node. */ +final: + if (rs_input) + { + vsi_nn_kernel_tensor_release( &rs_input ); + } + if (rs_input1) + { + vsi_nn_kernel_tensor_release( &rs_input1 ); + } + if (rs_output) + { + vsi_nn_kernel_tensor_release( &rs_output ); + } + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_CL( repeat, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/cl/sequence_mask_cl.c b/src/tim/vx/internal/src/kernel/cl/sequence_mask_cl.c new file mode 100644 index 0000000..45e606e --- /dev/null +++ b/src/tim/vx/internal/src/kernel/cl/sequence_mask_cl.c @@ -0,0 +1,354 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_error.h" +#include "vsi_nn_tensor_util.h" +#include "math.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "kernel/vsi_nn_kernel_eltwise.h" +#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h" + +__BEGIN_DECLS + +/* + * Define kernel meta. + */ +#define KERNEL_SOURCE_1 "sequence_mask" + +#define HASH_SEQUENCE_MASK_KEY(_input0_type, _output_type, _image_2d) \ + ((_input0_type << 24) | (_output_type << 8) | (_image_2d)) + +#define HASH_SEQUENCE_MASK_SH_KERNEL_NAME(SRC0_TYPE, DST_TYPE) \ + CVIVANTE_NAMESPACE("cl.sequence_mask_"#SRC0_TYPE"to"#DST_TYPE) + + #define HASH_SEQUENCE_MASK_SH_2DKERNEL_NAME(SRC0_TYPE, DST_TYPE) \ + CVIVANTE_NAMESPACE("cl.sequence_mask_"#SRC0_TYPE"to"#DST_TYPE"_2D") + +#define TENSOR_SEQUENCE_MASK_KERNELS(IN0_TYPE, OUT_TYPE, SOURCE) \ + { HASH_SEQUENCE_MASK_KEY(IN0_TYPE, OUT_TYPE, 0), \ + HASH_SEQUENCE_MASK_SH_KERNEL_NAME(IN0_TYPE, OUT_TYPE), \ + SOURCE }, + + #define TENSOR_SEQUENCE_MASK_2D_KERNELS(IN0_TYPE, OUT_TYPE, SOURCE) \ + { HASH_SEQUENCE_MASK_KEY(IN0_TYPE, OUT_TYPE, 1), \ + HASH_SEQUENCE_MASK_SH_2DKERNEL_NAME(IN0_TYPE, OUT_TYPE), \ + SOURCE }, + +static const struct { + uint32_t key; + char* function_name; + const char* source_name; + } kernel_map[] = +{ + TENSOR_SEQUENCE_MASK_KERNELS(I32, U8, KERNEL_SOURCE_1) + TENSOR_SEQUENCE_MASK_KERNELS(I32, I32, KERNEL_SOURCE_1) + TENSOR_SEQUENCE_MASK_KERNELS(I32, F32, KERNEL_SOURCE_1) + TENSOR_SEQUENCE_MASK_2D_KERNELS(I32, U8, KERNEL_SOURCE_1) + TENSOR_SEQUENCE_MASK_2D_KERNELS(I32, I32, KERNEL_SOURCE_1) + TENSOR_SEQUENCE_MASK_2D_KERNELS(I32, F32, KERNEL_SOURCE_1) +}; + +/* + * Kernel params + */ +static vx_param_description_t kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, +}; + +#define _CL_PARAM_NUM _cnt_of_array(kernel_param_def) + +/* + * Kernel initializer + */ +DEF_KERNEL_INITIALIZER(_sequence_mask_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + gpu_param_t gpu_param = { + 3, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0} + }; + + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL }; + vsi_int_array_t * out_shape = NULL; + + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); + CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); + + out_shape = attr[0]->shape; + + gpu_param.global_scale[0] = 1; + gpu_param.global_scale[1] = 1; + gpu_param.global_scale[2] = 1; + + gpu_param.global_size[0] = gpu_align_p2((out_shape->data[0] + gpu_param.global_scale[0] - 1) + / gpu_param.global_scale[0], 4); + gpu_param.global_size[1] = (out_shape->data[1] + gpu_param.global_scale[1] - 1) + / gpu_param.global_scale[1]; + gpu_param.global_size[2] = out_shape->size > 2 ? out_shape->data[2] : 1; + + status = vsi_nn_kernel_gpu_config( node, &gpu_param ); + +final: + if (attr[0]) + { + vsi_nn_kernel_tensor_attr_release( &attr[0] ); + attr[0] = NULL; + } + + return status; +} /* _sequence_mask_initializer() */ + +static vsi_status _query_kernel + ( + vsi_nn_tensor_t* const* const inputs, + vsi_nn_tensor_t* const* const outputs, + vsi_nn_kernel_t* kernel, + int32_t is2Dflg + ) +{ + vsi_nn_kernel_dtype_e input0_dtype = I32; + vsi_nn_kernel_dtype_e output_dtype = U8; + vsi_status status = VSI_FAILURE; + uint32_t key = 0; + int i = 0; + + input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + if (output_dtype == BOOL8) + { + output_dtype= U8; + } + + key = HASH_SEQUENCE_MASK_KEY( input0_dtype, output_dtype, is2Dflg ); + + for( i = 0; i < _cnt_of_array(kernel_map); i ++ ) + { + if ( kernel_map[i].key == key ) + { + break; + } + } + if ( i < _cnt_of_array(kernel_map) ) + { + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name ); + kernel->info.parameters = kernel_param_def; + kernel->info.numParams = _cnt_of_array( kernel_param_def ); + kernel->info.initialize = _sequence_mask_initializer; + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2, + "eltwise_ops_helper", + kernel_map[i].source_name ); + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + kernel_map[i].source_name ); + status = VSI_SUCCESS; + } + return status; +} /* _query_kernel() */ + +static int32_t _optimize_mask_shape + ( + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs, + int32_t max_len, + int32_t* opt_shape_in, + int32_t* opt_shape_out, + int32_t* is2Dflg + ) +{ + vsi_status status = VSI_SUCCESS; + int32_t in_shape[VSI_NN_MAX_DIM_NUM] = {0}; + int32_t new_rank = 0; + uint32_t i = 0; + + for(i = 0; i < inputs[0]->attr.dim_num; i++) + { + in_shape[i] = inputs[0]->attr.size[i]; + } + + vsi_nn_kernel_optimize_element_shape( in_shape, inputs[0]->attr.dim_num, opt_shape_in, &new_rank ); + if (new_rank > 2) + { + return VSI_FAILURE; + } + + opt_shape_out[0] = max_len; + for(i = 0; i < (uint32_t)new_rank; i++) + { + opt_shape_out[i + 1] = opt_shape_in[i]; + } + if (opt_shape_out[2] == 1) + { + is2Dflg[0] = 1; + } + + return status; +} + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_CL_PARAM_NUM] = {NULL}; + vsi_nn_kernel_tensor_t rs_input = NULL, rs_output = NULL; + int32_t new_shape[2][VSI_NN_MAX_DIM_NUM] = {{ 1, 1, 1, 1 }, { 1, 1, 1, 1 }}; + int32_t max_len = vsi_nn_kernel_param_get_int32( params, "max_len" ); + vsi_nn_kernel_node_t node = NULL; + int32_t is2Dflg = 0; + float input_zp = 0; + float input_scale = 1.0f; + int32_t output_zp = 0; + float output_scale = 1.0f; + float input_zpScale = 0; + float outputVal1 = 1.0f; + int32_t input_fl = 0; + int32_t output_fl = 0; + + if ( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size, + outputs[0]->attr.dim_num ) ) + { + return NULL; + } + + status = _optimize_mask_shape(inputs, outputs, max_len, new_shape[0], new_shape[1], &is2Dflg); + if ( VSI_SUCCESS != status ) + { + goto final; + } + rs_input = vsi_nn_kernel_tensor_reshape(inputs[0]->t, new_shape[0], 2); + rs_output = vsi_nn_kernel_tensor_reshape(outputs[0]->t, new_shape[1], 4); + + if (inputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC) + { + input_zp = (float)inputs[0]->attr.dtype.zero_point; + input_scale = inputs[0]->attr.dtype.scale; + } + else if (inputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_DFP) + { + input_fl = inputs[0]->attr.dtype.fl; + if (input_fl > 0) + { + input_scale = (1.0f / ((float) ((int64_t)1 << input_fl))); + } + else + { + input_scale = ((float) ((int64_t)1 << -input_fl)); + } + input_zp = 0.0f; + } + + if (outputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC) + { + output_zp = outputs[0]->attr.dtype.zero_point; + output_scale = 1.0f / outputs[0]->attr.dtype.scale; + } + else if (outputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_DFP) + { + output_fl = outputs[0]->attr.dtype.fl; + if (output_fl > 0) + { + output_scale = (float)((int64_t)1 << output_fl); + } + else + { + output_scale = (1.0f / (float)((int64_t)1 << -output_fl)); + } + output_zp = 0; + } + input_zpScale = input_scale * input_zp; + outputVal1 = output_scale + (float)output_zp; + + status = _query_kernel( inputs, outputs, kernel, is2Dflg ); + if ( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + + if ( node ) + { + uint32_t index = 0; + node_params[index++] = rs_input; + node_params[index++] = rs_output; + node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &max_len ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &input_scale ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &input_zpScale ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &outputVal1 ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &output_zp ); + + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, node_params, _CL_PARAM_NUM ); + VSI_ASSERT( status == VSI_SUCCESS ); + vsi_nn_kernel_scalar_release( &node_params[2] ); + vsi_nn_kernel_scalar_release( &node_params[3] ); + vsi_nn_kernel_scalar_release( &node_params[4] ); + vsi_nn_kernel_scalar_release( &node_params[5] ); + vsi_nn_kernel_scalar_release( &node_params[6] ); + } + } + +final: + if (rs_input) + { + vsi_nn_kernel_tensor_release( &rs_input ); + } + if (rs_output) + { + vsi_nn_kernel_tensor_release( &rs_output ); + } + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_CL( sequence_mask, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/cl/slice_cl.c b/src/tim/vx/internal/src/kernel/cl/slice_cl.c new file mode 100644 index 0000000..d05a32e --- /dev/null +++ b/src/tim/vx/internal/src/kernel/cl/slice_cl.c @@ -0,0 +1,308 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_error.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h" + +__BEGIN_DECLS + + /* + * Define kernel meta. + */ + typedef enum +{ + INTERNAL_KERNEL_SLICE, +} _internal_kernel_e; + +#define _SLICE_KERNEL_SOURCE "slice" +#define SLICE_SH_KERNEL_NAME(IN0_DTYPE, IN1_DTYPE, OUT_DTYPE) \ + CVIVANTE_NAMESPACE("cl.slice_"#IN0_DTYPE"_"#IN1_DTYPE"to"#OUT_DTYPE) + +// Add kernel hashtable here +#define SLICE_HASH_KEY( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE , _IMAGE_2D) \ + (( IN1_DTYPE << 18 ) | ( IN0_DTYPE << 10 ) | ( OUT_DTYPE << 2 ) | (_IMAGE_2D)) + +#define PACK_KERNEL_MAP( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, SOURCE ) \ +{ SLICE_HASH_KEY( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, 0 ), \ + SLICE_SH_KERNEL_NAME( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ), SOURCE } + +#define SLICE_SH_KERNEL_2D_NAME(IN0_DTYPE, IN1_DTYPE, OUT_DTYPE) \ + CVIVANTE_NAMESPACE("cl.slice_"#IN0_DTYPE"_"#IN1_DTYPE"to"#OUT_DTYPE"_2D") + +#define PACK_KERNEL_MAP_2D( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, SOURCE ) \ +{ SLICE_HASH_KEY( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, 1 ), \ + SLICE_SH_KERNEL_2D_NAME( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ), SOURCE } + + +typedef struct +{ + uint32_t key; + char * function_name; + const char * source_name; +} _kernel_map_type; + +static const _kernel_map_type _slice_kernel_map[] = +{ + // Register kernel here + PACK_KERNEL_MAP( F32, I32, F32, _SLICE_KERNEL_SOURCE ), + PACK_KERNEL_MAP( I32, I32, I32, _SLICE_KERNEL_SOURCE ), + PACK_KERNEL_MAP( U8, I32, U8, _SLICE_KERNEL_SOURCE ), + + PACK_KERNEL_MAP_2D( F32, I32, F32, _SLICE_KERNEL_SOURCE ), + PACK_KERNEL_MAP_2D( I32, I32, I32, _SLICE_KERNEL_SOURCE ), + PACK_KERNEL_MAP_2D( U8, I32, U8, _SLICE_KERNEL_SOURCE ), +}; + +#define _INPUT_NUM (2) +#define _OUTPUT_NUM (1) +#define _IO_NUM (_INPUT_NUM + _OUTPUT_NUM) + +/* +* Kernel params +*/ +static vx_param_description_t _slice_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + // Add kererl parameters here +}; +#define _SLICE_PARAM_NUM _cnt_of_array( _slice_kernel_param_def ) +#define SCALAR_INPUT_SCALE (3) +#define SCALAR_INPUT_TAIL (4) +#define SCALAR_OUTPUT_SCALE (5) +#define SCALAR_OUTPUT_ZP (6) +/* +* Kernel initializer +*/ +DEF_KERNEL_INITIALIZER(_slice_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + gpu_param_t gpu_param = { + 3, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0} + }; + vsi_nn_kernel_tensor_attr_t * output_attr = NULL; + vsi_int_array_t * out_shape = NULL; + + output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] ); + CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final ); + + out_shape = output_attr->shape; + + gpu_param.global_scale[0] = 1; + gpu_param.global_scale[1] = 1; + gpu_param.global_scale[2] = 1; + + gpu_param.dim = (out_shape->size < 3 || 1 == out_shape->data[2]) ? 2 : 3; + gpu_param.global_size[0] = gpu_align_p2( + (out_shape->data[0] + gpu_param.global_scale[0] - 1) + / gpu_param.global_scale[0], 4); + gpu_param.global_size[1] = ( + (out_shape->data[1] + gpu_param.global_scale[1] - 1) + / gpu_param.global_scale[1]); + gpu_param.global_size[2] = out_shape->size > 2 ? out_shape->data[2] : 1; + status = vsi_nn_kernel_gpu_config( node, &gpu_param ); + + final: +#define SAFE_FREE_TENSOR_ATTR(_PTR) if( _PTR ) { vsi_nn_kernel_tensor_attr_release( &_PTR ); _PTR = NULL; } + SAFE_FREE_TENSOR_ATTR(output_attr); + return status; +} /* _slice_initializer() */ + +/* +* Query kernel +*/ +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs, + vsi_bool image_2d + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_dtype_e in0_dtype; + vsi_nn_kernel_dtype_e in1_dtype; + vsi_nn_kernel_dtype_e out_dtype; + const _kernel_map_type * kernel_map = _slice_kernel_map; + size_t kernel_map_size = _cnt_of_array( _slice_kernel_map ); + vx_param_description_t * param_def = _slice_kernel_param_def; + size_t param_def_size = _cnt_of_array( _slice_kernel_param_def ); + vx_kernel_initialize_f initializer = _slice_initializer; + + uint32_t key; + uint32_t i; + + in0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + in1_dtype = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type ); + out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + + if (F16 == in0_dtype) + { + in0_dtype = F32; + } + + if (F16 == out_dtype) + { + out_dtype = F32; + } + + key = SLICE_HASH_KEY( in0_dtype, in1_dtype, out_dtype, image_2d ); + + for ( i = 0; i < (uint32_t)kernel_map_size; i ++ ) + { + if ( kernel_map[i].key == key ) + { + break; + } + } + if ( i < kernel_map_size ) + { + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name ); + kernel->info.parameters = param_def; + kernel->info.numParams = (uint32_t)param_def_size; + kernel->info.initialize = initializer; + // Register code source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2, + "eltwise_ops_helper", + kernel_map[i].source_name ); + // Register binary source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + kernel_map[i].source_name ); + status = VSI_SUCCESS; + } + + return status; +} /* _query_kernel() */ + + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_SLICE_PARAM_NUM]; + vsi_nn_kernel_node_t node = NULL; + vsi_bool image_2d = FALSE; + uint32_t rank[_IO_NUM] = {0}; + int32_t shapes[_IO_NUM][VSI_NN_MAX_DIM_NUM] = {{ 1 }}; + vsi_nn_tensor_t* reshape_tensors[_IO_NUM] = { NULL }; + int32_t i = 0; + int32_t input_batch = inputs[0]->attr.dim_num > 3 ? inputs[0]->attr.size[3] : 1; + int32_t output_batch = outputs[0]->attr.dim_num > 3 ? outputs[0]->attr.size[3] : 1; + float inputScale = inputs[0]->attr.dtype.scale; + float inputTail = (float)inputs[0]->attr.dtype.zero_point * inputScale; + float outputScale = outputs[0]->attr.dtype.scale; + float outputZP = (float)outputs[0]->attr.dtype.zero_point + 0.5f; + + outputScale = vsi_abs(outputScale) < 1e-5 ? 0.0f : 1.0f / outputScale; + + vsi_nn_kernel_optimize_1d_tensor_shape( (const int32_t*)inputs[0]->attr.size, inputs[0]->attr.dim_num, + shapes[0], &rank[0]); + vsi_nn_kernel_optimize_1d_tensor_shape( (const int32_t*)inputs[1]->attr.size, inputs[1]->attr.dim_num, + shapes[1], &rank[1]); + vsi_nn_kernel_optimize_1d_tensor_shape( (const int32_t*)outputs[0]->attr.size, outputs[0]->attr.dim_num, + shapes[2], &rank[2]); + + for (i = 0; i < _INPUT_NUM; i++) + { + reshape_tensors[i] = vsi_nn_reshape_tensor( graph, + inputs[i], (uint32_t*)shapes[i], rank[i] ); + } + reshape_tensors[_INPUT_NUM] = vsi_nn_reshape_tensor( graph, + outputs[0], (uint32_t*)shapes[_INPUT_NUM], rank[_INPUT_NUM] ); + + if ( !vsi_nn_kernel_gpu_check_shape( (int32_t*)reshape_tensors[0]->attr.size, + inputs[0]->attr.dim_num ) || input_batch != output_batch ) + { + return NULL; + } + + image_2d = (rank[0] < 3 || shapes[0][2] == 1); + + status = _query_kernel( kernel, inputs, outputs, image_2d ); + if ( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if ( node ) + { + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( node_params, _SLICE_PARAM_NUM, + reshape_tensors, input_num, &reshape_tensors[_INPUT_NUM], output_num ); + node_params[SCALAR_INPUT_SCALE] = vsi_nn_kernel_scalar_create( + graph, F32, &inputScale ); + node_params[SCALAR_INPUT_TAIL] = vsi_nn_kernel_scalar_create( + graph, F32, &inputTail ); + node_params[SCALAR_OUTPUT_SCALE] = vsi_nn_kernel_scalar_create( + graph, F32, &outputScale ); + node_params[SCALAR_OUTPUT_ZP] = vsi_nn_kernel_scalar_create( + graph, F32, &outputZP ); + + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, node_params, _SLICE_PARAM_NUM ); + VSI_ASSERT( status == VSI_SUCCESS ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_SCALE] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_TAIL] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_OUTPUT_SCALE] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_OUTPUT_ZP] ); + } + } + return node; +} /* _setup() */ + +__END_DECLS + + REGISTER_BACKEND_CL( slice, _setup ) diff --git a/src/tim/vx/internal/src/kernel/cpu/argmax_cpu.c b/src/tim/vx/internal/src/kernel/cpu/argmax_cpu.c index be3424b..e9cb96f 100644 --- a/src/tim/vx/internal/src/kernel/cpu/argmax_cpu.c +++ b/src/tim/vx/internal/src/kernel/cpu/argmax_cpu.c @@ -36,7 +36,7 @@ #include "utils/vsi_nn_dtype_util.h" #include "kernel/vsi_nn_kernel.h" #include "kernel/vsi_nn_kernel_eltwise.h" -#include "client/vsi_nn_vxkernel.h" +#include "libnnext/vsi_nn_vxkernel.h" __BEGIN_DECLS #define _CPU_ARG_NUM (1) diff --git a/src/tim/vx/internal/src/kernel/cpu/argmin_cpu.c b/src/tim/vx/internal/src/kernel/cpu/argmin_cpu.c index 7b0f1dd..f6b092b 100644 --- a/src/tim/vx/internal/src/kernel/cpu/argmin_cpu.c +++ b/src/tim/vx/internal/src/kernel/cpu/argmin_cpu.c @@ -36,7 +36,7 @@ #include "utils/vsi_nn_dtype_util.h" #include "kernel/vsi_nn_kernel.h" #include "kernel/vsi_nn_kernel_eltwise.h" -#include "client/vsi_nn_vxkernel.h" +#include "libnnext/vsi_nn_vxkernel.h" __BEGIN_DECLS diff --git a/src/tim/vx/internal/src/kernel/cpu/axis_aligned_bbox_transform_cpu.c b/src/tim/vx/internal/src/kernel/cpu/axis_aligned_bbox_transform_cpu.c new file mode 100644 index 0000000..448eb33 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/cpu/axis_aligned_bbox_transform_cpu.c @@ -0,0 +1,279 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_error.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" + +__BEGIN_DECLS + +/* + * Define kernel meta. + */ +#define _INPUT_NUM (4) +#define _OUTPUT_NUM (1) +#define _KERNEL_NAME CVIVANTE_NAMESPACE("cpu.axis_aligned_bbox_transform") + +typedef struct vsi_nn_box_encoding_corner_t +{ + float x1, y1, x2, y2; +}vsi_nn_box_encoding_corner; + +typedef struct vsi_nn_box_encoding_center_t +{ + float w, h, x, y; +}vsi_nn_box_encoding_center; + +/* + * Kernel params + */ +static vx_param_description_t _axis_aligned_bbox_transform_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, +}; +#define _AXIS_ALIGNED_BBOX_TRANSFORM_PARAM_NUM _cnt_of_array( _axis_aligned_bbox_transform_kernel_param_def ) + + +static void _to_box_encoding_corner + ( + vsi_nn_box_encoding_center* ctr, + vsi_nn_box_encoding_corner* cnr + ) +{ + cnr->x1 = ctr->x - ctr->w / 2; + cnr->y1 = ctr->y - ctr->h / 2; + cnr->x2 = ctr->x + ctr->w / 2; + cnr->y2 = ctr->y + ctr->h / 2; +} + +static void _to_box_encoding_center + ( + vsi_nn_box_encoding_corner* cnr, + vsi_nn_box_encoding_center* ctr + ) +{ + ctr->w = cnr->x2 - cnr->x1; + ctr->h = cnr->y2 - cnr->y1; + ctr->x = (cnr->x1 + cnr->x2) / 2; + ctr->y = (cnr->y1 + cnr->y2) / 2; +} + +/* + * Kernel function + */ +DEF_KERNEL_EXECUTOR(_compute) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_tensor_t input[_INPUT_NUM] = {NULL}; + vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL}; + float *f32_in_buffer[_INPUT_NUM] = {NULL}; + float *f32_out_buffer[_OUTPUT_NUM] = {NULL}; + vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM]; + vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM]; + size_t out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}}; + size_t out_elements[_OUTPUT_NUM] = {0}; + size_t out_bytes[_OUTPUT_NUM] = {0}; + uint32_t i; + const uint32_t roiLength = 4; + const uint32_t imageLength = 2; + uint32_t numClasses = 0; + uint32_t numRois = 0; + uint32_t j; + uint32_t roiIndex; + + /* prepare data */ + for (i = 0; i < _INPUT_NUM; i ++) + { + input[i] = (vsi_nn_kernel_tensor_t)param[i]; + in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] ); + f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE ); + CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input0 buffer fail.", final ); + + } + for (i = 0; i < _OUTPUT_NUM; i ++) + { + output[i] = (vsi_nn_kernel_tensor_t)param[i + _INPUT_NUM]; + out_attr[i] = vsi_nn_kernel_tensor_attr_create( output[i] ); + vsi_nn_kernel_tensor_attr_get_stride( out_attr[i], out_stride_size[i] ); + out_elements[i] = vsi_nn_kernel_tensor_attr_get_size( out_attr[i] ); + out_bytes[i] = out_elements[i] * sizeof(float); + f32_out_buffer[i] = (float *)malloc( out_bytes[i] ); + CHECK_PTR_FAIL_GOTO( f32_out_buffer[i], "Create output buffer fail.", final ); + memset( f32_out_buffer[i], 0, out_bytes[i] ); + } + + numClasses = in_attr[1]->shape->data[0] / roiLength; + numRois = in_attr[0]->shape->data[1]; + + for (roiIndex = 0; roiIndex < numRois; roiIndex++) + { + uint32_t batchIndex = (uint32_t)f32_in_buffer[2][roiIndex]; + float imageHeight = f32_in_buffer[3][batchIndex * imageLength]; + float imageWidth = f32_in_buffer[3][batchIndex * imageLength + 1]; + vsi_nn_box_encoding_corner roi_cnr; + vsi_nn_box_encoding_center roiBefore; + roi_cnr.x1 = f32_in_buffer[0][roiIndex * roiLength]; + roi_cnr.y1 = f32_in_buffer[0][roiIndex * roiLength + 1]; + roi_cnr.x2 = f32_in_buffer[0][roiIndex * roiLength + 2]; + roi_cnr.y2 = f32_in_buffer[0][roiIndex * roiLength + 3]; + _to_box_encoding_center(&roi_cnr, &roiBefore); + + for (j = 0; j < numClasses; j++) + { + vsi_nn_box_encoding_center roi_ctr; + vsi_nn_box_encoding_corner roiAfter; + vsi_nn_box_encoding_corner cliped; + uint32_t index = (roiIndex * numClasses + j) * roiLength; + + roi_ctr.w = (float)(exp(f32_in_buffer[1][index + 2]) * roiBefore.w); + roi_ctr.h = (float)(exp(f32_in_buffer[1][index + 3]) * roiBefore.h); + roi_ctr.x = roiBefore.x + f32_in_buffer[1][index] * roiBefore.w; + roi_ctr.y = roiBefore.y + f32_in_buffer[1][index + 1] * roiBefore.h; + _to_box_encoding_corner(&roi_ctr, &roiAfter); + + cliped.x1 = vsi_nn_min(vsi_nn_max(roiAfter.x1, 0.0f), imageWidth); + cliped.y1 = vsi_nn_min(vsi_nn_max(roiAfter.y1, 0.0f), imageHeight); + cliped.x2 = vsi_nn_min(vsi_nn_max(roiAfter.x2, 0.0f), imageWidth); + cliped.y2 = vsi_nn_min(vsi_nn_max(roiAfter.y2, 0.0f), imageHeight); + f32_out_buffer[0][index] = cliped.x1; + f32_out_buffer[0][index + 1] = cliped.y1; + f32_out_buffer[0][index + 2] = cliped.x2; + f32_out_buffer[0][index + 3] = cliped.y2; + } + } + + /* save data */ + for(i = 0; i < _OUTPUT_NUM; i++) + { + status = vsi_nn_kernel_tensor_write_from_float( output[i], out_attr[i], + f32_out_buffer[i], out_elements[i] ); + CHECK_STATUS_FAIL_GOTO( status, final ); + } + +final: + for (i = 0; i < _INPUT_NUM; i++) + { + if (f32_in_buffer[i]) + { + free(f32_in_buffer[i]); + f32_in_buffer[i] = NULL; + } + if (in_attr[i]) + { + vsi_nn_kernel_tensor_attr_release( &in_attr[i] ); + } + } + for (i = 0; i < _OUTPUT_NUM; i++) + { + if (f32_out_buffer[i]) + { + free(f32_out_buffer[i]); + f32_out_buffer[i] = NULL; + } + if (out_attr[i]) + { + vsi_nn_kernel_tensor_attr_release( &out_attr[i] ); + } + } + + return status; +} /* _compute() */ + + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + vsi_nn_tensor_t * const * const inputs, + vsi_nn_tensor_t * const * const outputs + /* Add extra params */ + ) +{ + vsi_status status = VSI_FAILURE; + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", _KERNEL_NAME ); + kernel->info.function = _compute; + kernel->info.parameters = _axis_aligned_bbox_transform_kernel_param_def; + kernel->info.numParams = _cnt_of_array( _axis_aligned_bbox_transform_kernel_param_def ); + status = VSI_SUCCESS; + + return status; +} /* _query_kernel() */ + + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_AXIS_ALIGNED_BBOX_TRANSFORM_PARAM_NUM]; + vsi_nn_kernel_node_t node = NULL; + + status = _query_kernel( kernel, inputs, outputs /* Add extra params */ ); + if( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if( node ) + { + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( node_params, _AXIS_ALIGNED_BBOX_TRANSFORM_PARAM_NUM, + inputs, input_num, outputs, output_num ); + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, node_params, _AXIS_ALIGNED_BBOX_TRANSFORM_PARAM_NUM ); + } + } + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_CPU( axis_aligned_bbox_transform, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/cpu/batchnorm_single_cpu.c b/src/tim/vx/internal/src/kernel/cpu/batchnorm_single_cpu.c index cd4f594..ca6164b 100644 --- a/src/tim/vx/internal/src/kernel/cpu/batchnorm_single_cpu.c +++ b/src/tim/vx/internal/src/kernel/cpu/batchnorm_single_cpu.c @@ -34,7 +34,7 @@ #include "vsi_nn_tensor.h" #include "vsi_nn_tensor_util.h" #include "kernel/vsi_nn_kernel.h" -#include "client/vsi_nn_vxkernel.h" +#include "libnnext/vsi_nn_vxkernel.h" #include "kernel/vsi_nn_kernel_eltwise.h" __BEGIN_DECLS diff --git a/src/tim/vx/internal/src/kernel/cpu/comparisons_cpu.c b/src/tim/vx/internal/src/kernel/cpu/comparisons_cpu.c index f64f102..8397b30 100644 --- a/src/tim/vx/internal/src/kernel/cpu/comparisons_cpu.c +++ b/src/tim/vx/internal/src/kernel/cpu/comparisons_cpu.c @@ -32,7 +32,7 @@ #include "vsi_nn_prv.h" #include "vsi_nn_error.h" #include "kernel/vsi_nn_kernel.h" -#include "client/vsi_nn_vxkernel.h" +#include "libnnext/vsi_nn_vxkernel.h" __BEGIN_DECLS @@ -164,8 +164,8 @@ DEF_KERNEL_EXECUTOR(_comparisons_exec) buffer[2][i] = (float)data; } - status = vsi_nn_kernel_tensor_write_from_float( tensors[1], attr[1], - buffer[1], out_elements ); + status = vsi_nn_kernel_tensor_write_from_float( tensors[2], attr[2], + buffer[2], out_elements ); CHECK_STATUS_FAIL_GOTO( status, final ); final: diff --git a/src/tim/vx/internal/src/kernel/cpu/conv1d_ovxlib_cpu.c b/src/tim/vx/internal/src/kernel/cpu/conv1d_ovxlib_cpu.c new file mode 100644 index 0000000..f1b1b9e --- /dev/null +++ b/src/tim/vx/internal/src/kernel/cpu/conv1d_ovxlib_cpu.c @@ -0,0 +1,264 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_error.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "libnnext/vx_lib_nnext.h" + +__BEGIN_DECLS + +/* + * Define kernel meta. + */ +#define _KERNEL_NAME CVIVANTE_NAMESPACE("cpu.conv1d_ovxlib") + +/* + * Kernel params + */ +static vx_param_description_t _conv1d_ovxlib_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, +}; +#define _CONV1D_OVXLIB_PARAM_NUM _cnt_of_array( _conv1d_ovxlib_kernel_param_def ) +#define _IO_COUNT (4) + +/* + * Kernel function + */ +DEF_KERNEL_EXECUTOR(_compute) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + int i = 0; + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_tensor_t tensors[_IO_COUNT] = { NULL }; + vsi_nn_kernel_tensor_attr_t* attr[_IO_COUNT] = { NULL }; + float* buffer[_IO_COUNT] = { NULL }; + int32_t stride = 0; + int32_t pad_front = 0; + int32_t pad_end = 0; + int32_t dilation = 0; + int32_t overflow_policy = 0; + int32_t rounding_policy = 0; + int32_t down_scale_size_rounding = 0; + + tensors[0] = (vsi_nn_kernel_tensor_t)param[0]; + tensors[1] = (vsi_nn_kernel_tensor_t)param[1]; + tensors[2] = (vsi_nn_kernel_tensor_t)param[2]; + tensors[3] = (vsi_nn_kernel_tensor_t)param[3]; + + attr[0] = vsi_nn_kernel_tensor_attr_create( tensors[0] ); + attr[1] = vsi_nn_kernel_tensor_attr_create( tensors[1] ); + attr[2] = vsi_nn_kernel_tensor_attr_create( tensors[2] ); + attr[3] = vsi_nn_kernel_tensor_attr_create( tensors[3] ); + + buffer[0] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[0], attr[0], TRUE ); + CHECK_PTR_FAIL_GOTO( buffer[0], "Create input buffer fail.", final ); + buffer[1] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[1], attr[1], TRUE ); + CHECK_PTR_FAIL_GOTO( buffer[1], "Create input buffer fail.", final ); + buffer[2] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[2], attr[2], TRUE ); + buffer[3] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[3], attr[3], TRUE ); + CHECK_PTR_FAIL_GOTO( buffer[3], "Create input buffer fail.", final ); + + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[4], &stride); + CHECK_STATUS_FAIL_GOTO(status, final); + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[5], &pad_front); + CHECK_STATUS_FAIL_GOTO(status, final); + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[6], &pad_end); + CHECK_STATUS_FAIL_GOTO(status, final); + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[7], &dilation); + CHECK_STATUS_FAIL_GOTO(status, final); + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[8], &overflow_policy); + CHECK_STATUS_FAIL_GOTO(status, final); + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[9], &rounding_policy); + CHECK_STATUS_FAIL_GOTO(status, final); + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[10], &down_scale_size_rounding); + CHECK_STATUS_FAIL_GOTO(status, final); + + { + int32_t batch = attr[0]->shape->data[2]; + int32_t input_channel = attr[0]->shape->data[1]; + int32_t input_height = attr[0]->shape->data[0]; + int32_t kernel_size = attr[1]->shape->data[0]; + int32_t output_channel = attr[1]->shape->data[2]; + int32_t output_height = attr[3]->shape->data[0]; + int32_t batch_index = 0; + int32_t input_channel_index = 0; + int32_t output_channel_index = 0; + int32_t output_h_index = 0; + + for(batch_index = 0; batch_index < batch; batch_index++) + { + float* per_batch_input = buffer[0] + batch_index * input_channel * input_height; + float* per_batch_output = buffer[3] + batch_index * output_channel * output_height; + for(output_channel_index = 0; output_channel_index < output_channel; output_channel_index++) + { + float* filter = buffer[1] + output_channel_index * input_channel * kernel_size; + for(output_h_index = 0; output_h_index < output_height; output_h_index++) + { + float output_value = 0.; + float* current_value_ptr = per_batch_input + output_h_index * stride; + + for(input_channel_index = 0; input_channel_index < input_channel; input_channel_index++) + { + int k = 0; + int32_t index = 0; + for(k = 0; k < kernel_size; k++) + { + float w = *(filter + input_channel_index * kernel_size + k); + float v = *(current_value_ptr + input_channel_index * input_height + index); + + output_value += w * v; + index += dilation; + } + } + + if(buffer[2]) + { + output_value += buffer[2][output_channel_index]; + } + + *(per_batch_output + output_channel_index * output_height + output_h_index) = output_value; + } + } + } + status = vsi_nn_kernel_tensor_write_from_float( tensors[3], attr[3], + buffer[3], batch * output_channel * output_height ); + CHECK_STATUS_FAIL_GOTO( status, final ); + } + +final: + for( i = 0; i < _IO_COUNT; i ++ ) + { + if( buffer[i] ) + { + free( buffer[i] ); + } + vsi_nn_kernel_tensor_attr_release( &attr[i] ); + } + + return status; + +} /* _compute() */ + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + vsi_nn_tensor_t * const * const inputs, + vsi_nn_tensor_t * const * const outputs + /* Add extra params */ + ) +{ + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", _KERNEL_NAME ); + kernel->info.function = _compute; + kernel->info.parameters = _conv1d_ovxlib_kernel_param_def; + kernel->info.numParams = _cnt_of_array( _conv1d_ovxlib_kernel_param_def ); + + return VSI_SUCCESS; +} /* _query_kernel() */ + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_CONV1D_OVXLIB_PARAM_NUM]; + vsi_nn_kernel_node_t node = NULL; + int j = 0; + + int32_t stride = vsi_nn_kernel_param_get_int32( params, "stride" ); + int32_t pad_front = vsi_nn_kernel_param_get_int32( params, "pad_front" ); + int32_t pad_end = vsi_nn_kernel_param_get_int32( params, "pad_end" ); + int32_t dilation = vsi_nn_kernel_param_get_int32( params, "dilation" ); + int32_t overflow_policy = vsi_nn_kernel_param_get_int32( params, "overflow_policy" ); + int32_t rounding_policy = vsi_nn_kernel_param_get_int32( params, "rounding_policy" ); + int32_t down_scale_size_rounding = vsi_nn_kernel_param_get_int32( params, "down_scale_size_rounding" ); + + status = _query_kernel( kernel, inputs, outputs /* Add extra params */ ); + if( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if( node ) + { + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( node_params, _CONV1D_OVXLIB_PARAM_NUM, + inputs, input_num, outputs, output_num ); + j = (int)(input_num + output_num); + node_params[j++] = vsi_nn_kernel_scalar_create(graph, I32, &stride ); + node_params[j++] = vsi_nn_kernel_scalar_create(graph, I32, &pad_front ); + node_params[j++] = vsi_nn_kernel_scalar_create(graph, I32, &pad_end ); + node_params[j++] = vsi_nn_kernel_scalar_create(graph, I32, &dilation ); + node_params[j++] = vsi_nn_kernel_scalar_create(graph, I32, &overflow_policy ); + node_params[j++] = vsi_nn_kernel_scalar_create(graph, I32, &rounding_policy ); + node_params[j++] = vsi_nn_kernel_scalar_create(graph, I32, &down_scale_size_rounding ); + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, node_params, _CONV1D_OVXLIB_PARAM_NUM ); + vsi_nn_kernel_scalar_release( &node_params[--j] ); + vsi_nn_kernel_scalar_release( &node_params[--j] ); + vsi_nn_kernel_scalar_release( &node_params[--j] ); + vsi_nn_kernel_scalar_release( &node_params[--j] ); + vsi_nn_kernel_scalar_release( &node_params[--j] ); + vsi_nn_kernel_scalar_release( &node_params[--j] ); + vsi_nn_kernel_scalar_release( &node_params[--j] ); + } + } + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_CPU( conv1d_ovxlib, _setup ) diff --git a/src/tim/vx/internal/src/kernel/cpu/depth2space_internal_cpu.c b/src/tim/vx/internal/src/kernel/cpu/depth2space_internal_cpu.c index 138c6e4..aa96ba3 100644 --- a/src/tim/vx/internal/src/kernel/cpu/depth2space_internal_cpu.c +++ b/src/tim/vx/internal/src/kernel/cpu/depth2space_internal_cpu.c @@ -35,7 +35,7 @@ #include "vsi_nn_error.h" #include "utils/vsi_nn_util.h" #include "kernel/vsi_nn_kernel.h" -#include "client/vsi_nn_vxkernel.h" +#include "libnnext/vsi_nn_vxkernel.h" __BEGIN_DECLS diff --git a/src/tim/vx/internal/src/kernel/cpu/eltwise_unary_cpu.c b/src/tim/vx/internal/src/kernel/cpu/eltwise_unary_cpu.c index 03c1711..64f9490 100644 --- a/src/tim/vx/internal/src/kernel/cpu/eltwise_unary_cpu.c +++ b/src/tim/vx/internal/src/kernel/cpu/eltwise_unary_cpu.c @@ -32,7 +32,7 @@ #include "vsi_nn_prv.h" #include "vsi_nn_error.h" #include "kernel/vsi_nn_kernel.h" -#include "client/vsi_nn_vxkernel.h" +#include "libnnext/vsi_nn_vxkernel.h" __BEGIN_DECLS @@ -46,6 +46,7 @@ typedef enum UNARY_NEG, UNARY_HSIGMOID, UNARY_MISH, + UNARY_ROUND, } unary_type_e; @@ -101,6 +102,13 @@ static float mish_eval(float data) return data; } +static float round_eval(float data) +{ + data = (float)(vsi_rtne(data)); + + return data; +} + DEF_KERNEL_EXECUTOR(_eltwise_unary_exec) ( vsi_nn_kernel_node_t node, @@ -165,6 +173,9 @@ DEF_KERNEL_EXECUTOR(_eltwise_unary_exec) case UNARY_MISH: data = mish_eval(data); break; + case UNARY_ROUND: + data = round_eval(data); + break; default: break; } @@ -298,3 +309,4 @@ REGISTER_ELTWISE_UNARY_BACKEND_CPU( elu, UNARY_ELU ) REGISTER_ELTWISE_UNARY_BACKEND_CPU( neg, UNARY_NEG ) REGISTER_ELTWISE_UNARY_BACKEND_CPU( hard_sigmoid, UNARY_HSIGMOID ) REGISTER_ELTWISE_UNARY_BACKEND_CPU( mish, UNARY_MISH ) +REGISTER_ELTWISE_UNARY_BACKEND_CPU( round, UNARY_ROUND ) \ No newline at end of file diff --git a/src/tim/vx/internal/src/kernel/cpu/erf_cpu.c b/src/tim/vx/internal/src/kernel/cpu/erf_cpu.c new file mode 100644 index 0000000..07f8e82 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/cpu/erf_cpu.c @@ -0,0 +1,229 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_error.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "libnnext/vx_lib_nnext.h" + +__BEGIN_DECLS + +/* + * Define kernel meta. + */ +#define _INPUT_NUM (1) +#define _OUTPUT_NUM (1) +#define _KERNEL_NAME CVIVANTE_NAMESPACE("cpu.erf") + + +/* + * Kernel params + */ +static vx_param_description_t _erf_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + // Add kererl parameters here +}; +#define _ERF_PARAM_NUM _cnt_of_array( _erf_kernel_param_def ) + + +/* + * Kernel function + */ +DEF_KERNEL_EXECUTOR(_compute) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_tensor_t input[_INPUT_NUM] = {NULL}; + vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL}; + float *f32_in_buffer[_INPUT_NUM] = {NULL}; + float *f32_out_buffer[_OUTPUT_NUM] = {NULL}; + vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM]; + vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM]; + size_t out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}}; + size_t out_elements[_OUTPUT_NUM] = {0}; + size_t out_bytes[_OUTPUT_NUM] = {0}; + size_t i = 0; + + /* prepare data */ + for (i = 0; i < _INPUT_NUM; i ++) + { + input[i] = (vsi_nn_kernel_tensor_t)param[i]; + in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] ); + f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE ); + CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input0 buffer fail.", final ); + } + + for (i = 0; i < _OUTPUT_NUM; i ++) + { + output[i] = (vsi_nn_kernel_tensor_t)param[i + _INPUT_NUM]; + out_attr[i] = vsi_nn_kernel_tensor_attr_create( output[i] ); + vsi_nn_kernel_tensor_attr_get_stride( out_attr[i], out_stride_size[i] ); + out_elements[i] = vsi_nn_kernel_tensor_attr_get_size( out_attr[i] ); + out_bytes[i] = out_elements[i] * sizeof(float); + f32_out_buffer[i] = (float *)malloc( out_bytes[i] ); + CHECK_PTR_FAIL_GOTO( f32_out_buffer[i], "Create output buffer fail.", final ); + memset( f32_out_buffer[i], 0, out_bytes[i] ); + } +#define ERF_PI 3.141592653589793 + for (i = 0; i < out_elements[0]; i ++) + { + /* 2 / sqrt(pi) * (sum[(-1)^n! * x ^ (2n + 1)] + x) */ + float x = f32_in_buffer[0][i]; + float res = 0; + float tmp = x; + float factorial = 1; /*n!*/ + float x_pow = x; + int32_t one = 1; + int32_t n = 1; + + while (vsi_abs(tmp) > 1e-5) + { + res += tmp; + + factorial *= n; + one *= -1; + x_pow *= x * x; + tmp = one / factorial * x_pow / ( 2 * n + 1); + + n ++; + } + + + res *= 2.0f / (float)sqrt(ERF_PI); + + f32_out_buffer[0][i] = res; + } + + /* save data */ + for(i = 0; i < _OUTPUT_NUM; i++) + { + status = vsi_nn_kernel_tensor_write_from_float( output[i], out_attr[i], + f32_out_buffer[i], out_elements[i] ); + CHECK_STATUS_FAIL_GOTO( status, final ); + } + +final: + for (i = 0; i < _INPUT_NUM; i++) + { + if (f32_in_buffer[i]) + { + free(f32_in_buffer[i]); + f32_in_buffer[i] = NULL; + } + if (in_attr[i]) + { + vsi_nn_kernel_tensor_attr_release( &in_attr[i] ); + } + } + for (i = 0; i < _OUTPUT_NUM; i++) + { + if (f32_out_buffer[i]) + { + free(f32_out_buffer[i]); + f32_out_buffer[i] = NULL; + } + if (out_attr[i]) + { + vsi_nn_kernel_tensor_attr_release( &out_attr[i] ); + } + } + + return status; +} /* _compute() */ + + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + vsi_nn_tensor_t * const * const inputs, + vsi_nn_tensor_t * const * const outputs + /* Add extra params */ + ) +{ + vsi_status status = VSI_FAILURE; + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", _KERNEL_NAME ); + kernel->info.function = _compute; + kernel->info.parameters = _erf_kernel_param_def; + kernel->info.numParams = _cnt_of_array( _erf_kernel_param_def ); + status = VSI_SUCCESS; + + return status; +} /* _query_kernel() */ + + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_ERF_PARAM_NUM]; + vsi_nn_kernel_node_t node = NULL; + + + status = _query_kernel( kernel, inputs, outputs); + if ( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if ( node ) + { + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( node_params, _ERF_PARAM_NUM, + inputs, input_num, outputs, output_num ); + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, node_params, _ERF_PARAM_NUM ); + } + } + + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_CPU( erf, _setup ) diff --git a/src/tim/vx/internal/src/kernel/cpu/gather_cpu.c b/src/tim/vx/internal/src/kernel/cpu/gather_cpu.c index c234a51..076b6b8 100644 --- a/src/tim/vx/internal/src/kernel/cpu/gather_cpu.c +++ b/src/tim/vx/internal/src/kernel/cpu/gather_cpu.c @@ -35,7 +35,7 @@ #include "vsi_nn_error.h" #include "utils/vsi_nn_util.h" #include "kernel/vsi_nn_kernel.h" -#include "client/vsi_nn_vxkernel.h" +#include "libnnext/vsi_nn_vxkernel.h" __BEGIN_DECLS diff --git a/src/tim/vx/internal/src/kernel/cpu/gather_nd_cpu.c b/src/tim/vx/internal/src/kernel/cpu/gather_nd_cpu.c index cb22732..3d912b8 100644 --- a/src/tim/vx/internal/src/kernel/cpu/gather_nd_cpu.c +++ b/src/tim/vx/internal/src/kernel/cpu/gather_nd_cpu.c @@ -35,7 +35,7 @@ #include "vsi_nn_error.h" #include "utils/vsi_nn_util.h" #include "kernel/vsi_nn_kernel.h" -#include "client/vsi_nn_vxkernel.h" +#include "libnnext/vsi_nn_vxkernel.h" __BEGIN_DECLS diff --git a/src/tim/vx/internal/src/kernel/cpu/group_normalization_cpu.c b/src/tim/vx/internal/src/kernel/cpu/group_normalization_cpu.c new file mode 100644 index 0000000..17f45d7 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/cpu/group_normalization_cpu.c @@ -0,0 +1,315 @@ +/**************************************************************************** +* +* Copyright (c) 2019 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "vsi_nn_error.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "libnnext/vsi_nn_vxkernel.h" +#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h" + +__BEGIN_DECLS + +/* + * Define kernel meta. + */ +#define _CPU_ARG_NUM (2) +#define _CPU_INPUT_NUM (3) +#define _CPU_OUTPUT_NUM (1) +#define _CPU_IO_NUM (_CPU_INPUT_NUM + _CPU_OUTPUT_NUM) +#define _CPU_PARAM_NUM (_CPU_ARG_NUM + _CPU_IO_NUM) +#define _KERNEL_NAME CVIVANTE_NAMESPACE("cpu.group_norm") + +DEF_KERNEL_EXECUTOR(_group_norm_exec) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_tensor_t tensors[_CPU_IO_NUM] = { NULL }; + float * buffer[_CPU_IO_NUM] = { NULL }; + size_t out_elements = 0; + vsi_nn_kernel_tensor_attr_t * attr[_CPU_IO_NUM] = { NULL }; + uint32_t i = 0; + int32_t spaceOrg = 0; + float eps = .0f; + + tensors[0] = (vsi_nn_kernel_tensor_t)param[0]; + tensors[1] = (vsi_nn_kernel_tensor_t)param[1]; + tensors[2] = (vsi_nn_kernel_tensor_t)param[2]; + tensors[3] = (vsi_nn_kernel_tensor_t)param[3]; + + attr[0] = vsi_nn_kernel_tensor_attr_create( tensors[0] ); + CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); + attr[1] = vsi_nn_kernel_tensor_attr_create( tensors[1] ); + CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final ); + attr[2] = vsi_nn_kernel_tensor_attr_create( tensors[2] ); + CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", final ); + attr[3] = vsi_nn_kernel_tensor_attr_create( tensors[3] ); + CHECK_PTR_FAIL_GOTO( attr[3], "Create tensor attr buffer fail.", final ); + + out_elements = vsi_nn_kernel_tensor_attr_get_size( attr[3] ); + + status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[4], &eps); + CHECK_STATUS_FAIL_GOTO(status, final ); + + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[5], &spaceOrg); + CHECK_STATUS_FAIL_GOTO(status, final ); + + buffer[0] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[0], attr[0], TRUE ); + CHECK_PTR_FAIL_GOTO( buffer[0], "Create input0 buffer fail.", final ); + + buffer[1] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[1], attr[1], TRUE ); + CHECK_PTR_FAIL_GOTO( buffer[1], "Create input1 buffer fail.", final ); + + buffer[2] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[2], attr[2], TRUE ); + CHECK_PTR_FAIL_GOTO( buffer[2], "Create input1 buffer fail.", final ); + + buffer[3] = (float *)malloc( out_elements * sizeof(float) ); + CHECK_PTR_FAIL_GOTO( buffer[3], "Create output buffer fail.", final ); + memset( buffer[3], 0, out_elements * sizeof(float) ); + + { + uint32_t b = 0, c = 0; + uint32_t height = attr[0]->shape->data[1]; + uint32_t width = attr[0]->shape->data[0]; + uint32_t ch = attr[0]->shape->size > 2 ? attr[0]->shape->data[2] : 1; + uint32_t bh = attr[0]->shape->size > 3 ? attr[0]->shape->data[3] : 1; + uint32_t spatial = height * width; + + for (b = 0; b < bh; b++) + { + for (c = 0; c < ch; c++) + { + uint32_t page = c * spatial + b * (spatial * ch); + uint32_t paraIdx = c * attr[1]->shape->data[0]; + float sum = .0f; + float sumsq = .0f; + float mean = .0f; + float vari = .0f; + float data = 0; + + for (i = 0; i < spatial; i++) + { + uint32_t index = page + i; + sum += buffer[0][index]; + } + + mean = sum / spatial; + for (i = 0; i < spatial; i++) + { + uint32_t index = page + i; + data = buffer[0][index] - mean; + sumsq += data * data; + } + + vari = sumsq / spatial; + vari = (float)(1.0 / sqrtf(vari + eps)); + + for (i = 0; i < spatial; i++) + { + float normVal = 0; + uint32_t index = page + i; + uint32_t tmpIdx = paraIdx + i / spaceOrg; + float scaleVal = buffer[2][tmpIdx]; + float biasVal = buffer[1][tmpIdx]; + + data = buffer[0][index] - mean; + normVal = data * vari * scaleVal + biasVal; + buffer[3][index] = normVal; + } + } + } + } + + status = vsi_nn_kernel_tensor_write_from_float( tensors[3], attr[3], + buffer[3], out_elements ); + CHECK_STATUS_FAIL_GOTO( status, final ); + +final: + for( i = 0; i < _CPU_IO_NUM; i ++ ) + { + if ( buffer[i] ) + { + free( buffer[i] ); + } + } + for( i = 0; i < _CPU_IO_NUM; i ++ ) + { + if (attr[i]) { vsi_nn_kernel_tensor_attr_release( &attr[i] ); } + } + return status; +} /* _group_norm_exec() */ +/* + * Kernel params + */ +static vx_param_description_t _group_normalization_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + // Add kererl parameters here +}; +#define _GROUP_NORMALIZATION_PARAM_NUM _cnt_of_array( _group_normalization_kernel_param_def ) + +static const vx_kernel_description_t _kernel_info = +{ + KERNEL_ID_PLACEHOLDER, + _KERNEL_NAME, + _group_norm_exec, + _group_normalization_kernel_param_def, + _cnt_of_array( _group_normalization_kernel_param_def ), + vsi_nn_KernelValidator, + NULL, + NULL, + vsi_nn_KernelInitializer, + vsi_nn_KernelDeinitializer +}; + +static vsi_status _query_kernel + ( + vsi_nn_tensor_t* const* const inputs, + vsi_nn_tensor_t* const* const outputs, + vsi_nn_kernel_t* kernel + ) +{ + memmove( &kernel->info, &_kernel_info, sizeof(vx_kernel_description_t) ); + return VSI_SUCCESS; +} /* _query_kernel() */ + +static int32_t _optimize_gn_shape_cpu + ( + vsi_nn_tensor_t ** inputs, + int32_t group_size, + int32_t group_num, + int32_t* opt_shape + ) +{ + vsi_status status = VSI_SUCCESS; + int32_t group_shape[VSI_NN_MAX_DIM_NUM] = {0}; + int32_t new_rank = 0; + group_shape[0] = inputs[0]->attr.size[0]; + group_shape[1] = inputs[0]->attr.size[1]; + group_shape[2] = group_size; + + vsi_nn_kernel_optimize_element_shape(group_shape, 3, opt_shape, &new_rank ); + + if (new_rank == 2) + { + opt_shape[2] = group_num; + opt_shape[3] = inputs[0]->attr.dim_num > 3 ? inputs[0]->attr.size[3] : 1; + } + else + { + status = VSI_FAILURE; + } + + return status; +} + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t backend_params[_CPU_PARAM_NUM] = {NULL}; + vsi_nn_kernel_node_t node = NULL; + vsi_nn_kernel_tensor_t rs_input = NULL, rs_output = NULL; + int32_t new_shape[VSI_NN_MAX_DIM_NUM] = { 1, 1, 1, 1 }; + int32_t group_num = vsi_nn_kernel_param_get_int32( params, "group_num" ); + int32_t group_size = inputs[0]->attr.size[2] / group_num; + int32_t spaceOrg = inputs[0]->attr.size[0] * inputs[0]->attr.size[1]; + + status = _optimize_gn_shape_cpu(inputs, group_size, group_num, new_shape); + if ( VSI_SUCCESS != status ) + { + goto final; + } + rs_input = vsi_nn_kernel_tensor_reshape(inputs[0]->t, new_shape, 4); + rs_output = vsi_nn_kernel_tensor_reshape(outputs[0]->t, new_shape, 4); + + status = _query_kernel( inputs, outputs, kernel ); + if ( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if ( node ) + { + float eps = vsi_nn_kernel_param_get_float32( params, "eps" ); + uint32_t index = 0; + /* Set inputs and outputs */ + backend_params[index++] = rs_input; + backend_params[index++] = (vsi_nn_kernel_node_param_t)inputs[1]->t; + backend_params[index++] = (vsi_nn_kernel_node_param_t)inputs[2]->t; + backend_params[index++] = rs_output; + backend_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &eps ); + backend_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &spaceOrg ); + + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, backend_params, _CPU_PARAM_NUM ); + CHECK_STATUS( status ); + vsi_nn_kernel_scalar_release( &backend_params[4] ); + vsi_nn_kernel_scalar_release( &backend_params[5] ); + } + else + { + status = VSI_FAILURE; + } + } +final: + if (rs_input) + { + vsi_nn_kernel_tensor_release( &rs_input ); + } + if (rs_output) + { + vsi_nn_kernel_tensor_release( &rs_output ); + } + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_CPU( group_norm, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/cpu/instance_normalization_cpu.c b/src/tim/vx/internal/src/kernel/cpu/instance_normalization_cpu.c index 6720a14..c9b665c 100644 --- a/src/tim/vx/internal/src/kernel/cpu/instance_normalization_cpu.c +++ b/src/tim/vx/internal/src/kernel/cpu/instance_normalization_cpu.c @@ -35,7 +35,7 @@ #include "vsi_nn_error.h" #include "utils/vsi_nn_util.h" #include "kernel/vsi_nn_kernel.h" -#include "client/vsi_nn_vxkernel.h" +#include "libnnext/vsi_nn_vxkernel.h" __BEGIN_DECLS diff --git a/src/tim/vx/internal/src/kernel/cpu/layer_normalization_cpu.c b/src/tim/vx/internal/src/kernel/cpu/layer_normalization_cpu.c index d6d9802..c8c82bf 100644 --- a/src/tim/vx/internal/src/kernel/cpu/layer_normalization_cpu.c +++ b/src/tim/vx/internal/src/kernel/cpu/layer_normalization_cpu.c @@ -35,7 +35,7 @@ #include "vsi_nn_error.h" #include "utils/vsi_nn_util.h" #include "kernel/vsi_nn_kernel.h" -#include "client/vsi_nn_vxkernel.h" +#include "libnnext/vsi_nn_vxkernel.h" __BEGIN_DECLS @@ -143,8 +143,8 @@ DEF_KERNEL_EXECUTOR(_layer_norm_exec) { int idx = (outer * axisSize + i) * innerSize + inner; float data = buffer[0][idx] - mean; - float scaleVal = buffer[2][idx]; - float biasVal = buffer[1][idx]; + float scaleVal = buffer[2][i]; + float biasVal = buffer[1][i]; float normVal = data * vari * scaleVal + biasVal; buffer[3][idx] = normVal; } diff --git a/src/tim/vx/internal/src/kernel/cpu/log_softmax_cpu.c b/src/tim/vx/internal/src/kernel/cpu/log_softmax_cpu.c index 4f56938..2ef240f 100644 --- a/src/tim/vx/internal/src/kernel/cpu/log_softmax_cpu.c +++ b/src/tim/vx/internal/src/kernel/cpu/log_softmax_cpu.c @@ -36,7 +36,7 @@ #include "utils/vsi_nn_dtype_util.h" #include "kernel/vsi_nn_kernel.h" #include "kernel/vsi_nn_kernel_eltwise.h" -#include "client/vsi_nn_vxkernel.h" +#include "libnnext/vsi_nn_vxkernel.h" __BEGIN_DECLS #define _CPU_ARG_NUM (2) diff --git a/src/tim/vx/internal/src/kernel/cpu/matrixmul_cpu.c b/src/tim/vx/internal/src/kernel/cpu/matrixmul_cpu.c index 4e8097d..c263ff7 100644 --- a/src/tim/vx/internal/src/kernel/cpu/matrixmul_cpu.c +++ b/src/tim/vx/internal/src/kernel/cpu/matrixmul_cpu.c @@ -35,7 +35,7 @@ #include "vsi_nn_error.h" #include "utils/vsi_nn_util.h" #include "kernel/vsi_nn_kernel.h" -#include "client/vsi_nn_vxkernel.h" +#include "libnnext/vsi_nn_vxkernel.h" __BEGIN_DECLS diff --git a/src/tim/vx/internal/src/kernel/cpu/maximum_cpu.c b/src/tim/vx/internal/src/kernel/cpu/maximum_cpu.c index 61bba5c..4795735 100644 --- a/src/tim/vx/internal/src/kernel/cpu/maximum_cpu.c +++ b/src/tim/vx/internal/src/kernel/cpu/maximum_cpu.c @@ -36,7 +36,7 @@ #include "utils/vsi_nn_dtype_util.h" #include "kernel/vsi_nn_kernel.h" #include "kernel/vsi_nn_kernel_eltwise.h" -#include "client/vsi_nn_vxkernel.h" +#include "libnnext/vsi_nn_vxkernel.h" __BEGIN_DECLS diff --git a/src/tim/vx/internal/src/kernel/cpu/minimum_cpu.c b/src/tim/vx/internal/src/kernel/cpu/minimum_cpu.c index 1a63797..6908a1e 100644 --- a/src/tim/vx/internal/src/kernel/cpu/minimum_cpu.c +++ b/src/tim/vx/internal/src/kernel/cpu/minimum_cpu.c @@ -32,7 +32,7 @@ #include "vsi_nn_prv.h" #include "vsi_nn_error.h" #include "kernel/vsi_nn_kernel.h" -#include "client/vsi_nn_vxkernel.h" +#include "libnnext/vsi_nn_vxkernel.h" __BEGIN_DECLS diff --git a/src/tim/vx/internal/src/kernel/cpu/moments_cpu.c b/src/tim/vx/internal/src/kernel/cpu/moments_cpu.c index f1124bf..ad46c58 100644 --- a/src/tim/vx/internal/src/kernel/cpu/moments_cpu.c +++ b/src/tim/vx/internal/src/kernel/cpu/moments_cpu.c @@ -35,7 +35,7 @@ #include "vsi_nn_error.h" #include "utils/vsi_nn_util.h" #include "kernel/vsi_nn_kernel.h" -#include "client/vsi_nn_vxkernel.h" +#include "libnnext/vsi_nn_vxkernel.h" __BEGIN_DECLS diff --git a/src/tim/vx/internal/src/kernel/cpu/nms_cpu.c b/src/tim/vx/internal/src/kernel/cpu/nms_cpu.c new file mode 100644 index 0000000..8924f7b --- /dev/null +++ b/src/tim/vx/internal/src/kernel/cpu/nms_cpu.c @@ -0,0 +1,441 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_error.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "libnnext/vx_lib_nnext.h" + +__BEGIN_DECLS + +/* + * Define kernel meta. + */ +#define _INPUT_NUM (2) +#define _OUTPUT_NUM (3) + #define _CPU_IO_NUM (_INPUT_NUM + _OUTPUT_NUM) +#define _KERNEL_NAME CVIVANTE_NAMESPACE("cpu.nms") + + +/* + * Kernel params + */ +static vx_param_description_t _nms_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + // Add kererl parameters here +}; +#define SCALAR_INPUT_MAX_SIZE (5) +#define SCALAR_INPUT_IOU_THRES (6) +#define SCALAR_INPUT_SCORE_THRES (7) +#define SCALAR_INPUT_SOFT_NMS_SIGMA (8) +#define _NMS_PARAM_NUM _cnt_of_array( _nms_kernel_param_def ) + +typedef struct Candidate_s +{ + int index; + float score; + int suppress_begin_index; +}Candidate; +static void _swap_element + ( + Candidate* list, + uint32_t first, + uint32_t second + ) +{ + Candidate temp; + memcpy(&temp, &list[first], sizeof(Candidate)); + memcpy(&list[first], &list[second], sizeof(Candidate)); + memcpy(&list[second], &temp, sizeof(Candidate)); +} + +static uint32_t _max_element + ( + Candidate* list, + uint32_t len + ) +{ + uint32_t i; + uint32_t max_index = 0; + float max_val = list[0].score; + for ( i = 1; i < len; i++ ) + { + float val = list[i].score; + if ( max_val < val ) + { + max_val = val; + max_index = i; + } + } + + return max_index; +} + +typedef struct box_corner_encoding_s +{ + float y1; + float x1; + float y2; + float x2; +}box_corner_encoding; + +static float _computeIntersectionOverUnion + ( + const float* boxes, + const int32_t i, + const int32_t j + ) +{ + box_corner_encoding box_i = ((box_corner_encoding *)boxes)[i]; + box_corner_encoding box_j = ((box_corner_encoding *)boxes)[j]; + const float box_i_y_min = vsi_nn_min(box_i.y1, box_i.y2); + const float box_i_y_max = vsi_nn_max(box_i.y1, box_i.y2); + const float box_i_x_min = vsi_nn_min(box_i.x1, box_i.x2); + const float box_i_x_max = vsi_nn_max(box_i.x1, box_i.x2); + const float box_j_y_min = vsi_nn_min(box_j.y1, box_j.y2); + const float box_j_y_max = vsi_nn_max(box_j.y1, box_j.y2); + const float box_j_x_min = vsi_nn_min(box_j.x1, box_j.x2); + const float box_j_x_max = vsi_nn_max(box_j.x1, box_j.x2); + + const float area_i = + (box_i_y_max - box_i_y_min) * (box_i_x_max - box_i_x_min); + const float area_j = + (box_j_y_max - box_j_y_min) * (box_j_x_max - box_j_x_min); + const float intersection_ymax = vsi_nn_min(box_i_y_max, box_j_y_max); + const float intersection_xmax = vsi_nn_min(box_i_x_max, box_j_x_max); + const float intersection_ymin = vsi_nn_max(box_i_y_min, box_j_y_min); + const float intersection_xmin = vsi_nn_max(box_i_x_min, box_j_x_min); + const float intersection_area = + vsi_nn_max(intersection_ymax - intersection_ymin, 0.0f) * + vsi_nn_max(intersection_xmax - intersection_xmin, 0.0f); + + if (area_i <= 0 || area_j <= 0) + { + return 0.0f; + } + + return intersection_area / (area_i + area_j - intersection_area); +} + +/* + * Kernel function + */ +DEF_KERNEL_EXECUTOR(_compute) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VX_SUCCESS; + vsi_nn_kernel_tensor_t tensors[_INPUT_NUM] = { NULL }; + vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL}; + float * buffer[_INPUT_NUM] = { NULL }; + float *f32_out_buffer[_OUTPUT_NUM] = {NULL}; + size_t stride_size[_INPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{0}}; + size_t out_elements[_OUTPUT_NUM] = {0}; + vsi_nn_kernel_tensor_attr_t * attr[_INPUT_NUM] = { NULL }; + vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM] = {NULL}; + int32_t i = 0; + int32_t num_boxes = 0; + float* boxes = NULL; + float* scores = NULL; + float* selected_indices = NULL; + float* selected_scores = NULL; + float* num_selected_indices = NULL; + Candidate * candidate = NULL; + int32_t select_size = 0; + int32_t max_output_size = 0; + int32_t select_start = 0; + int32_t select_len = 0; + float iou_threshold = 0.f; + float score_threshold = 0.f; + float soft_nms_sigma = 0.f; + float scale = 0; + int32_t num_outputs = 0; + + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[SCALAR_INPUT_MAX_SIZE], + &max_output_size); + status |= vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[SCALAR_INPUT_IOU_THRES], + &iou_threshold); + status |= vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[SCALAR_INPUT_SCORE_THRES], + &score_threshold); + status |= vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[SCALAR_INPUT_SOFT_NMS_SIGMA], + &soft_nms_sigma); + CHECK_STATUS_FAIL_GOTO(status, final ); + + for ( i = 0; i < _INPUT_NUM; i++) + { + tensors[i] = (vsi_nn_kernel_tensor_t)param[i]; + attr[i] = vsi_nn_kernel_tensor_attr_create( tensors[i] ); + + vsi_nn_kernel_tensor_attr_get_stride( attr[i], stride_size[i] ); + buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[i], attr[i], TRUE ); + CHECK_PTR_FAIL_GOTO( buffer[i], "Create input buffer fail.", final ); + } + + for ( i = 0; i < _OUTPUT_NUM; i++) + { + output[i] = (vsi_nn_kernel_tensor_t)param[i + _INPUT_NUM]; + out_attr[i] = vsi_nn_kernel_tensor_attr_create( output[i] ); + + out_elements[i] = vsi_nn_kernel_tensor_attr_get_size( out_attr[i] ); + f32_out_buffer[i] = (float *)malloc( out_elements[i] * sizeof(float) ); + CHECK_PTR_FAIL_GOTO( f32_out_buffer[i], "Create output buffer fail.", final ); + memset( f32_out_buffer[i], 0, out_elements[i] * sizeof(float) ); + } + + num_boxes = attr[0]->shape->data[1]; + boxes = buffer[0]; + scores = buffer[1]; + selected_indices = f32_out_buffer[0]; + selected_scores = f32_out_buffer[1]; + num_selected_indices = f32_out_buffer[2]; + + candidate = (Candidate*)malloc(num_boxes * sizeof(Candidate)); + CHECK_PTR_FAIL_GOTO( candidate, "Create select buffer fail.", final ); + memset(candidate, 0, num_boxes * sizeof(Candidate)); + + for (i = 0; i < num_boxes; ++i) + { + if (scores[i] > score_threshold) + { + candidate[select_size].index = i; + candidate[select_size].score = scores[i]; + candidate[select_size].suppress_begin_index = 0; + select_size++; + } + } + + num_outputs = vsi_nn_min(select_size, max_output_size); + + if (num_outputs == 0) + { + num_selected_indices[0] = 0; + } + + if (soft_nms_sigma > 0.0f) + { + scale = -0.5f / soft_nms_sigma; + } + + select_len = 0; + while (select_len < num_outputs && select_start < select_size) + { + int32_t j = 0; + float original_score = 0; + vsi_bool should_hard_suppress = FALSE; + + // find max score and swap to the front. + int32_t max_index = _max_element( &candidate[select_start], select_size - select_start); + + if (max_index != select_size - select_start - 1) + { + _swap_element(&(candidate[select_start]), max_index, 0); + } + + original_score = candidate[select_start].score; + // Calculate IoU of the rest, swap to the end (disgard) if needed. + for ( j = select_len - 1; j >= candidate[select_start].suppress_begin_index; j-- ) + { + int32_t idx = (int32_t)selected_indices[j]; + float iou = _computeIntersectionOverUnion(boxes, candidate[select_start].index, idx); + + // First decide whether to perform hard suppression. + if (iou >= iou_threshold) + { + should_hard_suppress = TRUE; + break; + } + + // Suppress score if NMS sigma > 0. + if (soft_nms_sigma > 0.0) + { + candidate[select_start].score = + candidate[select_start].score * (float)exp(scale * iou * iou); + } + + if (candidate[select_start].score <= score_threshold) + break; + } + + candidate[select_start].suppress_begin_index = select_len; + if (!should_hard_suppress) + { + if (candidate[select_start].score == original_score) + { + // Suppression has not occurred, so select next_candidate. + selected_indices[select_len] = (float)candidate[select_start].index; + selected_scores[select_len] = candidate[select_start].score; + ++ select_len; + } + if ( candidate[select_start].score > score_threshold) + { + // Soft suppression might have occurred and current score is still + // greater than score_threshold; add next_candidate back onto priority + // queue. + candidate[select_start].suppress_begin_index = select_len; + } + } + + select_start ++; + } + + num_selected_indices[0] = (float)select_len; + + for ( i = select_len; i < max_output_size; i++) + { + selected_indices[i] = 0; + selected_scores[i] = 0; + } + + /* save data */ + for ( i = 0; i < _OUTPUT_NUM; i++ ) + { + status = vsi_nn_kernel_tensor_write_from_float( output[i], out_attr[i], + f32_out_buffer[i], out_elements[i] ); + CHECK_STATUS_FAIL_GOTO( status, final ); + } + +final: + vsi_nn_safe_free(candidate); + for( i = 0; i < _INPUT_NUM; i ++ ) + { + if ( buffer[i] ) + { + free( buffer[i] ); + } + vsi_nn_kernel_tensor_attr_release( &attr[i] ); + } + + for ( i = 0; i < _OUTPUT_NUM; i++ ) + { + if (f32_out_buffer[i]) + { + free(f32_out_buffer[i]); + f32_out_buffer[i] = NULL; + } + if (out_attr[i]) + { + vsi_nn_kernel_tensor_attr_release( &out_attr[i] ); + } + } + + return status; +} /* _compute() */ + + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + vsi_nn_tensor_t * const * const inputs, + vsi_nn_tensor_t * const * const outputs + /* Add extra params */ + ) +{ + vsi_status status = VSI_FAILURE; + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", _KERNEL_NAME ); + kernel->info.function = _compute; + kernel->info.parameters = _nms_kernel_param_def; + kernel->info.numParams = _cnt_of_array( _nms_kernel_param_def ); + status = VSI_SUCCESS; + + return status; +} /* _query_kernel() */ + + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_NMS_PARAM_NUM]; + vsi_nn_kernel_node_t node = NULL; + int32_t max_output_size = vsi_nn_kernel_param_get_int32(params, "max_output_size"); + float iou_threshold = vsi_nn_kernel_param_get_float32(params, "iou_threshold"); + float score_threshold = vsi_nn_kernel_param_get_float32(params, "score_threshold"); + float soft_nms_sigma = vsi_nn_kernel_param_get_float32(params, "soft_nms_sigma"); + + status = _query_kernel( kernel, inputs, outputs ); + if ( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if ( node ) + { + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( node_params, _NMS_PARAM_NUM, + inputs, input_num, outputs, output_num ); + /* Pass parameters to node. */ + node_params[SCALAR_INPUT_MAX_SIZE] = vsi_nn_kernel_scalar_create( + graph, I32, &max_output_size ); + node_params[SCALAR_INPUT_IOU_THRES] = vsi_nn_kernel_scalar_create( + graph, F32, &iou_threshold ); + node_params[SCALAR_INPUT_SCORE_THRES] = vsi_nn_kernel_scalar_create( + graph, F32, &score_threshold ); + node_params[SCALAR_INPUT_SOFT_NMS_SIGMA] = vsi_nn_kernel_scalar_create( + graph, F32, &soft_nms_sigma ); + status = vsi_nn_kernel_node_pass_param( node, node_params, _NMS_PARAM_NUM ); + + vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_MAX_SIZE] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_IOU_THRES] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_SCORE_THRES] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_SOFT_NMS_SIGMA] ); + } + } + + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_CPU( nms, _setup ) diff --git a/src/tim/vx/internal/src/kernel/cpu/one_hot_cpu.c b/src/tim/vx/internal/src/kernel/cpu/one_hot_cpu.c new file mode 100644 index 0000000..6a46178 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/cpu/one_hot_cpu.c @@ -0,0 +1,252 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_error.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "libnnext/vx_lib_nnext.h" + +__BEGIN_DECLS + +/* + * Define kernel meta. + */ +#define _INPUT_NUM (1) +#define _OUTPUT_NUM (1) + #define _IO_NUM (_INPUT_NUM + _OUTPUT_NUM) +#define _KERNEL_NAME CVIVANTE_NAMESPACE("cpu.one_hot") + + +/* + * Kernel params + */ +static vx_param_description_t _one_hot_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, +}; + +#define INPUT_SCALAR_DEPTH (2) +#define INPUT_SCALAR_ON_VALUE (3) +#define INPUT_SCALAR_OFF_VALUE (4) +#define INPUT_SCALAR_AXIS (5) +#define _ONE_HOT_PARAM_NUM _cnt_of_array( _one_hot_kernel_param_def ) + + +/* + * Kernel function + */ +DEF_KERNEL_EXECUTOR(_compute) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_tensor_t tensors[_IO_NUM] = { NULL }; + float * buffer[_IO_NUM] = { NULL }; + size_t out_elements = 0; + vsi_nn_kernel_tensor_attr_t * attr[_IO_NUM] = { NULL }; + int32_t i = 0; + int32_t j = 0; + int32_t k = 0; + int32_t index = 0; + int32_t depth = 0; + float on_value = 0; + float off_value = 0; + int32_t axis = 0; + int32_t prefix_dim_size = 1; + int32_t suffix_dim_size = 0; + int32_t num_elements = 0; + + tensors[0] = (vsi_nn_kernel_tensor_t)param[0]; + tensors[1] = (vsi_nn_kernel_tensor_t)param[1]; + + attr[0] = vsi_nn_kernel_tensor_attr_create( tensors[0] ); + CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); + attr[1] = vsi_nn_kernel_tensor_attr_create( tensors[1] ); + CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final ); + + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[2], &depth); + status |= vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[3], &on_value); + status |= vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[4], &off_value); + status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[5], &axis); + CHECK_STATUS_FAIL_GOTO(status, final ); + + num_elements = (int32_t)vsi_nn_kernel_tensor_attr_get_size( attr[0] ); + buffer[0] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[0], attr[0], TRUE ); + CHECK_PTR_FAIL_GOTO( buffer[0], "Create input buffer fail.", final ); + + out_elements = vsi_nn_kernel_tensor_attr_get_size( attr[1] ); + buffer[1] = (float *)malloc( out_elements * sizeof(float) ); + CHECK_PTR_FAIL_GOTO( buffer[1], "Create output buffer fail.", final ); + memset( buffer[1], 0, out_elements * sizeof(float) ); + + axis = axis == -1 ? (int32_t)attr[0]->shape->size : (int32_t)attr[0]->shape->size - axis; + + for (i = 0; i < axis; i++) + { + prefix_dim_size *= attr[0]->shape->data[i]; + } + + suffix_dim_size = num_elements / prefix_dim_size; + + for (i = 0; i < prefix_dim_size; i++) + { + for (j = 0; j < depth; j++) + { + for (k = 0; k < suffix_dim_size; k++) + { + int32_t value = (int32_t)buffer[0][i * suffix_dim_size + k]; + buffer[1][index ++] = value == j ? on_value : off_value; + } + } + } + + status = vsi_nn_kernel_tensor_write_from_float( tensors[1], attr[1], + buffer[1], out_elements ); + CHECK_STATUS_FAIL_GOTO( status, final ); +final: +#define SAFE_FREE_TENSOR_ATTR(_PTR) if ( _PTR ) { vsi_nn_kernel_tensor_attr_release( &_PTR ); _PTR = NULL; } + SAFE_FREE_TENSOR_ATTR(attr[0]); + SAFE_FREE_TENSOR_ATTR(attr[1]); +#undef SAFE_FREE_TENSOR_ATTR + for ( i = 0; i < _IO_NUM; i ++ ) + { + if ( buffer[i] ) + { + free( buffer[i] ); + buffer[i] = NULL; + } + } + + return status; +} /* _compute() */ + + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + vsi_nn_tensor_t * const * const inputs, + vsi_nn_tensor_t * const * const outputs + /* Add extra params */ + ) +{ + vsi_status status = VSI_FAILURE; + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", _KERNEL_NAME ); + kernel->info.function = _compute; + kernel->info.parameters = _one_hot_kernel_param_def; + kernel->info.numParams = _cnt_of_array( _one_hot_kernel_param_def ); + status = VSI_SUCCESS; + + return status; +} /* _query_kernel() */ + + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_ONE_HOT_PARAM_NUM] = {NULL}; + vsi_nn_kernel_node_t node = NULL; + int32_t depth = vsi_nn_kernel_param_get_int32( params, "depth" ); + float on_value = vsi_nn_kernel_param_get_float32( params, "on_value" ); + float off_value = vsi_nn_kernel_param_get_float32( params, "off_value" ); + int32_t axis = vsi_nn_kernel_param_get_int32( params, "axis" ); + + status = _query_kernel( kernel, inputs, outputs /* Add extra params */ ); + if ( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if ( node ) + { + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( node_params, _ONE_HOT_PARAM_NUM, + inputs, input_num, outputs, output_num ); + node_params[INPUT_SCALAR_DEPTH] = vsi_nn_kernel_scalar_create( + graph, I32, &depth ); + node_params[INPUT_SCALAR_ON_VALUE] = vsi_nn_kernel_scalar_create( + graph, F32, &on_value ); + node_params[INPUT_SCALAR_OFF_VALUE] = vsi_nn_kernel_scalar_create( + graph, F32, &off_value ); + node_params[INPUT_SCALAR_AXIS] = vsi_nn_kernel_scalar_create( + graph, I32, &axis ); + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, node_params, _ONE_HOT_PARAM_NUM ); + CHECK_STATUS_FAIL_GOTO( status, OnError ); + } + } +OnError: + if (node_params[INPUT_SCALAR_DEPTH]) + { + vsi_nn_kernel_scalar_release( &node_params[INPUT_SCALAR_DEPTH] ); + } + + if (node_params[INPUT_SCALAR_ON_VALUE]) + { + vsi_nn_kernel_scalar_release( &node_params[INPUT_SCALAR_ON_VALUE] ); + } + + if (node_params[INPUT_SCALAR_OFF_VALUE]) + { + vsi_nn_kernel_scalar_release( &node_params[INPUT_SCALAR_OFF_VALUE] ); + } + + if (node_params[INPUT_SCALAR_AXIS]) + { + vsi_nn_kernel_scalar_release( &node_params[INPUT_SCALAR_AXIS] ); + } + + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_CPU( one_hot, _setup ) diff --git a/src/tim/vx/internal/src/kernel/cpu/pow_cpu.c b/src/tim/vx/internal/src/kernel/cpu/pow_cpu.c index 1f7c2eb..902d40e 100644 --- a/src/tim/vx/internal/src/kernel/cpu/pow_cpu.c +++ b/src/tim/vx/internal/src/kernel/cpu/pow_cpu.c @@ -32,7 +32,7 @@ #include "vsi_nn_prv.h" #include "vsi_nn_error.h" #include "kernel/vsi_nn_kernel.h" -#include "client/vsi_nn_vxkernel.h" +#include "libnnext/vsi_nn_vxkernel.h" __BEGIN_DECLS diff --git a/src/tim/vx/internal/src/kernel/cpu/pre_process_bgra_cpu.c b/src/tim/vx/internal/src/kernel/cpu/pre_process_bgra_cpu.c index 3be8fc9..d31f2fc 100644 --- a/src/tim/vx/internal/src/kernel/cpu/pre_process_bgra_cpu.c +++ b/src/tim/vx/internal/src/kernel/cpu/pre_process_bgra_cpu.c @@ -32,7 +32,7 @@ #include "vsi_nn_prv.h" #include "vsi_nn_error.h" #include "kernel/vsi_nn_kernel.h" -#include "client/vsi_nn_vxkernel.h" +#include "libnnext/vsi_nn_vxkernel.h" __BEGIN_DECLS diff --git a/src/tim/vx/internal/src/kernel/cpu/pre_process_gray_cpu.c b/src/tim/vx/internal/src/kernel/cpu/pre_process_gray_cpu.c index 644add0..c615f68 100644 --- a/src/tim/vx/internal/src/kernel/cpu/pre_process_gray_cpu.c +++ b/src/tim/vx/internal/src/kernel/cpu/pre_process_gray_cpu.c @@ -32,7 +32,7 @@ #include "vsi_nn_prv.h" #include "vsi_nn_error.h" #include "kernel/vsi_nn_kernel.h" -#include "client/vsi_nn_vxkernel.h" +#include "libnnext/vsi_nn_vxkernel.h" __BEGIN_DECLS diff --git a/src/tim/vx/internal/src/kernel/cpu/pre_process_nv12_cpu.c b/src/tim/vx/internal/src/kernel/cpu/pre_process_nv12_cpu.c index 2417d0e..1e4d48d 100644 --- a/src/tim/vx/internal/src/kernel/cpu/pre_process_nv12_cpu.c +++ b/src/tim/vx/internal/src/kernel/cpu/pre_process_nv12_cpu.c @@ -32,7 +32,7 @@ #include "vsi_nn_prv.h" #include "vsi_nn_error.h" #include "kernel/vsi_nn_kernel.h" -#include "client/vsi_nn_vxkernel.h" +#include "libnnext/vsi_nn_vxkernel.h" __BEGIN_DECLS diff --git a/src/tim/vx/internal/src/kernel/cpu/pre_process_rgb_cpu.c b/src/tim/vx/internal/src/kernel/cpu/pre_process_rgb_cpu.c index 2be7273..972172f 100644 --- a/src/tim/vx/internal/src/kernel/cpu/pre_process_rgb_cpu.c +++ b/src/tim/vx/internal/src/kernel/cpu/pre_process_rgb_cpu.c @@ -32,7 +32,7 @@ #include "vsi_nn_prv.h" #include "vsi_nn_error.h" #include "kernel/vsi_nn_kernel.h" -#include "client/vsi_nn_vxkernel.h" +#include "libnnext/vsi_nn_vxkernel.h" __BEGIN_DECLS diff --git a/src/tim/vx/internal/src/kernel/cpu/pre_process_yuv420_cpu.c b/src/tim/vx/internal/src/kernel/cpu/pre_process_yuv420_cpu.c index 6749f29..8132778 100644 --- a/src/tim/vx/internal/src/kernel/cpu/pre_process_yuv420_cpu.c +++ b/src/tim/vx/internal/src/kernel/cpu/pre_process_yuv420_cpu.c @@ -32,7 +32,7 @@ #include "vsi_nn_prv.h" #include "vsi_nn_error.h" #include "kernel/vsi_nn_kernel.h" -#include "client/vsi_nn_vxkernel.h" +#include "libnnext/vsi_nn_vxkernel.h" __BEGIN_DECLS diff --git a/src/tim/vx/internal/src/kernel/cpu/pre_process_yuv444_cpu.c b/src/tim/vx/internal/src/kernel/cpu/pre_process_yuv444_cpu.c index 6894957..a19e5ae 100644 --- a/src/tim/vx/internal/src/kernel/cpu/pre_process_yuv444_cpu.c +++ b/src/tim/vx/internal/src/kernel/cpu/pre_process_yuv444_cpu.c @@ -32,7 +32,7 @@ #include "vsi_nn_prv.h" #include "vsi_nn_error.h" #include "kernel/vsi_nn_kernel.h" -#include "client/vsi_nn_vxkernel.h" +#include "libnnext/vsi_nn_vxkernel.h" __BEGIN_DECLS diff --git a/src/tim/vx/internal/src/kernel/cpu/prelu_cpu.c b/src/tim/vx/internal/src/kernel/cpu/prelu_cpu.c index fa433dc..b7e97c2 100644 --- a/src/tim/vx/internal/src/kernel/cpu/prelu_cpu.c +++ b/src/tim/vx/internal/src/kernel/cpu/prelu_cpu.c @@ -32,7 +32,7 @@ #include "vsi_nn_prv.h" #include "vsi_nn_error.h" #include "kernel/vsi_nn_kernel.h" -#include "client/vsi_nn_vxkernel.h" +#include "libnnext/vsi_nn_vxkernel.h" __BEGIN_DECLS diff --git a/src/tim/vx/internal/src/kernel/cpu/random_multinomial_cpu.c b/src/tim/vx/internal/src/kernel/cpu/random_multinomial_cpu.c index 3b21033..15d1b51 100644 --- a/src/tim/vx/internal/src/kernel/cpu/random_multinomial_cpu.c +++ b/src/tim/vx/internal/src/kernel/cpu/random_multinomial_cpu.c @@ -38,7 +38,7 @@ #include "utils/vsi_nn_dtype_util.h" #include "kernel/vsi_nn_kernel.h" #include "kernel/vsi_nn_kernel_eltwise.h" -#include "client/vsi_nn_vxkernel.h" +#include "libnnext/vsi_nn_vxkernel.h" __BEGIN_DECLS diff --git a/src/tim/vx/internal/src/kernel/cpu/repeat_cpu.c b/src/tim/vx/internal/src/kernel/cpu/repeat_cpu.c new file mode 100644 index 0000000..ceb1684 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/cpu/repeat_cpu.c @@ -0,0 +1,286 @@ +/**************************************************************************** +* +* Copyright (c) 2019 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "vsi_nn_error.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "libnnext/vsi_nn_vxkernel.h" + +__BEGIN_DECLS + +/* + * Define kernel meta. + */ +#define _CPU_ARG_NUM (1) +#define _CPU_INPUT_NUM (2) +#define _CPU_OUTPUT_NUM (1) +#define _CPU_IO_NUM (_CPU_INPUT_NUM + _CPU_OUTPUT_NUM) +#define _CPU_PARAM_NUM (_CPU_ARG_NUM + _CPU_IO_NUM) +#define _KERNEL_NAME CVIVANTE_NAMESPACE("cpu.repeat") + +DEF_KERNEL_EXECUTOR(_repeat_exec) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_tensor_t tensors[_CPU_IO_NUM] = { NULL }; + float * buffer[_CPU_IO_NUM] = { NULL }; + size_t out_elements = 0; + vsi_nn_kernel_tensor_attr_t * attr[_CPU_IO_NUM] = { NULL }; + int32_t i = 0, j = 0, b = 0, c = 0; + int32_t axis = 0; + int32_t outerSize = 1; + int32_t outIdx = 0; + int32_t width = 0, height = 0, channel = 0, batch = 0; + int32_t spatial = 0, vol = 0; + + tensors[0] = (vsi_nn_kernel_tensor_t)param[0]; + tensors[1] = (vsi_nn_kernel_tensor_t)param[1]; + tensors[2] = (vsi_nn_kernel_tensor_t)param[2]; + + attr[0] = vsi_nn_kernel_tensor_attr_create( tensors[0] ); + CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); + attr[1] = vsi_nn_kernel_tensor_attr_create( tensors[1] ); + CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final ); + attr[2] = vsi_nn_kernel_tensor_attr_create( tensors[2] ); + CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", final ); + + out_elements = vsi_nn_kernel_tensor_attr_get_size( attr[2] ); + + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[3], &axis); + CHECK_STATUS_FAIL_GOTO(status, final ); + + buffer[0] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[0], attr[0], TRUE ); + CHECK_PTR_FAIL_GOTO( buffer[0], "Create input0 buffer fail.", final ); + + buffer[1] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[1], attr[1], TRUE ); + CHECK_PTR_FAIL_GOTO( buffer[1], "Create input0 buffer fail.", final ); + + buffer[2] = (float *)malloc( out_elements * sizeof(float) ); + CHECK_PTR_FAIL_GOTO( buffer[2], "Create output buffer fail.", final ); + memset( buffer[2], 0, out_elements * sizeof(float) ); + + width = attr[0]->shape->data[0]; + height = attr[0]->shape->data[1]; + channel = attr[0]->shape->size > 2 ? attr[0]->shape->data[2] : 1; + batch = attr[0]->shape->size > 3 ? attr[0]->shape->data[3] : 1; + spatial = width * height; + vol = spatial * channel; + + for(i = 1; i < (int32_t)attr[0]->shape->size; i++) + { + outerSize *= attr[0]->shape->data[i]; + } + + if (axis == 0 && outerSize == 1) + { + for(i = 0; i < width; i++) + { + float data = buffer[0][i]; + int32_t len = (int32_t)buffer[1][i]; + for(j = 0; j < len; j++) + { + buffer[2][outIdx] = data; + } + } + } + else if (axis == 0) + { + for(b = 0; b < batch; b++) + { + for(c = 0; c < channel; c++) + { + for(i = 0; i < height; i++) + { + int32_t len = (int32_t)buffer[1][i]; + int32_t offset = i * width + c * spatial + b * vol; + for(j = 0; j < len; j++) + { + memcpy(buffer[2] + outIdx, buffer[0] + offset, sizeof(float) * width); + outIdx += width; + } + } + } + } + } + else if (axis == 1) + { + for(b = 0; b < batch; b++) + { + for(c = 0; c < channel; c++) + { + for(i = 0; i < height; i++) + { + int32_t offset = i * width + c * spatial + b * vol; + for(j = 0; j < width; j++) + { + int32_t len = (int32_t)buffer[1][j]; + float data = buffer[0][offset + j]; + int32_t k = 0; + for(k = 0; k < len; k++) + { + buffer[2][outIdx++] = data; + } + } + } + } + } + } + else if (axis == 2) + { + for(b = 0; b < batch; b++) + { + for(c = 0; c < channel; c++) + { + int32_t len = (int32_t)buffer[1][c]; + int32_t offset = c * spatial + b * vol; + + for(j = 0; j < len; j++) + { + memcpy(buffer[2] + outIdx, buffer[0] + offset, sizeof(float) * spatial); + outIdx += spatial; + } + } + } + } + else + { + VSILOGE("axis is not support"); + status = VSI_FAILURE; + goto final; + } + + status = vsi_nn_kernel_tensor_write_from_float( tensors[2], attr[2], + buffer[2], out_elements ); + CHECK_STATUS_FAIL_GOTO( status, final ); + +final: + for( i = 0; i < _CPU_IO_NUM; i ++ ) + { + if( buffer[i] ) + { + free( buffer[i] ); + } + } + for( i = 0; i < _CPU_IO_NUM; i ++ ) + { + if(attr[i]) { vsi_nn_kernel_tensor_attr_release( &attr[i] ); } + } + return status; +} /* _repeat_exec() */ +/* + * Kernel params + */ +static vx_param_description_t _repeat_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + // Add kererl parameters here +}; +#define _REPEAT_PARAM_NUM _cnt_of_array( _repeat_kernel_param_def ) + +static const vx_kernel_description_t _kernel_info = +{ + KERNEL_ID_PLACEHOLDER, + _KERNEL_NAME, + _repeat_exec, + _repeat_kernel_param_def, + _cnt_of_array( _repeat_kernel_param_def ), + vsi_nn_KernelValidator, + NULL, + NULL, + vsi_nn_KernelInitializer, + vsi_nn_KernelDeinitializer +}; + +static vsi_status _query_kernel + ( + vsi_nn_tensor_t* const* const inputs, + vsi_nn_tensor_t* const* const outputs, + vsi_nn_kernel_t* kernel + ) +{ + memmove( &kernel->info, &_kernel_info, sizeof(vx_kernel_description_t) ); + return VSI_SUCCESS; +} /* _query_kernel() */ + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t backend_params[_CPU_PARAM_NUM] = {NULL}; + vsi_nn_kernel_node_t node = NULL; + int32_t axis = vsi_nn_kernel_param_get_int32( params, "axis" ); + + status = _query_kernel( inputs, outputs, kernel ); + if( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if( node ) + { + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( backend_params, _CPU_PARAM_NUM, + inputs, _CPU_INPUT_NUM, outputs, _CPU_OUTPUT_NUM ); + backend_params[3] = vsi_nn_kernel_scalar_create( graph, I32, &axis ); + + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, backend_params, _CPU_PARAM_NUM ); + CHECK_STATUS( status ); + vsi_nn_kernel_scalar_release( &backend_params[3] ); + } + else + { + status = VSI_FAILURE; + } + } + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_CPU( repeat, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/cpu/scatter_nd_cpu.c b/src/tim/vx/internal/src/kernel/cpu/scatter_nd_cpu.c index 1369867..62c7ff0 100644 --- a/src/tim/vx/internal/src/kernel/cpu/scatter_nd_cpu.c +++ b/src/tim/vx/internal/src/kernel/cpu/scatter_nd_cpu.c @@ -35,7 +35,7 @@ #include "vsi_nn_error.h" #include "utils/vsi_nn_util.h" #include "kernel/vsi_nn_kernel.h" -#include "client/vsi_nn_vxkernel.h" +#include "libnnext/vsi_nn_vxkernel.h" __BEGIN_DECLS diff --git a/src/tim/vx/internal/src/kernel/cpu/sequence_mask_cpu.c b/src/tim/vx/internal/src/kernel/cpu/sequence_mask_cpu.c new file mode 100644 index 0000000..9790537 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/cpu/sequence_mask_cpu.c @@ -0,0 +1,248 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_error.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "utils/vsi_nn_dtype_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "kernel/vsi_nn_kernel_eltwise.h" +#include "libnnext/vsi_nn_vxkernel.h" + +__BEGIN_DECLS + +#define _CPU_ARG_NUM (1) +#define _CPU_INPUT_NUM (1) +#define _CPU_OUTPUT_NUM (1) +#define _CPU_IO_NUM (_CPU_INPUT_NUM + _CPU_OUTPUT_NUM) +#define _CPU_PARAM_NUM (_CPU_ARG_NUM + _CPU_IO_NUM) +#define _KERNEL_NAME CVIVANTE_NAMESPACE("sequence_mask_sw") + +DEF_KERNEL_EXECUTOR(_sequence_mask_exec) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VX_SUCCESS; + vsi_nn_kernel_tensor_t tensors[_CPU_IO_NUM] = { NULL }; + float * buffer_in = NULL; + float * buffer = NULL; + size_t out_elements = 0; + vsi_nn_kernel_tensor_attr_t * attr[_CPU_IO_NUM] = { NULL }; + uint32_t i = 0; + + tensors[0] = (vsi_nn_kernel_tensor_t)param[0]; + tensors[1] = (vsi_nn_kernel_tensor_t)param[1]; + + attr[0] = vsi_nn_kernel_tensor_attr_create( tensors[0] ); + attr[1] = vsi_nn_kernel_tensor_attr_create( tensors[1] ); + + out_elements = vsi_nn_kernel_tensor_attr_get_size( attr[1] ); + + buffer_in = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[0], attr[0], TRUE ); + CHECK_PTR_FAIL_GOTO( buffer_in, "Create input0 buffer fail.", final ); + + buffer = (float *)malloc( out_elements * sizeof(float) ); + CHECK_PTR_FAIL_GOTO( buffer, "Create output buffer fail.", final ); + memset( buffer, 0, out_elements * sizeof(float) ); + + { + uint32_t j = 0; + uint32_t height = attr[1]->shape->data[1]; + uint32_t width = attr[1]->shape->data[0]; + + for(j = 0; j < height; j++) + { + uint32_t idx_in = (uint32_t)buffer_in[j]; + uint32_t out_offset = j * width; + idx_in = idx_in > width ? width : idx_in; + for(i = 0; i < idx_in; i++) + { + buffer[out_offset + i] = 1; + } + } + } + + status = vsi_nn_kernel_tensor_write_from_float( tensors[1], attr[1], + buffer, out_elements ); + CHECK_STATUS_FAIL_GOTO( status, final ); + +final: + if (buffer_in) + { + free( buffer_in ); + } + if (buffer) + { + free( buffer ); + } + for( i = 0; i < _CPU_IO_NUM; i ++ ) + { + vsi_nn_kernel_tensor_attr_release( &attr[i] ); + } + return status; +} /* _sequence_mask_exec() */ + +static vx_param_description_t kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, +}; + + +static const vx_kernel_description_t _kernel_info = +{ + KERNEL_ID_PLACEHOLDER, + _KERNEL_NAME, + _sequence_mask_exec, + kernel_param_def, + _cnt_of_array( kernel_param_def ), + vsi_nn_KernelValidator, + NULL, + NULL, + vsi_nn_KernelInitializer, + vsi_nn_KernelDeinitializer +}; + +static vsi_status _query_kernel + ( + vsi_nn_tensor_t* const* const inputs, + vsi_nn_tensor_t* const* const outputs, + vsi_nn_kernel_t* kernel + ) +{ + memmove( &kernel->info, &_kernel_info, sizeof(vx_kernel_description_t) ); + return VSI_SUCCESS; +} /* _query_kernel() */ + +static int32_t _optimize_mask_shape + ( + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs, + int32_t max_len, + int32_t* opt_shape_in, + int32_t* opt_shape_out + ) +{ + vsi_status status = VSI_SUCCESS; + int32_t out_size = 1; + uint32_t i = 0; + opt_shape_in[0] = 1; + opt_shape_in[1] = 1; + for(i = 0; i < inputs[0]->attr.dim_num; i++) + { + opt_shape_in[0] *= inputs[0]->attr.size[i]; + } + + for(i = 0; i < outputs[0]->attr.dim_num; i++) + { + out_size *= outputs[0]->attr.size[i]; + } + + opt_shape_out[0] = max_len; + opt_shape_out[1] = out_size / max_len; + + if (out_size % max_len != 0) + { + return VSI_FAILURE; + } + + return status; +} + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_SUCCESS; + vsi_nn_kernel_node_param_t backend_params[_CPU_PARAM_NUM] = {NULL}; + vsi_nn_kernel_node_t node = NULL; + vsi_nn_kernel_tensor_t rs_input = NULL, rs_output = NULL; + int32_t new_shape[2][VSI_NN_MAX_DIM_NUM] = {{ 1, 1, 1, 1 }}; + int32_t max_len = vsi_nn_kernel_param_get_int32( params, "max_len" ); + + status = _optimize_mask_shape(inputs, outputs, max_len, new_shape[0], new_shape[1]); + if ( VSI_SUCCESS != status ) + { + goto final; + } + rs_input = vsi_nn_kernel_tensor_reshape(inputs[0]->t, new_shape[0], 2); + rs_output = vsi_nn_kernel_tensor_reshape(outputs[0]->t, new_shape[1], 2); + + status = _query_kernel( inputs, outputs, kernel ); + if ( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if ( node ) + { + /* Set inputs and outputs */ + uint32_t index = 0; + /* Pass parameters to node. */ + backend_params[index++] = rs_input; + backend_params[index++] = rs_output; + backend_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &max_len ); + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, backend_params, _CPU_PARAM_NUM ); + CHECK_STATUS(status); + vsi_nn_kernel_scalar_release( &backend_params[2] ); + } + else + { + status = VSI_FAILURE; + } + } +final: + if (rs_input) + { + vsi_nn_kernel_tensor_release( &rs_input ); + } + if (rs_output) + { + vsi_nn_kernel_tensor_release( &rs_output ); + } + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_CPU( sequence_mask, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/cpu/slice_cpu.c b/src/tim/vx/internal/src/kernel/cpu/slice_cpu.c new file mode 100644 index 0000000..8307152 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/cpu/slice_cpu.c @@ -0,0 +1,246 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_error.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "libnnext/vx_lib_nnext.h" + +__BEGIN_DECLS + + /* + * Define kernel meta. + */ +#define _INPUT_NUM (2) +#define _OUTPUT_NUM (1) +#define _KERNEL_NAME CVIVANTE_NAMESPACE("cpu.slice") + + + /* + * Kernel params + */ + static vx_param_description_t _slice_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + // Add kererl parameters here +}; +#define _SLICE_PARAM_NUM _cnt_of_array( _slice_kernel_param_def ) + + +/* +* Kernel function +*/ +DEF_KERNEL_EXECUTOR(_compute) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_tensor_t input[_INPUT_NUM] = {NULL}; + vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL}; + float *f32_in_buffer[_INPUT_NUM] = {NULL}; + float *f32_out_buffer[_OUTPUT_NUM] = {NULL}; + vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM]; + vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM]; + size_t out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}}; + size_t out_elements[_OUTPUT_NUM] = {0}; + size_t out_bytes[_OUTPUT_NUM] = {0}; + int32_t rank = 0; + int32_t i = 0; + int32_t in_w = 0; + int32_t in_h = 0; + int32_t in_c = 0; + int32_t in_b = 0; + int32_t start[4] = {0}; + int32_t stop[4] = {0}; + int32_t in_size[4] = {1, 1, 1, 1}; + int32_t out_size[4] = {1, 1, 1, 1}; + float *input_ptr = NULL; + float *output_ptr = NULL; + int32_t dstIdx = 0; + + /* prepare data */ + for (i = 0; i < _INPUT_NUM; i ++) + { + input[i] = (vsi_nn_kernel_tensor_t)param[i]; + in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] ); + f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE ); + CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input0 buffer fail.", final ); + } + + for (i = 0; i < _OUTPUT_NUM; i ++) + { + output[i] = (vsi_nn_kernel_tensor_t)param[i + _INPUT_NUM]; + out_attr[i] = vsi_nn_kernel_tensor_attr_create( output[i] ); + vsi_nn_kernel_tensor_attr_get_stride( out_attr[i], out_stride_size[i] ); + out_elements[i] = vsi_nn_kernel_tensor_attr_get_size( out_attr[i] ); + out_bytes[i] = out_elements[i] * sizeof(float); + f32_out_buffer[i] = (float *)malloc( out_bytes[i] ); + CHECK_PTR_FAIL_GOTO( f32_out_buffer[i], "Create output buffer fail.", final ); + memset( f32_out_buffer[i], 0, out_bytes[i] ); + } + + rank = (int32_t)out_attr[0]->shape->size; + + for (i = 0; i < rank; i++) + { + in_size[i] = in_attr[0]->shape->data[i]; + out_size[i] = out_attr[0]->shape->data[i]; + } + + start[0] = (int32_t)f32_in_buffer[1][0]; + stop[0] = start[0] + out_attr[0]->shape->data[0]; + start[1] = rank < 2 ? 0 : (int32_t)f32_in_buffer[1][1]; + stop[1] = rank < 2 ? 1 : start[1] + out_size[1]; + start[2] = rank < 3 ? 0 : (int32_t)f32_in_buffer[1][2]; + stop[2] = rank < 3 ? 1 : start[2] + out_size[2]; + start[3] = rank < 4 ? 0 : (int32_t)f32_in_buffer[1][3]; + stop[3] = rank < 4 ? 1 : start[3] + out_size[3]; + input_ptr = f32_in_buffer[0]; + output_ptr = f32_out_buffer[0]; + + for (in_b = start[3]; in_b < stop[3]; ++in_b) + { + for (in_c = start[2]; in_c < stop[2]; ++in_c) + { + for (in_h = start[1]; in_h < stop[1]; ++in_h) + { + for (in_w = start[0]; in_w < stop[0]; ++in_w) + { + int32_t srcIdx = ((in_b * in_size[2] + in_c) * in_size[1] + in_h) * in_size[0] + in_w; + output_ptr[dstIdx ++] = input_ptr[srcIdx]; + } + } + } + } + + /* save data */ + for(i = 0; i < _OUTPUT_NUM; i++) + { + status = vsi_nn_kernel_tensor_write_from_float( output[i], out_attr[i], + f32_out_buffer[i], out_elements[i] ); + CHECK_STATUS_FAIL_GOTO( status, final ); + } + +final: + for (i = 0; i < _INPUT_NUM; i++) + { + if (f32_in_buffer[i]) + { + free(f32_in_buffer[i]); + f32_in_buffer[i] = NULL; + } + if (in_attr[i]) + { + vsi_nn_kernel_tensor_attr_release( &in_attr[i] ); + } + } + for (i = 0; i < _OUTPUT_NUM; i++) + { + if (f32_out_buffer[i]) + { + free(f32_out_buffer[i]); + f32_out_buffer[i] = NULL; + } + if (out_attr[i]) + { + vsi_nn_kernel_tensor_attr_release( &out_attr[i] ); + } + } + + return status; +} /* _compute() */ + + +/* +* Query kernel +*/ +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + vsi_nn_tensor_t * const * const inputs, + vsi_nn_tensor_t * const * const outputs + /* Add extra params */ + ) +{ + vsi_status status = VSI_FAILURE; + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", _KERNEL_NAME ); + kernel->info.function = _compute; + kernel->info.parameters = _slice_kernel_param_def; + kernel->info.numParams = _cnt_of_array( _slice_kernel_param_def ); + status = VSI_SUCCESS; + + return status; +} /* _query_kernel() */ + + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_SLICE_PARAM_NUM]; + vsi_nn_kernel_node_t node = NULL; + + status = _query_kernel( kernel, inputs, outputs /* Add extra params */ ); + if ( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if ( node ) + { + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( node_params, _SLICE_PARAM_NUM, + inputs, input_num, outputs, output_num ); + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, node_params, _SLICE_PARAM_NUM ); + } + } + + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_CPU( slice, _setup ) diff --git a/src/tim/vx/internal/src/kernel/cpu/space2depth_internal_cpu.c b/src/tim/vx/internal/src/kernel/cpu/space2depth_internal_cpu.c index 4df8a52..a9170c7 100644 --- a/src/tim/vx/internal/src/kernel/cpu/space2depth_internal_cpu.c +++ b/src/tim/vx/internal/src/kernel/cpu/space2depth_internal_cpu.c @@ -35,7 +35,7 @@ #include "vsi_nn_error.h" #include "utils/vsi_nn_util.h" #include "kernel/vsi_nn_kernel.h" -#include "client/vsi_nn_vxkernel.h" +#include "libnnext/vsi_nn_vxkernel.h" __BEGIN_DECLS diff --git a/src/tim/vx/internal/src/kernel/cpu/tile_cpu.c b/src/tim/vx/internal/src/kernel/cpu/tile_cpu.c index 63c2f4c..90729c7 100644 --- a/src/tim/vx/internal/src/kernel/cpu/tile_cpu.c +++ b/src/tim/vx/internal/src/kernel/cpu/tile_cpu.c @@ -32,7 +32,7 @@ #include "vsi_nn_prv.h" #include "vsi_nn_error.h" #include "kernel/vsi_nn_kernel.h" -#include "client/vsi_nn_vxkernel.h" +#include "libnnext/vsi_nn_vxkernel.h" __BEGIN_DECLS diff --git a/src/tim/vx/internal/src/kernel/cpu/topk_cpu.c b/src/tim/vx/internal/src/kernel/cpu/topk_cpu.c new file mode 100644 index 0000000..a2062c8 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/cpu/topk_cpu.c @@ -0,0 +1,297 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_error.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "libnnext/vx_lib_nnext.h" + +__BEGIN_DECLS + +/* + * Define kernel meta. + */ +#define _INPUT_NUM (1) +#define _OUTPUT_NUM (2) +#define _KERNEL_NAME CVIVANTE_NAMESPACE("cpu.topk") + + +/* + * Kernel params + */ +static vx_param_description_t _topk_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED }, + // Add kererl parameters here +}; +#define _TOPK_PARAM_NUM _cnt_of_array( _topk_kernel_param_def ) + +static uint32_t _max_comp_func(void* data, int32_t left, int32_t right) +{ + float* fdata = (float*)data; + if (fdata[left] >= fdata[right]) + { + return TRUE; + } + else + { + return FALSE; + } +} + +static void _find_top_k_1d +( + float* input, + uint32_t input_len, + uint32_t k, + float* value, + uint32_t* indices +) +{ + int32_t low = 0; + int32_t high = input_len - 1; + int32_t j; + + for (j = 0; j < (int32_t)input_len; j++) + { + indices[j] = j; + } + + j = vsi_nn_partition(input, low, high, _max_comp_func, FALSE, indices); + + //part_sort + while (j != (int32_t)k) + { + if ((int32_t)k > j) + { + low = j + 1; + } + else + { + high = j; + } + j = vsi_nn_partition(input, low, high, _max_comp_func, FALSE, indices); + } + //all_sort + vsi_nn_partition(input, 0, k - 1, _max_comp_func, TRUE, indices); + + for (j = 0; j < (int32_t)k; j++) + { + value[j] = input[indices[j]]; + } +} + +/* + * Kernel function + */ +DEF_KERNEL_EXECUTOR(_compute) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_tensor_t input[_INPUT_NUM] = {NULL}; + vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL}; + float *f32_in_buffer[_INPUT_NUM] = {NULL}; + float *f32_out_buffer[_OUTPUT_NUM] = {NULL}; + vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM]; + vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM]; + size_t out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}}; + size_t out_elements[_OUTPUT_NUM] = {0}; + size_t out_bytes[_OUTPUT_NUM] = {0}; + uint32_t i = 0; + int32_t j = 0; + int32_t top_k = 0; + uint32_t block_num = 0; + uint32_t block_size = 0; + uint32_t * indices_ptr = NULL; + + /* prepare data */ + for (i = 0; i < _INPUT_NUM; i ++) + { + input[i] = (vsi_nn_kernel_tensor_t)param[i]; + in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] ); + f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE ); + CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input0 buffer fail.", final ); + } + + for (i = 0; i < _OUTPUT_NUM; i ++) + { + output[i] = (vsi_nn_kernel_tensor_t)param[i + _INPUT_NUM]; + out_attr[i] = vsi_nn_kernel_tensor_attr_create( output[i] ); + vsi_nn_kernel_tensor_attr_get_stride( out_attr[i], out_stride_size[i] ); + out_elements[i] = vsi_nn_kernel_tensor_attr_get_size( out_attr[i] ); + out_bytes[i] = out_elements[i] * sizeof(float); + f32_out_buffer[i] = (float *)malloc( out_bytes[i] ); + CHECK_PTR_FAIL_GOTO( f32_out_buffer[i], "Create output buffer fail.", final ); + memset( f32_out_buffer[i], 0, out_bytes[i] ); + } + + status = vsi_nn_kernel_scalar_read_int32( param[3], &top_k ); + CHECK_STATUS_FAIL_GOTO(status, final ); + + block_num = in_attr[0]->shape->data[1]; + block_size = in_attr[0]->shape->data[0]; + indices_ptr = (uint32_t*)malloc(block_size * sizeof(uint32_t)); + CHECK_PTR_FAIL_GOTO( indices_ptr, "Create indices buffer fail.", final ); + + for(i = 0; i < block_num; i++) + { + uint32_t in_index = i * block_size; + uint32_t out_index = i * top_k; + _find_top_k_1d(&(f32_in_buffer[0][in_index]), + block_size, top_k, &(f32_out_buffer[0][out_index]), indices_ptr); + + for (j = 0; j < top_k; j++) + { + f32_out_buffer[1][out_index + j] = (float)indices_ptr[j]; + } + } + // Handle the 1D input + if (!block_num) + { + _find_top_k_1d(&(f32_in_buffer[0][0]), + block_size, top_k, &(f32_out_buffer[0][0]), indices_ptr); + for (j = 0; j < top_k; j++) + { + f32_out_buffer[1][j] = (float)indices_ptr[j]; + } + } + + /* save data */ + for(i = 0; i < _OUTPUT_NUM; i++) + { + status = vsi_nn_kernel_tensor_write_from_float( output[i], out_attr[i], + f32_out_buffer[i], out_elements[i] ); + CHECK_STATUS_FAIL_GOTO( status, final ); + } + +final: + vsi_nn_safe_free(indices_ptr); + for (i = 0; i < _INPUT_NUM; i++) + { + if (f32_in_buffer[i]) + { + free(f32_in_buffer[i]); + f32_in_buffer[i] = NULL; + } + if (in_attr[i]) + { + vsi_nn_kernel_tensor_attr_release( &in_attr[i] ); + } + } + for (i = 0; i < _OUTPUT_NUM; i++) + { + if (f32_out_buffer[i]) + { + free(f32_out_buffer[i]); + f32_out_buffer[i] = NULL; + } + if (out_attr[i]) + { + vsi_nn_kernel_tensor_attr_release( &out_attr[i] ); + } + } + + return status; +} /* _compute() */ + + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + vsi_nn_tensor_t * const * const inputs, + vsi_nn_tensor_t * const * const outputs + /* Add extra params */ + ) +{ + vsi_status status = VSI_FAILURE; + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", _KERNEL_NAME ); + kernel->info.function = _compute; + kernel->info.parameters = _topk_kernel_param_def; + kernel->info.numParams = _cnt_of_array( _topk_kernel_param_def ); + status = VSI_SUCCESS; + + return status; +} /* _query_kernel() */ + + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_TOPK_PARAM_NUM]; + vsi_nn_kernel_node_t node = NULL; + int32_t top_k = vsi_nn_kernel_param_get_int32(params, "top_k"); + + status = _query_kernel( kernel, inputs, outputs /* Add extra params */ ); + if ( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if ( node ) + { + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( node_params, _TOPK_PARAM_NUM, + inputs, input_num, outputs, output_num ); + node_params[3] = vsi_nn_kernel_scalar_create( graph, I32, &top_k ); + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, node_params, _TOPK_PARAM_NUM ); + + vsi_nn_kernel_scalar_release( &node_params[3] ); + } + } + + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_CPU( topk, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/evis/batchnorm_single_evis.c b/src/tim/vx/internal/src/kernel/evis/batchnorm_single_evis.c index ec0213c..13b2a6a 100644 --- a/src/tim/vx/internal/src/kernel/evis/batchnorm_single_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/batchnorm_single_evis.c @@ -44,23 +44,26 @@ typedef enum _internal_img_dim_e IMAGE_2D, } internal_img_dim_e; -#define _BATCH_NORM_KERNEL_SOURCE "batchnorm_single" +#define SOURCE0 "batchnorm_single" +#define SOURCE1 "batchnorm_single_f32" #define STR(a) #a // Add kernel hashtable here -#define BATCH_NORM_HASH_KEY(IN_DTYPE, OUT_DTYPE, BRDCST, _image_2d) \ - ( ( IN_DTYPE << 16 ) | ( OUT_DTYPE << 3) | ( BRDCST << 1) | (_image_2d) ) +#define BATCH_NORM_HASH_KEY(IN_DTYPE, GAMMA_TYPE, OUT_DTYPE, BRDCST, _image_2d) \ + ( ( IN_DTYPE << 24 ) | ( GAMMA_TYPE << 16 ) | ( OUT_DTYPE << 3) | ( BRDCST << 1) | (_image_2d) ) -#define PACK_KERNEL_MAP( IN_DTYPE, OUT_DTYPE, BRDCST) \ - { BATCH_NORM_HASH_KEY( IN_DTYPE, OUT_DTYPE, BRDCST, IMAGE), \ - CVIVANTE_NAMESPACE("evis.batch_norm_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)"_brdcst"#BRDCST), \ - _BATCH_NORM_KERNEL_SOURCE} +#define PACK_KERNEL_MAP( IN_DTYPE, GAMMA_TYPE, OUT_DTYPE, BRDCST, source) \ + { BATCH_NORM_HASH_KEY( IN_DTYPE, GAMMA_TYPE, OUT_DTYPE, BRDCST, IMAGE), \ + CVIVANTE_NAMESPACE("evis.batch_norm_"STR(IN_DTYPE)"_F16_F16_" \ + STR(GAMMA_TYPE)"_F32to"STR(OUT_DTYPE)"_brdcst"#BRDCST), \ + source} -#define PACK_KERNEL_MAP_2D( IN_DTYPE, OUT_DTYPE, BRDCST) \ - { BATCH_NORM_HASH_KEY( IN_DTYPE, OUT_DTYPE, BRDCST, IMAGE_2D), \ - CVIVANTE_NAMESPACE("evis.batch_norm_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)"_brdcst"#BRDCST"_2D"), \ - _BATCH_NORM_KERNEL_SOURCE} +#define PACK_KERNEL_MAP_2D( IN_DTYPE, GAMMA_TYPE, OUT_DTYPE, BRDCST, source) \ + { BATCH_NORM_HASH_KEY( IN_DTYPE, GAMMA_TYPE, OUT_DTYPE, BRDCST, IMAGE_2D), \ + CVIVANTE_NAMESPACE("evis.batch_norm_"STR(IN_DTYPE)"_F16_F16_" \ + STR(GAMMA_TYPE)"_F32to"STR(OUT_DTYPE)"_brdcst"#BRDCST"_2D"), \ + source} typedef struct { @@ -71,47 +74,89 @@ typedef struct static const _kernel_map_type _batch_norm_kernel_map[] = { - PACK_KERNEL_MAP(F16, F16, 0), - PACK_KERNEL_MAP(F16, I16, 0), - PACK_KERNEL_MAP(F16, U8, 0), - PACK_KERNEL_MAP(F16, I8, 0), - PACK_KERNEL_MAP(U8, U8, 0), - PACK_KERNEL_MAP(U8, F16, 0), - PACK_KERNEL_MAP(I8, I8, 0), - PACK_KERNEL_MAP(I8, F16, 0), - PACK_KERNEL_MAP(I16, I16, 0), - PACK_KERNEL_MAP(I16, F16, 0), - PACK_KERNEL_MAP(F16, F16, 1), - PACK_KERNEL_MAP(F16, I16, 1), - PACK_KERNEL_MAP(F16, U8, 1), - PACK_KERNEL_MAP(F16, I8, 1), - PACK_KERNEL_MAP(U8, U8, 1), - PACK_KERNEL_MAP(U8, F16, 1), - PACK_KERNEL_MAP(I8, I8, 1), - PACK_KERNEL_MAP(I8, F16, 1), - PACK_KERNEL_MAP(I16, I16, 1), - PACK_KERNEL_MAP(I16, F16, 1), + PACK_KERNEL_MAP(F16, F16, F16, 0, SOURCE0), + PACK_KERNEL_MAP(F16, F16, I16, 0, SOURCE0), + PACK_KERNEL_MAP(F16, F16, U8, 0, SOURCE0), + PACK_KERNEL_MAP(F16, F16, I8, 0, SOURCE0), + PACK_KERNEL_MAP(U8, F16, U8, 0, SOURCE0), + PACK_KERNEL_MAP(U8, F16, F16, 0, SOURCE0), + PACK_KERNEL_MAP(I8, F16, I8, 0, SOURCE0), + PACK_KERNEL_MAP(I8, F16, F16, 0, SOURCE0), + PACK_KERNEL_MAP(I16, F16, I16, 0, SOURCE0), + PACK_KERNEL_MAP(I16, F16, F16, 0, SOURCE0), + PACK_KERNEL_MAP(F16, F16, F16, 1, SOURCE0), + PACK_KERNEL_MAP(F16, F16, I16, 1, SOURCE0), + PACK_KERNEL_MAP(F16, F16, U8, 1, SOURCE0), + PACK_KERNEL_MAP(F16, F16, I8, 1, SOURCE0), + PACK_KERNEL_MAP(U8, F16, U8, 1, SOURCE0), + PACK_KERNEL_MAP(U8, F16, F16, 1, SOURCE0), + PACK_KERNEL_MAP(I8, F16, I8, 1, SOURCE0), + PACK_KERNEL_MAP(I8, F16, F16, 1, SOURCE0), + PACK_KERNEL_MAP(I16, F16, I16, 1, SOURCE0), + PACK_KERNEL_MAP(I16, F16, F16, 1, SOURCE0), - PACK_KERNEL_MAP_2D(F16, F16, 0), - PACK_KERNEL_MAP_2D(F16, I16, 0), - PACK_KERNEL_MAP_2D(F16, U8 , 0), - PACK_KERNEL_MAP_2D(F16, I8 , 0), - PACK_KERNEL_MAP_2D(U8, U8 , 0), - PACK_KERNEL_MAP_2D(U8, F16, 0), - PACK_KERNEL_MAP_2D(I8, I8, 0), - PACK_KERNEL_MAP_2D(I8, F16, 0), - PACK_KERNEL_MAP_2D(I16, I16, 0), - PACK_KERNEL_MAP_2D(I16, F16, 0), - PACK_KERNEL_MAP_2D(F16, F16, 1), - PACK_KERNEL_MAP_2D(F16, I16, 1), - PACK_KERNEL_MAP_2D(F16, U8 , 1), - PACK_KERNEL_MAP_2D(F16, I8 , 1), - PACK_KERNEL_MAP_2D(U8, U8 , 1), - PACK_KERNEL_MAP_2D(U8, F16, 1), - PACK_KERNEL_MAP_2D(I8, I8, 1), - PACK_KERNEL_MAP_2D(I8, F16, 1), - PACK_KERNEL_MAP_2D(I16, I16, 1), - PACK_KERNEL_MAP_2D(I16, F16, 1), + PACK_KERNEL_MAP(F16, F32, F16, 0, SOURCE1), + PACK_KERNEL_MAP(F16, F32, I16, 0, SOURCE1), + PACK_KERNEL_MAP(F16, F32, U8, 0, SOURCE1), + PACK_KERNEL_MAP(F16, F32, I8, 0, SOURCE1), + PACK_KERNEL_MAP(U8, F32, U8, 0, SOURCE1), + PACK_KERNEL_MAP(U8, F32, F16, 0, SOURCE1), + PACK_KERNEL_MAP(I8, F32, I8, 0, SOURCE1), + PACK_KERNEL_MAP(I8, F32, F16, 0, SOURCE1), + PACK_KERNEL_MAP(I16, F32, I16, 0, SOURCE1), + PACK_KERNEL_MAP(I16, F32, F16, 0, SOURCE1), + PACK_KERNEL_MAP(F16, F32, F16, 1, SOURCE1), + PACK_KERNEL_MAP(F16, F32, I16, 1, SOURCE1), + PACK_KERNEL_MAP(F16, F32, U8, 1, SOURCE1), + PACK_KERNEL_MAP(F16, F32, I8, 1, SOURCE1), + PACK_KERNEL_MAP(U8, F32, U8, 1, SOURCE1), + PACK_KERNEL_MAP(U8, F32, F16, 1, SOURCE1), + PACK_KERNEL_MAP(I8, F32, I8, 1, SOURCE1), + PACK_KERNEL_MAP(I8, F32, F16, 1, SOURCE1), + PACK_KERNEL_MAP(I16, F32, I16, 1, SOURCE1), + PACK_KERNEL_MAP(I16, F32, F16, 1, SOURCE1), + + PACK_KERNEL_MAP_2D(F16, F16, F16, 0, SOURCE0), + PACK_KERNEL_MAP_2D(F16, F16, I16, 0, SOURCE0), + PACK_KERNEL_MAP_2D(F16, F16, U8, 0, SOURCE0), + PACK_KERNEL_MAP_2D(F16, F16, I8, 0, SOURCE0), + PACK_KERNEL_MAP_2D(U8, F16, U8, 0, SOURCE0), + PACK_KERNEL_MAP_2D(U8, F16, F16, 0, SOURCE0), + PACK_KERNEL_MAP_2D(I8, F16, I8, 0, SOURCE0), + PACK_KERNEL_MAP_2D(I8, F16, F16, 0, SOURCE0), + PACK_KERNEL_MAP_2D(I16, F16, I16, 0, SOURCE0), + PACK_KERNEL_MAP_2D(I16, F16, F16, 0, SOURCE0), + PACK_KERNEL_MAP_2D(F16, F16, F16, 1, SOURCE0), + PACK_KERNEL_MAP_2D(F16, F16, I16, 1, SOURCE0), + PACK_KERNEL_MAP_2D(F16, F16, U8, 1, SOURCE0), + PACK_KERNEL_MAP_2D(F16, F16, I8, 1, SOURCE0), + PACK_KERNEL_MAP_2D(U8, F16, U8, 1, SOURCE0), + PACK_KERNEL_MAP_2D(U8, F16, F16, 1, SOURCE0), + PACK_KERNEL_MAP_2D(I8, F16, I8, 1, SOURCE0), + PACK_KERNEL_MAP_2D(I8, F16, F16, 1, SOURCE0), + PACK_KERNEL_MAP_2D(I16, F16, I16, 1, SOURCE0), + PACK_KERNEL_MAP_2D(I16, F16, F16, 1, SOURCE0), + + PACK_KERNEL_MAP_2D(F16, F32, F16, 0, SOURCE1), + PACK_KERNEL_MAP_2D(F16, F32, I16, 0, SOURCE1), + PACK_KERNEL_MAP_2D(F16, F32, U8, 0, SOURCE1), + PACK_KERNEL_MAP_2D(F16, F32, I8, 0, SOURCE1), + PACK_KERNEL_MAP_2D(U8, F32, U8, 0, SOURCE1), + PACK_KERNEL_MAP_2D(U8, F32, F16, 0, SOURCE1), + PACK_KERNEL_MAP_2D(I8, F32, I8, 0, SOURCE1), + PACK_KERNEL_MAP_2D(I8, F32, F16, 0, SOURCE1), + PACK_KERNEL_MAP_2D(I16, F32, I16, 0, SOURCE1), + PACK_KERNEL_MAP_2D(I16, F32, F16, 0, SOURCE1), + PACK_KERNEL_MAP_2D(F16, F32, F16, 1, SOURCE1), + PACK_KERNEL_MAP_2D(F16, F32, I16, 1, SOURCE1), + PACK_KERNEL_MAP_2D(F16, F32, U8, 1, SOURCE1), + PACK_KERNEL_MAP_2D(F16, F32, I8, 1, SOURCE1), + PACK_KERNEL_MAP_2D(U8, F32, U8, 1, SOURCE1), + PACK_KERNEL_MAP_2D(U8, F32, F16, 1, SOURCE1), + PACK_KERNEL_MAP_2D(I8, F32, I8, 1, SOURCE1), + PACK_KERNEL_MAP_2D(I8, F32, F16, 1, SOURCE1), + PACK_KERNEL_MAP_2D(I16, F32, I16, 1, SOURCE1), + PACK_KERNEL_MAP_2D(I16, F32, F16, 1, SOURCE1), }; /* @@ -329,6 +374,7 @@ static vsi_status _query_kernel { vsi_status status = VSI_FAILURE; vsi_nn_kernel_dtype_e in_dtype; + vsi_nn_kernel_dtype_e gamma_dtype; vsi_nn_kernel_dtype_e out_dtype; const _kernel_map_type * kernel_map = _batch_norm_kernel_map; size_t kernel_map_size = _cnt_of_array( _batch_norm_kernel_map ); @@ -340,6 +386,7 @@ static vsi_status _query_kernel uint32_t brdcst = 0; in_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + gamma_dtype = vsi_nn_kernel_map_dtype( inputs[3]->attr.dtype.vx_type ); out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); if (inputs[BATCHNORM_INPUT]->attr.size[0] != 1 && inputs[BATCHNORM_INPUT_BETA]->attr.size[0] == 1) @@ -347,7 +394,7 @@ static vsi_status _query_kernel brdcst = 1; } - key = BATCH_NORM_HASH_KEY(in_dtype, out_dtype, brdcst, image_2d); + key = BATCH_NORM_HASH_KEY(in_dtype, gamma_dtype, out_dtype, brdcst, image_2d); for( i = 0; i < kernel_map_size; i ++ ) { @@ -397,7 +444,6 @@ static vsi_nn_kernel_node_t _setup if ( (inputs[1]->attr.is_const && inputs[2]->attr.is_const) || (inputs[1]->attr.dtype.vx_type != VSI_NN_TYPE_FLOAT16) || (inputs[2]->attr.dtype.vx_type != VSI_NN_TYPE_FLOAT16) - || (inputs[3]->attr.dtype.vx_type != VSI_NN_TYPE_FLOAT16) || (inputs[4]->attr.dtype.vx_type != VSI_NN_TYPE_FLOAT32) ) { return NULL; diff --git a/src/tim/vx/internal/src/kernel/evis/cast_evis.c b/src/tim/vx/internal/src/kernel/evis/cast_evis.c index 2d25883..4f201e9 100644 --- a/src/tim/vx/internal/src/kernel/evis/cast_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/cast_evis.c @@ -241,6 +241,7 @@ static vsi_status _query_kernel uint32_t i; in_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + in_dtype = in_dtype == BOOL8 ? I8 : in_dtype; out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); key = CAST_HASH_KEY( in_dtype, out_dtype, image_2d ); diff --git a/src/tim/vx/internal/src/kernel/evis/comparisons_evis.c b/src/tim/vx/internal/src/kernel/evis/comparisons_evis.c index fd32271..3c1ac2f 100644 --- a/src/tim/vx/internal/src/kernel/evis/comparisons_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/comparisons_evis.c @@ -455,6 +455,7 @@ static vsi_status _query_kernel input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); input1_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + output_dtype = output_dtype == I8 ? BOOL8 : output_dtype; key = HASH_COMPARISONS_KEY( operation, input0_dtype, input1_dtype, output_dtype, image_2d ); for( i = 0; i < _cnt_of_array(_comparisons_evis_kernel_map); i ++ ) diff --git a/src/tim/vx/internal/src/kernel/evis/conv1d_ovxlib_evis.c b/src/tim/vx/internal/src/kernel/evis/conv1d_ovxlib_evis.c new file mode 100644 index 0000000..923328e --- /dev/null +++ b/src/tim/vx/internal/src/kernel/evis/conv1d_ovxlib_evis.c @@ -0,0 +1,702 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_error.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "libnnext/vx_lib_nnext.h" + +__BEGIN_DECLS + +/* + * Define kernel meta. + */ +typedef enum +{ + NORMAL = 0, + K3_S1, + K3_S1_D2_D4, + K1024_SMALL, + K1024_LARGE, +} _internal_kernel_e; + +#define _CONV1D_OVXLIB_KERNEL_SOURCE "conv1d_ovxlib" +#define _CONV1D_OVXLIB_KERNEL_SOURCE_K1024 "conv1d_ovxlib_k1024" + +#define STR(a) #a +// Add kernel hashtable here +#define CONV1D_OVXLIB_HASH_KEY( IN_DTYPE, W_DTYPE, B_DTYPE, OUT_DTYPE, KERNEL_TYPE ) \ + (( KERNEL_TYPE << 24 ) | ( IN_DTYPE << 18 ) | ( W_DTYPE << 12 ) | ( B_DTYPE << 6 ) | ( OUT_DTYPE )) +#define PACK_KERNEL_MAP( IN_DTYPE, W_DTYPE, B_DTYPE, OUT_DTYPE, KERNEL_TYPE, SOURCE ) \ + { CONV1D_OVXLIB_HASH_KEY( IN_DTYPE, W_DTYPE, B_DTYPE, OUT_DTYPE, KERNEL_TYPE ), \ + CVIVANTE_NAMESPACE(\ + "evis.conv1d_"STR(IN_DTYPE)STR(W_DTYPE)STR(B_DTYPE)"to"STR(OUT_DTYPE)"_"STR(KERNEL_TYPE)), \ + SOURCE } + +typedef struct +{ + uint32_t key; + char * function_name; + const char * source_name; +} _kernel_map_type; + +static const _kernel_map_type _conv1d_ovxlib_kernel_map[] = +{ + // Register kernel here + PACK_KERNEL_MAP( U8, U8, I32, U8, K3_S1, _CONV1D_OVXLIB_KERNEL_SOURCE ), + PACK_KERNEL_MAP( U8, U8, I32, U8, K3_S1_D2_D4, _CONV1D_OVXLIB_KERNEL_SOURCE ), + PACK_KERNEL_MAP( U8, U8, I32, U8, K1024_SMALL, _CONV1D_OVXLIB_KERNEL_SOURCE_K1024 ), + PACK_KERNEL_MAP( U8, U8, I32, U8, K1024_LARGE, _CONV1D_OVXLIB_KERNEL_SOURCE_K1024 ), +}; + + +/* + * Kernel params + */ +static vx_param_description_t _conv1d_ovxlib_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, +}; +#define _CONV1D_OVXLIB_PARAM_NUM _cnt_of_array( _conv1d_ovxlib_kernel_param_def ) + +/* + * Kernel initializer + */ +DEF_KERNEL_INITIALIZER(_conv1d_ovxlib_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + gpu_param_t gpu_param = { + 3, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0} + }; + vsi_nn_kernel_tensor_attr_t * input_attr = NULL; + vsi_nn_kernel_tensor_attr_t * weights_attr = NULL; + vsi_nn_kernel_tensor_attr_t * output_attr = NULL; + vsi_int_array_t * in_shape = NULL; + vsi_int_array_t * out_shape = NULL; + vsi_int_array_t * weight_shape = NULL; + float scaleIn = 1.0f; + float scaleOut = 1.0f; + float scaleWights = 1.0f; + int32_t input_ZP = 0; + int32_t weight_ZP = 0; + float output_ZP = 0; + int32_t stride = 1; + int32_t dilation = 0; + int32_t input_height = 0; + int32_t input_width = 0; + int32_t output_width = 0; + + input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); + CHECK_PTR_FAIL_GOTO( input_attr, "Create tensor attr buffer fail.", final ); + + weights_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); + CHECK_PTR_FAIL_GOTO( weights_attr, "Create tensor attr buffer fail.", final ); + + output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[3] ); + CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final ); + + vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[4], &(stride)); + vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[7], &(dilation)); + + in_shape = input_attr->shape; + out_shape = output_attr->shape; + weight_shape = weights_attr->shape; + + if ( VSI_NN_KERNEL_QUANT_ASYMM == input_attr->quant ) + { + input_ZP = input_attr->asymm.zero_point; + scaleIn = input_attr->asymm.scale; + } + + if ( VSI_NN_KERNEL_QUANT_ASYMM == weights_attr->quant ) + { + weight_ZP = weights_attr->asymm.zero_point; + scaleWights = weights_attr->asymm.scale; + } + + if ( VSI_NN_KERNEL_QUANT_ASYMM == output_attr->quant ) + { + output_ZP = (float)output_attr->asymm.zero_point; + scaleOut = output_attr->asymm.scale; + } + + scaleOut = (scaleIn * scaleWights) / scaleOut; + input_height = in_shape->data[1]; + input_width = in_shape->data[0]; + output_width = out_shape->data[0]; + + if ((U8 == input_attr->dtype) && (U8 == weights_attr->dtype) && (U8 == output_attr->dtype)) + { + gpu_dp_inst_t uniSumOrderUchar_2x8 = {{ + 0x11111111, // TCfg + 0x11110000, // ASelt + 0x0c080400, 0x0c080400, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16 }; + + if ( (3 == weight_shape->data[0]) && (1 == stride) ) + { + gpu_dp_inst_t uniConv1DK3_Lo0_4x4 = {{ + 0x69696969, // TCfg + 0x44444444, // ASelt + 0x41014000, 0x43034202, // ABin + 0x55555555, // BSelt + 0x55405540, 0x55405540, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniConv1DK3_Lo1_4x4 = {{ + 0x69696969, // TCfg + 0x44444444, // ASelt + 0x41114010, 0x43134212, // ABin + 0x55555555, // BSelt + 0x55415541, 0x55415541, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniConv1DK3_Lo2_4x4 = {{ + 0x69696969, // TCfg + 0x44444444, // ASelt + 0x41214020, 0x43234222, // ABin + 0x55555555, // BSelt + 0x55425542, 0x55425542, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniConv1DK3_Hi0_4x4 = {{ + 0x69696969, // TCfg + 0x44444444, // ASelt + 0x45054404, 0x47074606, // ABin + 0x55555555, // BSelt + 0x55405540, 0x55405540, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniConv1DK3_Hi1_4x4 = {{ + 0x69696969, // TCfg + 0x44444444, // ASelt + 0x45154414, 0x47174616, // ABin + 0x55555555, // BSelt + 0x55415541, 0x55415541, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniConv1DK3_Hi2_4x4 = {{ + 0x69696969, // TCfg + 0x44444444, // ASelt + 0x45254424, 0x47274626, // ABin + 0x55555555, // BSelt + 0x55425542, 0x55425542, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniDataConvK3_2x8 = {{ + 0x00111111, // TCfg + 0x00110000, // ASelt + 0x03020100, 0x00000504, // ABin + 0x00222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + + uint32_t conv1dK3D2_Lo1[4] = {0x43134212, 0x45154414, 0x55415541, 0x55415541}; + uint32_t conv1dK3D2_Lo2[4] = {0x45254424, 0x47274626, 0x55425542, 0x55425542}; + uint32_t conv1dK3D2_Hi1[4] = {0x47174616, 0x49194818, 0x55415541, 0x55415541}; + uint32_t conv1dK3D2_Hi2[4] = {0x49294828, 0x4b2b4a2a, 0x55425542, 0x55425542}; + uint32_t conv1dK3D4_Lo1[4] = {0x45154414, 0x47174616, 0x55415541, 0x55415541}; + uint32_t conv1dK3D4_Lo2[4] = {0x49294828, 0x4b2b4a2a, 0x55425542, 0x55425542}; + uint32_t conv1dK3D4_Hi1[4] = {0x49194818, 0x4b1b4a1a, 0x55415541, 0x55415541}; + uint32_t conv1dK3D4_Hi2[4] = {0x4d2d4c2c, 0x4f2f4e2e, 0x55425542, 0x55425542}; + + if (2 == dilation) + { + uniConv1DK3_Lo1_4x4.data[2] = conv1dK3D2_Lo1[0]; + uniConv1DK3_Lo1_4x4.data[3] = conv1dK3D2_Lo1[1]; + uniConv1DK3_Lo1_4x4.data[5] = conv1dK3D2_Lo1[2]; + uniConv1DK3_Lo1_4x4.data[6] = conv1dK3D2_Lo1[3]; + uniConv1DK3_Lo2_4x4.data[2] = conv1dK3D2_Lo2[0]; + uniConv1DK3_Lo2_4x4.data[3] = conv1dK3D2_Lo2[1]; + uniConv1DK3_Lo2_4x4.data[5] = conv1dK3D2_Lo2[2]; + uniConv1DK3_Lo2_4x4.data[6] = conv1dK3D2_Lo2[3]; + uniConv1DK3_Hi1_4x4.data[2] = conv1dK3D2_Hi1[0]; + uniConv1DK3_Hi1_4x4.data[3] = conv1dK3D2_Hi1[1]; + uniConv1DK3_Hi1_4x4.data[5] = conv1dK3D2_Hi1[2]; + uniConv1DK3_Hi1_4x4.data[6] = conv1dK3D2_Hi1[3]; + uniConv1DK3_Hi2_4x4.data[2] = conv1dK3D2_Hi2[0]; + uniConv1DK3_Hi2_4x4.data[3] = conv1dK3D2_Hi2[1]; + uniConv1DK3_Hi2_4x4.data[5] = conv1dK3D2_Hi2[2]; + uniConv1DK3_Hi2_4x4.data[6] = conv1dK3D2_Hi2[3]; + } + else if (4 == dilation) + { + uniConv1DK3_Lo1_4x4.data[2] = conv1dK3D4_Lo1[0]; + uniConv1DK3_Lo1_4x4.data[3] = conv1dK3D4_Lo1[1]; + uniConv1DK3_Lo1_4x4.data[5] = conv1dK3D4_Lo1[2]; + uniConv1DK3_Lo1_4x4.data[6] = conv1dK3D4_Lo1[3]; + uniConv1DK3_Lo2_4x4.data[2] = conv1dK3D4_Lo2[0]; + uniConv1DK3_Lo2_4x4.data[3] = conv1dK3D4_Lo2[1]; + uniConv1DK3_Lo2_4x4.data[5] = conv1dK3D4_Lo2[2]; + uniConv1DK3_Lo2_4x4.data[6] = conv1dK3D4_Lo2[3]; + uniConv1DK3_Hi1_4x4.data[2] = conv1dK3D4_Hi1[0]; + uniConv1DK3_Hi1_4x4.data[3] = conv1dK3D4_Hi1[1]; + uniConv1DK3_Hi1_4x4.data[5] = conv1dK3D4_Hi1[2]; + uniConv1DK3_Hi1_4x4.data[6] = conv1dK3D4_Hi1[3]; + uniConv1DK3_Hi2_4x4.data[2] = conv1dK3D4_Hi2[0]; + uniConv1DK3_Hi2_4x4.data[3] = conv1dK3D4_Hi2[1]; + uniConv1DK3_Hi2_4x4.data[5] = conv1dK3D4_Hi2[2]; + uniConv1DK3_Hi2_4x4.data[6] = conv1dK3D4_Hi2[3]; + } + + + status = vsi_nn_kernel_gpu_add_param( node, + "uniConv1DK3_Lo0_4x4", &uniConv1DK3_Lo0_4x4 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniConv1DK3_Hi0_4x4", &uniConv1DK3_Hi0_4x4 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniConv1DK3_Lo1_4x4", &uniConv1DK3_Lo1_4x4 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniConv1DK3_Lo2_4x4", &uniConv1DK3_Lo2_4x4 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniConv1DK3_Hi1_4x4", &uniConv1DK3_Hi1_4x4 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniConv1DK3_Hi2_4x4", &uniConv1DK3_Hi2_4x4 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniDataConvK3_2x8", &uniDataConvK3_2x8 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniSumOrderUchar_2x8", &uniSumOrderUchar_2x8 ); + status |= vsi_nn_kernel_gpu_add_param( node, "input_ZP", &input_ZP); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + else if ( (1024 == weight_shape->data[0]) && (1 == stride) ) + { + gpu_dp_inst_t uniU8SubZp_lo_2x8= {{ + 0x99999999, // TCfg + 0x44444444, // ASelt + 0x03020100, 0x07060504, // ABin + 0xaaaaaaaa, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00010001, 0x00010001, 0x00010001, 0x00010001, + 0x00010001, 0x00010001, 0x00010001, 0x00010001 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniU8SubZp_hi_2x8= {{ + 0x99999999, // TCfg + 0x44444444, // ASelt + 0x0b0a0908, 0x0f0e0d0c, // ABin + 0xaaaaaaaa, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00010001, 0x00010001, 0x00010001, 0x00010001, + 0x00010001, 0x00010001, 0x00010001, 0x00010001 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniU8Conv1d_part0_8x2= {{ + 0x55555555, // TCfg + 0x00000000, // ASelt + 0x76543210, 0x87654321, // ABin + 0x55555555, // BSelt + 0x76543210, 0x76543210, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniU8Conv1d_part1_8x2= {{ + 0x55555555, // TCfg + 0x00000000, // ASelt + 0x98765432, 0xa9876543, // ABin + 0x55555555, // BSelt + 0x76543210, 0x76543210, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniU8Conv1d_part2_8x2= {{ + 0x55555555, // TCfg + 0x00000000, // ASelt + 0xba987654, 0xcba98765, // ABin + 0x55555555, // BSelt + 0x76543210, 0x76543210, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniU8Conv1d_part3_8x2= {{ + 0x55555555, // TCfg + 0x00000000, // ASelt + 0xdcba9876, 0xedcba987, // ABin + 0x55555555, // BSelt + 0x76543210, 0x76543210, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + int32_t kernel_cnt_x16 = (weight_shape->data[0] + 15) / 16; + status = vsi_nn_kernel_gpu_add_param( node, + "kernel_cnt_x16", &kernel_cnt_x16 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniU8SubZp_lo_2x8", &uniU8SubZp_lo_2x8 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniU8SubZp_hi_2x8", &uniU8SubZp_hi_2x8 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniU8Conv1d_part0_8x2", &uniU8Conv1d_part0_8x2 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniU8Conv1d_part1_8x2", &uniU8Conv1d_part1_8x2 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniU8Conv1d_part2_8x2", &uniU8Conv1d_part2_8x2 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniU8Conv1d_part3_8x2", &uniU8Conv1d_part3_8x2 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniSumOrderUchar_2x8", &uniSumOrderUchar_2x8 ); + if (input_width >= GPU_TENSOR_MAX_WIDTH) + { + status |= vsi_nn_kernel_gpu_add_param( node, "input_width", &input_width); + status |= vsi_nn_kernel_gpu_add_param( node, "output_width", &output_width); + } + CHECK_STATUS_FAIL_GOTO(status, final ); + } + + status = vsi_nn_kernel_gpu_add_param( node, "weight_ZP", &weight_ZP); + status |= vsi_nn_kernel_gpu_add_param( node, "output_ZP", &output_ZP); + status |= vsi_nn_kernel_gpu_add_param( node, "scaleOut", &scaleOut); + status |= vsi_nn_kernel_gpu_add_param( node, "input_height", &input_height); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + + gpu_param.global_scale[0] = 8; + gpu_param.global_scale[1] = 1; + gpu_param.dim = 2; + gpu_param.global_size[0] = ( + (out_shape->data[0] + gpu_param.global_scale[0] - 1) + / gpu_param.global_scale[0]); + gpu_param.global_size[1] = ( + (out_shape->data[1] + gpu_param.global_scale[1] - 1) + / gpu_param.global_scale[1]); + status = vsi_nn_kernel_gpu_config( node, &gpu_param ); + +final: +#define SAFE_FREE_TENSOR_ATTR(_PTR) if( _PTR ) { vsi_nn_kernel_tensor_attr_release( &_PTR ); _PTR = NULL; } + SAFE_FREE_TENSOR_ATTR(input_attr); + + return status; +} /* _conv1d_ovxlib_initializer() */ + + + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + vsi_nn_tensor_t * const * const inputs, + vsi_nn_tensor_t * const * const outputs, + _internal_kernel_e kernel_type + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_dtype_e in_dtype; + vsi_nn_kernel_dtype_e w_dtype; + vsi_nn_kernel_dtype_e b_dtype; + vsi_nn_kernel_dtype_e out_dtype; + const _kernel_map_type * kernel_map = _conv1d_ovxlib_kernel_map; + size_t kernel_map_size = _cnt_of_array( _conv1d_ovxlib_kernel_map ); + vx_param_description_t * param_def = _conv1d_ovxlib_kernel_param_def; + size_t param_def_size = _cnt_of_array( _conv1d_ovxlib_kernel_param_def ); + vx_kernel_initialize_f initializer = _conv1d_ovxlib_initializer; + uint32_t key; + uint32_t i; + + in_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + w_dtype = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type ); + b_dtype = vsi_nn_kernel_map_dtype( inputs[2]->attr.dtype.vx_type ); + out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + + key = CONV1D_OVXLIB_HASH_KEY( in_dtype, w_dtype, b_dtype, out_dtype, kernel_type ); + + for ( i = 0; i < (uint32_t)kernel_map_size; i ++ ) + { + if ( kernel_map[i].key == key ) + { + break; + } + } + if ( i < (uint32_t)kernel_map_size ) + { + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name ); + kernel->info.parameters = param_def; + kernel->info.numParams = (uint32_t)param_def_size; + kernel->info.initialize = initializer; + // Register code source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2, + "vsi_nn_kernel_header", + kernel_map[i].source_name ); + // Register binary source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + kernel_map[i].source_name ); + status = VSI_SUCCESS; + } + return status; +} /* _query_kernel() */ + +static vsi_nn_tensor_t* _create_new_bias_tensor + ( + vsi_nn_graph_t *graph, + vsi_nn_tensor_t *input, + vsi_nn_tensor_t *weight, + vsi_nn_tensor_t *bias + ) +{ + vsi_nn_tensor_t * new_bias = NULL; + vsi_nn_tensor_attr_t attr; + int32_t *new_bias_data_ptr = NULL; + uint8_t *weight_data = NULL; + int32_t *bias_data = NULL; + uint32_t i, j; + memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t)); + weight_data = vsi_nn_ConvertTensorToData(graph, weight); + + if (bias == NULL) + { + memcpy(&attr, &weight->attr, sizeof(vsi_nn_tensor_attr_t)); + attr.dim_num = 2; + attr.size[0] = weight->attr.size[2]; + attr.size[1] = 1; + if (weight->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC) + { + attr.dtype.scale = input->attr.dtype.scale * weight->attr.dtype.scale; + attr.dtype.zero_point = 0; + attr.dtype.vx_type = VSI_NN_TYPE_INT32; + } + } + else + { + memcpy(&attr, &bias->attr, sizeof(vsi_nn_tensor_attr_t)); + if (attr.dim_num == 1) + { + attr.size[1] = 1; + attr.dim_num = 2; + } + bias_data = (int32_t *)vsi_nn_ConvertTensorToData(graph, bias); + } + + new_bias_data_ptr = (int32_t *)malloc(attr.size[0] * sizeof(int32_t)); + memset((void *)new_bias_data_ptr, 0, sizeof(int32_t) * attr.size[0]); + + if (input->attr.dtype.zero_point != 0) + { + for (i = 0; i < weight->attr.size[2]; i++) + { + uint8_t *weight_ptr = weight_data + i * weight->attr.size[0] * weight->attr.size[1]; + for (j = 0; j < weight->attr.size[0] * weight->attr.size[1]; j++) + { + new_bias_data_ptr[i] += -((int32_t)weight_ptr[j] - weight->attr.dtype.zero_point) \ + * input->attr.dtype.zero_point; + } + } + } + + if (bias_data != NULL) + { + for (i = 0; i < attr.size[0]; i++) + { + new_bias_data_ptr[i] += bias_data[i]; + } + } + + new_bias = vsi_nn_CreateTensorFromData(graph, (uint8_t *)new_bias_data_ptr, &attr); + + vsi_nn_safe_free( new_bias_data_ptr ); + vsi_nn_safe_free( bias_data ); + vsi_nn_safe_free( weight_data ); + + return new_bias; +} + + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_CONV1D_OVXLIB_PARAM_NUM]; + vsi_nn_kernel_node_t node = NULL; + int32_t j = 0; + _internal_kernel_e kernel_type = NORMAL; + + int32_t stride = vsi_nn_kernel_param_get_int32( params, "stride" ); + int32_t pad_front = vsi_nn_kernel_param_get_int32( params, "pad_front" ); + int32_t pad_end = vsi_nn_kernel_param_get_int32( params, "pad_end" ); + int32_t dilation = vsi_nn_kernel_param_get_int32( params, "dilation" ); + int32_t overflow_policy = vsi_nn_kernel_param_get_int32( params, "overflow_policy" ); + vsi_nn_tensor_t *in_tensors[3] = {NULL}; + vsi_nn_tensor_t *new_bias = NULL; + + if (VX_CONVERT_POLICY_SATURATE == overflow_policy) + { + overflow_policy = 1; + } + else + { + overflow_policy = 0; + } + + if ( 1 == stride ) + { + if ( 3 == inputs[1]->attr.size[0] ) + { + if (2 == dilation || 4 == dilation) + { + kernel_type = K3_S1_D2_D4; + } + else + { + kernel_type = K3_S1; + } + } + else if ( 1024 == inputs[1]->attr.size[0] ) + { + if (inputs[0]->attr.size[0] < 65535) + { + kernel_type = K1024_SMALL; + } + else if (0 == pad_front && 0 == pad_end) + { + kernel_type = K1024_LARGE; + } + else + { + return NULL; + } + } + else + { + return NULL; + } + } + + if (1024 == inputs[1]->attr.size[0]) + { + new_bias = _create_new_bias_tensor(graph, inputs[0], inputs[1], inputs[2]); + in_tensors[0] = inputs[0]; + in_tensors[1] = inputs[1]; + in_tensors[2] = new_bias; + } + else + { + in_tensors[0] = inputs[0]; + in_tensors[1] = inputs[1]; + in_tensors[2] = inputs[2]; + } + + status = _query_kernel( kernel, inputs, outputs, kernel_type ); + if ( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if ( node ) + { + if( pad_front != 0 || pad_end != 0) + { + // Set default border mode. + vx_border_t border; + border.mode = VX_BORDER_CONSTANT; + border.constant_value.U8 = (uint8_t)(inputs[0]->attr.dtype.zero_point); + status |= vxSetNodeAttribute( (vx_node)node, VX_NODE_BORDER, &border, sizeof(border) ); + } + + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( node_params, _CONV1D_OVXLIB_PARAM_NUM, + in_tensors, input_num, outputs, output_num ); + j = (int32_t)(input_num + output_num); + node_params[j++] = vsi_nn_kernel_scalar_create(graph, I32, &stride ); + node_params[j++] = vsi_nn_kernel_scalar_create(graph, I32, &pad_front ); + node_params[j++] = vsi_nn_kernel_scalar_create(graph, I32, &pad_end ); + node_params[j++] = vsi_nn_kernel_scalar_create(graph, I32, &dilation ); + node_params[j++] = vsi_nn_kernel_scalar_create(graph, I32, &overflow_policy ); + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, node_params, _CONV1D_OVXLIB_PARAM_NUM ); + vsi_nn_kernel_scalar_release( &node_params[--j] ); + vsi_nn_kernel_scalar_release( &node_params[--j] ); + vsi_nn_kernel_scalar_release( &node_params[--j] ); + vsi_nn_kernel_scalar_release( &node_params[--j] ); + vsi_nn_kernel_scalar_release( &node_params[--j] ); + } + } + + if (new_bias) + { + vsi_nn_ReleaseTensor(&new_bias); + } + + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_EVIS( conv1d_ovxlib, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/evis/depth2space_internal_evis.c b/src/tim/vx/internal/src/kernel/evis/depth2space_internal_evis.c index 9a57aee..7d3dc68 100644 --- a/src/tim/vx/internal/src/kernel/evis/depth2space_internal_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/depth2space_internal_evis.c @@ -42,28 +42,44 @@ __BEGIN_DECLS /* * Define kernel meta. */ -#define VX_KERNEL_NAME_DEPTH2SPACE_CRD_U8TOU8 CVIVANTE_NAMESPACE("evis.depth2space_crd_U8toU8") -#define VX_KERNEL_NAME_DEPTH2SPACE_CRD_I8TOI8 CVIVANTE_NAMESPACE("evis.depth2space_crd_I8toI8") -#define VX_KERNEL_NAME_DEPTH2SPACE_CRD_I16TOI16 CVIVANTE_NAMESPACE("evis.depth2space_crd_I16toI16") -#define VX_KERNEL_NAME_DEPTH2SPACE_CRD_F16TOF16 CVIVANTE_NAMESPACE("evis.depth2space_crd_F16toF16") -#define VX_KERNEL_NAME_DEPTH2SPACE_CRD_I8TOF16 CVIVANTE_NAMESPACE("evis.depth2space_crd_I8toF16") -#define VX_KERNEL_NAME_DEPTH2SPACE_CRD_I16TOF16 CVIVANTE_NAMESPACE("evis.depth2space_crd_I16toF16") -#define VX_KERNEL_NAME_DEPTH2SPACE_CRD_F16TOI8 CVIVANTE_NAMESPACE("evis.depth2space_crd_F16toI8") -#define VX_KERNEL_NAME_DEPTH2SPACE_CRD_F16TOI16 CVIVANTE_NAMESPACE("evis.depth2space_crd_F16toI16") -#define VX_KERNEL_NAME_DEPTH2SPACE_CRD_U8TOF16 CVIVANTE_NAMESPACE("evis.depth2space_crd_U8toF16") -#define VX_KERNEL_NAME_DEPTH2SPACE_CRD_F16TOU8 CVIVANTE_NAMESPACE("evis.depth2space_crd_F16toU8") +#define VX_KERNEL_NAME_DEPTH2SPACE_CRD_U8TOU8 CVIVANTE_NAMESPACE("evis.depth2space_crd_U8toU8") +#define VX_KERNEL_NAME_DEPTH2SPACE_CRD_I8TOI8 CVIVANTE_NAMESPACE("evis.depth2space_crd_I8toI8") +#define VX_KERNEL_NAME_DEPTH2SPACE_CRD_I16TOI16 CVIVANTE_NAMESPACE("evis.depth2space_crd_I16toI16") +#define VX_KERNEL_NAME_DEPTH2SPACE_CRD_F16TOF16 CVIVANTE_NAMESPACE("evis.depth2space_crd_F16toF16") +#define VX_KERNEL_NAME_DEPTH2SPACE_CRD_I8TOF16 CVIVANTE_NAMESPACE("evis.depth2space_crd_I8toF16") +#define VX_KERNEL_NAME_DEPTH2SPACE_CRD_I16TOF16 CVIVANTE_NAMESPACE("evis.depth2space_crd_I16toF16") +#define VX_KERNEL_NAME_DEPTH2SPACE_CRD_F16TOI8 CVIVANTE_NAMESPACE("evis.depth2space_crd_F16toI8") +#define VX_KERNEL_NAME_DEPTH2SPACE_CRD_F16TOI16 CVIVANTE_NAMESPACE("evis.depth2space_crd_F16toI16") +#define VX_KERNEL_NAME_DEPTH2SPACE_CRD_U8TOF16 CVIVANTE_NAMESPACE("evis.depth2space_crd_U8toF16") +#define VX_KERNEL_NAME_DEPTH2SPACE_CRD_F16TOU8 CVIVANTE_NAMESPACE("evis.depth2space_crd_F16toU8") + +#define VX_KERNEL_NAME_DEPTH2SPACE_CRD_U8TOU8_BLK2 CVIVANTE_NAMESPACE("evis.depth2space_crd_U8toU8_blk2") +#define VX_KERNEL_NAME_DEPTH2SPACE_CRD_I8TOI8_BLK2 CVIVANTE_NAMESPACE("evis.depth2space_crd_I8toI8_blk2") +#define VX_KERNEL_NAME_DEPTH2SPACE_CRD_I16TOI16_BLK2 CVIVANTE_NAMESPACE("evis.depth2space_crd_I16toI16_blk2") +#define VX_KERNEL_NAME_DEPTH2SPACE_CRD_F16TOF16_BLK2 CVIVANTE_NAMESPACE("evis.depth2space_crd_F16toF16_blk2") +#define VX_KERNEL_NAME_DEPTH2SPACE_CRD_I8TOF16_BLK2 CVIVANTE_NAMESPACE("evis.depth2space_crd_I8toF16_blk2") +#define VX_KERNEL_NAME_DEPTH2SPACE_CRD_I16TOF16_BLK2 CVIVANTE_NAMESPACE("evis.depth2space_crd_I16toF16_blk2") +#define VX_KERNEL_NAME_DEPTH2SPACE_CRD_F16TOI8_BLK2 CVIVANTE_NAMESPACE("evis.depth2space_crd_F16toI8_blk2") +#define VX_KERNEL_NAME_DEPTH2SPACE_CRD_F16TOI16_BLK2 CVIVANTE_NAMESPACE("evis.depth2space_crd_F16toI16_blk2") +#define VX_KERNEL_NAME_DEPTH2SPACE_CRD_U8TOF16_BLK2 CVIVANTE_NAMESPACE("evis.depth2space_crd_U8toF16_blk2") +#define VX_KERNEL_NAME_DEPTH2SPACE_CRD_F16TOU8_BLK2 CVIVANTE_NAMESPACE("evis.depth2space_crd_F16toU8_blk2") #define KERNEL_SOURCE_1 "depth2space_crd" // Add kernel hashtable here -#define HASH_DEPTH2SPACE_CRD_KEY(_input0_type, _output_type, _quant_type) \ - ((_input0_type << 24) | (_output_type << 16) | (_quant_type << 8)) +#define HASH_DEPTH2SPACE_CRD_KEY(_input0_type, _output_type, _blk_size) \ + ((_input0_type << 24) | (_output_type << 16) | (_blk_size << 8)) #define TENSOR_DEPTH2SPACE_CRD_KERNELS(IN0_TYPE, OUT_TYPE, SOURCE) \ { HASH_DEPTH2SPACE_CRD_KEY(IN0_TYPE, OUT_TYPE, 0), \ VX_KERNEL_NAME_DEPTH2SPACE_CRD_##IN0_TYPE##TO##OUT_TYPE, \ SOURCE }, +#define TENSOR_DEPTH2SPACE_CRD_BLK2_KERNELS(IN0_TYPE, OUT_TYPE, SOURCE) \ + { HASH_DEPTH2SPACE_CRD_KEY(IN0_TYPE, OUT_TYPE, 1), \ + VX_KERNEL_NAME_DEPTH2SPACE_CRD_##IN0_TYPE##TO##OUT_TYPE##_BLK2, \ + SOURCE }, + static const struct { uint32_t key; char* function_name; @@ -80,6 +96,17 @@ static const struct { TENSOR_DEPTH2SPACE_CRD_KERNELS(F16, I16, KERNEL_SOURCE_1) TENSOR_DEPTH2SPACE_CRD_KERNELS(U8, F16, KERNEL_SOURCE_1) TENSOR_DEPTH2SPACE_CRD_KERNELS(F16, U8, KERNEL_SOURCE_1) + + TENSOR_DEPTH2SPACE_CRD_BLK2_KERNELS(U8, U8, KERNEL_SOURCE_1) + TENSOR_DEPTH2SPACE_CRD_BLK2_KERNELS(I8, I8, KERNEL_SOURCE_1) + TENSOR_DEPTH2SPACE_CRD_BLK2_KERNELS(I16, I16, KERNEL_SOURCE_1) + TENSOR_DEPTH2SPACE_CRD_BLK2_KERNELS(F16, F16, KERNEL_SOURCE_1) + TENSOR_DEPTH2SPACE_CRD_BLK2_KERNELS(I8, F16, KERNEL_SOURCE_1) + TENSOR_DEPTH2SPACE_CRD_BLK2_KERNELS(I16, F16, KERNEL_SOURCE_1) + TENSOR_DEPTH2SPACE_CRD_BLK2_KERNELS(F16, I8, KERNEL_SOURCE_1) + TENSOR_DEPTH2SPACE_CRD_BLK2_KERNELS(F16, I16, KERNEL_SOURCE_1) + TENSOR_DEPTH2SPACE_CRD_BLK2_KERNELS(U8, F16, KERNEL_SOURCE_1) + TENSOR_DEPTH2SPACE_CRD_BLK2_KERNELS(F16, U8, KERNEL_SOURCE_1) }; /* @@ -118,9 +145,10 @@ DEF_KERNEL_INITIALIZER(_depth2space_crd_initializer) int32_t output_height = 0; int32_t output_chn = 0; int32_t src0ZP = 0; - float src0Scale = 0; + float src0Scale = 1.0f; int32_t dstZP = 0; - float dstScale = 0; + float dstScale = 1.0f; + int32_t block_size = 0; uint32_t pack_key = 0; @@ -128,12 +156,15 @@ DEF_KERNEL_INITIALIZER(_depth2space_crd_initializer) CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError ); attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", OnError ); + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[2], &block_size); + CHECK_STATUS_FAIL_GOTO(status, OnError ); - src0ZP = attr[0]->asymm.zero_point; - src0Scale = attr[0]->asymm.scale; - dstZP = attr[1]->asymm.zero_point; - dstScale = attr[1]->asymm.scale; - if( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP ) + if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM ) + { + src0ZP = attr[0]->asymm.zero_point; + src0Scale = attr[0]->asymm.scale; + } + else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP ) { if (attr[0]->dfp.fl > 0) { @@ -143,27 +174,35 @@ DEF_KERNEL_INITIALIZER(_depth2space_crd_initializer) { src0Scale = ((float) ((int64_t)1 << -attr[0]->dfp.fl)); } + src0ZP = 0; } - else if( attr[0]->quant == VSI_NN_KERNEL_QUANT_NONE ) + else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_NONE ) { src0Scale = 1; + src0ZP = 0; } - if( attr[1]->quant == VSI_NN_KERNEL_QUANT_DFP ) + if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_ASYMM ) + { + dstZP = attr[1]->asymm.zero_point; + dstScale = attr[1]->asymm.scale; + } + else if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_DFP ) { if (attr[1]->dfp.fl > 0) { - dstScale = (float)((int64_t)1 << attr[1]->dfp.fl); + dstScale = (1.0f / (float)((int64_t)1 << attr[1]->dfp.fl)); } else { - dstScale = (1.0f / (float)((int64_t)1 << -attr[1]->dfp.fl)); + dstScale = (float)((int64_t)1 << -attr[1]->dfp.fl); } - dstScale = 1.0f/dstScale; + dstZP = 0; } - else if( attr[1]->quant == VSI_NN_KERNEL_QUANT_NONE ) + else if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_NONE ) { dstScale = 1; + dstZP = 0; } output_dims = (uint32_t)attr[1]->shape->size; @@ -179,6 +218,17 @@ DEF_KERNEL_INITIALIZER(_depth2space_crd_initializer) shaderParam.global_size[1] = output_height; shaderParam.global_size[2] = output_chn; + if (block_size == 2) + { + shaderParam.global_scale[0] = 16; + shaderParam.global_scale[1] = 1; + shaderParam.global_scale[2] = 1; + shaderParam.global_size[0] = gpu_align_p2((output_width + shaderParam.global_scale[0] - 1) + / shaderParam.global_scale[0], 4); + shaderParam.global_size[1] = output_height; + shaderParam.global_size[2] = output_chn; + } + status = vsi_nn_kernel_gpu_config( node, &shaderParam ); CHECK_STATUS_FAIL_GOTO(status, OnError); @@ -202,6 +252,43 @@ DEF_KERNEL_INITIALIZER(_depth2space_crd_initializer) 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniU8MulAndPostShift_ExLo_2x8 = {{ + 0xdddddddd, // TCfg + 0x44444444, // ASelt + 0x19111810, 0x1b131a12, // ABin + 0x11111111, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00005600, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniU8MulAndPostShift_ExHi_2x8 = {{ + 0xdddddddd, // TCfg + 0x44444444, // ASelt + 0x1d151c14, 0x1f171e16, // ABin + 0x11111111, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002600, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniDepth2SpaceF16Blk2_lo_2x8 = {{ + 0x11111111, // TCfg + 0x10101010, // ASelt + 0x01010000, 0x03030202, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniDepth2SpaceF16Blk2_hi_2x8 = {{ + 0x11111111, // TCfg + 0x10101010, // ASelt + 0x05050404, 0x07070606, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16 }; + switch( pack_key ) { case _PACK_SELECT_KEY( U8, F16): @@ -213,14 +300,25 @@ DEF_KERNEL_INITIALIZER(_depth2space_crd_initializer) case _PACK_SELECT_KEY( U8, U8): case _PACK_SELECT_KEY( I8, I8): case _PACK_SELECT_KEY( I16, I16): + case _PACK_SELECT_KEY( F16, F16): { gpu_quantize_multiplier_16bit( (double)src0Scale / dstScale, &M0, &postShift); multAndoutZP0[0] = (uint32_t)(M0); multAndoutZP0[1] = (uint32_t)((dstZP << postShift) - src0ZP * M0); gpu_dp_inst_update_postshfit( &uniU8MulAndPostShift_0_Lo_2x8, postShift ); + gpu_dp_inst_update_postshfit( &uniU8MulAndPostShift_ExLo_2x8, postShift ); + gpu_dp_inst_update_postshfit( &uniU8MulAndPostShift_ExHi_2x8, postShift ); status = vsi_nn_kernel_gpu_add_param( node, "uniU8MulAndPostShift_0_Lo_2x8", &uniU8MulAndPostShift_0_Lo_2x8 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniU8MulAndPostShift_ExLo_2x8", &uniU8MulAndPostShift_ExLo_2x8 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniU8MulAndPostShift_ExHi_2x8", &uniU8MulAndPostShift_ExHi_2x8 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniDepth2SpaceF16Blk2_lo_2x8", &uniDepth2SpaceF16Blk2_lo_2x8 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniDepth2SpaceF16Blk2_hi_2x8", &uniDepth2SpaceF16Blk2_hi_2x8 ); status |= vsi_nn_kernel_gpu_add_param( node, "multAndoutZP0", &multAndoutZP0 ); CHECK_STATUS_FAIL_GOTO(status, OnError ); } @@ -256,7 +354,8 @@ static vsi_status _query_kernel vsi_nn_tensor_t* const* const inputs, vsi_nn_tensor_t* const* const outputs, vsi_nn_kernel_t* kernel, - const vsi_nn_kernel_param_t * params + const vsi_nn_kernel_param_t * params, + int32_t blk_flg ) { vsi_status status = VSI_FAILURE; @@ -268,16 +367,16 @@ static vsi_status _query_kernel input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); - key = HASH_DEPTH2SPACE_CRD_KEY( input0_dtype, output_dtype, 0 ); + key = HASH_DEPTH2SPACE_CRD_KEY( input0_dtype, output_dtype, blk_flg ); for( i = 0; i < _cnt_of_array(depth2space_crd_map); i ++ ) { - if( depth2space_crd_map[i].key == key ) + if ( depth2space_crd_map[i].key == key ) { break; } } - if( i < _cnt_of_array(depth2space_crd_map) ) + if ( i < _cnt_of_array(depth2space_crd_map) ) { snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", depth2space_crd_map[i].function_name ); kernel->info.parameters = _depth2space_crd_kernel_param_def; @@ -310,18 +409,19 @@ static vsi_nn_kernel_node_t _setup vsi_nn_kernel_node_param_t tmp_params[_DEPTH2SPACE_CRD_PARAM_NUM] = { NULL }; vsi_nn_kernel_node_t node = NULL; int32_t block_size = vsi_nn_kernel_param_get_int32( params, "block_size" ); + int32_t blk_flg = block_size == 2 ? 1 : 0; - if( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size, + if ( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size, outputs[0]->attr.dim_num ) ) { return NULL; } - status = _query_kernel( inputs, outputs, kernel, params ); - if( VSI_SUCCESS == status) + status = _query_kernel( inputs, outputs, kernel, params, blk_flg); + if ( VSI_SUCCESS == status) { node = vsi_nn_kernel_create_node( graph, kernel ); - if( node ) + if ( node ) { vsi_nn_kernel_node_pack_io( tmp_params, _DEPTH2SPACE_CRD_PARAM_NUM, inputs, 1, outputs, 1 ); tmp_params[2] = vsi_nn_kernel_scalar_create( graph, I32, &block_size ); diff --git a/src/tim/vx/internal/src/kernel/evis/depthwise_conv1d_evis.c b/src/tim/vx/internal/src/kernel/evis/depthwise_conv1d_evis.c index 2c7c4f6..32b57b4 100644 --- a/src/tim/vx/internal/src/kernel/evis/depthwise_conv1d_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/depthwise_conv1d_evis.c @@ -717,12 +717,13 @@ static vsi_nn_kernel_node_t _setup int32_t pad_front = vsi_nn_kernel_param_get_int32( params, "pad_front" ); int32_t pad_end = vsi_nn_kernel_param_get_int32( params, "pad_end" ); int32_t dilation = vsi_nn_kernel_param_get_int32( params, "dilation" ); + int32_t batch = inputs[0]->attr.size[2]; _internal_kernel_size_e ks = KN; - if (!((VSI_NN_TYPE_UINT8 == inputs[0]->attr.dtype.vx_type) + if ( (!((VSI_NN_TYPE_UINT8 == inputs[0]->attr.dtype.vx_type) && (VSI_NN_TYPE_UINT8 == inputs[1]->attr.dtype.vx_type) && (NULL == inputs[2] || VSI_NN_TYPE_INT32 == inputs[2]->attr.dtype.vx_type) - && (VSI_NN_TYPE_UINT8 == outputs[0]->attr.dtype.vx_type))) + && (VSI_NN_TYPE_UINT8 == outputs[0]->attr.dtype.vx_type))) || batch > 1) { return NULL; } @@ -769,18 +770,27 @@ static vsi_nn_kernel_node_t _setup status = _query_kernel( kernel, temp_tensor, outputs, dilation, ks); - if( VSI_SUCCESS == status) + if ( VSI_SUCCESS == status) { node = vsi_nn_kernel_create_node( graph, kernel ); - if( node ) + if ( node ) { - if( pad_front != 0 && pad_end != 0) + if ( pad_front != 0 && pad_end != 0) { // Set default border mode. vx_border_t border; border.mode = VX_BORDER_CONSTANT; - border.constant_value.U8 = 0; - border.constant_value.U16 = 0; + if (VSI_NN_TYPE_UINT8 == inputs[0]->attr.dtype.vx_type && + VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC == inputs[0]->attr.dtype.qnt_type) + { + border.constant_value.U8 = (uint8_t)inputs[0]->attr.dtype.zero_point; + } + else + { + border.constant_value.U8 = 0; + border.constant_value.U16 = 0; + } + status |= vxSetNodeAttribute( (vx_node)node, VX_NODE_BORDER, &border, sizeof(border) ); } diff --git a/src/tim/vx/internal/src/kernel/evis/eltwise_unary_evis.c b/src/tim/vx/internal/src/kernel/evis/eltwise_unary_evis.c index e6831f7..fd07a58 100644 --- a/src/tim/vx/internal/src/kernel/evis/eltwise_unary_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/eltwise_unary_evis.c @@ -48,6 +48,7 @@ typedef enum UNARY_NEG, UNARY_HSIGMOID, UNARY_MISH, + UNARY_ROUND, } unary_type_e; /* @@ -82,6 +83,7 @@ typedef enum #define NEG_OPERATION neg #define HSIGMOID_OPERATION hard_sigmoid #define MISH_OPERATION mish +#define ROUND_OPERATION round static const struct { uint32_t key; @@ -248,6 +250,30 @@ static const struct { TENSOR_UNARY_KERNELS_2D(MISH_OPERATION, UNARY_MISH, I8, I8 , KERNEL_SOURCE_2D) TENSOR_UNARY_KERNELS_2D(MISH_OPERATION, UNARY_MISH, I8, F16 , KERNEL_SOURCE_2D) TENSOR_UNARY_KERNELS_2D(MISH_OPERATION, UNARY_MISH, BF16, BF16 , KERNEL_SOURCE_2D) + + TENSOR_UNARY_KERNELS(ROUND_OPERATION, UNARY_ROUND, F16, F16 , KERNEL_SOURCE_3D) + TENSOR_UNARY_KERNELS(ROUND_OPERATION, UNARY_ROUND, F16, I16 , KERNEL_SOURCE_3D) + TENSOR_UNARY_KERNELS(ROUND_OPERATION, UNARY_ROUND, F16, U8 , KERNEL_SOURCE_3D) + TENSOR_UNARY_KERNELS(ROUND_OPERATION, UNARY_ROUND, F16, I8 , KERNEL_SOURCE_3D) + TENSOR_UNARY_KERNELS(ROUND_OPERATION, UNARY_ROUND, I16, I16 , KERNEL_SOURCE_3D) + TENSOR_UNARY_KERNELS(ROUND_OPERATION, UNARY_ROUND, I16, F16 , KERNEL_SOURCE_3D) + TENSOR_UNARY_KERNELS(ROUND_OPERATION, UNARY_ROUND, U8, U8 , KERNEL_SOURCE_3D) + TENSOR_UNARY_KERNELS(ROUND_OPERATION, UNARY_ROUND, U8, F16 , KERNEL_SOURCE_3D) + TENSOR_UNARY_KERNELS(ROUND_OPERATION, UNARY_ROUND, I8, I8 , KERNEL_SOURCE_3D) + TENSOR_UNARY_KERNELS(ROUND_OPERATION, UNARY_ROUND, I8, F16 , KERNEL_SOURCE_3D) + TENSOR_UNARY_KERNELS(ROUND_OPERATION, UNARY_ROUND, BF16, BF16 , KERNEL_SOURCE_3D) + + TENSOR_UNARY_KERNELS_2D(ROUND_OPERATION, UNARY_ROUND, F16, F16 , KERNEL_SOURCE_2D) + TENSOR_UNARY_KERNELS_2D(ROUND_OPERATION, UNARY_ROUND, F16, I16 , KERNEL_SOURCE_2D) + TENSOR_UNARY_KERNELS_2D(ROUND_OPERATION, UNARY_ROUND, F16, U8 , KERNEL_SOURCE_2D) + TENSOR_UNARY_KERNELS_2D(ROUND_OPERATION, UNARY_ROUND, F16, I8 , KERNEL_SOURCE_2D) + TENSOR_UNARY_KERNELS_2D(ROUND_OPERATION, UNARY_ROUND, I16, I16 , KERNEL_SOURCE_2D) + TENSOR_UNARY_KERNELS_2D(ROUND_OPERATION, UNARY_ROUND, I16, F16 , KERNEL_SOURCE_2D) + TENSOR_UNARY_KERNELS_2D(ROUND_OPERATION, UNARY_ROUND, U8, U8 , KERNEL_SOURCE_2D) + TENSOR_UNARY_KERNELS_2D(ROUND_OPERATION, UNARY_ROUND, U8, F16 , KERNEL_SOURCE_2D) + TENSOR_UNARY_KERNELS_2D(ROUND_OPERATION, UNARY_ROUND, I8, I8 , KERNEL_SOURCE_2D) + TENSOR_UNARY_KERNELS_2D(ROUND_OPERATION, UNARY_ROUND, I8, F16 , KERNEL_SOURCE_2D) + TENSOR_UNARY_KERNELS_2D(ROUND_OPERATION, UNARY_ROUND, BF16, BF16 , KERNEL_SOURCE_2D) }; #undef SIN_OPERATION @@ -257,6 +283,7 @@ static const struct { #undef NEG_OPERATION #undef HSIGMOID_OPERATION #undef MISH_OPERATION +#undef ROUND_OPERATION /* * Kernel params @@ -375,6 +402,7 @@ DEF_KERNEL_INITIALIZER(_eltwise_unary_initializer) case _PACK_SELECT_KEY( UNARY_NEG, BF16, BF16 ): case _PACK_SELECT_KEY( UNARY_HSIGMOID, BF16, BF16 ): case _PACK_SELECT_KEY( UNARY_MISH, BF16, BF16 ): + case _PACK_SELECT_KEY( UNARY_ROUND, BF16, BF16 ): { gpu_dp_inst_t uniConvBF16toF32_Part0_2x8 = {{ 0x11111111, // TCfg @@ -653,6 +681,7 @@ REGISTER_ELTWISE_UNARY_BACKEND_EVIS( elu, UNARY_ELU ) REGISTER_ELTWISE_UNARY_BACKEND_EVIS( neg, UNARY_NEG ) REGISTER_ELTWISE_UNARY_BACKEND_EVIS( hard_sigmoid, UNARY_HSIGMOID ) REGISTER_ELTWISE_UNARY_BACKEND_EVIS( mish, UNARY_MISH ) +REGISTER_ELTWISE_UNARY_BACKEND_EVIS( round, UNARY_ROUND ) __END_DECLS diff --git a/src/tim/vx/internal/src/kernel/evis/erf_evis.c b/src/tim/vx/internal/src/kernel/evis/erf_evis.c new file mode 100644 index 0000000..7753349 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/evis/erf_evis.c @@ -0,0 +1,428 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_error.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h" + +__BEGIN_DECLS + +/* + * Define kernel meta. + */ +#define HASH_UNARY_KEY(_input_type, _output_type, _image_2d) \ + ( (_input_type << 12) | (_output_type << 4) | (_image_2d)) + +#define KERNEL_SOURCE "erf", + +#define HASH_UNARY_SH_KERNEL_NAME( SRC_TYPE, DST_TYPE) \ + CVIVANTE_NAMESPACE("evis.erf_"#SRC_TYPE"to"#DST_TYPE) + +#define TENSOR_UNARY_KERNELS(SRC_TYPE, OUT_TYPE) \ + { HASH_UNARY_KEY(SRC_TYPE, OUT_TYPE, 0), \ + HASH_UNARY_SH_KERNEL_NAME(SRC_TYPE, OUT_TYPE), \ + KERNEL_SOURCE }, + +#define HASH_UNARY_SH_KERNEL_2D_NAME(SRC_TYPE, DST_TYPE) \ + CVIVANTE_NAMESPACE("evis.erf_"#SRC_TYPE"to"#DST_TYPE"_2D") + +#define TENSOR_UNARY_KERNELS_2D(SRC_TYPE, OUT_TYPE) \ + { HASH_UNARY_KEY(SRC_TYPE, OUT_TYPE, 1), \ + HASH_UNARY_SH_KERNEL_2D_NAME(SRC_TYPE, OUT_TYPE), \ + KERNEL_SOURCE }, + + +typedef struct +{ + uint32_t key; + char * function_name; + const char * source_name; +} _kernel_map_type; + +static const _kernel_map_type _erf_kernel_map[] = +{ + // Register kernel here + TENSOR_UNARY_KERNELS(F16, F16 ) + TENSOR_UNARY_KERNELS(F16, I16 ) + TENSOR_UNARY_KERNELS(F16, U8 ) + TENSOR_UNARY_KERNELS(F16, I8 ) + TENSOR_UNARY_KERNELS(I16, I16 ) + TENSOR_UNARY_KERNELS(I16, F16 ) + TENSOR_UNARY_KERNELS(U8, U8 ) + TENSOR_UNARY_KERNELS(U8, F16 ) + TENSOR_UNARY_KERNELS(I8, I8 ) + TENSOR_UNARY_KERNELS(I8, F16 ) + TENSOR_UNARY_KERNELS(BF16, BF16) + + TENSOR_UNARY_KERNELS_2D(F16, F16 ) + TENSOR_UNARY_KERNELS_2D(F16, I16 ) + TENSOR_UNARY_KERNELS_2D(F16, U8 ) + TENSOR_UNARY_KERNELS_2D(F16, I8 ) + TENSOR_UNARY_KERNELS_2D(I16, I16 ) + TENSOR_UNARY_KERNELS_2D(I16, F16 ) + TENSOR_UNARY_KERNELS_2D(U8, U8 ) + TENSOR_UNARY_KERNELS_2D(U8, F16 ) + TENSOR_UNARY_KERNELS_2D(I8, I8 ) + TENSOR_UNARY_KERNELS_2D(I8, F16 ) + TENSOR_UNARY_KERNELS_2D(BF16, BF16) +}; + + +/* + * Kernel params + */ +static vx_param_description_t _erf_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, +}; +#define _ERF_PARAM_NUM _cnt_of_array( _erf_kernel_param_def ) + +/* + * Kernel initializer + */ +DEF_KERNEL_INITIALIZER(_erf_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + gpu_param_t gpu_param = { + 3, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0} + }; + vsi_nn_kernel_tensor_attr_t * attr[2] = { NULL, NULL }; + vsi_int_array_t * out_shape = NULL; + float inputScale = 1.0f; + float inputTail = 0; + float outputScale = 1.0f; + float outputZP = 0; + uint32_t pack_key; + + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); + CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); + attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); + CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final ); + + out_shape = attr[1]->shape; + + if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP ) + { + int32_t fl = attr[0]->dfp.fl; + if (fl > 0) + { + inputScale = 1.0f / (float) ((int64_t)1 << fl); + } + else + { + inputScale = (float)((int64_t)1 << -fl); + } + } + else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM ) + { + inputScale = attr[0]->asymm.scale; + inputTail = 0 - attr[0]->asymm.zero_point * inputScale; + } + + if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_DFP ) + { + int32_t fl = attr[1]->dfp.fl; + if (fl > 0) + { + outputScale = (float)((int64_t)1 << fl); + } + else + { + outputScale = (float)1.0f / (float) ((int64_t)1 << -fl); + } + } + else if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_ASYMM ) + { + outputScale = (float)1.0f / attr[1]->asymm.scale; + outputZP = (float)attr[1]->asymm.zero_point; + } + +#define _PACK_SELECT_KEY( IN_TYPE, OUT_TYPE ) \ + ( ( IN_TYPE << 16) | ( OUT_TYPE << 8)) + + pack_key = _PACK_SELECT_KEY( attr[0]->dtype, attr[1]->dtype ); + + gpu_param.global_scale[0] = 4; + gpu_param.global_scale[1] = 1; + gpu_param.global_scale[2] = 1; + + gpu_param.global_size[0] = gpu_align_p2( + (out_shape->data[0] + gpu_param.global_scale[0] - 1) + / gpu_param.global_scale[0], 4); + gpu_param.global_size[1] = ( + (out_shape->data[1] + gpu_param.global_scale[1] - 1) + / gpu_param.global_scale[1]); + gpu_param.global_size[2] = out_shape->size > 2 ? out_shape->data[2] : 1; + + switch ( pack_key ) + { + case _PACK_SELECT_KEY( BF16, BF16 ): + { + gpu_dp_inst_t uniConvBF16toF32_Part0_2x8 = {{ + 0x11111111, // TCfg + 0x01010101, // ASelt + 0x01050004, 0x03070206, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniExtractOddData_2x8 = {{ + 0x11111111, // TCfg + 0x11110000, // ASelt + 0x07050301, 0x07050301, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16 }; + + status = vsi_nn_kernel_gpu_add_param( node, + "uniConvBF16toF32_Part0_2x8", &uniConvBF16toF32_Part0_2x8 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniExtractOddData_2x8", &uniExtractOddData_2x8 ); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + break; + default: + { + gpu_dp_inst_t uniExtractHalf8_2x8 = {{ + 0x11111111, // TCfg + 0x11110000, // ASelt + 0x06040200, 0x06040200, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, + 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniExtractInteger_2x8 = {{ + 0x33333333, // TCfg + 0x11110000, // ASelt + 0x03020100, 0x03020100, // ABin + 0x00000000, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniDatatoFp32Part0_4x4 = {{ + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00010000, 0x00030002, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x00003c00, 0x00000000, 0x00003c00, 0x00000000, + 0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + + status = vsi_nn_kernel_gpu_add_param( node, + "uniDatatoFp32Part0_4x4", &uniDatatoFp32Part0_4x4 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "inputScale", &inputScale ); + status |= vsi_nn_kernel_gpu_add_param( node, + "inputTail", &inputTail ); + status |= vsi_nn_kernel_gpu_add_param( node, + "outputScale", &outputScale ); + status |= vsi_nn_kernel_gpu_add_param( node, + "outputZP", &outputZP ); + + if (attr[1]->dtype == F16) + { + status |= vsi_nn_kernel_gpu_add_param( node, + "uniExtract8Data_2x8", &uniExtractHalf8_2x8 ); + } + else + { + status |= vsi_nn_kernel_gpu_add_param( node, + "uniExtract8Data_2x8", &uniExtractInteger_2x8 ); + } + CHECK_STATUS_FAIL_GOTO(status, final ); + } + break; + } + +#undef _PACK_SELECT_KEY + + status = vsi_nn_kernel_gpu_config( node, &gpu_param ); + +final: +#define SAFE_FREE_TENSOR_ATTR(_PTR) if( _PTR ) { vsi_nn_kernel_tensor_attr_release( &_PTR ); _PTR = NULL; } + SAFE_FREE_TENSOR_ATTR(attr[0]); + SAFE_FREE_TENSOR_ATTR(attr[1]); +#undef SAFE_FREE_TENSOR_ATTR + + return status; +} /* _erf_initializer() */ + + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + vsi_nn_tensor_t * const * const inputs, + vsi_nn_tensor_t * const * const outputs, + vsi_bool image_2d + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_dtype_e in_dtype; + vsi_nn_kernel_dtype_e out_dtype; + const _kernel_map_type * kernel_map = _erf_kernel_map; + size_t kernel_map_size = _cnt_of_array( _erf_kernel_map ); + vx_param_description_t * param_def = _erf_kernel_param_def; + vx_kernel_initialize_f initializer = _erf_initializer; + + uint32_t key; + uint32_t i; + + in_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + + key = HASH_UNARY_KEY( in_dtype, out_dtype, image_2d ); + + for ( i = 0; i < (uint32_t)kernel_map_size; i ++ ) + { + if ( kernel_map[i].key == key ) + { + break; + } + } + + if ( i < (uint32_t)kernel_map_size ) + { + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name ); + kernel->info.parameters = param_def; + kernel->info.numParams = _cnt_of_array( _erf_kernel_param_def ); + kernel->info.initialize = initializer; + // Register code source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2, + "vsi_nn_kernel_header", + kernel_map[i].source_name ); + // Register binary source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + kernel_map[i].source_name ); + status = VSI_SUCCESS; + } + + return status; +} /* _query_kernel() */ + + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_ERF_PARAM_NUM] = {NULL}; + vsi_nn_kernel_node_t node = NULL; + vsi_nn_tensor_t* rs_tensors[2] = { NULL }; + int32_t shape[VSI_NN_MAX_DIM_NUM] = { 0 }; + int32_t new_rank = 0; + vsi_bool image_2d = FALSE; + vsi_bool ret = FALSE; + + ret = vsi_nn_kernel_optimize_element_shape( + (int32_t *)inputs[0]->attr.size, inputs[0]->attr.dim_num, + shape, &new_rank ); + if ( ret ) + { + rs_tensors[0] = vsi_nn_reshape_tensor( graph, + inputs[0], (uint32_t*)shape, new_rank ); + rs_tensors[1] = vsi_nn_reshape_tensor( graph, + outputs[0], (uint32_t*)shape, new_rank ); + } + + if ( !vsi_nn_kernel_gpu_check_shape( (int32_t*)rs_tensors[0]->attr.size, + rs_tensors[0]->attr.dim_num ) ) + { + goto OnError; + } + + image_2d = (rs_tensors[0]->attr.dim_num == 2 || rs_tensors[0]->attr.size[2] == 1); + + status = _query_kernel( kernel, inputs, outputs, image_2d ); + if ( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if ( node ) + { + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( node_params, _ERF_PARAM_NUM, + rs_tensors, 1, &rs_tensors[1], 1 ); + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, node_params, _ERF_PARAM_NUM ); + } + } + +OnError: + if (rs_tensors[0]) + { + vsi_nn_ReleaseTensor( &rs_tensors[0] ); + } + + if (rs_tensors[1]) + { + vsi_nn_ReleaseTensor( &rs_tensors[1] ); + } + + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_EVIS( erf, _setup ) diff --git a/src/tim/vx/internal/src/kernel/evis/gather_evis.c b/src/tim/vx/internal/src/kernel/evis/gather_evis.c index 2ef5977..ae35694 100644 --- a/src/tim/vx/internal/src/kernel/evis/gather_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/gather_evis.c @@ -64,39 +64,60 @@ __BEGIN_DECLS #define VX_KERNEL_NAME_GATHER_AXIS0_U8TOF16 CVIVANTE_NAMESPACE("evis.gather_U8toF16_axis0") #define VX_KERNEL_NAME_GATHER_AXIS0_F16TOU8 CVIVANTE_NAMESPACE("evis.gather_F16toU8_axis0") +#define VX_KERNEL_NAME_GATHER_ARRAY_U8TOU8 CVIVANTE_NAMESPACE("evis.gather_U8toU8_array") +#define VX_KERNEL_NAME_GATHER_ARRAY_I8TOI8 CVIVANTE_NAMESPACE("evis.gather_I8toI8_array") +#define VX_KERNEL_NAME_GATHER_ARRAY_I16TOI16 CVIVANTE_NAMESPACE("evis.gather_I16toI16_array") +#define VX_KERNEL_NAME_GATHER_ARRAY_F16TOF16 CVIVANTE_NAMESPACE("evis.gather_F16toF16_array") + +#define VX_KERNEL_NAME_GATHER_AXIS0_ARRAY_U8TOU8 CVIVANTE_NAMESPACE("evis.gather_U8toU8_axis0_array") +#define VX_KERNEL_NAME_GATHER_AXIS0_ARRAY_I8TOI8 CVIVANTE_NAMESPACE("evis.gather_I8toI8_axis0_array") +#define VX_KERNEL_NAME_GATHER_AXIS0_ARRAY_I16TOI16 CVIVANTE_NAMESPACE("evis.gather_I16toI16_axis0_array") +#define VX_KERNEL_NAME_GATHER_AXIS0_ARRAY_F16TOF16 CVIVANTE_NAMESPACE("evis.gather_F16toF16_axis0_array") + #define KERNEL_SOURCE_1 "gather" #define KERNEL_SOURCE_2 "gather_mix" +#define KERNEL_SOURCE_3 "gather_array" // Add kernel hashtable here -#define HASH_GATHER_KEY(_input0_type, _input1_type, _output_type, _is_axis0) \ - ((_input0_type << 24) | (_input1_type << 16) | (_output_type << 8) | (_is_axis0)) +#define HASH_GATHER_KEY(_input0_type, _input1_type, _output_type, _is_axis0, _is_max) \ + ((_input0_type << 24) | (_input1_type << 16) | (_output_type << 8) | (_is_axis0 << 4) | (_is_max)) #define TENSOR_GATHER_KERNELS(IN0_TYPE, IN1TYPE, OUT_TYPE, SOURCE) \ - { HASH_GATHER_KEY(IN0_TYPE, IN1TYPE, OUT_TYPE, 0), \ + { HASH_GATHER_KEY(IN0_TYPE, IN1TYPE, OUT_TYPE, 0, 0), \ VX_KERNEL_NAME_GATHER_##IN0_TYPE##TO##OUT_TYPE, \ SOURCE }, #define TENSOR_GATHER_AXIS0_KERNELS(IN0_TYPE, IN1TYPE, OUT_TYPE, SOURCE) \ - { HASH_GATHER_KEY(IN0_TYPE, IN1TYPE, OUT_TYPE, 1), \ + { HASH_GATHER_KEY(IN0_TYPE, IN1TYPE, OUT_TYPE, 1, 0), \ VX_KERNEL_NAME_GATHER_AXIS0_##IN0_TYPE##TO##OUT_TYPE, \ SOURCE }, +#define TENSOR_GATHER_ARRAY_KERNELS(IN0_TYPE, IN1TYPE, OUT_TYPE, SOURCE) \ + { HASH_GATHER_KEY(IN0_TYPE, IN1TYPE, OUT_TYPE, 0, 1), \ + VX_KERNEL_NAME_GATHER_ARRAY_##IN0_TYPE##TO##OUT_TYPE, \ + SOURCE }, + +#define TENSOR_GATHER_AXIS0_ARRAY_KERNELS(IN0_TYPE, IN1TYPE, OUT_TYPE, SOURCE) \ + { HASH_GATHER_KEY(IN0_TYPE, IN1TYPE, OUT_TYPE, 1, 1), \ + VX_KERNEL_NAME_GATHER_AXIS0_ARRAY_##IN0_TYPE##TO##OUT_TYPE, \ + SOURCE }, + static const struct { uint32_t key; char* function_name; const char* source_name; } gather_map[] = { - TENSOR_GATHER_KERNELS(U8, I32, U8, KERNEL_SOURCE_1) - TENSOR_GATHER_KERNELS(I8, I32, I8, KERNEL_SOURCE_1) - TENSOR_GATHER_KERNELS(I16, I32, I16, KERNEL_SOURCE_1) - TENSOR_GATHER_KERNELS(F16, I32, F16, KERNEL_SOURCE_1) - TENSOR_GATHER_KERNELS(I8, I32, F16, KERNEL_SOURCE_2) - TENSOR_GATHER_KERNELS(I16, I32, F16, KERNEL_SOURCE_2) - TENSOR_GATHER_KERNELS(F16, I32, I8, KERNEL_SOURCE_2) - TENSOR_GATHER_KERNELS(F16, I32, I16, KERNEL_SOURCE_2) - TENSOR_GATHER_KERNELS(U8, I32, F16, KERNEL_SOURCE_2) - TENSOR_GATHER_KERNELS(F16, I32, U8, KERNEL_SOURCE_2) + TENSOR_GATHER_KERNELS(U8, I32, U8, KERNEL_SOURCE_1) + TENSOR_GATHER_KERNELS(I8, I32, I8, KERNEL_SOURCE_1) + TENSOR_GATHER_KERNELS(I16, I32, I16, KERNEL_SOURCE_1) + TENSOR_GATHER_KERNELS(F16, I32, F16, KERNEL_SOURCE_1) + TENSOR_GATHER_KERNELS(I8, I32, F16, KERNEL_SOURCE_2) + TENSOR_GATHER_KERNELS(I16, I32, F16, KERNEL_SOURCE_2) + TENSOR_GATHER_KERNELS(F16, I32, I8, KERNEL_SOURCE_2) + TENSOR_GATHER_KERNELS(F16, I32, I16, KERNEL_SOURCE_2) + TENSOR_GATHER_KERNELS(U8, I32, F16, KERNEL_SOURCE_2) + TENSOR_GATHER_KERNELS(F16, I32, U8, KERNEL_SOURCE_2) TENSOR_GATHER_AXIS0_KERNELS(U8, I32, U8, KERNEL_SOURCE_1) TENSOR_GATHER_AXIS0_KERNELS(I8, I32, I8, KERNEL_SOURCE_1) TENSOR_GATHER_AXIS0_KERNELS(I16, I32, I16, KERNEL_SOURCE_1) @@ -107,6 +128,14 @@ static const struct { TENSOR_GATHER_AXIS0_KERNELS(F16, I32, I16, KERNEL_SOURCE_2) TENSOR_GATHER_AXIS0_KERNELS(U8, I32, F16, KERNEL_SOURCE_2) TENSOR_GATHER_AXIS0_KERNELS(F16, I32, U8, KERNEL_SOURCE_2) + TENSOR_GATHER_ARRAY_KERNELS(U8, I32, U8, KERNEL_SOURCE_3) + TENSOR_GATHER_ARRAY_KERNELS(I8, I32, I8, KERNEL_SOURCE_3) + TENSOR_GATHER_ARRAY_KERNELS(I16, I32, I16, KERNEL_SOURCE_3) + TENSOR_GATHER_ARRAY_KERNELS(F16, I32, F16, KERNEL_SOURCE_3) + TENSOR_GATHER_AXIS0_ARRAY_KERNELS(U8, I32, U8, KERNEL_SOURCE_3) + TENSOR_GATHER_AXIS0_ARRAY_KERNELS(I8, I32, I8, KERNEL_SOURCE_3) + TENSOR_GATHER_AXIS0_ARRAY_KERNELS(I16, I32, I16, KERNEL_SOURCE_3) + TENSOR_GATHER_AXIS0_ARRAY_KERNELS(F16, I32, F16, KERNEL_SOURCE_3) }; /* @@ -129,7 +158,8 @@ static vsi_status get_gather_tensor_reshape_size vsi_nn_tensor_t ** inputs, int32_t sizes[VSI_NN_MAX_DIM_NUM], uint32_t block_size, - uint32_t idxFlg + uint32_t idxFlg, + int32_t* arrayFlg ) { vsi_status status = VSI_FAILURE; @@ -157,12 +187,13 @@ static vsi_status get_gather_tensor_reshape_size } else { - if ((elementCnt / block_size) < VSI_NN_MAX_IMAGE_WIDTH) + sizes[0] = block_size; + sizes[1] = elementCnt / block_size; + if ((elementCnt / block_size) > VSI_NN_MAX_IMAGE_WIDTH) { - sizes[0] = block_size; - sizes[1] = elementCnt / block_size; - status = VSI_SUCCESS; + arrayFlg[0] = 1; } + status = VSI_SUCCESS; } #undef VSI_NN_MAX_IMAGE_WIDTH @@ -535,10 +566,11 @@ DEF_KERNEL_INITIALIZER(_gather_axis0_initializer) CHECK_STATUS_FAIL_GOTO(status, OnError ); } break; - case _PACK_SELECT_KEY( I16, I16): - case _PACK_SELECT_KEY( I8, I8): - case _PACK_SELECT_KEY( U8, U8): - case _PACK_SELECT_KEY( F16, F16): + case _PACK_SELECT_KEY( I16, I16): + case _PACK_SELECT_KEY( I8, I8): + case _PACK_SELECT_KEY( U8, U8): + case _PACK_SELECT_KEY( F16, F16): + case _PACK_SELECT_KEY( BF16, BF16): { status = vsi_nn_kernel_gpu_add_param( node, "uniExtraCopyDpKeepinEvis_2x8", &uniExtraCopyDpKeepinEvis_2x8 ); @@ -583,7 +615,8 @@ static vsi_status _query_kernel vsi_nn_tensor_t* const* const outputs, vsi_nn_kernel_t* kernel, const vsi_nn_kernel_param_t * params, - int32_t axis + int32_t axis, + int32_t is_array ) { vsi_status status = VSI_FAILURE; @@ -595,7 +628,16 @@ static vsi_status _query_kernel input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); - key = HASH_GATHER_KEY( input0_dtype, I32, output_dtype, axis ); + if (input0_dtype == BF16) + { + input0_dtype = F16; + } + if (output_dtype == BF16) + { + output_dtype = F16; + } + + key = HASH_GATHER_KEY( input0_dtype, I32, output_dtype, axis, is_array); for( i = 0; i < _cnt_of_array(gather_map); i ++ ) { @@ -640,6 +682,7 @@ static vsi_nn_kernel_node_t _setup vsi_nn_kernel_t * kernel ) { +#define VSI_NN_MAX_BLOCK_SIZE (65536) vsi_status status = VSI_FAILURE; vsi_nn_kernel_node_param_t tmp_params[_GATHER_PARAM_NUM] = { NULL }; vsi_nn_kernel_node_t node = NULL; @@ -649,21 +692,23 @@ static vsi_nn_kernel_node_t _setup int32_t axis_num = vsi_nn_kernel_param_get_int32( params, "axis_num" ); int32_t axis = vsi_nn_kernel_param_get_int32( params, "axis" ); int32_t axis0_flg = 0; + int32_t is_array = block_size > VSI_NN_MAX_BLOCK_SIZE ? 1 : 0; if (axis == 0) { - status = get_gather_tensor_reshape_size(&inputs[0], shapes[0], axis_num, 0); - status |= get_gather_tensor_reshape_size(&inputs[1], shapes[1], 1, 1); - status |= get_gather_tensor_reshape_size(&outputs[0], shapes[2], shapes[1][0], 0); + status = get_gather_tensor_reshape_size(&inputs[0], shapes[0], axis_num, 0, &is_array); + status |= get_gather_tensor_reshape_size(&inputs[1], shapes[1], 1, 1, &is_array); + status |= get_gather_tensor_reshape_size(&outputs[0], shapes[2], shapes[1][0], 0, &is_array); axis0_flg = 1; } else { - status = get_gather_tensor_reshape_size(&inputs[0], shapes[0], block_size, 0); - status |= get_gather_tensor_reshape_size(&inputs[1], shapes[1], 1, 1); - status |= get_gather_tensor_reshape_size(&outputs[0], shapes[2], block_size, 0); + status = get_gather_tensor_reshape_size(&inputs[0], shapes[0], block_size, 0, &is_array); + status |= get_gather_tensor_reshape_size(&inputs[1], shapes[1], 1, 1, &is_array); + status |= get_gather_tensor_reshape_size(&outputs[0], shapes[2], block_size, 0, &is_array); axis0_flg = 0; } +#undef VSI_NN_MAX_BLOCK_SIZE if (status != VSI_SUCCESS) { return NULL; @@ -675,7 +720,7 @@ static vsi_nn_kernel_node_t _setup return NULL; } - status = _query_kernel( inputs, outputs, kernel, params, axis0_flg); + status = _query_kernel( inputs, outputs, kernel, params, axis0_flg, is_array); if ( VSI_SUCCESS == status) { node = vsi_nn_kernel_create_node( graph, kernel ); diff --git a/src/tim/vx/internal/src/kernel/evis/gather_nd_evis.c b/src/tim/vx/internal/src/kernel/evis/gather_nd_evis.c index c38b90e..8595e5a 100644 --- a/src/tim/vx/internal/src/kernel/evis/gather_nd_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/gather_nd_evis.c @@ -387,6 +387,15 @@ static vsi_status _query_kernel input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + if (input0_dtype == BF16) + { + input0_dtype = F16; + } + if (output_dtype == BF16) + { + output_dtype = F16; + } + if(coord_dim == 1) { coord_type = _1D; diff --git a/src/tim/vx/internal/src/kernel/evis/group_normalization_evis.c b/src/tim/vx/internal/src/kernel/evis/group_normalization_evis.c new file mode 100644 index 0000000..89f0c4c --- /dev/null +++ b/src/tim/vx/internal/src/kernel/evis/group_normalization_evis.c @@ -0,0 +1,1219 @@ +/**************************************************************************** +* +* Copyright (c) 2019 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "vsi_nn_error.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "libnnext/vx_lib_nnext.h" +#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h" + +__BEGIN_DECLS + +/* + * Define kernel meta. + */ + +typedef enum +{ + INTERNAL_KERNEL_SUM_SQR, + INTERNAL_KERNEL_MEAN_VARI, + INTERNAL_KERNEL_NORM, +} _internal_kernel_e; + +#define KERNEL_SOURCE_1 "group_normalization_i8" +#define KERNEL_SOURCE_2 "group_normalization_u8" +#define KERNEL_SOURCE_3 "group_normalization_i16" +#define KERNEL_SOURCE_4 "group_normalization_f16" +#define KERNEL_SOURCE_5 "group_normalization_u8_f16" + +#define HASH_GROUPNORM_SUM_SQR_SH_KERNEL_NAME(SRC0_TYPE) \ + CVIVANTE_NAMESPACE("evis.group_norm_sumsqr_"#SRC0_TYPE) + +#define HASH_GROUPNORM_SUM_SQR_SH_KERNEL_2D_NAME(SRC0_TYPE) \ + CVIVANTE_NAMESPACE("evis.group_norm_sumsqr_"#SRC0_TYPE"_2D") + +#define HASH_GROUPNORM_MEAN_VARI_SH_KERNEL_NAME \ + CVIVANTE_NAMESPACE("evis.group_norm_meanvari") + +#define HASH_GROUPNORM_SH_KERNEL_NAME(SRC0_TYPE, DST_TYPE) \ + CVIVANTE_NAMESPACE("evis.group_norm_"#SRC0_TYPE"to"#DST_TYPE) + +#define HASH_GROUPNORM_SH_KERNEL_2D_NAME(SRC0_TYPE, DST_TYPE) \ + CVIVANTE_NAMESPACE("evis.group_norm_"#SRC0_TYPE"to"#DST_TYPE"_2D") + +// Add kernel hashtable here +// Sum Sqr +#define HASH_GROUPNORM_SUM_SQR_KEY(_input0_type, _output_type, _reshape_flag) \ + ((_input0_type << 24) | (_output_type << 16) | (_reshape_flag << 8)) + +#define TENSOR_GROUPNORM_SUM_SQR_KERNELS(IN0_TYPE, OUT_TYPE, SOURCE) \ + { HASH_GROUPNORM_SUM_SQR_KEY(IN0_TYPE, OUT_TYPE, 0), \ + HASH_GROUPNORM_SUM_SQR_SH_KERNEL_NAME(IN0_TYPE), \ + SOURCE }, + +#define TENSOR_GROUPNORM_SUM_SQR_KERNELS_2D(IN0_TYPE, OUT_TYPE, SOURCE) \ + { HASH_GROUPNORM_SUM_SQR_KEY(IN0_TYPE, OUT_TYPE, 1), \ + HASH_GROUPNORM_SUM_SQR_SH_KERNEL_2D_NAME(IN0_TYPE), \ + SOURCE }, + +#define HASH_GROUPNORM_MEAN_VARI_KEY(_input0_type, _output_type) \ + ((_input0_type << 24) | (_output_type << 16)) + +#define TENSOR_GROUPNORM_MEAN_VARI_KERNELS(SOURCE) \ + { HASH_GROUPNORM_MEAN_VARI_KEY(F32, F32), \ + HASH_GROUPNORM_MEAN_VARI_SH_KERNEL_NAME, \ + SOURCE }, + +// normalization +#define HASH_GROUPNORM_KEY(_input0_type, _output_type, _reshape_flag) \ + ((_input0_type << 24) | (_output_type << 16) | (_reshape_flag << 8)) + +#define TENSOR_GROUPNORM_KERNELS(IN0_TYPE, OUT_TYPE, SOURCE) \ + { HASH_GROUPNORM_KEY(IN0_TYPE, OUT_TYPE, 0), \ + HASH_GROUPNORM_SH_KERNEL_NAME(IN0_TYPE, OUT_TYPE), \ + SOURCE }, + +#define TENSOR_GROUPNORM_KERNELS_2D(IN0_TYPE, OUT_TYPE, SOURCE) \ + { HASH_GROUPNORM_KEY(IN0_TYPE, OUT_TYPE, 1), \ + HASH_GROUPNORM_SH_KERNEL_2D_NAME(IN0_TYPE, OUT_TYPE), \ + SOURCE }, + +typedef struct +{ + uint32_t key; + char * function_name; + const char * source_name; +} _kernel_map_type; + +static const _kernel_map_type _groupnorm_sum_sqr_kernel_map[] = +{ + // Register kernel here + TENSOR_GROUPNORM_SUM_SQR_KERNELS( I8, F32, KERNEL_SOURCE_1 ) + TENSOR_GROUPNORM_SUM_SQR_KERNELS_2D( I8, F32, KERNEL_SOURCE_1 ) + TENSOR_GROUPNORM_SUM_SQR_KERNELS( U8, F32, KERNEL_SOURCE_2 ) + TENSOR_GROUPNORM_SUM_SQR_KERNELS_2D( U8, F32, KERNEL_SOURCE_2 ) + TENSOR_GROUPNORM_SUM_SQR_KERNELS( I16, F32, KERNEL_SOURCE_3 ) + TENSOR_GROUPNORM_SUM_SQR_KERNELS_2D( I16, F32, KERNEL_SOURCE_3 ) + TENSOR_GROUPNORM_SUM_SQR_KERNELS( F16, F32, KERNEL_SOURCE_4 ) + TENSOR_GROUPNORM_SUM_SQR_KERNELS_2D( F16, F32, KERNEL_SOURCE_4 ) +}; + +static const _kernel_map_type _groupnorm_mean_vari_kernel_map[] = +{ + // Register kernel here + TENSOR_GROUPNORM_MEAN_VARI_KERNELS( KERNEL_SOURCE_2 ) +}; + +static const _kernel_map_type _groupnorm_kernel_map[] = +{ + // Register kernel here + TENSOR_GROUPNORM_KERNELS( I8, I8, KERNEL_SOURCE_1 ) + TENSOR_GROUPNORM_KERNELS_2D( I8, I8, KERNEL_SOURCE_1 ) + TENSOR_GROUPNORM_KERNELS( I8, F16, KERNEL_SOURCE_1 ) + TENSOR_GROUPNORM_KERNELS_2D( I8, F16, KERNEL_SOURCE_1 ) + + TENSOR_GROUPNORM_KERNELS( U8, U8, KERNEL_SOURCE_2 ) + TENSOR_GROUPNORM_KERNELS_2D( U8, U8, KERNEL_SOURCE_2 ) + TENSOR_GROUPNORM_KERNELS( U8, F16, KERNEL_SOURCE_5 ) + TENSOR_GROUPNORM_KERNELS_2D( U8, F16, KERNEL_SOURCE_5 ) + + TENSOR_GROUPNORM_KERNELS( I16, I16, KERNEL_SOURCE_3 ) + TENSOR_GROUPNORM_KERNELS_2D( I16, I16, KERNEL_SOURCE_3 ) + TENSOR_GROUPNORM_KERNELS( I16, F16, KERNEL_SOURCE_3 ) + TENSOR_GROUPNORM_KERNELS_2D( I16, F16, KERNEL_SOURCE_3 ) + + TENSOR_GROUPNORM_KERNELS( F16, F16, KERNEL_SOURCE_4 ) + TENSOR_GROUPNORM_KERNELS_2D( F16, F16, KERNEL_SOURCE_4 ) + TENSOR_GROUPNORM_KERNELS( F16, U8, KERNEL_SOURCE_4 ) + TENSOR_GROUPNORM_KERNELS_2D( F16, U8, KERNEL_SOURCE_4 ) +}; + +/* + * Kernel params + */ +static vx_param_description_t _groupnorm_sum_sqr_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + // Add kererl parameters here +}; +#define _GROUPNORM_SUM_SQR_PARAM_NUM _cnt_of_array( _groupnorm_sum_sqr_kernel_param_def ) + +static vx_param_description_t _groupnorm_mean_vari_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + // Add kererl parameters here +}; +#define _GROUPNORM_MEAN_VARI_PARAM_NUM _cnt_of_array( _groupnorm_mean_vari_kernel_param_def ) + +static vx_param_description_t _groupnorm_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + // Add kererl parameters here +}; +#define _GROUPNORM_PARAM_NUM _cnt_of_array( _groupnorm_kernel_param_def ) + +/* + * Kernel initializer + */ +DEF_KERNEL_INITIALIZER(_groupnorm_sum_sqr_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + gpu_param_t shaderParam = { + 3, // workdim + {0, 0, 0}, // globalWorkOffset: control the start location be processed in the image + {0, 0, 0}, // globalWorkScale: how many pixels could be processed by a single thread + {0, 0, 0}, // localWorkSize: local group size in thread + {0, 0, 0}}; // globalWorkSize: image size in thread + + vsi_nn_kernel_tensor_attr_t* attr[2] = {NULL, NULL}; + vsi_int_array_t * input_shape = NULL; + float scaleIn = 1; + int32_t input_zp = 0; + vx_uint32 iter = 0; + int32_t sumInZp = 0; + int32_t tmpZp1 = 0; + float tmpZp2 = 0; + float e2InScale = 0; + float rowSumScale = 0; + int32_t is2D = 0; + int32_t width = 0; + int32_t height = 0; + int32_t chn = 0; + float in_scale_fl = 1, inFlScale_s2 = 1; + + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); + CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError ); + attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); + CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", OnError ); + + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[3], &is2D); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + + input_shape = attr[0]->shape; + + if (attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM) + { + input_zp = attr[0]->asymm.zero_point; + scaleIn = attr[0]->asymm.scale; + } + else if (attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP) + { + if (attr[0]->dfp.fl > 0) + { + in_scale_fl = (1.0f / ((float) ((int64_t)1 << attr[0]->dfp.fl))); + } + else + { + in_scale_fl = ((float) ((int64_t)1 << -attr[0]->dfp.fl)); + } + inFlScale_s2 = in_scale_fl * in_scale_fl; + } + + width = input_shape->data[0]; + height = input_shape->data[1]; + chn = attr[1]->shape->data[1]; + if (is2D) + { + height = 1; + } + iter = height * 16; + + if (attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM) + { + sumInZp = input_zp * iter * (-1); + tmpZp1 = (-2) * input_zp; + e2InScale = scaleIn * scaleIn; + tmpZp2 = input_zp * input_zp * e2InScale; + rowSumScale = height * 16 * tmpZp2; + } + + shaderParam.global_scale[0] = 1; + shaderParam.global_scale[1] = 1; + shaderParam.global_scale[2] = 1; + shaderParam.local_size[0] = 16; + shaderParam.local_size[1] = 1; + shaderParam.local_size[2] = 1; + + if (attr[0]->dtype == I8 || attr[0]->dtype == U8) + { + shaderParam.global_size[0] = (width + 255) / 256 * 16; + } + else if (attr[0]->dtype == I16 || attr[0]->dtype == F16) + { + shaderParam.global_size[0] = (width + 127) / 128 * 16; + } + shaderParam.global_size[1] = chn; + shaderParam.global_size[2] = 1; + + status = vsi_nn_kernel_gpu_config( node, &shaderParam ); + CHECK_STATUS_FAIL_GOTO(status, OnError); + + if (attr[0]->dtype == U8) + { + gpu_dp_inst_t uniSumU8_16x1 = {{ + 0x55555555, // TCfg + 0x00000000, // ASelt + 0x76543210, 0xfedcba98, // ABin + 0xaaaaaaaa, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002400, // AccumType, ConstantType, and PostShift + 0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00010001 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniSqrSum_16x1 = {{ + 0x55555555, // TCfg + 0x00000000, // ASelt + 0x76543210, 0xfedcba98, // ABin + 0x55555555, // BSelt + 0x76543210, 0xfedcba98, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + status = vsi_nn_kernel_gpu_add_param(node, "uniSumU8_16x1", &uniSumU8_16x1); + status |= vsi_nn_kernel_gpu_add_param(node, "uniSqrSum_16x1", &uniSqrSum_16x1); + status |= vsi_nn_kernel_gpu_add_param(node, "sumInZp", &sumInZp); + status |= vsi_nn_kernel_gpu_add_param(node, "tmpZp1", &tmpZp1); + status |= vsi_nn_kernel_gpu_add_param(node, "input_scale", &scaleIn); + status |= vsi_nn_kernel_gpu_add_param(node, "e2InScale", &e2InScale); + status |= vsi_nn_kernel_gpu_add_param(node, "rowSumScale", &rowSumScale); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + } + else if (attr[0]->dtype == I8) + { + gpu_dp_inst_t uniSumInt8_16x1 = {{ + 0x55555555, // TCfg + 0x00000000, // ASelt + 0x76543210, 0xfedcba98, // ABin + 0xaaaaaaaa, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002400, // AccumType, ConstantType, and PostShift + 0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00010001 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniSqrSumInt8_16x1 = {{ + 0x55555555, // TCfg + 0x00000000, // ASelt + 0x76543210, 0xfedcba98, // ABin + 0x55555555, // BSelt + 0x76543210, 0xfedcba98, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + + status = vsi_nn_kernel_gpu_add_param(node, "uniSumInt8_16x1", &uniSumInt8_16x1); + status |= vsi_nn_kernel_gpu_add_param(node, "uniSqrSumInt8_16x1", &uniSqrSumInt8_16x1); + status |= vsi_nn_kernel_gpu_add_param(node, "input_fl_scale", &in_scale_fl); + status |= vsi_nn_kernel_gpu_add_param(node, "inFlScale_s2", &inFlScale_s2); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + } + else if (attr[0]->dtype == I16) + { + gpu_dp_inst_t uniInt16SumSqr_dp8x2 = {{ + 0x55555555, // TCfg + 0x00000000, // ASelt + 0x76543210, 0x76543210, // ABin + 0x5555aaaa, // BSelt + 0x00000000, 0x76543210, // BBin + 0x00000300, // AccumType, ConstantType, and PostShift + 0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + status = vsi_nn_kernel_gpu_add_param(node, "uniInt16SumSqr_dp8x2", &uniInt16SumSqr_dp8x2); + status |= vsi_nn_kernel_gpu_add_param(node, "input_fl_scale", &in_scale_fl); + status |= vsi_nn_kernel_gpu_add_param(node, "inFlScale_s2", &inFlScale_s2); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + } + else if (attr[0]->dtype == F16) + { + gpu_dp_inst_t uniFp16SumSqr_dp8x2 = {{ + 0x55555555, // TCfg + 0x00000000, // ASelt + 0x76543210, 0x76543210, // ABin + 0x5555aaaa, // BSelt + 0x00000000, 0x76543210, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x3c003c00, 0x3c003c00, 0x3c003c00, 0x3c003c00, 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + status = vsi_nn_kernel_gpu_add_param(node, "uniFp16SumSqr_dp8x2", &uniFp16SumSqr_dp8x2); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + } + + status = vsi_nn_kernel_gpu_add_param(node, "width", &width); + status |= vsi_nn_kernel_gpu_add_param(node, "height", &height); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + +OnError: + if (attr[0]) + { + vsi_nn_kernel_tensor_attr_release( &attr[0] ); + attr[0] = NULL; + } + if (attr[1]) + { + vsi_nn_kernel_tensor_attr_release( &attr[1] ); + attr[1] = NULL; + } + + return status; +} + +DEF_KERNEL_INITIALIZER(_groupnorm_mean_vari_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + gpu_param_t shaderParam = { + 3, // workdim + {0, 0, 0}, // globalWorkOffset: control the start location be processed in the image + {0, 0, 0}, // globalWorkScale: how many pixels could be processed by a single thread + {0, 0, 0}, // localWorkSize: local group size in thread + {0, 0, 0}}; // globalWorkSize: image size in thread + + vsi_nn_kernel_tensor_attr_t* attr[1] = {NULL}; + int32_t chn = 0; + int32_t group_stride = 0; + + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); + CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError ); + + chn = attr[0]->shape->data[1]; + group_stride = attr[0]->shape->data[0]; + + shaderParam.global_scale[0] = 4; + shaderParam.global_scale[1] = 1; + shaderParam.global_scale[2] = 1; + shaderParam.local_size[0] = 16; + shaderParam.local_size[1] = 1; + shaderParam.local_size[2] = 1; + shaderParam.global_size[0] = 16; + shaderParam.global_size[1] = chn; + shaderParam.global_size[2] = 1; + + status = vsi_nn_kernel_gpu_config( node, &shaderParam ); + CHECK_STATUS_FAIL_GOTO(status, OnError); + + { + gpu_dp_inst_t uniResetFp32_4x4 = {{ + 0x09090909, // TCfg + 0x00000000, // ASelt + 0x00110000, 0x00330022, // ABin + 0x0a0a0a0a, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000700, // AccumType, ConstantType, and PostShift + 0x00010001, 0x00000000, 0x00010001, 0x00000000, + 0x00010001, 0x00000000, 0x00010001, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + status = vsi_nn_kernel_gpu_add_param(node, "uniResetFp32_4x4", &uniResetFp32_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "group_stride", &group_stride); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + } + +OnError: + if (attr[0]) + { + vsi_nn_kernel_tensor_attr_release( &attr[0] ); + attr[0] = NULL; + } + + return status; +} + +DEF_KERNEL_INITIALIZER(_groupnorm_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + gpu_param_t shaderParam = { + 3, // workdim + {0, 0, 0}, // globalWorkOffset: control the start location be processed in the image + {0, 0, 0}, // globalWorkScale: how many pixels could be processed by a single thread + {0, 0, 0}, // localWorkSize: local group size in thread + {0, 0, 0}}; // globalWorkSize: image size in thread + + vsi_nn_kernel_tensor_attr_t* attr[3] = {NULL, NULL}; + vsi_int_array_t * input_shape = NULL; + float scaleIn = 1.0f; + float scaleOut = 1.0f; + float reScaleOut_u8 = 1.0f; + float scale_inOut = 1.0f; + int32_t output_zp = 0; + int32_t input_zp = 0; + float in_scale_fl = 1, out_scale_fl = 1, inOut_fl_scale = 1; + int32_t height = 0, width = 0, chn = 0; + int32_t is2D = 0; + + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); + CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError ); + attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[3] ); + CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", OnError ); + attr[2] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[4] ); + CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", OnError ); + + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[6], &is2D); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + + input_shape = attr[0]->shape; + + if (attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM) + { + input_zp = attr[0]->asymm.zero_point; + scaleIn = attr[0]->asymm.scale; + } + else if (attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP) + { + if (attr[0]->dfp.fl > 0) + { + in_scale_fl = (1.0f / ((float) ((int64_t)1 << attr[0]->dfp.fl))); + } + else + { + in_scale_fl = ((float) ((int64_t)1 << -attr[0]->dfp.fl)); + } + input_zp = 0; + } + + if (attr[2]->quant == VSI_NN_KERNEL_QUANT_ASYMM) + { + output_zp = attr[2]->asymm.zero_point; + scaleOut = attr[2]->asymm.scale; + reScaleOut_u8 = 1 / scaleOut; + } + else if (attr[2]->quant == VSI_NN_KERNEL_QUANT_DFP) + { + if (attr[2]->dfp.fl > 0) + { + out_scale_fl = (float)((int64_t)1 << attr[2]->dfp.fl); + } + else + { + out_scale_fl = (1.0f / (float)((int64_t)1 << -attr[2]->dfp.fl)); + } + output_zp = 0; + } + + if ((attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP) + && (attr[2]->quant == VSI_NN_KERNEL_QUANT_DFP)) + { + inOut_fl_scale = in_scale_fl * out_scale_fl; + } + + width = input_shape->data[0]; + height = input_shape->data[1]; + chn = attr[1]->shape->data[1]; + if (is2D) + { + height = 1; + } + + shaderParam.global_scale[0] = 16; + if (attr[0]->dtype == I16 || attr[0]->dtype == F16) + { + shaderParam.global_scale[0] = 8; + } + + shaderParam.global_scale[1] = 1; + shaderParam.global_scale[2] = 1; + shaderParam.global_size[0] = gpu_align_p2((width + shaderParam.global_scale[0] - 1) + / shaderParam.global_scale[0], 4); + shaderParam.global_size[1] = (height + shaderParam.global_scale[1] - 1) + / shaderParam.global_scale[1]; + shaderParam.global_size[2] = chn; + if (is2D) + { + shaderParam.global_size[0] = gpu_align_p2((width + shaderParam.global_scale[0] - 1) + / shaderParam.global_scale[0], 4); + shaderParam.global_size[1] = (chn + shaderParam.global_scale[1] - 1) + / shaderParam.global_scale[1]; + shaderParam.global_size[2] = 1; + } + + status = vsi_nn_kernel_gpu_config( node, &shaderParam ); + CHECK_STATUS_FAIL_GOTO(status, OnError); + + { + gpu_dp_inst_t UniFP16toFP32Lo4_dp4x4 = {{ + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00010000, 0x00030002, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x00003c00, 0x00000000, 0x00003c00, 0x00000000, 0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniConvertInt32toUint8_2x8 = {{ + 0x33333333, // TCfg + 0x11110000, // ASelt + 0x03020100, 0x03020100, // ABin + 0x00000000, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniConvertEndInt16Fp32_4x4 = {{ + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00050004, 0x00070006, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x00003c00, 0x00000000, 0x00003c00, 0x00000000, + 0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniConvert1stUint8SubZpToFp32_4x4 = {{ + 0x05050505, // TCfg + 0x04040404, // ASelt + 0x00010000, 0x00030002, // ABin + 0x0a0a0a0a, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0xffff0001, 0x00000000, 0xffff0001, 0x00000000, 0xffff0001, 0x00000000, 0xffff0001, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniConvert2ndUint8SubZpToFp32_4x4 = {{ + 0x05050505, // TCfg + 0x04040404, // ASelt + 0x00050004, 0x00070006, // ABin + 0x0a0a0a0a, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0xffff0001, 0x00000000, 0xffff0001, 0x00000000, 0xffff0001, 0x00000000, 0xffff0001, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniConvert3rdUint8SubZpToFp32_4x4 = {{ + 0x05050505, // TCfg + 0x04040404, // ASelt + 0x00090008, 0x000b000a, // ABin + 0x0a0a0a0a, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0xffff0001, 0x00000000, 0xffff0001, 0x00000000, 0xffff0001, 0x00000000, 0xffff0001, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniConvert4thUint8SubZpToFp32_4x4 = {{ + 0x05050505, // TCfg + 0x04040404, // ASelt + 0x000d000c, 0x000f000e, // ABin + 0x0a0a0a0a, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002400, // AccumType, ConstantType, and PostShift + 0xffff0001, 0x00000000, 0xffff0001, 0x00000000, 0xffff0001, 0x00000000, 0xffff0001, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniConvertInt16Fp32Fst_4x4 = {{ + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00010000, 0x00030002, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000000, 0x00000001, 0x00000000, + 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniConvertInt16Fp32Secd_4x4 = {{ + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00050004, 0x00070006, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000000, 0x00000001, 0x00000000, + 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniConvertInt32toInt16_2x8 = {{ + 0x33333333, // TCfg + 0x11110000, // ASelt + 0x03020100, 0x03020100, // ABin + 0x00000000, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniConvertDirUint8Fp32_4x4 = {{ + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00010000, 0x00030002, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x00003c00, 0x00000000, 0x00003c00, 0x00000000, + 0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniConvertEndUint8Fp32_4x4 = {{ + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00050004, 0x00070006, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x00003c00, 0x00000000, 0x00003c00, 0x00000000, + 0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniConvertTrdUint8Fp32_4x4 = {{ + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00090008, 0x000b000a, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x00003c00, 0x00000000, 0x00003c00, 0x00000000, + 0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniConvertFthUint8Fp32_4x4 = {{ + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x000d000c, 0x000f000e, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x00003c00, 0x00000000, 0x00003c00, 0x00000000, + 0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniConvertHalfToFp16_2x8 = {{ + 0x11111111, // TCfg + 0x11110000, // ASelt + 0x06040200, 0x06040200, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00 // Constant + }, GPU_DP_TYPE_16 }; + + uint32_t pack_key = 0; +#define _PACK_SELECT_KEY( IN0_TYPE, OUT_TYPE ) \ + (IN0_TYPE | (OUT_TYPE << 8)) + + pack_key = _PACK_SELECT_KEY( attr[0]->dtype, attr[2]->dtype ); + + status = vsi_nn_kernel_gpu_add_param(node, "height", &height); + status |= vsi_nn_kernel_gpu_add_param(node, "UniFP16toFP32Lo4_dp4x4", &UniFP16toFP32Lo4_dp4x4); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + + switch( pack_key ) + { + case _PACK_SELECT_KEY( I8, I8 ): + case _PACK_SELECT_KEY( I8, F16 ): + { + status = vsi_nn_kernel_gpu_add_param(node, "uniConvertInt32toUint8_2x8", + &uniConvertInt32toUint8_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertDirInt8Fp32_4x4", + &uniConvertDirUint8Fp32_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertEndInt8Fp32_4x4", + &uniConvertEndUint8Fp32_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertTrdInt8Fp32_4x4", + &uniConvertTrdUint8Fp32_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertFthInt8Fp32_4x4", + &uniConvertFthUint8Fp32_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertHalfToFp16_2x8", + &uniConvertHalfToFp16_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "input_fl_scale", &in_scale_fl); + + status |= vsi_nn_kernel_gpu_add_param(node, "output_fl_scale", &out_scale_fl); + status |= vsi_nn_kernel_gpu_add_param(node, "inOut_fl_scale", &inOut_fl_scale); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + } + break; + case _PACK_SELECT_KEY( U8, U8 ): + { + status = vsi_nn_kernel_gpu_add_param(node, "uniConvertInt32toUint8_2x8", + &uniConvertInt32toUint8_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert1stUint8SubZpToFp32_4x4", + &uniConvert1stUint8SubZpToFp32_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert2ndUint8SubZpToFp32_4x4", + &uniConvert2ndUint8SubZpToFp32_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert3rdUint8SubZpToFp32_4x4", + &uniConvert3rdUint8SubZpToFp32_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert4thUint8SubZpToFp32_4x4", + &uniConvert4thUint8SubZpToFp32_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "inputZP", &input_zp); + status |= vsi_nn_kernel_gpu_add_param(node, "input_scale", &scaleIn); + + status |= vsi_nn_kernel_gpu_add_param(node, "output_ZP", &output_zp); + status |= vsi_nn_kernel_gpu_add_param(node, "outputScale", &reScaleOut_u8); + + scale_inOut = reScaleOut_u8 * scaleIn; + status |= vsi_nn_kernel_gpu_add_param(node, "scale_inOut", &scale_inOut); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + } + break; + case _PACK_SELECT_KEY( U8, F16 ): + { + status = vsi_nn_kernel_gpu_add_param(node, "uniConvert1stUint8SubZpToFp32_4x4", + &uniConvert1stUint8SubZpToFp32_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert2ndUint8SubZpToFp32_4x4", + &uniConvert2ndUint8SubZpToFp32_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert3rdUint8SubZpToFp32_4x4", + &uniConvert3rdUint8SubZpToFp32_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert4thUint8SubZpToFp32_4x4", + &uniConvert4thUint8SubZpToFp32_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertHalfToFp16_2x8", + &uniConvertHalfToFp16_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "inputZP", &input_zp); + status |= vsi_nn_kernel_gpu_add_param(node, "input_scale", &scaleIn); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + } + break; + case _PACK_SELECT_KEY( I16, I16 ): + case _PACK_SELECT_KEY( I16, F16 ): + { + status = vsi_nn_kernel_gpu_add_param(node, "uniConvertInt16Fp32Fst_4x4", + &uniConvertInt16Fp32Fst_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertInt16Fp32Secd_4x4", + &uniConvertInt16Fp32Secd_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "input_fl_scale", &in_scale_fl); + + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertInt32toInt16_2x8", + &uniConvertInt32toInt16_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertHalfToFp16_2x8", + &uniConvertHalfToFp16_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "output_fl_scale", &out_scale_fl); + + status |= vsi_nn_kernel_gpu_add_param(node, "inOut_fl_scale", &inOut_fl_scale); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + } + break; + case _PACK_SELECT_KEY( F16, F16 ): + { + status = vsi_nn_kernel_gpu_add_param(node, "uniConvertEndInt16Fp32_4x4", + &uniConvertEndInt16Fp32_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertHalfToFp16_2x8", + &uniConvertHalfToFp16_2x8); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + } + break; + case _PACK_SELECT_KEY( F16, U8 ): + { + status = vsi_nn_kernel_gpu_add_param(node, "uniConvertEndInt16Fp32_4x4", + &uniConvertEndInt16Fp32_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertInt32toUint8_2x8", + &uniConvertInt32toUint8_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "output_ZP", &output_zp); + status |= vsi_nn_kernel_gpu_add_param(node, "outputScale", &reScaleOut_u8); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + } + break; + default: + VSI_ASSERT( FALSE ); + return VSI_FAILURE; + } +#undef _PACK_SELECT_KEY + } + +OnError: + if (attr[0]) + { + vsi_nn_kernel_tensor_attr_release( &attr[0] ); + attr[0] = NULL; + } + if (attr[1]) + { + vsi_nn_kernel_tensor_attr_release( &attr[1] ); + attr[1] = NULL; + } + if (attr[2]) + { + vsi_nn_kernel_tensor_attr_release( &attr[2] ); + attr[2] = NULL; + } + + return status; +} + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + const uint32_t hashkey, + _internal_kernel_e kernel_id + /* Add extra params */ + ) +{ + vsi_status status = VSI_FAILURE; + vx_kernel_initialize_f initializer = NULL; + vx_param_description_t * param_def = NULL; + const _kernel_map_type* kernel_map; + size_t kernel_map_size = 0; + size_t param_size = 0; + uint32_t i = 0; + + switch( kernel_id ) + { + case INTERNAL_KERNEL_SUM_SQR: + initializer = _groupnorm_sum_sqr_initializer; + kernel_map = _groupnorm_sum_sqr_kernel_map; + kernel_map_size = _cnt_of_array( _groupnorm_sum_sqr_kernel_map ); + param_def = _groupnorm_sum_sqr_kernel_param_def; + param_size = _GROUPNORM_SUM_SQR_PARAM_NUM; + break; + case INTERNAL_KERNEL_MEAN_VARI: + initializer = _groupnorm_mean_vari_initializer; + kernel_map = _groupnorm_mean_vari_kernel_map; + kernel_map_size = _cnt_of_array( _groupnorm_mean_vari_kernel_map ); + param_def = _groupnorm_mean_vari_kernel_param_def; + param_size = _GROUPNORM_MEAN_VARI_PARAM_NUM; + break; + case INTERNAL_KERNEL_NORM: + initializer = _groupnorm_initializer; + kernel_map = _groupnorm_kernel_map; + kernel_map_size = _cnt_of_array( _groupnorm_kernel_map ); + param_def = _groupnorm_kernel_param_def; + param_size = _GROUPNORM_PARAM_NUM; + break; + default: + VSI_ASSERT( FALSE ); + return VSI_FAILURE; + } + + for( i = 0; i < kernel_map_size; i ++ ) + { + if ( kernel_map[i].key == hashkey ) + { + break; + } + } + if ( i < kernel_map_size ) + { + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name ); + kernel->info.parameters = param_def; + kernel->info.numParams = (uint32_t)param_size; + kernel->info.initialize = initializer; + // Register code source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2, + "vsi_nn_kernel_header", + kernel_map[i].source_name ); + // Register binary source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + kernel_map[i].source_name ); + status = VSI_SUCCESS; + } + + return status; +} /* _query_kernel() */ + +static int32_t _optimize_gn_shape + ( + vsi_nn_tensor_t ** inputs, + int32_t group_size, + int32_t group_num, + int32_t* opt_shape, + int32_t* is2D_flg + ) +{ + vsi_status status = VSI_SUCCESS; + int32_t group_shape[VSI_NN_MAX_DIM_NUM] = {0}; + int32_t new_rank = 0; + group_shape[0] = inputs[0]->attr.size[0]; + group_shape[1] = inputs[0]->attr.size[1]; + group_shape[2] = group_size; + + vsi_nn_kernel_optimize_element_shape( group_shape, 3, opt_shape, &new_rank ); + + if (opt_shape[1] == 1) + { + opt_shape[1] = group_num; + opt_shape[2] = 1; + opt_shape[3] = inputs[0]->attr.dim_num > 3 ? inputs[0]->attr.size[3] : 1; + is2D_flg[0] = 1; + } + else if (new_rank == 2) + { + opt_shape[2] = group_num; + opt_shape[3] = inputs[0]->attr.dim_num > 3 ? inputs[0]->attr.size[3] : 1; + } + else + { + status = VSI_FAILURE; + } + + return status; +} + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ +#define INTERNAL_KERNEL_SIZE (2) +#define SUM_SQR_INDEX (0) +#define MEAN_VARI_INDEX (1) + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t sum_sqr_node_params[_GROUPNORM_SUM_SQR_PARAM_NUM] = { NULL }; + vsi_nn_kernel_node_param_t mean_vari_node_params[_GROUPNORM_MEAN_VARI_PARAM_NUM] = { NULL }; + vsi_nn_kernel_node_param_t node_params[_GROUPNORM_PARAM_NUM] = { NULL }; + vsi_nn_kernel_node_t tmp_node = NULL, tmp_node1 = NULL; + vsi_nn_kernel_node_t node = NULL; + vsi_nn_kernel_dtype_e in0_dtype = U8; + vsi_nn_kernel_dtype_e out_dtype = U8; + vsi_nn_tensor_attr_t attr; + vsi_nn_kernel_t * ikernels[INTERNAL_KERNEL_SIZE] = { NULL }; + vsi_nn_tensor_t * tensors[INTERNAL_KERNEL_SIZE] = { NULL }; + vsi_nn_kernel_tensor_t rs_input = NULL, rs_output = NULL; + int32_t new_shape[VSI_NN_MAX_DIM_NUM] = { 1, 1, 1, 1 }; + int32_t is2D_flg = 0; + uint32_t hashkeys[INTERNAL_KERNEL_SIZE] = { 0 }; + uint32_t hashkey = 0; + int32_t i = 0; + float rSpaceOrg = 1.0f / (inputs[0]->attr.size[0] * inputs[0]->attr.size[1]); + float eps = vsi_nn_kernel_param_get_float32( params, "eps" ); + int32_t group_num = vsi_nn_kernel_param_get_int32( params, "group_num" ); + int32_t group_size = inputs[0]->attr.size[2] / group_num; + float group_ratio = 1.0f / (inputs[0]->attr.size[0] * inputs[0]->attr.size[1] * group_size); + + // Check if gpu can support the size + if ( !vsi_nn_kernel_gpu_check_shape( + (int32_t*)outputs[0]->attr.size, outputs[0]->attr.dim_num ) ) + { + return NULL; + } + + status = _optimize_gn_shape(inputs, group_size, group_num, new_shape, &is2D_flg); + if ( VSI_SUCCESS != status ) + { + goto final; + } + rs_input = vsi_nn_kernel_tensor_reshape(inputs[0]->t, new_shape, 4); + rs_output = vsi_nn_kernel_tensor_reshape(outputs[0]->t, new_shape, 4); + + for( i = 0; i < INTERNAL_KERNEL_SIZE; i ++ ) + { + ikernels[i] = vsi_nn_kernel_create( VSI_NN_KERNEL_TYPE_EVIS ); + // Assign unique_id + ikernels[i]->unique_id = kernel->unique_id; + } + + in0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + + hashkeys[SUM_SQR_INDEX]= HASH_GROUPNORM_SUM_SQR_KEY( in0_dtype, F32, is2D_flg ); + hashkeys[MEAN_VARI_INDEX]= HASH_GROUPNORM_MEAN_VARI_KEY( F32, F32 ); + hashkey = HASH_GROUPNORM_KEY( in0_dtype, out_dtype, is2D_flg ); + + status = _query_kernel( ikernels[SUM_SQR_INDEX], hashkeys[SUM_SQR_INDEX], INTERNAL_KERNEL_SUM_SQR ); + if ( VSI_SUCCESS != status ) + { + goto final; + } + status = _query_kernel( ikernels[MEAN_VARI_INDEX], hashkeys[MEAN_VARI_INDEX], INTERNAL_KERNEL_MEAN_VARI ); + if ( VSI_SUCCESS != status ) + { + goto final; + } + status = _query_kernel( kernel, hashkey, INTERNAL_KERNEL_NORM ); + if ( VSI_SUCCESS != status ) + { + goto final; + } + + memset( &attr, 0, sizeof(vsi_nn_tensor_attr_t) ); + attr.dtype.vx_type = VSI_NN_TYPE_FLOAT32; + attr.is_const = FALSE; + attr.vtl = TRUE; + attr.size[0] = ((new_shape[0] + 255) / 256) * 4; + if ( inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT16 + || inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT16) + { + attr.size[0] = ((new_shape[0] + 127) / 128) * 4; + } + attr.size[1] = group_num; + attr.size[2] = 1; + attr.size[3] = inputs[0]->attr.dim_num > 3 ? inputs[0]->attr.size[3] : 1; + attr.dim_num = 4; + tensors[SUM_SQR_INDEX] = vsi_nn_CreateTensor( graph, &attr ); + + attr.size[0] = 4; + tensors[MEAN_VARI_INDEX] = vsi_nn_CreateTensor( graph, &attr ); + + // Sum Sqr + tmp_node = vsi_nn_kernel_create_node( graph, ikernels[SUM_SQR_INDEX] ); + if (tmp_node) + { + uint32_t index = 0; + sum_sqr_node_params[index++] = rs_input; + sum_sqr_node_params[index++] = (vsi_nn_kernel_node_param_t)tensors[SUM_SQR_INDEX]->t; + sum_sqr_node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &eps ); + sum_sqr_node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &is2D_flg ); + + status = vsi_nn_kernel_node_pass_param( tmp_node, sum_sqr_node_params, + _GROUPNORM_SUM_SQR_PARAM_NUM ); + CHECK_STATUS(status); + vsi_nn_kernel_scalar_release( &sum_sqr_node_params[2] ); + vsi_nn_kernel_scalar_release( &sum_sqr_node_params[3] ); + { + // Set default border mode. + vx_border_t border; + border.mode = VX_BORDER_CONSTANT; + border.constant_value.U8 = 0; + border.constant_value.U16 = 0; + if (inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_UINT8) + { + border.constant_value.U8 = (vx_uint8)inputs[0]->attr.dtype.zero_point; + } + status = vxSetNodeAttribute( (vx_node)tmp_node, VX_NODE_BORDER, &border, sizeof(border) ); + CHECK_STATUS(status); + } + } + + // mean vari + tmp_node1 = vsi_nn_kernel_create_node( graph, ikernels[MEAN_VARI_INDEX] ); + if (tmp_node1) + { + uint32_t index = 0; + mean_vari_node_params[index++] = (vsi_nn_kernel_node_param_t)tensors[SUM_SQR_INDEX]->t; + mean_vari_node_params[index++] = (vsi_nn_kernel_node_param_t)tensors[MEAN_VARI_INDEX]->t; + mean_vari_node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &eps ); + mean_vari_node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &group_ratio ); + + status = vsi_nn_kernel_node_pass_param( tmp_node1, mean_vari_node_params, + _GROUPNORM_MEAN_VARI_PARAM_NUM ); + CHECK_STATUS(status); + vsi_nn_kernel_scalar_release( &mean_vari_node_params[2] ); + vsi_nn_kernel_scalar_release( &mean_vari_node_params[3] ); + { + // Set default border mode. + vx_border_t border; + border.mode = VX_BORDER_CONSTANT; + border.constant_value.U8 = 0; + border.constant_value.U16 = 0; + if (inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_UINT8) + { + border.constant_value.U8 = (vx_uint8)inputs[0]->attr.dtype.zero_point; + } + status = vxSetNodeAttribute( (vx_node)tmp_node1, VX_NODE_BORDER, &border, sizeof(border) ); + CHECK_STATUS(status); + } + } + + // Nomalization + node = vsi_nn_kernel_create_node( graph, kernel ); + if (node) + { + uint32_t index = 0; + int32_t pStride = 0; + if (!is2D_flg) + { + pStride = inputs[1]->attr.size[0] / new_shape[1]; + rSpaceOrg = 1.0f / (new_shape[0] / pStride); + } + node_params[index++] = rs_input; + node_params[index++] = (vsi_nn_kernel_node_param_t)inputs[1]->t; + node_params[index++] = (vsi_nn_kernel_node_param_t)inputs[2]->t; + node_params[index++] = (vsi_nn_kernel_node_param_t)tensors[MEAN_VARI_INDEX]->t; + node_params[index++] = rs_output; + node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &eps ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &is2D_flg ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &rSpaceOrg ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &pStride ); + + status = vsi_nn_kernel_node_pass_param( node, node_params, + _GROUPNORM_PARAM_NUM ); + CHECK_STATUS(status); + vsi_nn_kernel_scalar_release( &node_params[5] ); + vsi_nn_kernel_scalar_release( &node_params[6] ); + vsi_nn_kernel_scalar_release( &node_params[7] ); + vsi_nn_kernel_scalar_release( &node_params[8] ); + { + // Set default border mode. + vx_border_t border; + border.mode = VX_BORDER_CONSTANT; + border.constant_value.U8 = 0; + border.constant_value.U16 = 0; + if (inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_UINT8) + { + border.constant_value.U8 = (vx_uint8)inputs[0]->attr.dtype.zero_point; + } + status = vxSetNodeAttribute( (vx_node)node, VX_NODE_BORDER, &border, sizeof(border) ); + CHECK_STATUS(status); + } + } + + /* Pass parameters to node. */ +final: + if (rs_input) + { + vsi_nn_kernel_tensor_release( &rs_input ); + } + if (rs_output) + { + vsi_nn_kernel_tensor_release( &rs_output ); + } + for( i = 0; i < INTERNAL_KERNEL_SIZE; i ++ ) + { + if ( ikernels[i] ) + { + vsi_nn_kernel_release( &ikernels[i] ); + } + if ( tensors[i] ) + { + vsi_nn_ReleaseTensor( &tensors[i] ); + } + } + if (tmp_node) {vsi_nn_kernel_node_release( &tmp_node );} + if (tmp_node1) {vsi_nn_kernel_node_release( &tmp_node1 );} +#undef INTERNAL_KERNEL_SIZE +#undef SUM_SQR_INDEX +#undef MEAN_VARI_INDEX + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_EVIS( group_norm, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/evis/instance_normalization_evis.c b/src/tim/vx/internal/src/kernel/evis/instance_normalization_evis.c index b893e74..ecb7014 100644 --- a/src/tim/vx/internal/src/kernel/evis/instance_normalization_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/instance_normalization_evis.c @@ -53,6 +53,10 @@ typedef enum #define KERNEL_SOURCE_2 "instance_normalization_u8" #define KERNEL_SOURCE_3 "instance_normalization_i16" #define KERNEL_SOURCE_4 "instance_normalization_f16" +#define KERNEL_SOURCE_5 "instance_normalization_u8_f16" +#define KERNEL_SOURCE_6 "instance_normalization_scale_f32" +#define KERNEL_SOURCE_7 "instance_normalization_scale_f32_f16" +#define KERNEL_SOURCE_8 "instance_normalization_scale_f32_bf16" #define HASH_INSTANCENORM_MEAN_VARI_SH_KERNEL_NAME(SRC0_TYPE) \ CVIVANTE_NAMESPACE("evis.instance_norm_meanvari_"#SRC0_TYPE) @@ -66,6 +70,12 @@ typedef enum #define HASH_INSTANCENORM_SH_KERNEL_2D_NAME(SRC0_TYPE, DST_TYPE) \ CVIVANTE_NAMESPACE("evis.instance_norm_"#SRC0_TYPE"to"#DST_TYPE"_2D") +#define HASH_INSTANCENORM_SCALE_SH_KERNEL_NAME(SRC0_TYPE, DST_TYPE) \ + CVIVANTE_NAMESPACE("evis.instance_norm_"#SRC0_TYPE"F32to"#DST_TYPE) + +#define HASH_INSTANCENORM_SCALE_SH_KERNEL_2D_NAME(SRC0_TYPE, DST_TYPE) \ + CVIVANTE_NAMESPACE("evis.instance_norm_"#SRC0_TYPE"F32to"#DST_TYPE"_2D") + // Add kernel hashtable here // mean vari #define HASH_INSTANCENORM_MEAN_VARI_KEY(_input0_type, _output_type, _reshape_flag) \ @@ -82,19 +92,29 @@ typedef enum SOURCE }, // normalization -#define HASH_INSTANCENORM_KEY(_input0_type, _output_type, _reshape_flag) \ - ((_input0_type << 24) | (_output_type << 16) | (_reshape_flag << 8)) +#define HASH_INSTANCENORM_KEY(_input0_type, _input1_type, _output_type, _reshape_flag) \ + ((_input0_type << 24) | (_input1_type << 16) | (_output_type << 8) | (_reshape_flag << 4)) #define TENSOR_INSTANCENORM_KERNELS(IN0_TYPE, OUT_TYPE, SOURCE) \ - { HASH_INSTANCENORM_KEY(IN0_TYPE, OUT_TYPE, 0), \ + { HASH_INSTANCENORM_KEY(IN0_TYPE, F16, OUT_TYPE, 0), \ HASH_INSTANCENORM_SH_KERNEL_NAME(IN0_TYPE, OUT_TYPE), \ SOURCE }, #define TENSOR_INSTANCENORM_KERNELS_2D(IN0_TYPE, OUT_TYPE, SOURCE) \ - { HASH_INSTANCENORM_KEY(IN0_TYPE, OUT_TYPE, 1), \ + { HASH_INSTANCENORM_KEY(IN0_TYPE, F16, OUT_TYPE, 1), \ HASH_INSTANCENORM_SH_KERNEL_2D_NAME(IN0_TYPE, OUT_TYPE), \ SOURCE }, +#define TENSOR_INSTANCENORM_SCALE_KERNELS(IN0_TYPE, OUT_TYPE, SOURCE) \ + { HASH_INSTANCENORM_KEY(IN0_TYPE, F32, OUT_TYPE, 0), \ + HASH_INSTANCENORM_SCALE_SH_KERNEL_NAME(IN0_TYPE, OUT_TYPE), \ + SOURCE }, + +#define TENSOR_INSTANCENORM_SCALE_KERNELS_2D(IN0_TYPE, OUT_TYPE, SOURCE) \ + { HASH_INSTANCENORM_KEY(IN0_TYPE, F32, OUT_TYPE, 1), \ + HASH_INSTANCENORM_SCALE_SH_KERNEL_2D_NAME(IN0_TYPE, OUT_TYPE), \ + SOURCE }, + typedef struct { uint32_t key; @@ -113,6 +133,8 @@ static const _kernel_map_type _instancenorm_mean_vari_kernel_map[] = TENSOR_INSTANCENORM_MEAN_VARI_KERNELS_2D( I16, F32, KERNEL_SOURCE_3 ) TENSOR_INSTANCENORM_MEAN_VARI_KERNELS( F16, F32, KERNEL_SOURCE_4 ) TENSOR_INSTANCENORM_MEAN_VARI_KERNELS_2D( F16, F32, KERNEL_SOURCE_4 ) + TENSOR_INSTANCENORM_MEAN_VARI_KERNELS( BF16, F32, KERNEL_SOURCE_8 ) + TENSOR_INSTANCENORM_MEAN_VARI_KERNELS_2D( BF16, F32, KERNEL_SOURCE_8 ) }; static const _kernel_map_type _instancenorm_kernel_map[] = @@ -125,8 +147,8 @@ static const _kernel_map_type _instancenorm_kernel_map[] = TENSOR_INSTANCENORM_KERNELS( U8, U8, KERNEL_SOURCE_2 ) TENSOR_INSTANCENORM_KERNELS_2D( U8, U8, KERNEL_SOURCE_2 ) - TENSOR_INSTANCENORM_KERNELS( U8, F16, KERNEL_SOURCE_2 ) - TENSOR_INSTANCENORM_KERNELS_2D( U8, F16, KERNEL_SOURCE_2 ) + TENSOR_INSTANCENORM_KERNELS( U8, F16, KERNEL_SOURCE_5 ) + TENSOR_INSTANCENORM_KERNELS_2D( U8, F16, KERNEL_SOURCE_5 ) TENSOR_INSTANCENORM_KERNELS( I16, I16, KERNEL_SOURCE_3 ) TENSOR_INSTANCENORM_KERNELS_2D( I16, I16, KERNEL_SOURCE_3 ) @@ -135,6 +157,21 @@ static const _kernel_map_type _instancenorm_kernel_map[] = TENSOR_INSTANCENORM_KERNELS( F16, F16, KERNEL_SOURCE_4 ) TENSOR_INSTANCENORM_KERNELS_2D( F16, F16, KERNEL_SOURCE_4 ) + + TENSOR_INSTANCENORM_SCALE_KERNELS( U8, U8, KERNEL_SOURCE_6 ) + TENSOR_INSTANCENORM_SCALE_KERNELS_2D( U8, U8, KERNEL_SOURCE_6 ) + + TENSOR_INSTANCENORM_SCALE_KERNELS( I8, I8, KERNEL_SOURCE_6 ) + TENSOR_INSTANCENORM_SCALE_KERNELS_2D( I8, I8, KERNEL_SOURCE_6 ) + + TENSOR_INSTANCENORM_SCALE_KERNELS( I16, I16, KERNEL_SOURCE_6 ) + TENSOR_INSTANCENORM_SCALE_KERNELS_2D( I16, I16, KERNEL_SOURCE_6 ) + + TENSOR_INSTANCENORM_SCALE_KERNELS( F16, F16, KERNEL_SOURCE_7 ) + TENSOR_INSTANCENORM_SCALE_KERNELS_2D( F16, F16, KERNEL_SOURCE_7 ) + + TENSOR_INSTANCENORM_SCALE_KERNELS( BF16, BF16, KERNEL_SOURCE_8 ) + TENSOR_INSTANCENORM_SCALE_KERNELS_2D( BF16, BF16, KERNEL_SOURCE_8 ) }; /* @@ -254,7 +291,7 @@ DEF_KERNEL_INITIALIZER(_instancenorm_mean_vari_initializer) { shaderParam.global_size[0] = (width + 255) / 256 * 16; } - else if (attr[0]->dtype == I16 || attr[0]->dtype == F16) + else if (attr[0]->dtype == I16 || attr[0]->dtype == F16 || attr[0]->dtype == BF16) { shaderParam.global_size[0] = (width + 127) / 128 * 16; } @@ -350,6 +387,32 @@ DEF_KERNEL_INITIALIZER(_instancenorm_mean_vari_initializer) status = vsi_nn_kernel_gpu_add_param(node, "uniFp16SumSqr_dp8x2", &uniFp16SumSqr_dp8x2); CHECK_STATUS_FAIL_GOTO(status, OnError ); } + else if (attr[0]->dtype == BF16) + { + gpu_dp_inst_t uniConvBF16toF32_Part0_2x8 = {{ + 0x11111111, // TCfg + 0x01010101, // ASelt + 0x01050004, 0x03070206, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniConvBF16toF32_Part1_2x8 = {{ + 0x11111111, // TCfg + 0x01010101, // ASelt + 0x05050404, 0x07070606, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16}; + status = vsi_nn_kernel_gpu_add_param(node, "uniConvBF16toF32_Part0_2x8", &uniConvBF16toF32_Part0_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvBF16toF32_Part1_2x8", &uniConvBF16toF32_Part1_2x8); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + } status = vsi_nn_kernel_gpu_add_param(node, "width", &width); status |= vsi_nn_kernel_gpu_add_param(node, "height", &height); @@ -385,15 +448,13 @@ DEF_KERNEL_INITIALIZER(_instancenorm_initializer) {0, 0, 0}, // localWorkSize: local group size in thread {0, 0, 0}}; // globalWorkSize: image size in thread - vsi_nn_kernel_tensor_attr_t* attr[3] = {NULL, NULL}; + vsi_nn_kernel_tensor_attr_t* attr[4] = {NULL, NULL}; vsi_int_array_t * input_shape = NULL; float scaleIn = 1.0f; float scaleOut = 1.0f; - float reScaleOut_u8 = 1.0f; float scale_inOut = 1.0f; int32_t output_zp = 0; int32_t input_zp = 0; - float in_scale_fl = 1, out_scale_fl = 1, inOut_fl_scale = 1; float dimRatio = 0; vx_uint32 group_num = 0; vx_int32 height = 0, width = 0, chn = 0; @@ -401,10 +462,12 @@ DEF_KERNEL_INITIALIZER(_instancenorm_initializer) attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError ); - attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[3] ); + attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] ); CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", OnError ); - attr[2] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[4] ); + attr[2] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[3] ); CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", OnError ); + attr[3] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[4] ); + CHECK_PTR_FAIL_GOTO( attr[3], "Create tensor attr buffer fail.", OnError ); status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[6], &rsFlg); CHECK_STATUS_FAIL_GOTO(status, OnError ); @@ -420,43 +483,39 @@ DEF_KERNEL_INITIALIZER(_instancenorm_initializer) { if (attr[0]->dfp.fl > 0) { - in_scale_fl = (1.0f / ((float) ((int64_t)1 << attr[0]->dfp.fl))); + scaleIn = (1.0f / ((float) ((int64_t)1 << attr[0]->dfp.fl))); } else { - in_scale_fl = ((float) ((int64_t)1 << -attr[0]->dfp.fl)); + scaleIn = ((float) ((int64_t)1 << -attr[0]->dfp.fl)); } input_zp = 0; } - if (attr[2]->quant == VSI_NN_KERNEL_QUANT_ASYMM) + if (attr[3]->quant == VSI_NN_KERNEL_QUANT_ASYMM) { - output_zp = attr[2]->asymm.zero_point; - scaleOut = attr[2]->asymm.scale; - reScaleOut_u8 = 1 / scaleOut; + output_zp = attr[3]->asymm.zero_point; + scaleOut = attr[3]->asymm.scale; + scaleOut = 1 / scaleOut; } - else if (attr[2]->quant == VSI_NN_KERNEL_QUANT_DFP) + else if (attr[3]->quant == VSI_NN_KERNEL_QUANT_DFP) { - if (attr[2]->dfp.fl > 0) + if (attr[3]->dfp.fl > 0) { - out_scale_fl = (float)((int64_t)1 << attr[2]->dfp.fl); + scaleOut = (float)((int64_t)1 << attr[3]->dfp.fl); } else { - out_scale_fl = (1.0f / (float)((int64_t)1 << -attr[2]->dfp.fl)); + scaleOut = (1.0f / (float)((int64_t)1 << -attr[3]->dfp.fl)); } output_zp = 0; } - if ((attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP) - && (attr[2]->quant == VSI_NN_KERNEL_QUANT_DFP)) - { - inOut_fl_scale = in_scale_fl * out_scale_fl; - } + scale_inOut = scaleIn * scaleOut; width = input_shape->data[0]; height = input_shape->data[1]; - chn = attr[1]->shape->data[1]; + chn = attr[2]->shape->data[1]; if (rsFlg) { height = height / chn; @@ -467,7 +526,7 @@ DEF_KERNEL_INITIALIZER(_instancenorm_initializer) group_num = (width + 255) / 256; shaderParam.global_scale[0] = 16; - if (attr[0]->dtype == I16 || attr[0]->dtype == F16) + if (attr[0]->dtype == I16 || attr[0]->dtype == F16 || attr[0]->dtype == BF16) { shaderParam.global_scale[0] = 8; group_num = (width + 127) / 128; @@ -630,23 +689,52 @@ DEF_KERNEL_INITIALIZER(_instancenorm_initializer) 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00 // Constant }, GPU_DP_TYPE_16 }; - uint32_t pack_key = 0; -#define _PACK_SELECT_KEY( IN0_TYPE, OUT_TYPE ) \ - (IN0_TYPE | (OUT_TYPE << 8)) + gpu_dp_inst_t uniConvBF16toF32_Part0_2x8 = {{ + 0x11111111, // TCfg + 0x01010101, // ASelt + 0x01050004, 0x03070206, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniConvBF16toF32_Part1_2x8 = {{ + 0x11111111, // TCfg + 0x01010101, // ASelt + 0x05050404, 0x07070606, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniExtractOddData_2x8 = {{ + 0x11111111, // TCfg + 0x11110000, // ASelt + 0x07050301, 0x07050301, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16}; - pack_key = _PACK_SELECT_KEY( attr[0]->dtype, attr[2]->dtype ); + uint32_t pack_key = 0; +#define _PACK_SELECT_KEY( IN0_TYPE, IN1_TYPE, OUT_TYPE ) \ + (IN0_TYPE | (IN1_TYPE << 8) | (OUT_TYPE << 16)) + + pack_key = _PACK_SELECT_KEY( attr[0]->dtype, attr[1]->dtype, attr[3]->dtype ); status = vsi_nn_kernel_gpu_add_param(node, "height", &height); status |= vsi_nn_kernel_gpu_add_param(node, "dimRatio", &dimRatio); status |= vsi_nn_kernel_gpu_add_param(node, "group_num", &group_num); - status |= vsi_nn_kernel_gpu_add_param(node, "UniFP16toFP32Lo4_dp4x4", &UniFP16toFP32Lo4_dp4x4); - status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertHalfToFp16_2x8", &uniConvertHalfToFp16_2x8); CHECK_STATUS_FAIL_GOTO(status, OnError ); switch( pack_key ) { - case _PACK_SELECT_KEY( I8, I8 ): - case _PACK_SELECT_KEY( I8, F16 ): + case _PACK_SELECT_KEY( I8, F16, I8 ): + case _PACK_SELECT_KEY( I8, F16, F16 ): { status = vsi_nn_kernel_gpu_add_param(node, "uniConvertInt32toUint8_2x8", &uniConvertInt32toUint8_2x8); @@ -658,15 +746,42 @@ DEF_KERNEL_INITIALIZER(_instancenorm_initializer) &uniConvertTrdUint8Fp32_4x4); status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertFthInt8Fp32_4x4", &uniConvertFthUint8Fp32_4x4); - status |= vsi_nn_kernel_gpu_add_param(node, "input_fl_scale", &in_scale_fl); + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertHalfToFp16_2x8", + &uniConvertHalfToFp16_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "UniFP16toFP32Lo4_dp4x4", + &UniFP16toFP32Lo4_dp4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "input_fl_scale", &scaleIn); - status |= vsi_nn_kernel_gpu_add_param(node, "output_fl_scale", &out_scale_fl); - status |= vsi_nn_kernel_gpu_add_param(node, "inOut_fl_scale", &inOut_fl_scale); + status |= vsi_nn_kernel_gpu_add_param(node, "output_fl_scale", &scaleOut); + status |= vsi_nn_kernel_gpu_add_param(node, "inOut_fl_scale", &scale_inOut); CHECK_STATUS_FAIL_GOTO(status, OnError ); } break; - case _PACK_SELECT_KEY( U8, U8 ): - case _PACK_SELECT_KEY( U8, F16 ): + case _PACK_SELECT_KEY( U8, F16, U8 ): + { + status = vsi_nn_kernel_gpu_add_param(node, "uniConvertInt32toUint8_2x8", + &uniConvertInt32toUint8_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert1stUint8SubZpToFp32_4x4", + &uniConvert1stUint8SubZpToFp32_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert2ndUint8SubZpToFp32_4x4", + &uniConvert2ndUint8SubZpToFp32_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert3rdUint8SubZpToFp32_4x4", + &uniConvert3rdUint8SubZpToFp32_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert4thUint8SubZpToFp32_4x4", + &uniConvert4thUint8SubZpToFp32_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "UniFP16toFP32Lo4_dp4x4", + &UniFP16toFP32Lo4_dp4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "inputZP", &input_zp); + status |= vsi_nn_kernel_gpu_add_param(node, "input_scale", &scaleIn); + + status |= vsi_nn_kernel_gpu_add_param(node, "output_ZP", &output_zp); + status |= vsi_nn_kernel_gpu_add_param(node, "outputScale", &scaleOut); + status |= vsi_nn_kernel_gpu_add_param(node, "scale_inOut", &scale_inOut); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + } + break; + case _PACK_SELECT_KEY( U8, F32, U8 ): + case _PACK_SELECT_KEY( I8, F32, I8 ): { status = vsi_nn_kernel_gpu_add_param(node, "uniConvertInt32toUint8_2x8", &uniConvertInt32toUint8_2x8); @@ -679,37 +794,85 @@ DEF_KERNEL_INITIALIZER(_instancenorm_initializer) status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert4thUint8SubZpToFp32_4x4", &uniConvert4thUint8SubZpToFp32_4x4); status |= vsi_nn_kernel_gpu_add_param(node, "inputZP", &input_zp); - status |= vsi_nn_kernel_gpu_add_param(node, "input_scale", &scaleIn); status |= vsi_nn_kernel_gpu_add_param(node, "output_ZP", &output_zp); - status |= vsi_nn_kernel_gpu_add_param(node, "outputScale", &reScaleOut_u8); - - scale_inOut = reScaleOut_u8 * scaleIn; + status |= vsi_nn_kernel_gpu_add_param(node, "outputScale", &scaleOut); status |= vsi_nn_kernel_gpu_add_param(node, "scale_inOut", &scale_inOut); CHECK_STATUS_FAIL_GOTO(status, OnError ); } break; - case _PACK_SELECT_KEY( I16, I16 ): - case _PACK_SELECT_KEY( I16, F16 ): + case _PACK_SELECT_KEY( U8, F16, F16 ): + { + status = vsi_nn_kernel_gpu_add_param(node, "uniConvert1stUint8SubZpToFp32_4x4", + &uniConvert1stUint8SubZpToFp32_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert2ndUint8SubZpToFp32_4x4", + &uniConvert2ndUint8SubZpToFp32_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert3rdUint8SubZpToFp32_4x4", + &uniConvert3rdUint8SubZpToFp32_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert4thUint8SubZpToFp32_4x4", + &uniConvert4thUint8SubZpToFp32_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertHalfToFp16_2x8", + &uniConvertHalfToFp16_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "UniFP16toFP32Lo4_dp4x4", + &UniFP16toFP32Lo4_dp4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "inputZP", &input_zp); + status |= vsi_nn_kernel_gpu_add_param(node, "input_scale", &scaleIn); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + } + break; + case _PACK_SELECT_KEY( I16, F16, I16 ): + case _PACK_SELECT_KEY( I16, F16, F16 ): { status = vsi_nn_kernel_gpu_add_param(node, "uniConvertInt16Fp32Fst_4x4", &uniConvertInt16Fp32Fst_4x4); status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertInt16Fp32Secd_4x4", &uniConvertInt16Fp32Secd_4x4); - status |= vsi_nn_kernel_gpu_add_param(node, "input_fl_scale", &in_scale_fl); + status |= vsi_nn_kernel_gpu_add_param(node, "input_fl_scale", &scaleIn); status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertInt32toInt16_2x8", &uniConvertInt32toInt16_2x8); - status |= vsi_nn_kernel_gpu_add_param(node, "output_fl_scale", &out_scale_fl); - - status |= vsi_nn_kernel_gpu_add_param(node, "inOut_fl_scale", &inOut_fl_scale); + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertHalfToFp16_2x8", + &uniConvertHalfToFp16_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "UniFP16toFP32Lo4_dp4x4", + &UniFP16toFP32Lo4_dp4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "output_fl_scale", &scaleOut); + status |= vsi_nn_kernel_gpu_add_param(node, "inOut_fl_scale", &scale_inOut); CHECK_STATUS_FAIL_GOTO(status, OnError ); } break; - case _PACK_SELECT_KEY( F16, F16 ): + case _PACK_SELECT_KEY( I16, F32, I16 ): + { + status = vsi_nn_kernel_gpu_add_param(node, "uniConvertInt16Fp32Fst_4x4", + &uniConvertInt16Fp32Fst_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertInt16Fp32Secd_4x4", + &uniConvertInt16Fp32Secd_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertInt32toInt16_2x8", + &uniConvertInt32toInt16_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "output_fl_scale", &scaleOut); + status |= vsi_nn_kernel_gpu_add_param(node, "inOut_fl_scale", &scale_inOut); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + } + break; + case _PACK_SELECT_KEY( F16, F16, F16 ): + case _PACK_SELECT_KEY( F16, F32, F16 ): { status = vsi_nn_kernel_gpu_add_param(node, "uniConvertEndInt16Fp32_4x4", &uniConvertEndInt16Fp32_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertHalfToFp16_2x8", + &uniConvertHalfToFp16_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "UniFP16toFP32Lo4_dp4x4", + &UniFP16toFP32Lo4_dp4x4); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + } + break; + case _PACK_SELECT_KEY( BF16, F32, BF16 ): + { + status = vsi_nn_kernel_gpu_add_param( node, + "uniConvBF16toF32_Part0_2x8", &uniConvBF16toF32_Part0_2x8 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniConvBF16toF32_Part1_2x8", &uniConvBF16toF32_Part1_2x8 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniExtractOddData_2x8", &uniExtractOddData_2x8 ); CHECK_STATUS_FAIL_GOTO(status, OnError ); } break; @@ -736,6 +899,11 @@ OnError: vsi_nn_kernel_tensor_attr_release( &attr[2] ); attr[2] = NULL; } + if (attr[3]) + { + vsi_nn_kernel_tensor_attr_release( &attr[3] ); + attr[3] = NULL; + } return status; } @@ -826,11 +994,13 @@ static vsi_nn_kernel_node_t _setup vsi_nn_kernel_node_t tmp_node = NULL; vsi_nn_kernel_node_t node = NULL; vsi_nn_kernel_dtype_e in0_dtype = U8; + vsi_nn_kernel_dtype_e in1_dtype = F16; vsi_nn_kernel_dtype_e out_dtype = U8; vsi_nn_tensor_attr_t attr; vsi_nn_kernel_t * ikernels[INTERNAL_KERNEL_SIZE] = { NULL }; vsi_nn_tensor_t * tensors[INTERNAL_KERNEL_SIZE] = { NULL }; vsi_nn_kernel_tensor_t rs_input = NULL, rs_output = NULL, rs_gamma = NULL, rs_beta = NULL; + int32_t shape[VSI_NN_MAX_DIM_NUM] = {0}; uint32_t hashkeys[INTERNAL_KERNEL_SIZE] = { 0 }; uint32_t hashkey = 0; int32_t i = 0; @@ -851,29 +1021,12 @@ static vsi_nn_kernel_node_t _setup ikernels[i]->unique_id = kernel->unique_id; } - memset( &attr, 0, sizeof(vsi_nn_tensor_attr_t) ); - attr.dtype.vx_type = VSI_NN_TYPE_FLOAT32; - attr.is_const = FALSE; - attr.vtl = TRUE; - - attr.size[0] = ((inputs[0]->attr.size[0] + 255) / 256) * 4; - - if ( inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT16 - || inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT16) - { - attr.size[0] = ((inputs[0]->attr.size[0] + 127) / 128) * 4; - } - attr.size[1] = inputs[0]->attr.dim_num > 2 ? inputs[0]->attr.size[2] : 1; - attr.size[2] = 1; - attr.size[3] = inputs[0]->attr.dim_num > 3 ? inputs[0]->attr.size[3] : 1; - attr.dim_num = 4; - tensors[MEAN_VARI_INDEX] = vsi_nn_CreateTensor( graph, &attr ); - in0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + in1_dtype = vsi_nn_kernel_map_dtype( inputs[2]->attr.dtype.vx_type ); out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); hashkeys[MEAN_VARI_INDEX]= HASH_INSTANCENORM_MEAN_VARI_KEY( in0_dtype, F32, reshape_flg ); - hashkey = HASH_INSTANCENORM_KEY( in0_dtype, out_dtype, reshape_flg ); + hashkey = HASH_INSTANCENORM_KEY( in0_dtype, in1_dtype, out_dtype, reshape_flg ); status = _query_kernel( ikernels[MEAN_VARI_INDEX], hashkeys[MEAN_VARI_INDEX], INTERNAL_KERNEL_MEAN_VARI ); if ( VSI_SUCCESS != status ) @@ -888,22 +1041,54 @@ static vsi_nn_kernel_node_t _setup if (reshape_flg) { - int32_t shape[VSI_NN_MAX_DIM_NUM] = {0}; shape[0] = inputs[0]->attr.size[0]; shape[1] = inputs[0]->attr.size[1] * inputs[0]->attr.size[2]; shape[2] = 1; shape[3] = inputs[0]->attr.dim_num > 3 ? inputs[0]->attr.size[3] : 1; rs_input = vsi_nn_kernel_tensor_reshape( inputs[0]->t, shape, 4 ); - - shape[0] = outputs[0]->attr.size[0]; - shape[1] = outputs[0]->attr.size[1] * outputs[0]->attr.size[2]; - shape[2] = 1; - shape[3] = outputs[0]->attr.dim_num > 3 ? outputs[0]->attr.size[3] : 1; rs_output = vsi_nn_kernel_tensor_reshape( outputs[0]->t, shape, 4 ); } + else if (inputs[0]->attr.size[0] * inputs[0]->attr.size[1] < GPU_TENSOR_MAX_WIDTH) + { + shape[0] = inputs[0]->attr.size[0] * inputs[0]->attr.size[1]; + shape[1] = 1; + shape[2] = inputs[0]->attr.size[2]; + shape[3] = inputs[0]->attr.dim_num > 3 ? inputs[0]->attr.size[3] : 1; + rs_input = vsi_nn_kernel_tensor_reshape( inputs[0]->t, shape, 4 ); + rs_output = vsi_nn_kernel_tensor_reshape( outputs[0]->t, shape, 4 ); + } + else if (inputs[0]->attr.size[0] < inputs[0]->attr.size[1]) + { + shape[0] = inputs[0]->attr.size[1]; + shape[1] = inputs[0]->attr.size[0]; + shape[2] = inputs[0]->attr.size[2]; + shape[3] = inputs[0]->attr.dim_num > 3 ? inputs[0]->attr.size[3] : 1; + rs_input = vsi_nn_kernel_tensor_reshape( inputs[0]->t, shape, 4 ); + rs_output = vsi_nn_kernel_tensor_reshape( outputs[0]->t, shape, 4 ); + } + else + { + shape[0] = inputs[0]->attr.size[0]; + } + + memset( &attr, 0, sizeof(vsi_nn_tensor_attr_t) ); + attr.dtype.vx_type = VSI_NN_TYPE_FLOAT32; + attr.is_const = FALSE; + attr.vtl = TRUE; + attr.size[0] = ((shape[0] + 255) / 256) * 4; + if ( inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT16 + || inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT16) + { + attr.size[0] = ((shape[0] + 127) / 128) * 4; + } + attr.size[1] = inputs[0]->attr.dim_num > 2 ? inputs[0]->attr.size[2] : 1; + attr.size[2] = 1; + attr.size[3] = inputs[0]->attr.dim_num > 3 ? inputs[0]->attr.size[3] : 1; + attr.dim_num = 4; + tensors[MEAN_VARI_INDEX] = vsi_nn_CreateTensor( graph, &attr ); + if (inputs[1]->attr.dim_num < 2) { - int32_t shape[VSI_NN_MAX_DIM_NUM] = {0}; shape[0] = inputs[1]->attr.size[0]; shape[1] = 1; shape[2] = 1; @@ -912,7 +1097,6 @@ static vsi_nn_kernel_node_t _setup } if (inputs[2]->attr.dim_num < 2) { - int32_t shape[VSI_NN_MAX_DIM_NUM] = {0}; shape[0] = inputs[2]->attr.size[0]; shape[1] = 1; shape[2] = 1; @@ -925,7 +1109,7 @@ static vsi_nn_kernel_node_t _setup if (tmp_node) { uint32_t index = 0; - if (reshape_flg) + if (rs_input) { mean_vari_node_params[index++] = rs_input; vsi_nn_kernel_node_pack_io( &mean_vari_node_params[index], @@ -967,7 +1151,7 @@ static vsi_nn_kernel_node_t _setup if (node) { uint32_t index = 0; - if (reshape_flg) + if (rs_input) { node_params[index++] = rs_input; } @@ -992,7 +1176,7 @@ static vsi_nn_kernel_node_t _setup node_params[index++] = (vsi_nn_kernel_node_param_t)inputs[2]->t; } node_params[index++] = (vsi_nn_kernel_node_param_t)tensors[MEAN_VARI_INDEX]->t; - if (reshape_flg) + if (rs_output) { node_params[index++] = rs_output; } @@ -1034,9 +1218,12 @@ final: { vsi_nn_kernel_tensor_release( &rs_gamma ); } - if (reshape_flg) + if (rs_input) { vsi_nn_kernel_tensor_release( &rs_input ); + } + if (rs_output) + { vsi_nn_kernel_tensor_release( &rs_output ); } for( i = 0; i < INTERNAL_KERNEL_SIZE; i ++ ) diff --git a/src/tim/vx/internal/src/kernel/evis/layer_normalization_evis.c b/src/tim/vx/internal/src/kernel/evis/layer_normalization_evis.c index 238eb23..d6c4b8a 100644 --- a/src/tim/vx/internal/src/kernel/evis/layer_normalization_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/layer_normalization_evis.c @@ -60,6 +60,9 @@ __BEGIN_DECLS #define KERNEL_SOURCE_5 "layer_normalization_wh_f16" #define KERNEL_SOURCE_6 "layer_normalization_i16" #define KERNEL_SOURCE_7 "layer_normalization_wh_i16" +#define KERNEL_SOURCE_8 "layer_normalization_scale_f32" +#define KERNEL_SOURCE_9 "layer_normalization_scale_f32_2d" +#define KERNEL_SOURCE_10 "layer_normalization_scale_f32_bf16" #define HASH_LAYERNORM_SH_KERNEL_NAME(SRC0_TYPE, DST_TYPE) \ @@ -68,20 +71,36 @@ __BEGIN_DECLS #define HASH_LAYERNORM_SH_KERNEL_2D_NAME(SRC0_TYPE, DST_TYPE) \ CVIVANTE_NAMESPACE("evis.layer_norm_"#SRC0_TYPE"to"#DST_TYPE"_2D") +#define HASH_LAYERNORM_SH_KERNEL_SCALE_NAME(SRC0_TYPE, DST_TYPE) \ + CVIVANTE_NAMESPACE("evis.layer_norm_"#SRC0_TYPE"F32to"#DST_TYPE) + +#define HASH_LAYERNORM_SH_KERNEL_SCALE_2D_NAME(SRC0_TYPE, DST_TYPE) \ + CVIVANTE_NAMESPACE("evis.layer_norm_"#SRC0_TYPE"F32to"#DST_TYPE"_2D") + // normalization -#define HASH_LAYERNORM_KEY(_input0_type, _output_type, _reshape_flag) \ - ((_input0_type << 24) | (_output_type << 16) | (_reshape_flag << 8)) +#define HASH_LAYERNORM_KEY(_input0_type, _input2_type, _output_type, _reshape_flag) \ + ((_input0_type << 24) | (_input2_type << 16) | (_output_type << 8) | _reshape_flag) #define TENSOR_LAYERNORM_KERNELS(IN0_TYPE, OUT_TYPE, SOURCE) \ - { HASH_LAYERNORM_KEY(IN0_TYPE, OUT_TYPE, LAYERNORM_KERNEL), \ + { HASH_LAYERNORM_KEY(IN0_TYPE, F16, OUT_TYPE, LAYERNORM_KERNEL), \ HASH_LAYERNORM_SH_KERNEL_NAME(IN0_TYPE, OUT_TYPE), \ SOURCE }, #define TENSOR_LAYERNORM_KERNELS_2D(IN0_TYPE, OUT_TYPE, SOURCE) \ - { HASH_LAYERNORM_KEY(IN0_TYPE, OUT_TYPE, LAYERNORM_2D_KERNEL), \ + { HASH_LAYERNORM_KEY(IN0_TYPE, F16, OUT_TYPE, LAYERNORM_2D_KERNEL), \ HASH_LAYERNORM_SH_KERNEL_2D_NAME(IN0_TYPE, OUT_TYPE), \ SOURCE }, +#define TENSOR_LAYERNORM_SCALE_KERNELS(IN0_TYPE, OUT_TYPE, SOURCE) \ + { HASH_LAYERNORM_KEY(IN0_TYPE, F32, OUT_TYPE, LAYERNORM_KERNEL), \ + HASH_LAYERNORM_SH_KERNEL_SCALE_NAME(IN0_TYPE, OUT_TYPE), \ + SOURCE }, + +#define TENSOR_LAYERNORM_SCALE_KERNELS_2D(IN0_TYPE, OUT_TYPE, SOURCE) \ + { HASH_LAYERNORM_KEY(IN0_TYPE, F32, OUT_TYPE, LAYERNORM_2D_KERNEL), \ + HASH_LAYERNORM_SH_KERNEL_SCALE_2D_NAME(IN0_TYPE, OUT_TYPE), \ + SOURCE }, + // greater than max size #define HASH_SUMSQR_SH_KERNEL_NAME(SRC0_TYPE, DST_TYPE) \ CVIVANTE_NAMESPACE("evis.layernorm_wh_sumSqr_"#SRC0_TYPE"to"#DST_TYPE) @@ -96,22 +115,22 @@ __BEGIN_DECLS CVIVANTE_NAMESPACE("evis.layernorm_wh_"#SRC0_TYPE"to"#DST_TYPE"_2D") #define TENSOR_SUMSQR_KERNELS(IN0_TYPE, OUT_TYPE, SOURCE) \ - { HASH_LAYERNORM_KEY(IN0_TYPE, OUT_TYPE, SUMSQR_KERNEL), \ + { HASH_LAYERNORM_KEY(IN0_TYPE, F16, OUT_TYPE, SUMSQR_KERNEL), \ HASH_SUMSQR_SH_KERNEL_NAME(IN0_TYPE, OUT_TYPE), \ SOURCE }, #define TENSOR_SUMSQR_KERNELS_2D(IN0_TYPE, OUT_TYPE, SOURCE) \ - { HASH_LAYERNORM_KEY(IN0_TYPE, OUT_TYPE, SUMSQR_2D_KERNEL), \ + { HASH_LAYERNORM_KEY(IN0_TYPE, F16, OUT_TYPE, SUMSQR_2D_KERNEL), \ HASH_SUMSQR_SH_KERNEL_2D_NAME(IN0_TYPE, OUT_TYPE), \ SOURCE }, #define TENSOR_LAYERNORM_WH_KERNELS(IN0_TYPE, OUT_TYPE, SOURCE) \ - { HASH_LAYERNORM_KEY(IN0_TYPE, OUT_TYPE, LAYERNORM_WH_KERNEL), \ + { HASH_LAYERNORM_KEY(IN0_TYPE, F16, OUT_TYPE, LAYERNORM_WH_KERNEL), \ HASH_LAYERNORM_WH_SH_KERNEL_NAME(IN0_TYPE, OUT_TYPE), \ SOURCE }, #define TENSOR_LAYERNORM_WH_KERNELS_2D(IN0_TYPE, OUT_TYPE, SOURCE) \ - { HASH_LAYERNORM_KEY(IN0_TYPE, OUT_TYPE, LAYERNORM_WH_2D_KERNEL), \ + { HASH_LAYERNORM_KEY(IN0_TYPE, F16, OUT_TYPE, LAYERNORM_WH_2D_KERNEL), \ HASH_LAYERNORM_WH_SH_KERNEL_2D_NAME(IN0_TYPE, OUT_TYPE), \ SOURCE }, @@ -136,6 +155,17 @@ static const _kernel_map_type _layernorm_kernel_map[] = TENSOR_LAYERNORM_KERNELS_2D( F16, U8, KERNEL_SOURCE_2 ) TENSOR_LAYERNORM_KERNELS( I16, I16, KERNEL_SOURCE_6 ) TENSOR_LAYERNORM_KERNELS_2D( I16, I16, KERNEL_SOURCE_6 ) + + TENSOR_LAYERNORM_SCALE_KERNELS( U8, U8, KERNEL_SOURCE_8 ) + TENSOR_LAYERNORM_SCALE_KERNELS_2D( U8, U8, KERNEL_SOURCE_9 ) + TENSOR_LAYERNORM_SCALE_KERNELS( I8, I8, KERNEL_SOURCE_8 ) + TENSOR_LAYERNORM_SCALE_KERNELS_2D( I8, I8, KERNEL_SOURCE_9 ) + TENSOR_LAYERNORM_SCALE_KERNELS( I16, I16, KERNEL_SOURCE_8 ) + TENSOR_LAYERNORM_SCALE_KERNELS_2D( I16, I16, KERNEL_SOURCE_9 ) + TENSOR_LAYERNORM_SCALE_KERNELS( F16, F16, KERNEL_SOURCE_8 ) + TENSOR_LAYERNORM_SCALE_KERNELS_2D( F16, F16, KERNEL_SOURCE_9 ) + TENSOR_LAYERNORM_SCALE_KERNELS( BF16, BF16, KERNEL_SOURCE_10 ) + TENSOR_LAYERNORM_SCALE_KERNELS_2D( BF16, BF16, KERNEL_SOURCE_10 ) }; static const _kernel_map_type _sumsqr_kernel_map[] = @@ -295,8 +325,7 @@ DEF_KERNEL_INITIALIZER(_layernorm_initializer) shaderParam.global_scale[1] = 1; shaderParam.global_scale[2] = 1; shaderParam.global_size[0] = 1; - shaderParam.global_size[1] = gpu_align_p2((height + shaderParam.global_scale[1] - 1) - / shaderParam.global_scale[1], 4); + shaderParam.global_size[1] = height; shaderParam.global_size[2] = chn; status = vsi_nn_kernel_gpu_config( node, &shaderParam ); @@ -424,6 +453,37 @@ DEF_KERNEL_INITIALIZER(_layernorm_initializer) 0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniConvBF16toF32_Part0_2x8 = {{ + 0x11111111, // TCfg + 0x01010101, // ASelt + 0x01050004, 0x03070206, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniConvBF16toF32_Part1_2x8 = {{ + 0x11111111, // TCfg + 0x01010101, // ASelt + 0x05050404, 0x07070606, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniExtractOddData_2x8 = {{ + 0x11111111, // TCfg + 0x11110000, // ASelt + 0x07050301, 0x07050301, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16}; + uint32_t pack_key = 0; #define _PACK_SELECT_KEY( IN0_TYPE, IN1_TYPE, OUT_TYPE ) \ (IN0_TYPE | (IN1_TYPE << 16) | (OUT_TYPE << 8)) @@ -432,9 +492,6 @@ DEF_KERNEL_INITIALIZER(_layernorm_initializer) status = vsi_nn_kernel_gpu_add_param(node, "width", &width); status |= vsi_nn_kernel_gpu_add_param(node, "dimRatio", &dimRatio); - status |= vsi_nn_kernel_gpu_add_param(node, "UniFP16toFP32Lo4_dp4x4", &UniFP16toFP32Lo4_dp4x4); - status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertSecFp16Fp32_4x4", &uniConvertSecFp16Fp32_4x4); - status |= vsi_nn_kernel_gpu_add_param(node, "e2InScale", &e2InScale); CHECK_STATUS_FAIL_GOTO(status, OnError ); switch( pack_key ) @@ -453,6 +510,11 @@ DEF_KERNEL_INITIALIZER(_layernorm_initializer) &uniConvert3rdUint8SubZpToFp32_4x4); status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert4thUint8SubZpToFp32_4x4", &uniConvert4thUint8SubZpToFp32_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertSecFp16Fp32_4x4", + &uniConvertSecFp16Fp32_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "UniFP16toFP32Lo4_dp4x4", + &UniFP16toFP32Lo4_dp4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "e2InScale", &e2InScale); status |= vsi_nn_kernel_gpu_add_param(node, "inputZP", &input_zp); status |= vsi_nn_kernel_gpu_add_param(node, "input_scale", &scaleIn); status |= vsi_nn_kernel_gpu_add_param(node, "sumInZp", &sumInZp); @@ -481,6 +543,11 @@ DEF_KERNEL_INITIALIZER(_layernorm_initializer) &uniConvert3rdUint8SubZpToFp32_4x4); status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert4thUint8SubZpToFp32_4x4", &uniConvert4thUint8SubZpToFp32_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertSecFp16Fp32_4x4", + &uniConvertSecFp16Fp32_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "UniFP16toFP32Lo4_dp4x4", + &UniFP16toFP32Lo4_dp4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "e2InScale", &e2InScale); status |= vsi_nn_kernel_gpu_add_param(node, "inputZP", &input_zp); status |= vsi_nn_kernel_gpu_add_param(node, "input_scale", &scaleIn); status |= vsi_nn_kernel_gpu_add_param(node, "sumInZp", &sumInZp); @@ -501,7 +568,11 @@ DEF_KERNEL_INITIALIZER(_layernorm_initializer) &uniConvert2ndUint8SubZpToFp32_4x4); status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertInt32toUint8_2x8", &uniConvertInt32toUint8_2x8); - + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertSecFp16Fp32_4x4", + &uniConvertSecFp16Fp32_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "UniFP16toFP32Lo4_dp4x4", + &UniFP16toFP32Lo4_dp4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "e2InScale", &e2InScale); status |= vsi_nn_kernel_gpu_add_param(node, "inputZP", &input_zp); status |= vsi_nn_kernel_gpu_add_param(node, "input_scale", &scaleIn); status |= vsi_nn_kernel_gpu_add_param(node, "output_zp", &output_zp); @@ -510,6 +581,70 @@ DEF_KERNEL_INITIALIZER(_layernorm_initializer) CHECK_STATUS_FAIL_GOTO(status, OnError ); } break; + case _PACK_SELECT_KEY( U8, F32, U8 ): + case _PACK_SELECT_KEY( F16, F32, F16 ): + { + status = vsi_nn_kernel_gpu_add_param(node, "uniFp16SumSqr_dp8x2", + &uniFp16SumSqr_dp8x2); + status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractHalf4_dp4x4", + &uniExtractHalf4_dp4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertInt32toUint8_2x8", + &uniConvertInt32toUint8_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "uniSumU8_16x1", &uniSumU8_16x1); + status |= vsi_nn_kernel_gpu_add_param(node, "uniSqrSum_16x1", &uniSqrSum_16x1); + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert1stUint8SubZpToFp32_4x4", + &uniConvert1stUint8SubZpToFp32_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert2ndUint8SubZpToFp32_4x4", + &uniConvert2ndUint8SubZpToFp32_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert3rdUint8SubZpToFp32_4x4", + &uniConvert3rdUint8SubZpToFp32_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert4thUint8SubZpToFp32_4x4", + &uniConvert4thUint8SubZpToFp32_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "UniFP16toFP32Lo4_dp4x4", + &UniFP16toFP32Lo4_dp4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "e2InScale", &e2InScale); + status |= vsi_nn_kernel_gpu_add_param(node, "inputZP", &input_zp); + status |= vsi_nn_kernel_gpu_add_param(node, "input_scale", &scaleIn); + status |= vsi_nn_kernel_gpu_add_param(node, "sumInZp", &sumInZp); + status |= vsi_nn_kernel_gpu_add_param(node, "tmpZp1", &tmpZp1); + status |= vsi_nn_kernel_gpu_add_param(node, "tmpZp2", &tmpZp2); + status |= vsi_nn_kernel_gpu_add_param(node, "output_zp", &output_zp); + status |= vsi_nn_kernel_gpu_add_param(node, "outputScale", &scaleOut); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + } + break; + case _PACK_SELECT_KEY( I16, F32, I16 ): + { + status = vsi_nn_kernel_gpu_add_param(node, "uniInt16SumSqr_dp8x2", + &uniInt16SumSqr_dp8x2); + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert1stUint8SubZpToFp32_4x4", + &uniConvert1stUint8SubZpToFp32_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert2ndUint8SubZpToFp32_4x4", + &uniConvert2ndUint8SubZpToFp32_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertInt32toUint8_2x8", + &uniConvertInt32toUint8_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "UniFP16toFP32Lo4_dp4x4", + &UniFP16toFP32Lo4_dp4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "e2InScale", &e2InScale); + status |= vsi_nn_kernel_gpu_add_param(node, "inputZP", &input_zp); + status |= vsi_nn_kernel_gpu_add_param(node, "input_scale", &scaleIn); + status |= vsi_nn_kernel_gpu_add_param(node, "output_zp", &output_zp); + status |= vsi_nn_kernel_gpu_add_param(node, "outputScale", &scaleOut); + status |= vsi_nn_kernel_gpu_add_param(node, "dimRatio_scale", &dimRatio_scale); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + } + break; + case _PACK_SELECT_KEY( BF16, F32, BF16 ): + { + status = vsi_nn_kernel_gpu_add_param( node, + "uniConvBF16toF32_Part0_2x8", &uniConvBF16toF32_Part0_2x8 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniConvBF16toF32_Part1_2x8", &uniConvBF16toF32_Part1_2x8 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniExtractOddData_2x8", &uniExtractOddData_2x8 ); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + } + break; default: VSI_ASSERT( FALSE ); return VSI_FAILURE; @@ -949,6 +1084,7 @@ static vsi_status _query_kernel { vsi_status status = VSI_FAILURE; vsi_nn_kernel_dtype_e input0_dtype = U8; + vsi_nn_kernel_dtype_e input2_dtype = F16; vsi_nn_kernel_dtype_e output_dtype = U8; uint32_t key = 0; int i = 0; @@ -960,9 +1096,10 @@ static vsi_status _query_kernel } input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + input2_dtype = vsi_nn_kernel_map_dtype( inputs[2]->attr.dtype.vx_type ); output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); - key = HASH_LAYERNORM_KEY( input0_dtype, output_dtype, kernel_type ); + key = HASH_LAYERNORM_KEY( input0_dtype, input2_dtype, output_dtype, kernel_type ); for( i = 0; i < _cnt_of_array(_layernorm_kernel_map); i ++ ) { @@ -1000,14 +1137,16 @@ static vsi_status _query_kernel_wh { vsi_status status = VSI_FAILURE; vsi_nn_kernel_dtype_e input0_dtype = U8; + vsi_nn_kernel_dtype_e input2_dtype = F16; vsi_nn_kernel_dtype_e output_dtype = U8; uint32_t key = 0; int i = 0; input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + input2_dtype = vsi_nn_kernel_map_dtype( inputs[2]->attr.dtype.vx_type ); output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); - key = HASH_LAYERNORM_KEY( input0_dtype, F32, is2D_sumsqr ); + key = HASH_LAYERNORM_KEY( input0_dtype, input2_dtype, F32, is2D_sumsqr ); for( i = 0; i < _cnt_of_array(_sumsqr_kernel_map); i ++ ) { @@ -1031,7 +1170,7 @@ static vsi_status _query_kernel_wh } - key = HASH_LAYERNORM_KEY( input0_dtype, output_dtype, is2D_wh ); + key = HASH_LAYERNORM_KEY( input0_dtype, input2_dtype, output_dtype, is2D_wh ); for( i = 0; i < _cnt_of_array(_sumsqr_kernel_map); i ++ ) { @@ -1256,17 +1395,25 @@ static vsi_nn_kernel_node_t _setup vsi_nn_kernel_node_param_t node_params[_LAYERNORM_PARAM_NUM] = { NULL }; vsi_nn_kernel_node_t node = NULL; vsi_nn_kernel_tensor_t rs_input = NULL, rs_output = NULL, rs_gamma = NULL, rs_beta = NULL; - int32_t rs_flg = vsi_nn_kernel_param_get_int32( params, "reshape_flg" ); - int32_t wh_flg = vsi_nn_kernel_param_get_int32( params, "wh_flg" ); - int32_t optFlg = rs_flg || (outputs[0]->attr.dim_num < 3); float eps = vsi_nn_kernel_param_get_float32( params, "eps" ); + uint32_t *input_size = inputs[0]->attr.size; + uint32_t dims_num = inputs[0]->attr.dim_num; + int32_t rs_flg = 0; + int32_t optFlg = 0; - if (wh_flg) + if (input_size[0] >= GPU_TENSOR_MAX_WIDTH) { node = _setup_wh(graph, inputs, input_num, outputs, output_num, params, kernel); goto final; } + if ((input_size[1] * input_size[2] < GPU_TENSOR_MAX_WIDTH) + && dims_num > 2) + { + rs_flg = 1; + } + optFlg = rs_flg || (outputs[0]->attr.dim_num < 3); + status = _query_kernel( inputs, outputs, kernel, optFlg); if (VSI_SUCCESS != status) { diff --git a/src/tim/vx/internal/src/kernel/evis/maximum_evis.c b/src/tim/vx/internal/src/kernel/evis/maximum_evis.c index 5b896c6..68dc6e8 100644 --- a/src/tim/vx/internal/src/kernel/evis/maximum_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/maximum_evis.c @@ -241,7 +241,7 @@ DEF_KERNEL_INITIALIZER(_maximum_initializer) pack_key = _PACK_SELECT_KEY( attr[0]->dtype, attr[1]->dtype, attr[2]->dtype ); - if ( (attr[2]->dtype == F16 || attr[2]->dtype == I16) + if ( (attr[2]->dtype == F16 || attr[2]->dtype == I16 || attr[2]->dtype == BF16) || ((attr[2]->dtype == I8 || attr[2]->dtype == U8) && (attr[0]->dtype == F16 && attr[1]->dtype == F16)) ) { gpu_param.global_scale[0] = 8; diff --git a/src/tim/vx/internal/src/kernel/evis/minimum_evis.c b/src/tim/vx/internal/src/kernel/evis/minimum_evis.c index 356b93f..02d7523 100644 --- a/src/tim/vx/internal/src/kernel/evis/minimum_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/minimum_evis.c @@ -241,7 +241,7 @@ DEF_KERNEL_INITIALIZER(_minimum_initializer) pack_key = _PACK_SELECT_KEY( attr[0]->dtype, attr[1]->dtype, attr[2]->dtype ); - if ( (attr[2]->dtype == F16 || attr[2]->dtype == I16) + if ( (attr[2]->dtype == F16 || attr[2]->dtype == I16 || attr[2]->dtype == BF16) || ((attr[2]->dtype == I8 || attr[2]->dtype == U8) && (attr[0]->dtype == F16 && attr[1]->dtype == F16)) ) { gpu_param.global_scale[0] = 8; diff --git a/src/tim/vx/internal/src/kernel/evis/one_hot_evis.c b/src/tim/vx/internal/src/kernel/evis/one_hot_evis.c new file mode 100644 index 0000000..f1798c2 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/evis/one_hot_evis.c @@ -0,0 +1,460 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_error.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "utils/vsi_nn_dtype_util.h" + +__BEGIN_DECLS + +#define _ONE_HOT_KERNEL_SOURCE "one_hot" + +// Add kernel hashtable here +#define HASH_ONE_HOT_SH_KERNEL_NAME(SRC_TYPE, DST_TYPE) \ + CVIVANTE_NAMESPACE("evis.one_hot_"#SRC_TYPE"to"#DST_TYPE) + +#define ONE_HOT_HASH_KEY( IN_DTYPE, OUT_DTYPE, IMG_2D ) \ + (( IN_DTYPE << 9 ) | ( OUT_DTYPE << 1) | (IMG_2D)) + +#define PACK_ONE_HOT_KERNEL_3D( IN_DTYPE, OUT_DTYPE ) \ +{ ONE_HOT_HASH_KEY( IN_DTYPE, OUT_DTYPE, 0 ), \ + CVIVANTE_NAMESPACE("evis.one_hot_"#IN_DTYPE"to"#OUT_DTYPE), \ + _ONE_HOT_KERNEL_SOURCE } + +#define PACK_ONE_HOT_KERNEL_2D( IN_DTYPE, OUT_DTYPE ) \ +{ ONE_HOT_HASH_KEY( IN_DTYPE, OUT_DTYPE, 1 ), \ + CVIVANTE_NAMESPACE("evis.one_hot_"#IN_DTYPE"to"#OUT_DTYPE"_2D"), \ + _ONE_HOT_KERNEL_SOURCE } + +typedef struct +{ + uint32_t key; + char * function_name; + const char * source_name; +} _kernel_map_type; + +static const _kernel_map_type _one_hot_kernel_map[] = +{ + // Register kernel here + PACK_ONE_HOT_KERNEL_3D( U8, U8 ), + PACK_ONE_HOT_KERNEL_3D( U8, F16 ), + PACK_ONE_HOT_KERNEL_3D( I8, I8 ), + PACK_ONE_HOT_KERNEL_3D( I8, F16 ), + PACK_ONE_HOT_KERNEL_3D( I16, I16 ), + PACK_ONE_HOT_KERNEL_3D( I16, F16 ), + PACK_ONE_HOT_KERNEL_3D( F16, F16 ), + PACK_ONE_HOT_KERNEL_3D( F16, I16 ), + PACK_ONE_HOT_KERNEL_3D( F16, U8 ), + PACK_ONE_HOT_KERNEL_3D( F16, I8 ), + + PACK_ONE_HOT_KERNEL_2D( U8, U8 ), + PACK_ONE_HOT_KERNEL_2D( U8, F16 ), + PACK_ONE_HOT_KERNEL_2D( I8, I8 ), + PACK_ONE_HOT_KERNEL_2D( I8, F16 ), + PACK_ONE_HOT_KERNEL_2D( I16, I16 ), + PACK_ONE_HOT_KERNEL_2D( I16, F16 ), + PACK_ONE_HOT_KERNEL_2D( F16, F16 ), + PACK_ONE_HOT_KERNEL_2D( F16, I16 ), + PACK_ONE_HOT_KERNEL_2D( F16, U8 ), + PACK_ONE_HOT_KERNEL_2D( F16, I8 ), +}; + + +/* + * Kernel params + */ +static vx_param_description_t _one_hot_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, +}; +#define SCALAR_INPUT_SUFFIX_SIZE (2) +#define SCALAR_INPUT_ON_VALUE (3) +#define SCALAR_INPUT_OFF_VALUE (4) +#define _ONE_HOT_PARAM_NUM _cnt_of_array( _one_hot_kernel_param_def ) + +/* + * Kernel initializer + */ +DEF_KERNEL_INITIALIZER(_one_hot_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + gpu_param_t gpu_param = { + 2, // workdim + {0, 0, 0}, // globalWorkOffset: control the start location be processed in the image + {0, 0, 0}, // globalWorkScale: how many pixels could be processed by a single thread + {0, 0, 0}, // localWorkSize: local group size in thread + {0, 0, 0} // globalWorkSize: image size in thread + }; + + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_tensor_attr_t * attr[2] = { NULL }; + vsi_int_array_t * in_shape = NULL; + int32_t suffix_size = 0; + int32_t depth = 0; + int32_t input_zp = 0; + float scaleIn = 1.0f; + int32_t srcFixPointPos = 0; + vsi_nn_kernel_dtype_e input_dtype = F16; + + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); + CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); + attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); + CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final ); + + vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[SCALAR_INPUT_SUFFIX_SIZE], &(suffix_size)); + + in_shape = attr[0]->shape; + depth = attr[1]->shape->data[1]; + input_dtype = attr[0]->dtype; + + if (VSI_NN_KERNEL_QUANT_DFP == attr[0]->quant) + { + srcFixPointPos = attr[0]->dfp.fl; + } + else if (VSI_NN_KERNEL_QUANT_ASYMM == attr[0]->quant) + { + input_zp = attr[0]->asymm.zero_point; + scaleIn = attr[0]->asymm.scale; + } + + if (suffix_size == 1) + { + gpu_param.global_scale[0] = 4; + gpu_param.global_scale[1] = 1; + + depth = attr[1]->shape->data[0]; + } + else + { + gpu_param.global_scale[0] = 1; + gpu_param.global_scale[1] = 1; + } + + gpu_param.global_size[0] = gpu_align_p2( + (in_shape->data[0] + gpu_param.global_scale[0] - 1) + / gpu_param.global_scale[0], 4); + gpu_param.global_size[1] = in_shape->data[1]; + + switch (input_dtype) + { + case I16: + case I8: + case F16: + { + gpu_dp_inst_t uniDataConvert_0_4x4 = {{ + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00010000, 0x00030002, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000000, 0x00000001, 0x00000000, + 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniDataConvert_1_4x4 = {{ + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00050004, 0x00070006, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000000, 0x00000001, 0x00000000, + 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniExtractInteger_2x8 = {{ + 0x33333333, // TCfg + 0x11110000, // ASelt + 0x03020100, 0x03020100, // ABin + 0x00000000, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + + gpu_dp_inst_update_postshfit( &uniDataConvert_0_4x4, srcFixPointPos ); + gpu_dp_inst_update_postshfit( &uniDataConvert_1_4x4, srcFixPointPos ); + + status = vsi_nn_kernel_gpu_add_param( node, + "uniDataConvert_0_4x4", &uniDataConvert_0_4x4 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniDataConvert_1_4x4", &uniDataConvert_1_4x4 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniExtract8Data_2x8", &uniExtractInteger_2x8 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "depth", &depth ); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + break; + case U8: + { + gpu_dp_inst_t uniDataConvert_0_4x4 = {{ + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00010000, 0x00030002, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000000, 0x00000001, 0x00000000, + 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniDataConvert_1_4x4 = {{ + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00050004, 0x00070006, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000000, 0x00000001, 0x00000000, + 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniExtractInteger_2x8 = {{ + 0x33333333, // TCfg + 0x11110000, // ASelt + 0x03020100, 0x03020100, // ABin + 0x00000000, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + float input_tail = 0 - (float)input_zp * scaleIn; + + status = vsi_nn_kernel_gpu_add_param( node, + "uniDataConvert_0_4x4", &uniDataConvert_0_4x4 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniDataConvert_1_4x4", &uniDataConvert_1_4x4 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniExtract8Data_2x8", &uniExtractInteger_2x8 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "input_scale", &scaleIn ); + status |= vsi_nn_kernel_gpu_add_param( node, + "input_tail", &input_tail ); + status |= vsi_nn_kernel_gpu_add_param( node, + "depth", &depth ); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + default: + break; + } + + status = vsi_nn_kernel_gpu_config( node, &gpu_param ); +final: +#define SAFE_FREE_TENSOR_ATTR(_PTR) if ( _PTR ) { vsi_nn_kernel_tensor_attr_release( &_PTR ); _PTR = NULL; } + SAFE_FREE_TENSOR_ATTR(attr[0]); + SAFE_FREE_TENSOR_ATTR(attr[1]); +#undef SAFE_FREE_TENSOR_ATTR + + return status; +} /* _one_hot_initializer() */ + + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + vsi_nn_tensor_t * const * const inputs, + vsi_nn_tensor_t * const * const outputs, + vsi_bool image_2d + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_dtype_e in_dtype; + vsi_nn_kernel_dtype_e out_dtype; + const _kernel_map_type * kernel_map = _one_hot_kernel_map; + size_t kernel_map_size = _cnt_of_array( _one_hot_kernel_map ); + vx_param_description_t * param_def = _one_hot_kernel_param_def; + vx_kernel_initialize_f initializer = _one_hot_initializer; + + uint32_t key; + uint32_t i; + + in_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + + key = ONE_HOT_HASH_KEY( in_dtype, out_dtype, image_2d ); + + for ( i = 0; i < (uint32_t)kernel_map_size; i ++ ) + { + if ( kernel_map[i].key == key ) + { + break; + } + } + if ( i < (uint32_t)kernel_map_size ) + { + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name ); + kernel->info.parameters = param_def; + kernel->info.numParams = _cnt_of_array( _one_hot_kernel_param_def ); + kernel->info.initialize = initializer; + // Register code source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2, + "vsi_nn_kernel_header", + kernel_map[i].source_name ); + // Register binary source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + kernel_map[i].source_name ); + status = VSI_SUCCESS; + } + + return status; +} /* _query_kernel() */ + + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_ONE_HOT_PARAM_NUM] = {NULL}; + vsi_nn_kernel_node_t node = NULL; + vsi_nn_tensor_t* rs_tensors[2] = { NULL }; + int32_t shape[2][VSI_NN_MAX_DIM_NUM] = {{ 0 }}; + int32_t i = 0; + vsi_bool image_2d = FALSE; + int32_t num_elements = vsi_nn_vxGetTensorElementNum(&inputs[0]->attr); + int32_t prefix_dim_size = 1; + int32_t suffix_dim_size = 0; + int32_t depth = vsi_nn_kernel_param_get_int32( params, "depth" ); + uint32_t data_u32[2] = {0}; + float on_value = vsi_nn_kernel_param_get_float32( params, "on_value" ); + float off_value = vsi_nn_kernel_param_get_float32( params, "off_value" ); + int32_t axis = vsi_nn_kernel_param_get_int32( params, "axis" ); + + vsi_nn_Float32ToDtype(on_value, (uint8_t*)&data_u32[0], &outputs[0]->attr.dtype); + vsi_nn_Float32ToDtype(off_value, (uint8_t*)&data_u32[1], &outputs[0]->attr.dtype); + + axis = axis == -1 ? (int32_t)inputs[0]->attr.dim_num : (int32_t)inputs[0]->attr.dim_num - axis; + for (i = 0; i < axis; i++) + { + prefix_dim_size *= inputs[0]->attr.size[i]; + } + + suffix_dim_size = num_elements / prefix_dim_size; + + if (suffix_dim_size == 1) + { + shape[0][0] = prefix_dim_size; + shape[0][1] = 1; + shape[1][0] = depth; + shape[1][1] = prefix_dim_size; + shape[1][2] = 1; + } + else + { + shape[0][0] = suffix_dim_size; + shape[0][1] = prefix_dim_size; + shape[1][0] = suffix_dim_size; + shape[1][1] = depth; + shape[1][2] = prefix_dim_size; + } + + rs_tensors[0] = vsi_nn_reshape_tensor( graph, + inputs[0], (uint32_t*)shape[0], 2 ); + rs_tensors[1] = vsi_nn_reshape_tensor( graph, + outputs[0], (uint32_t*)shape[1], 3 ); + + if ( !vsi_nn_kernel_gpu_check_shape( (int32_t*)rs_tensors[1]->attr.size, + rs_tensors[1]->attr.dim_num ) ) + { + return NULL; + } + + image_2d = suffix_dim_size == 1; + + status = _query_kernel( kernel, inputs, outputs, image_2d ); + if ( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if ( node ) + { + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( node_params, _ONE_HOT_PARAM_NUM, + &rs_tensors[0], input_num, &rs_tensors[1], output_num ); + node_params[SCALAR_INPUT_SUFFIX_SIZE] = vsi_nn_kernel_scalar_create( + graph, I32, &suffix_dim_size ); + node_params[SCALAR_INPUT_ON_VALUE] = vsi_nn_kernel_scalar_create( + graph, U32, &data_u32[0] ); + node_params[SCALAR_INPUT_OFF_VALUE] = vsi_nn_kernel_scalar_create( + graph, U32, &data_u32[1] ); + + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, node_params, _ONE_HOT_PARAM_NUM ); + CHECK_STATUS_FAIL_GOTO( status, final ); + } + } +final: + if (rs_tensors[0]) + { + vsi_nn_ReleaseTensor( &rs_tensors[0] ); + } + + if (rs_tensors[1]) + { + vsi_nn_ReleaseTensor( &rs_tensors[1] ); + } + + for (i = SCALAR_INPUT_SUFFIX_SIZE; i < _ONE_HOT_PARAM_NUM; i++) + { + if (node_params[i]) + { + vsi_nn_kernel_scalar_release( &node_params[i] ); + } + } + + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_EVIS( one_hot, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/evis/pre_process_rgb_evis.c b/src/tim/vx/internal/src/kernel/evis/pre_process_rgb_evis.c index 09f55a6..a7a6cb1 100644 --- a/src/tim/vx/internal/src/kernel/evis/pre_process_rgb_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/pre_process_rgb_evis.c @@ -202,8 +202,28 @@ DEF_KERNEL_INITIALIZER(_pre_process_rgb_initializer) }, GPU_DP_TYPE_16 }; gpu_dp_inst_t uniExtractRtoF32_part1_4x4 = {{ 0x01010101, // TCfg - 0x01010100, // ASelt - 0x0000000c, 0x00060003, // ABin + 0x01010000, // ASelt + 0x000f000c, 0x00050002, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000000, 0x00000001, 0x00000000, + 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniExtractRtoF32_part2_4x4 = {{ + 0x01010101, // TCfg + 0x01000000, // ASelt + 0x000b0008, 0x0001000e, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000000, 0x00000001, 0x00000000, + 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniExtractRtoF32_part3_4x4 = {{ + 0x01010101, // TCfg + 0x01010101, // ASelt + 0x00070004, 0x000d000a, // ABin 0x02020202, // BSelt 0x00000000, 0x00000000, // BBin 0x00000600, // AccumType, ConstantType, and PostShift @@ -223,7 +243,27 @@ DEF_KERNEL_INITIALIZER(_pre_process_rgb_initializer) gpu_dp_inst_t uniExtractGtoF32_part1_4x4 = {{ 0x01010101, // TCfg 0x01010100, // ASelt - 0x0001000d, 0x00070004, // ABin + 0x0000000d, 0x00060003, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000000, 0x00000001, 0x00000000, + 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniExtractGtoF32_part2_4x4 = {{ + 0x01010101, // TCfg + 0x01000000, // ASelt + 0x000c0009, 0x0002000f, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000000, 0x00000001, 0x00000000, + 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniExtractGtoF32_part3_4x4 = {{ + 0x01010101, // TCfg + 0x01010101, // ASelt + 0x00080005, 0x000e000b, // ABin 0x02020202, // BSelt 0x00000000, 0x00000000, // BBin 0x00000600, // AccumType, ConstantType, and PostShift @@ -243,7 +283,27 @@ DEF_KERNEL_INITIALIZER(_pre_process_rgb_initializer) gpu_dp_inst_t uniExtractBtoF32_part1_4x4 = {{ 0x01010101, // TCfg 0x01010100, // ASelt - 0x0002000e, 0x00080005, // ABin + 0x0001000e, 0x00070004, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000000, 0x00000001, 0x00000000, + 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniExtractBtoF32_part2_4x4 = {{ + 0x01010101, // TCfg + 0x01010000, // ASelt + 0x000d000a, 0x00030000, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000000, 0x00000001, 0x00000000, + 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniExtractBtoF32_part3_4x4 = {{ + 0x01010101, // TCfg + 0x01010101, // ASelt + 0x00090006, 0x000f000c, // ABin 0x02020202, // BSelt 0x00000000, 0x00000000, // BBin 0x00000600, // AccumType, ConstantType, and PostShift @@ -358,7 +418,14 @@ DEF_KERNEL_INITIALIZER(_pre_process_rgb_initializer) case _PACK_SELECT_KEY( 1, 0, 0): // copy case _PACK_SELECT_KEY( 1, 2, 0): // copy reorder { - shaderParam.global_scale[0] = 8; + if (attr[0]->dtype == I8 || attr[0]->dtype == U8) + { + shaderParam.global_scale[0] = 16; + } + else + { + shaderParam.global_scale[0] = 8; + } shaderParam.global_scale[1] = 1; shaderParam.global_scale[2] = 1; shaderParam.global_size[0] = gpu_align_p2((width + shaderParam.global_scale[0] - 1) @@ -366,7 +433,7 @@ DEF_KERNEL_INITIALIZER(_pre_process_rgb_initializer) shaderParam.global_size[1] = height; shaderParam.global_size[2] = 1; - if(attr[0]->dtype == F16) + if (attr[0]->dtype == F16) { status = vsi_nn_kernel_gpu_add_param(node, "uniExtract8Data_2x8", &uniExtractHalf8_2x8); } @@ -376,10 +443,16 @@ DEF_KERNEL_INITIALIZER(_pre_process_rgb_initializer) } status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractRtoF32_part0_4x4", &uniExtractRtoF32_part0_4x4); status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractRtoF32_part1_4x4", &uniExtractRtoF32_part1_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractRtoF32_part2_4x4", &uniExtractRtoF32_part2_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractRtoF32_part3_4x4", &uniExtractRtoF32_part3_4x4); status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractGtoF32_part0_4x4", &uniExtractGtoF32_part0_4x4); status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractGtoF32_part1_4x4", &uniExtractGtoF32_part1_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractGtoF32_part2_4x4", &uniExtractGtoF32_part2_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractGtoF32_part3_4x4", &uniExtractGtoF32_part3_4x4); status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractBtoF32_part0_4x4", &uniExtractBtoF32_part0_4x4); status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractBtoF32_part1_4x4", &uniExtractBtoF32_part1_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractBtoF32_part2_4x4", &uniExtractBtoF32_part2_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractBtoF32_part3_4x4", &uniExtractBtoF32_part3_4x4); status |= vsi_nn_kernel_gpu_add_param(node, "r_order", &reorder); status |= vsi_nn_kernel_gpu_add_param(node, "b_order", &order1); CHECK_STATUS_FAIL_GOTO(status, OnError); diff --git a/src/tim/vx/internal/src/kernel/evis/pre_process_yuv420_evis.c b/src/tim/vx/internal/src/kernel/evis/pre_process_yuv420_evis.c index 2d32371..7ab900b 100644 --- a/src/tim/vx/internal/src/kernel/evis/pre_process_yuv420_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/pre_process_yuv420_evis.c @@ -43,6 +43,7 @@ __BEGIN_DECLS #define VX_KERNEL_NAME_PRE_PROCESS_YUV420_SCALE_U8TOU8 CVIVANTE_NAMESPACE("evis.pre_process_yuv420_scale_U8toU8") #define VX_KERNEL_NAME_PRE_PROCESS_YUV420_SCALE_U8TOI8 CVIVANTE_NAMESPACE("evis.pre_process_yuv420_scale_U8toI8") #define VX_KERNEL_NAME_PRE_PROCESS_YUV420_COPY_U8TOU8 CVIVANTE_NAMESPACE("evis.pre_process_yuv420_copy_U8toU8") +#define VX_KERNEL_NAME_PRE_PROCESS_YUV420_COPY_U8TOF16 CVIVANTE_NAMESPACE("evis.pre_process_yuv420_copy_U8toF16") #define KERNEL_SOURCE_1 "pre_process_yuv420_scale_u8", #define KERNEL_SOURCE_2 "pre_process_yuv420_copy_u8", @@ -77,6 +78,7 @@ static const struct { TENSOR_PRE_PROCESS_YUV420_KERNELS(U8, U8, SCALE, KERNEL_SOURCE_1) TENSOR_PRE_PROCESS_YUV420_KERNELS(U8, I8, SCALE, KERNEL_SOURCE_5) TENSOR_PRE_PROCESS_YUV420_KERNELS(U8, U8, COPY, KERNEL_SOURCE_2) + TENSOR_PRE_PROCESS_YUV420_KERNELS(U8, F16, COPY, KERNEL_SOURCE_2) }; static vx_param_description_t vxPreProcessYuv420Kernel_param_def[] = @@ -155,10 +157,6 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv420_copy_initializer) } shaderParam.global_scale[0] = 16; - if (attr[0]->dtype == I16 || attr[0]->dtype == F16) - { - shaderParam.global_scale[0] = 8; - } shaderParam.global_scale[1] = 1; shaderParam.global_scale[2] = 1; shaderParam.global_size[0] = gpu_align_p2((width + shaderParam.global_scale[0] - 1) @@ -418,6 +416,7 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv420_copy_initializer) switch( attr[0]->dtype ) { case U8: + case F16: { // R status |= vsi_nn_kernel_gpu_add_param(node, "uniCalculateTmpR1st_4x4", &uniCalculateTmpR1st_4x4); @@ -866,7 +865,7 @@ static vsi_status _query_kernel input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); - if (enable_copy && output_dtype == U8) + if (enable_copy && (output_dtype == U8 || output_dtype == F16)) { convert_type = COPY; } @@ -890,7 +889,7 @@ static vsi_status _query_kernel kernel->info.parameters = vxPreProcessYuv420Kernel_param_def; kernel->info.numParams = _cnt_of_array( vxPreProcessYuv420Kernel_param_def ); - if (enable_copy && output_dtype == U8) + if (enable_copy && (output_dtype == U8 || output_dtype == F16)) { kernel->info.initialize = _pre_process_yuv420_copy_initializer; } diff --git a/src/tim/vx/internal/src/kernel/evis/pre_process_yuv444_evis.c b/src/tim/vx/internal/src/kernel/evis/pre_process_yuv444_evis.c index 7d51d43..262aa5d 100644 --- a/src/tim/vx/internal/src/kernel/evis/pre_process_yuv444_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/pre_process_yuv444_evis.c @@ -43,6 +43,7 @@ __BEGIN_DECLS #define VX_KERNEL_NAME_PRE_PROCESS_YUV444_SCALE_U8TOU8 CVIVANTE_NAMESPACE("evis.pre_process_yuv444_scale_U8toU8") #define VX_KERNEL_NAME_PRE_PROCESS_YUV444_SCALE_U8TOI8 CVIVANTE_NAMESPACE("evis.pre_process_yuv444_scale_U8toI8") #define VX_KERNEL_NAME_PRE_PROCESS_YUV444_COPY_U8TOU8 CVIVANTE_NAMESPACE("evis.pre_process_yuv444_copy_U8toU8") +#define VX_KERNEL_NAME_PRE_PROCESS_YUV444_COPY_U8TOF16 CVIVANTE_NAMESPACE("evis.pre_process_yuv444_copy_U8toF16") #define KERNEL_SOURCE_1 "pre_process_yuv444_scale", #define KERNEL_SOURCE_3 "pre_process_yuv444_scale_fp16", @@ -75,6 +76,7 @@ static const struct { TENSOR_PRE_PROCESS_YUV444_KERNELS(U8, U8, SCALE, KERNEL_SOURCE_1) TENSOR_PRE_PROCESS_YUV444_KERNELS(U8, I8, SCALE, KERNEL_SOURCE_1) TENSOR_PRE_PROCESS_YUV444_KERNELS(U8, U8, COPY, KERNEL_SOURCE_4) + TENSOR_PRE_PROCESS_YUV444_KERNELS(U8, F16, COPY, KERNEL_SOURCE_4) }; static vx_param_description_t vxPreProcessYuv444Kernel_param_def[] = @@ -145,10 +147,6 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv444_copy_initializer) } shaderParam.global_scale[0] = 16; - if (attr[0]->dtype == I16 || attr[0]->dtype == F16) - { - shaderParam.global_scale[0] = 8; - } shaderParam.global_scale[1] = 1; shaderParam.global_scale[2] = 1; shaderParam.global_size[0] = gpu_align_p2((width + shaderParam.global_scale[0] - 1) @@ -400,6 +398,7 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv444_copy_initializer) switch( attr[0]->dtype ) { case U8: + case F16: { // R status |= vsi_nn_kernel_gpu_add_param(node, "uniCalculateTmpR1st_4x4", &uniCalculateTmpR1st_4x4); @@ -841,7 +840,7 @@ static vsi_status _query_kernel input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); - if (enable_copy && output_dtype == U8) + if (enable_copy && (output_dtype == U8 || output_dtype == F16)) { convert_type = COPY; } @@ -865,7 +864,7 @@ static vsi_status _query_kernel kernel->info.parameters = vxPreProcessYuv444Kernel_param_def; kernel->info.numParams = _cnt_of_array( vxPreProcessYuv444Kernel_param_def ); - if (enable_copy && output_dtype == U8) + if (enable_copy && (output_dtype == U8 || output_dtype == F16)) { kernel->info.initialize = _pre_process_yuv444_copy_initializer; } diff --git a/src/tim/vx/internal/src/kernel/evis/repeat_evis.c b/src/tim/vx/internal/src/kernel/evis/repeat_evis.c new file mode 100644 index 0000000..421caca --- /dev/null +++ b/src/tim/vx/internal/src/kernel/evis/repeat_evis.c @@ -0,0 +1,609 @@ +/**************************************************************************** +* +* Copyright (c) 2019 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "vsi_nn_error.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "libnnext/vx_lib_nnext.h" +#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h" + +__BEGIN_DECLS + +/* + * Define kernel meta. + */ + +#define KERNEL_SOURCE_1 "repeat" +#define KERNEL_SOURCE_2 "repeat_axis1" + +#define HASH_PREPROCESS_STARTID_SH_KERNEL_NAME \ + CVIVANTE_NAMESPACE("evis.preprocess_start_idx") + +#define HASH_REPEAT_SH_KERNEL_1D_NAME(SRC0_TYPE) \ + CVIVANTE_NAMESPACE("evis.repeat_"#SRC0_TYPE"_1D") + +#define HASH_REPEAT_SH_KERNEL_NAME(SRC0_TYPE, AXIS) \ + CVIVANTE_NAMESPACE("evis.repeat_"#SRC0_TYPE"_axis"#AXIS) + +// Add kernel hashtable here +#define HASH_PREPROCESS_KEY(_input0_type, _output_type) \ + ((_input0_type << 24) | (_output_type << 16)) + +#define HASH_REPEAT_KEY(_input0_type, _output_type, _is1d, _axis) \ + ((_input0_type << 24) | (_output_type << 16) | (_is1d << 8) | _axis) + +#define TENSOR_PREPROCESS_STARTID_KERNELS(IN0_TYPE, OUT_TYPE, SOURCE) \ + { HASH_PREPROCESS_KEY(IN0_TYPE, OUT_TYPE), \ + HASH_PREPROCESS_STARTID_SH_KERNEL_NAME, \ + SOURCE }, + +#define TENSOR_REPEAT_KERNELS(IN0_TYPE, OUT_TYPE, AXIS, SOURCE) \ + { HASH_REPEAT_KEY(IN0_TYPE, OUT_TYPE, 0, AXIS), \ + HASH_REPEAT_SH_KERNEL_NAME(IN0_TYPE, AXIS), \ + SOURCE }, + +#define TENSOR_REPEAT_1D_KERNELS(IN0_TYPE, OUT_TYPE, SOURCE) \ + { HASH_REPEAT_KEY(IN0_TYPE, OUT_TYPE, 1, 0), \ + HASH_REPEAT_SH_KERNEL_1D_NAME(IN0_TYPE), \ + SOURCE }, + +typedef struct +{ + uint32_t key; + char * function_name; + const char * source_name; +} _kernel_map_type; + +static const _kernel_map_type _preprocess_kernel_map[] = +{ + // Register kernel here + TENSOR_PREPROCESS_STARTID_KERNELS( I32, I32, KERNEL_SOURCE_1 ) +}; + +static const _kernel_map_type _repeat_kernel_map[] = +{ + // Register kernel here + TENSOR_REPEAT_KERNELS( U8, U8, 0, KERNEL_SOURCE_1 ) + TENSOR_REPEAT_KERNELS( U8, U8, 1, KERNEL_SOURCE_2 ) + TENSOR_REPEAT_KERNELS( U8, U8, 2, KERNEL_SOURCE_1 ) + TENSOR_REPEAT_KERNELS( I16, I16, 0, KERNEL_SOURCE_1 ) + TENSOR_REPEAT_KERNELS( I16, I16, 1, KERNEL_SOURCE_2 ) + TENSOR_REPEAT_KERNELS( I16, I16, 2, KERNEL_SOURCE_1 ) + + TENSOR_REPEAT_1D_KERNELS( U8, U8, KERNEL_SOURCE_1 ) + TENSOR_REPEAT_1D_KERNELS( I16, I16, KERNEL_SOURCE_1 ) +}; + +/* + * Kernel params + */ +static vx_param_description_t _preprocess_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + // Add kererl parameters here +}; +#define _REPEAT_PREPROCESS_PARAM_NUM _cnt_of_array( _preprocess_kernel_param_def ) + +static vx_param_description_t _repeat_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + // Add kererl parameters here +}; +#define _REPEAT_PARAM_NUM _cnt_of_array( _repeat_kernel_param_def ) + +/* + * Kernel initializer + */ +DEF_KERNEL_INITIALIZER(_preprocess_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + gpu_param_t shaderParam = { + 3, // workdim + {0, 0, 0}, // globalWorkOffset: control the start location be processed in the image + {0, 0, 0}, // globalWorkScale: how many pixels could be processed by a single thread + {0, 0, 0}, // localWorkSize: local group size in thread + {0, 0, 0}}; // globalWorkSize: image size in thread + + vsi_nn_kernel_tensor_attr_t* attr[1] = {NULL}; + int32_t width = 0; + + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); + CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError ); + + width = attr[0]->shape->data[0]; + + shaderParam.global_scale[0] = 16; + shaderParam.global_scale[1] = 1; + shaderParam.global_scale[2] = 1; + shaderParam.local_size[0] = 32; + shaderParam.local_size[1] = 1; + shaderParam.local_size[2] = 1; + shaderParam.global_size[0] = 32; + shaderParam.global_size[1] = 1; + shaderParam.global_size[2] = 1; + + status = vsi_nn_kernel_gpu_config( node, &shaderParam ); + CHECK_STATUS_FAIL_GOTO(status, OnError); + + { + gpu_dp_inst_t uniIntegralHorAcc_4x4 = {{ + 0xff3f0f03, // TCfg + 0x00000000, // ASelt + 0x00100000, 0x32100210, // ABin + 0x00000000, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002600, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + status = vsi_nn_kernel_gpu_add_param(node, "uniIntegralHorAcc_4x4", &uniIntegralHorAcc_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "width", &width); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + } + +OnError: + if (attr[0]) + { + vsi_nn_kernel_tensor_attr_release( &attr[0] ); + attr[0] = NULL; + } + + return status; +} + +DEF_KERNEL_INITIALIZER(_repeat_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + gpu_param_t shaderParam = { + 3, // workdim + {0, 0, 0}, // globalWorkOffset: control the start location be processed in the image + {0, 0, 0}, // globalWorkScale: how many pixels could be processed by a single thread + {0, 0, 0}, // localWorkSize: local group size in thread + {0, 0, 0}}; // globalWorkSize: image size in thread + + vsi_nn_kernel_tensor_attr_t* attr[1] = {NULL}; + vsi_int_array_t * input_shape = NULL; + int32_t height = 0, width = 0, chn = 0; + int32_t is1d = 0; + int32_t axis = 0; + + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); + CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError ); + + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[4], &axis); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + + input_shape = attr[0]->shape; + width = input_shape->data[0]; + height = input_shape->data[1]; + if (height == 1 && input_shape->size == 2) + { + is1d = 1; + } + chn = input_shape->size > 2 ? input_shape->data[2] : 1; + + if ((axis == 0 && is1d == 0) || axis == 2) + { + shaderParam.global_scale[0] = 16; + if (attr[0]->dtype == I16 || attr[0]->dtype == F16) + { + shaderParam.global_scale[0] = 8; + } + + shaderParam.global_scale[1] = 1; + shaderParam.global_scale[2] = 1; + } + else if (is1d) + { + shaderParam.global_scale[0] = 1; + shaderParam.global_scale[1] = 1; + shaderParam.global_scale[2] = 1; + } + else if (axis == 1) + { + shaderParam.global_scale[0] = 1; + shaderParam.global_scale[1] = 8; + shaderParam.global_scale[2] = 1; + } + shaderParam.global_size[0] = gpu_align_p2((width + shaderParam.global_scale[0] - 1) + / shaderParam.global_scale[0], 4); + shaderParam.global_size[1] = (height + shaderParam.global_scale[1] - 1) + / shaderParam.global_scale[1]; + shaderParam.global_size[2] = chn; + + status = vsi_nn_kernel_gpu_config( node, &shaderParam ); + CHECK_STATUS_FAIL_GOTO(status, OnError); + + { + gpu_dp_inst_t uniExtract1to8Short_2x8 = {{ + 0x11111111, // TCfg + 0x00000000, // ASelt + 0x00000000, 0x00000000, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16 }; + + status = vsi_nn_kernel_gpu_add_param(node, "uniExtract1to8Short_2x8", &uniExtract1to8Short_2x8); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + } + +OnError: + if (attr[0]) + { + vsi_nn_kernel_tensor_attr_release( &attr[0] ); + attr[0] = NULL; + } + + return status; +} + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_tensor_t* const* const inputs, + vsi_nn_tensor_t* const* const outputs, + vsi_nn_kernel_t* kernel_preprocess, + vsi_nn_kernel_t* kernel, + int32_t axis + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_dtype_e input0_dtype = U8; + vsi_nn_kernel_dtype_e input1_dtype = I32; + vsi_nn_kernel_dtype_e output_dtype = U8; + uint32_t key = 0; + int32_t is1d = inputs[0]->attr.dim_num == 1 ? 1 : 0; + int i = 0; + + input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + input1_dtype = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type ); + output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + + if (input0_dtype == F16) + { + input0_dtype = I16; + } + if (output_dtype == F16) + { + output_dtype = I16; + } + + if (input0_dtype == I8) + { + input0_dtype = U8; + } + if (output_dtype == I8) + { + output_dtype = U8; + } + + key = HASH_PREPROCESS_KEY( input1_dtype, I32 ); + + for( i = 0; i < _cnt_of_array(_preprocess_kernel_map); i ++ ) + { + if ( _preprocess_kernel_map[i].key == key ) + { + break; + } + } + if ( i < _cnt_of_array(_preprocess_kernel_map) ) + { + snprintf( kernel_preprocess->info.name, VX_MAX_KERNEL_NAME, "%s", _preprocess_kernel_map[i].function_name ); + kernel_preprocess->info.parameters = _preprocess_kernel_param_def; + kernel_preprocess->info.numParams = _REPEAT_PREPROCESS_PARAM_NUM; + kernel_preprocess->info.initialize = _preprocess_initializer; + + vsi_nn_kernel_add_source( kernel_preprocess, VSI_NN_GPU_SOURCE_FMT_CODE, 2, + "vsi_nn_kernel_header", + _preprocess_kernel_map[i].source_name ); + vsi_nn_kernel_add_source( kernel_preprocess, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + _preprocess_kernel_map[i].source_name ); + } + + + key = HASH_REPEAT_KEY( input0_dtype, output_dtype, is1d, axis ); + + for( i = 0; i < _cnt_of_array(_repeat_kernel_map); i ++ ) + { + if ( _repeat_kernel_map[i].key == key ) + { + break; + } + } + if ( i < _cnt_of_array(_repeat_kernel_map) ) + { + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", _repeat_kernel_map[i].function_name ); + kernel->info.parameters = _repeat_kernel_param_def; + kernel->info.numParams = _REPEAT_PARAM_NUM; + kernel->info.initialize = _repeat_initializer; + + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2, + "vsi_nn_kernel_header", + _repeat_kernel_map[i].source_name ); + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + _repeat_kernel_map[i].source_name ); + status = VSI_SUCCESS; + } + return status; +} /* _query_kernel() */ + +static int32_t _optimize_repeat_shape + ( + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs, + int32_t* axis, + int32_t* opt_shape_in, + int32_t* opt_shape_out, + int32_t* new_rank + ) +{ + vsi_status status = VSI_SUCCESS; + + if (inputs[0]->attr.dim_num == 1) + { + opt_shape_in[0] = inputs[0]->attr.size[0]; + opt_shape_in[1] = 1; + opt_shape_out[0] = outputs[0]->attr.size[0]; + opt_shape_out[1] = 1; + new_rank[0] = 2; + new_rank[1] = 2; + } + else if (axis[0] == 3) + { + vsi_nn_kernel_optimize_element_shape( (int32_t*)inputs[0]->attr.size, 3, opt_shape_in, new_rank ); + if (opt_shape_in[1] == 1) + { + opt_shape_in[1] = inputs[0]->attr.size[3]; + opt_shape_out[0] = opt_shape_in[0]; + opt_shape_out[1] = outputs[0]->attr.size[3]; + axis[0] = 0; + new_rank[0] = 2; + new_rank[1] = 2; + } + else if (new_rank[0] == 2) + { + opt_shape_in[2] = inputs[0]->attr.size[3]; + opt_shape_out[0] = opt_shape_in[0]; + opt_shape_out[1] = opt_shape_in[1]; + opt_shape_out[2] = outputs[0]->attr.size[3]; + axis[0] = 2; + new_rank[0] = 3; + new_rank[1] = 3; + } + else + { + status = VSI_FAILURE; + } + } + + return status; +} + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t preprocess_node_params[_REPEAT_PREPROCESS_PARAM_NUM] = { NULL }; + vsi_nn_kernel_node_param_t node_params[_REPEAT_PARAM_NUM] = { NULL }; + vsi_nn_kernel_node_t tmp_node = NULL; + vsi_nn_kernel_node_t node = NULL; + vsi_nn_tensor_attr_t attr; + vsi_nn_kernel_t * kernel_preprocess = NULL; + vsi_nn_tensor_t * tensor_preprocess = NULL; + vsi_nn_kernel_tensor_t rs_input = NULL, rs_input1 = NULL, rs_output = NULL; + int32_t new_shape[2][VSI_NN_MAX_DIM_NUM] = {{ 1, 1, 1, 1 }, { 1, 1, 1, 1 }}; + int32_t new_rank[2] = {0, 0}; + int32_t axis = vsi_nn_kernel_param_get_int32( params, "axis" ); + + // Check if gpu can support the size + if ( !vsi_nn_kernel_gpu_check_shape( + (int32_t*)outputs[0]->attr.size, outputs[0]->attr.dim_num ) ) + { + return NULL; + } + + if (axis > 2 || outputs[0]->attr.dim_num == 1) + { + status = _optimize_repeat_shape(inputs, outputs, &axis, new_shape[0], new_shape[1], new_rank); + if ( VSI_SUCCESS != status ) + { + goto final; + } + rs_input = vsi_nn_kernel_tensor_reshape(inputs[0]->t, new_shape[0], new_rank[0]); + rs_output = vsi_nn_kernel_tensor_reshape(outputs[0]->t, new_shape[1], new_rank[1]); + } + + if (inputs[1]->attr.dim_num == 1) + { + new_shape[0][0] = inputs[1]->attr.size[0]; + new_shape[0][1] = 1; + rs_input1 = vsi_nn_kernel_tensor_reshape(inputs[1]->t, new_shape[0], 2); + } + + kernel_preprocess = vsi_nn_kernel_create( VSI_NN_KERNEL_TYPE_EVIS ); + // Assign unique_id + kernel_preprocess->unique_id = kernel->unique_id; + + status = _query_kernel( inputs, outputs, kernel_preprocess, kernel, axis ); + if ( VSI_SUCCESS != status ) + { + goto final; + } + + memset( &attr, 0, sizeof(vsi_nn_tensor_attr_t) ); + attr.dtype.vx_type = VSI_NN_TYPE_INT32; + attr.is_const = FALSE; + attr.vtl = TRUE; + attr.size[0] = inputs[1]->attr.size[0]; + attr.size[1] = 1; + attr.dim_num = 2; + tensor_preprocess = vsi_nn_CreateTensor( graph, &attr ); + + // preprocess + tmp_node = vsi_nn_kernel_create_node( graph, kernel_preprocess ); + if (tmp_node) + { + uint32_t index = 0; + if (rs_input1) + { + preprocess_node_params[index++] = rs_input1; + } + else + { + preprocess_node_params[index++] = (vsi_nn_kernel_node_param_t)inputs[1]->t; + } + preprocess_node_params[index++] = (vsi_nn_kernel_node_param_t)tensor_preprocess->t; + + status = vsi_nn_kernel_node_pass_param( tmp_node, preprocess_node_params, + _REPEAT_PREPROCESS_PARAM_NUM ); + CHECK_STATUS(status); + { + // Set default border mode. + vx_border_t border; + border.mode = VX_BORDER_CONSTANT; + border.constant_value.U8 = 0; + border.constant_value.U16 = 0; + border.constant_value.S32 = 0; + if (inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_UINT8) + { + border.constant_value.U8 = (vx_uint8)inputs[0]->attr.dtype.zero_point; + } + status = vxSetNodeAttribute( (vx_node)tmp_node, VX_NODE_BORDER, &border, sizeof(border) ); + CHECK_STATUS(status); + } + } + + // repeat + node = vsi_nn_kernel_create_node( graph, kernel ); + if (node) + { + uint32_t index = 0; + if (rs_input) + { + node_params[index++] = rs_input; + } + else + { + node_params[index++] = (vsi_nn_kernel_node_param_t)inputs[0]->t; + } + if (rs_input1) + { + node_params[index++] = rs_input1; + } + else + { + node_params[index++] = (vsi_nn_kernel_node_param_t)inputs[1]->t; + } + node_params[index++] = (vsi_nn_kernel_node_param_t)tensor_preprocess->t; + if (rs_output) + { + node_params[index++] = rs_output; + } + else + { + node_params[index++] = (vsi_nn_kernel_node_param_t)outputs[0]->t; + } + node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &axis ); + + status = vsi_nn_kernel_node_pass_param( node, node_params, + _REPEAT_PARAM_NUM ); + CHECK_STATUS(status); + vsi_nn_kernel_scalar_release( &node_params[4] ); + { + // Set default border mode. + vx_border_t border; + border.mode = VX_BORDER_REPLICATE; + status = vxSetNodeAttribute( (vx_node)node, VX_NODE_BORDER, &border, sizeof(border) ); + CHECK_STATUS(status); + } + } + + /* Pass parameters to node. */ +final: + if (rs_input) + { + vsi_nn_kernel_tensor_release( &rs_input ); + } + if (rs_input1) + { + vsi_nn_kernel_tensor_release( &rs_input1 ); + } + if (rs_output) + { + vsi_nn_kernel_tensor_release( &rs_output ); + } + if ( kernel_preprocess ) + { + vsi_nn_kernel_release( &kernel_preprocess ); + } + if ( tensor_preprocess ) + { + vsi_nn_ReleaseTensor( &tensor_preprocess ); + } + if (tmp_node) {vsi_nn_kernel_node_release( &tmp_node );} + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_EVIS( repeat, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/evis/resize_bilinear_evis.c b/src/tim/vx/internal/src/kernel/evis/resize_bilinear_evis.c index af3e06f..194fb3b 100644 --- a/src/tim/vx/internal/src/kernel/evis/resize_bilinear_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/resize_bilinear_evis.c @@ -49,11 +49,13 @@ typedef enum UP, UP_OPT, UP_2X_HALF, + UP_3X_HALF, + UP_4X_HALF, } _internal_scale_e; #define _RESIZE_BILINEAR_KERNEL_SOURCE(_input_type) "resize_bilinear_"#_input_type #define _RESIZE_BILINEAR_KERNEL_SOURCE_OPT(_input_type) "resize_bilinear_"#_input_type"_opt" -#define _RESIZE_BILINEAR_KERNEL_SOURCE_UP_2X(_input_type) "resize_bilinear_"#_input_type"_UP_2X" +#define _RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC(_input_type) "resize_bilinear_"#_input_type"_half_pixel_centers" #define STR(a) #a // Add kernel hashtable here @@ -77,8 +79,21 @@ typedef enum #define PACK_KERNEL_MAP_UP_2X_HALF( IN_DTYPE, OUT_DTYPE ) \ { RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_2X_HALF ), \ - CVIVANTE_NAMESPACE("evis.resize_bilinear_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)"_UP_2X_half"), \ - _RESIZE_BILINEAR_KERNEL_SOURCE_UP_2X(IN_DTYPE) } + CVIVANTE_NAMESPACE("evis.resize_bilinear_"STR(IN_DTYPE)"to"STR(OUT_DTYPE) \ + "_SAME_2x_upsample_half_pixel_centers"), \ + _RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC(IN_DTYPE) } + +#define PACK_KERNEL_MAP_UP_4X_HALF( IN_DTYPE, OUT_DTYPE ) \ + { RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_4X_HALF ), \ + CVIVANTE_NAMESPACE("evis.resize_bilinear_"STR(IN_DTYPE)"to"STR(OUT_DTYPE) \ + "_SAME_4x_upsample_half_pixel_centers"), \ + _RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC(IN_DTYPE) } + +#define PACK_KERNEL_MAP_UP_3X_HALF( IN_DTYPE, OUT_DTYPE ) \ + { RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_3X_HALF ), \ + CVIVANTE_NAMESPACE("evis.resize_bilinear_"STR(IN_DTYPE)"to"STR(OUT_DTYPE) \ + "_SAME_3x_upsample_half_pixel_centers"), \ + _RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC(IN_DTYPE) } typedef struct { @@ -103,6 +118,8 @@ static const _kernel_map_type _resize_bilinear_kernel_map[] = PACK_KERNEL_MAP_UP(BF16, BF16), PACK_KERNEL_MAP_UP_OPT(U8, U8), PACK_KERNEL_MAP_UP_2X_HALF(U8, U8), + PACK_KERNEL_MAP_UP_3X_HALF(U8, U8), + PACK_KERNEL_MAP_UP_4X_HALF(U8, U8), }; @@ -203,8 +220,10 @@ DEF_KERNEL_INITIALIZER(_resize_bilinear_initializer) uint32_t out_height; float half_pixel_value = 0.0f; vsi_bool is_use_scale_kernel = (vsi_bool)(_RESIZE_BILINEAR_PARAM_NUM == param_size); - vsi_bool is_use_2x_up_half_kernel = FALSE; - + vsi_bool is_half_pixel_centers = FALSE; + vsi_bool is_2x_up_kernel = FALSE; + vsi_bool is_3x_up_kernel = FALSE; + vsi_bool is_4x_up_kernel = FALSE; input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); CHECK_PTR_FAIL_GOTO( input_attr, "Create tensor attr buffer fail.", final ); @@ -254,11 +273,13 @@ DEF_KERNEL_INITIALIZER(_resize_bilinear_initializer) half_pixel_value = 0.0f; } - if ((U8 == input_dtype) && (_is_same_quant(input_attr, output_attr))) + is_half_pixel_centers = (!align_corners) && (half_pixel_centers); + + if ((U8 == input_dtype) && (_is_same_quant(input_attr, output_attr)) && is_half_pixel_centers) { - is_use_2x_up_half_kernel = (!align_corners) && (half_pixel_centers); - is_use_2x_up_half_kernel = is_use_2x_up_half_kernel && \ - (2 * in_width == out_width) && (2 * in_height == out_height); + is_2x_up_kernel = (2 * in_width == out_width) && (2 * in_height == out_height); + is_3x_up_kernel = (3 * in_width == out_width) && (3 * in_height == out_height); + is_4x_up_kernel = (4 * in_width == out_width) && (4 * in_height == out_height); } if (U8 == input_dtype && VSI_NN_KERNEL_QUANT_ASYMM == input_attr->quant ) @@ -309,11 +330,17 @@ DEF_KERNEL_INITIALIZER(_resize_bilinear_initializer) outputZP = 0; } - if (is_use_2x_up_half_kernel) + if (is_2x_up_kernel || is_4x_up_kernel) { - gpu_param.global_scale[0] = 8; + gpu_param.global_scale[0] = 16; gpu_param.global_scale[1] = 1; } + else if (is_3x_up_kernel) + { + gpu_param.global_scale[0] = 15; + gpu_param.global_scale[1] = 6; + gpu_param.global_scale[2] = 1; + } else { gpu_param.global_scale[0] = 4; @@ -321,28 +348,134 @@ DEF_KERNEL_INITIALIZER(_resize_bilinear_initializer) gpu_param.global_scale[2] = 1; } - if (is_use_2x_up_half_kernel) + if (is_2x_up_kernel) { - gpu_dp_inst_t uniResize2xUp_4x8 = {{ + gpu_dp_inst_t uniResize2xUp_0_4x8 = {{ 0x55555555, 0x55555555, // TCfg 0x4418c020, 0x29444194, 0xc8629c86, 0x83a4c839, 0xad0a4a4c, // BinSelect - 0x00000400, // AccumType, ConstantType, and PostShift + 0x00000704, // AccumType, ConstantType, and PostShift 0x09030301, 0x03090103, 0x09030301, 0x03090103, 0x09030301, 0x03090103, 0x09030301, 0x03090103 // Constant }, GPU_DP_TYPE_16}; - gpu_dp_inst_t uniResize2xUpRound_2x8 = {{ - 0x55555555, // TCfg - 0x44444444, // ASelt - 0x03020100, 0x07060504, // ABin - 0xaaaaaaaa, // BSelt - 0x00000000, 0x00000000, // BBin + gpu_dp_inst_t uniResize2xUp_1_4x8 = {{ + 0x55555555, 0x55555555, // TCfg + 0x4c5ad0a4, 0x6b54c5b5, 0xd8e6bd8e, 0x07c5d07b, 0xce128c5d, // BinSelect 0x00000704, // AccumType, ConstantType, and PostShift - 0x00010001, 0x00010001, 0x00010001, 0x00010001, - 0x00010001, 0x00010001, 0x00010001, 0x00010001 // Constant + 0x09030301, 0x03090103, 0x09030301, 0x03090103, + 0x09030301, 0x03090103, 0x09030301, 0x03090103 // Constant }, GPU_DP_TYPE_16}; - status = vsi_nn_kernel_gpu_add_param( node, "uniResize2xUp_4x8", &uniResize2xUp_4x8); - status |= vsi_nn_kernel_gpu_add_param( node, "uniResize2xUpRound_2x8", &uniResize2xUpRound_2x8); + status = vsi_nn_kernel_gpu_add_param( node, "uniResize2xUp_0_4x8", &uniResize2xUp_0_4x8); + status |= vsi_nn_kernel_gpu_add_param( node, "uniResize2xUp_1_4x8", &uniResize2xUp_1_4x8); + status |= vsi_nn_kernel_gpu_add_param( node, "out_height", &out_height); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + else if (is_3x_up_kernel) + { + gpu_dp_inst_t uniResize3xUp_l00_2x8 = {{ + 0x15515515, // TCfg + 0x00000000, // ASelt + 0x21210110, 0x03323202, // ABin + 0x2aa2aa2a, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000610, // AccumType, ConstantType, and PostShift + 0xaaaa5555, 0x0000ffff, 0x5555aaaa, 0xaaaa5555, + 0x0000ffff, 0x5555aaaa, 0xaaaa5555, 0x0000ffff // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniResize3xUp_l01_2x8 = {{ + 0x05155155, // TCfg + 0x00000000, // ASelt + 0x54044343, 0x00650554, // ABin + 0x0a2aa2aa, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000610, // AccumType, ConstantType, and PostShift + 0x5555aaaa, 0xaaaa5555, 0x0000ffff, 0x5555aaaa, + 0xaaaa5555, 0x0000ffff, 0x5555aaaa, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniResize3xUp_l10_4x4 = {{ + 0x55551155, // TCfg + 0x50501050, // ASelt + 0x01011010, 0x21212121, // ABin + 0xaaaa22aa, // BSelt + 0x00000000, 0x00000000, // BBin + 0x0000060f, // AccumType, ConstantType, and PostShift + 0x38e41c72, 0x1c720e39, 0x00005556, 0x00002aab, + 0x1c7238e4, 0x0e391c72, 0x38e41c72, 0x1c720e39 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniResize3xUp_l11_4x4 = {{ + 0x11555511, // TCfg + 0x10505010, // ASelt + 0x32320202, 0x03033232, // ABin + 0x22aaaa22, // BSelt + 0x00000000, 0x00000000, // BBin + 0x0000060f, // AccumType, ConstantType, and PostShift + 0x00005556, 0x00002aab, 0x1c7238e4, 0x0e391c72, + 0x38e41c72, 0x1c720e39, 0x00005556, 0x00002aab // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniResize3xUp_l12_4x4 = {{ + 0x55115555, // TCfg + 0x50105050, // ASelt + 0x43434343, 0x54540404, // ABin + 0xaa22aaaa, // BSelt + 0x00000000, 0x00000000, // BBin + 0x0000060f, // AccumType, ConstantType, and PostShift + 0x1c7238e4, 0x0e391c72, 0x38e41c72, 0x1c720e39, + 0x00005556, 0x00002aab, 0x1c7238e4, 0x0e391c72 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniResize3xUp_l13_4x4 = {{ + 0x00551155, // TCfg + 0x00501050, // ASelt + 0x05055454, 0x00006565, // ABin + 0x00aa22aa, // BSelt + 0x00000000, 0x00000000, // BBin + 0x0000060f, // AccumType, ConstantType, and PostShift + 0x38e41c72, 0x1c720e39, 0x00005556, 0x00002aab, + 0x1c7238e4, 0x0e391c72, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + + status = vsi_nn_kernel_gpu_add_param( node, "uniResize3xUp_l00_2x8", &uniResize3xUp_l00_2x8); + status |= vsi_nn_kernel_gpu_add_param( node, "uniResize3xUp_l01_2x8", &uniResize3xUp_l01_2x8); + status |= vsi_nn_kernel_gpu_add_param( node, "uniResize3xUp_l10_4x4", &uniResize3xUp_l10_4x4); + status |= vsi_nn_kernel_gpu_add_param( node, "uniResize3xUp_l11_4x4", &uniResize3xUp_l11_4x4); + status |= vsi_nn_kernel_gpu_add_param( node, "uniResize3xUp_l12_4x4", &uniResize3xUp_l12_4x4); + status |= vsi_nn_kernel_gpu_add_param( node, "uniResize3xUp_l13_4x4", &uniResize3xUp_l13_4x4); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + else if (is_4x_up_kernel) + { + gpu_dp_inst_t uniResize4xUp_l00_4x8 = {{ + 0x55555555, 0x55555555, // TCfg + 0x0208c020, 0x1944418c, 0x44419444, 0x62944419, 0x9c8629c8, // BinSelect + 0x00000406, // AccumType, ConstantType, and PostShift + 0x190f0f09, 0x23051503, 0x05230315, 0x0f19090f, + 0x190f0f09, 0x23051503, 0x05230315, 0x0f19090f // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniResize4xUp_l01_4x8 = {{ + 0x55555555, 0x55555555, // TCfg + 0x8629c862, 0x3a4c839c, 0x4c83a4c8, 0xa4a4c83a, 0xad0a4ad0, // BinSelect + 0x00000406, // AccumType, ConstantType, and PostShift + 0x190f0f09, 0x23051503, 0x05230315, 0x0f19090f, + 0x190f0f09, 0x23051503, 0x05230315, 0x0f19090f // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniResize4xUp_l10_4x8 = {{ + 0x55555555, 0x55555555, // TCfg + 0x0208c020, 0x1944418c, 0x44419444, 0x62944419, 0x9c8629c8, // BinSelect + 0x00000406, // AccumType, ConstantType, and PostShift + 0x23150503, 0x31070701, 0x07310107, 0x15230305, + 0x23150503, 0x31070701, 0x07310107, 0x15230305 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniResize4xUp_l11_4x8 = {{ + 0x55555555, 0x55555555, // TCfg + 0x8629c862, 0x3a4c839c, 0x4c83a4c8, 0xa4a4c83a, 0xad0a4ad0, // BinSelect + 0x00000406, // AccumType, ConstantType, and PostShift + 0x23150503, 0x31070701, 0x07310107, 0x15230305, + 0x23150503, 0x31070701, 0x07310107, 0x15230305 // Constant + }, GPU_DP_TYPE_16}; + + status = vsi_nn_kernel_gpu_add_param( node, "uniResize4xUp_l00_4x8", &uniResize4xUp_l00_4x8); + status |= vsi_nn_kernel_gpu_add_param( node, "uniResize4xUp_l01_4x8", &uniResize4xUp_l01_4x8); + status |= vsi_nn_kernel_gpu_add_param( node, "uniResize4xUp_l10_4x8", &uniResize4xUp_l10_4x8); + status |= vsi_nn_kernel_gpu_add_param( node, "uniResize4xUp_l11_4x8", &uniResize4xUp_l11_4x8); status |= vsi_nn_kernel_gpu_add_param( node, "out_height", &out_height); CHECK_STATUS_FAIL_GOTO(status, final ); } @@ -832,13 +965,13 @@ DEF_KERNEL_INITIALIZER(_resize_bilinear_initializer) goto final; } - if (!is_use_2x_up_half_kernel) + if (!is_2x_up_kernel && !is_3x_up_kernel && !is_4x_up_kernel) { status = vsi_nn_kernel_gpu_add_param( node, "half_pixel_value", &half_pixel_value); CHECK_STATUS_FAIL_GOTO(status, final ); } - if (is_use_2x_up_half_kernel) + if (is_2x_up_kernel || is_4x_up_kernel) { gpu_param.global_size[0] = gpu_align_p2((out_width + \ gpu_param.global_scale[0] - 1) / gpu_param.global_scale[0], 4); @@ -860,8 +993,6 @@ final: return status; } /* _resize_bilinear_initializer() */ - - /* * Query kernel */ @@ -872,7 +1003,8 @@ static vsi_status _query_kernel vsi_nn_tensor_t * const * const outputs, vsi_bool is_same_type, vsi_bool is_evis2, - vsi_bool is_2x_up_half, + int32_t align_corners, + int32_t half_pixel_centers, vsi_bool *is_run_opt_kernel ) { @@ -886,17 +1018,35 @@ static vsi_status _query_kernel vx_kernel_initialize_f initializer = _resize_bilinear_initializer; uint32_t key; uint32_t i; + vsi_bool is_2x_upsample =(2 * inputs[0]->attr.size[0] == outputs[0]->attr.size[0]) \ + && (2 * inputs[0]->attr.size[1] == outputs[0]->attr.size[1]); + vsi_bool is_3x_upsample =(3 * inputs[0]->attr.size[0] == outputs[0]->attr.size[0]) \ + && (3 * inputs[0]->attr.size[1] == outputs[0]->attr.size[1]); + vsi_bool is_4x_upsample =(4 * inputs[0]->attr.size[0] == outputs[0]->attr.size[0]) \ + && (4 * inputs[0]->attr.size[1] == outputs[0]->attr.size[1]); _internal_scale_e scale_flag = UP; in_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + is_2x_upsample &= (in_dtype == U8); + is_3x_upsample &= (in_dtype == U8); + is_4x_upsample &= (in_dtype == U8); + if (outputs[0]->attr.size[0] > inputs[0]->attr.size[0]) { - if (is_2x_up_half) + if (is_same_type && (!align_corners) && (half_pixel_centers) && is_2x_upsample) { scale_flag = UP_2X_HALF; } + else if (is_same_type && (!align_corners) && (half_pixel_centers) && is_3x_upsample) + { + scale_flag = UP_3X_HALF; + } + else if (is_same_type && (!align_corners) && (half_pixel_centers) && is_4x_upsample) + { + scale_flag = UP_4X_HALF; + } else if (is_same_type && is_evis2) { scale_flag = UP_OPT; @@ -920,19 +1070,6 @@ static vsi_status _query_kernel } } - if ((UP_2X_HALF == scale_flag) && (i >= kernel_map_size) && is_same_type && is_evis2) - { - scale_flag = UP_OPT; - key = RESIZE_BILINEAR_HASH_KEY( in_dtype, out_dtype, scale_flag ); - for( i = 0; i < (uint32_t)kernel_map_size; i ++ ) - { - if( kernel_map[i].key == key ) - { - break; - } - } - } - if ((UP_OPT == scale_flag) && (i >= kernel_map_size)) { scale_flag = UP; @@ -1109,9 +1246,6 @@ OnError: return scale; } - - - static vsi_nn_kernel_node_t _setup ( vsi_nn_graph_t * graph, @@ -1131,14 +1265,10 @@ static vsi_nn_kernel_node_t _setup vsi_bool is_same_type = vsi_nn_is_same_type(inputs[0], outputs[0]); vsi_bool is_evis2 = (vsi_bool)(graph->ctx->config.evis.ver == VSI_NN_HW_EVIS_2); vsi_bool is_run_opt_kernel = FALSE; - vsi_bool is_2x_up_half = FALSE; vsi_nn_tensor_t* scale = NULL; - is_2x_up_half = is_same_type && (!align_corners) && (half_pixel_centers); - is_2x_up_half = is_2x_up_half && (2 * inputs[0]->attr.size[0] == outputs[0]->attr.size[0]) \ - && (2 * inputs[0]->attr.size[1] == outputs[0]->attr.size[1]); status = _query_kernel( kernel, inputs, outputs, is_same_type, is_evis2, - is_2x_up_half, &is_run_opt_kernel); + align_corners, half_pixel_centers, &is_run_opt_kernel); if( VSI_SUCCESS == status) { node = vsi_nn_kernel_create_node( graph, kernel ); diff --git a/src/tim/vx/internal/src/kernel/evis/sequence_mask_evis.c b/src/tim/vx/internal/src/kernel/evis/sequence_mask_evis.c new file mode 100644 index 0000000..07c7266 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/evis/sequence_mask_evis.c @@ -0,0 +1,393 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "vsi_nn_error.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "kernel/vsi_nn_kernel_eltwise.h" +#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h" + +__BEGIN_DECLS + +#define KERNEL_NAME_SEQUENCE_MASK_U8TOU8 CVIVANTE_NAMESPACE("evis.sequence_mask_U8toU8") +#define KERNEL_NAME_SEQUENCE_MASK_U8TOU8_2D CVIVANTE_NAMESPACE("evis.sequence_mask_U8toU8_2D") +#define KERNEL_NAME_SEQUENCE_MASK_I8TOI8 CVIVANTE_NAMESPACE("evis.sequence_mask_I8toI8") +#define KERNEL_NAME_SEQUENCE_MASK_I8TOI8_2D CVIVANTE_NAMESPACE("evis.sequence_mask_I8toI8_2D") +#define KERNEL_NAME_SEQUENCE_MASK_I8TOU8 CVIVANTE_NAMESPACE("evis.sequence_mask_I8toU8") +#define KERNEL_NAME_SEQUENCE_MASK_I8TOU8_2D CVIVANTE_NAMESPACE("evis.sequence_mask_I8toU8_2D") +#define KERNEL_NAME_SEQUENCE_MASK_I16TOI16 CVIVANTE_NAMESPACE("evis.sequence_mask_I16toI16") +#define KERNEL_NAME_SEQUENCE_MASK_I16TOI16_2D CVIVANTE_NAMESPACE("evis.sequence_mask_I16toI16_2D") +#define KERNEL_NAME_SEQUENCE_MASK_I16TOU8 CVIVANTE_NAMESPACE("evis.sequence_mask_I16toU8") +#define KERNEL_NAME_SEQUENCE_MASK_I16TOU8_2D CVIVANTE_NAMESPACE("evis.sequence_mask_I16toU8_2D") +#define KERNEL_NAME_SEQUENCE_MASK_F16TOF16 CVIVANTE_NAMESPACE("evis.sequence_mask_F16toF16") +#define KERNEL_NAME_SEQUENCE_MASK_F16TOF16_2D CVIVANTE_NAMESPACE("evis.sequence_mask_F16toF16_2D") +#define KERNEL_NAME_SEQUENCE_MASK_F16TOU8 CVIVANTE_NAMESPACE("evis.sequence_mask_F16toU8") +#define KERNEL_NAME_SEQUENCE_MASK_F16TOU8_2D CVIVANTE_NAMESPACE("evis.sequence_mask_F16toU8_2D") + +#define KERNEL_SOURCE_1 "sequence_mask" + +#define HASH_SEQUENCE_MASK_KEY(_input0_type, _output_type, _is2D) \ + ((_input0_type << 24) | (_output_type << 16) | (_is2D)) + +#define TENSOR_SEQUENCE_MASK_KERNELS(IN0_TYPE, OUT_TYPE, SOURCE) \ + { HASH_SEQUENCE_MASK_KEY(IN0_TYPE, OUT_TYPE, 0), \ + KERNEL_NAME_SEQUENCE_MASK_##IN0_TYPE##TO##OUT_TYPE, \ + SOURCE }, + +#define TENSOR_SEQUENCE_MASK_2D_KERNELS(IN0_TYPE, OUT_TYPE, SOURCE) \ + { HASH_SEQUENCE_MASK_KEY(IN0_TYPE, OUT_TYPE, 1), \ + KERNEL_NAME_SEQUENCE_MASK_##IN0_TYPE##TO##OUT_TYPE##_2D, \ + SOURCE }, + +static const struct { + uint32_t key; + char* function_name; + const char* source_name; + } kernel_map[] = +{ + TENSOR_SEQUENCE_MASK_KERNELS(U8, U8, KERNEL_SOURCE_1) + TENSOR_SEQUENCE_MASK_2D_KERNELS(U8, U8, KERNEL_SOURCE_1) + TENSOR_SEQUENCE_MASK_KERNELS(I8, I8, KERNEL_SOURCE_1) + TENSOR_SEQUENCE_MASK_2D_KERNELS(I8, I8, KERNEL_SOURCE_1) + TENSOR_SEQUENCE_MASK_KERNELS(I16, I16, KERNEL_SOURCE_1) + TENSOR_SEQUENCE_MASK_2D_KERNELS(I16, I16, KERNEL_SOURCE_1) + TENSOR_SEQUENCE_MASK_KERNELS(F16, F16, KERNEL_SOURCE_1) + TENSOR_SEQUENCE_MASK_2D_KERNELS(F16, F16, KERNEL_SOURCE_1) + TENSOR_SEQUENCE_MASK_KERNELS(I16, U8, KERNEL_SOURCE_1) + TENSOR_SEQUENCE_MASK_2D_KERNELS(I16, U8, KERNEL_SOURCE_1) + TENSOR_SEQUENCE_MASK_KERNELS(I8, U8, KERNEL_SOURCE_1) + TENSOR_SEQUENCE_MASK_2D_KERNELS(I8, U8, KERNEL_SOURCE_1) + TENSOR_SEQUENCE_MASK_KERNELS(F16, U8, KERNEL_SOURCE_1) + TENSOR_SEQUENCE_MASK_2D_KERNELS(F16, U8, KERNEL_SOURCE_1) +}; + +static vx_param_description_t kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, +}; +#define _EVIS_PARAM_NUM _cnt_of_array(kernel_param_def) + +DEF_KERNEL_INITIALIZER(_sequence_mask_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + gpu_param_t gpu_param = { + 3, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0} + }; + + vsi_nn_kernel_tensor_attr_t * attr[2] = { NULL, NULL }; + vsi_int_array_t * out_shape = NULL; + float scaleIn = 1.0f; + float scaleOut = 1.0f; + float outputVal1 = 1.0f; + int32_t output_zp = 0; + int32_t input_zp = 0; + + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); + CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); + attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); + CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final ); + + out_shape = attr[1]->shape; + + if (attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM) + { + input_zp = attr[0]->asymm.zero_point; + scaleIn = attr[0]->asymm.scale; + } + else if (attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP) + { + if (attr[0]->dfp.fl > 0) + { + scaleIn = (1.0f / ((float) ((int64_t)1 << attr[0]->dfp.fl))); + } + else + { + scaleIn = ((float) ((int64_t)1 << -attr[0]->dfp.fl)); + } + input_zp = 0; + } + + if (attr[1]->quant == VSI_NN_KERNEL_QUANT_ASYMM) + { + output_zp = attr[1]->asymm.zero_point; + scaleOut = 1.0f / attr[1]->asymm.scale; + } + else if (attr[1]->quant == VSI_NN_KERNEL_QUANT_DFP) + { + if (attr[1]->dfp.fl > 0) + { + scaleOut = (float)((int64_t)1 << attr[1]->dfp.fl); + } + else + { + scaleOut = (1.0f / (float)((int64_t)1 << -attr[1]->dfp.fl)); + } + output_zp = 0; + } + + outputVal1 = scaleOut + (float)output_zp; + + gpu_param.global_scale[0] = 4; + gpu_param.global_scale[1] = 1; + gpu_param.global_scale[2] = 1; + + gpu_param.global_size[0] = gpu_align_p2( + (out_shape->data[0] + gpu_param.global_scale[0] - 1) + / gpu_param.global_scale[0], 4); + gpu_param.global_size[1] = ( + (out_shape->data[1] + gpu_param.global_scale[1] - 1) + / gpu_param.global_scale[1]); + gpu_param.global_size[2] = out_shape->size > 2 ? out_shape->data[2] : 1; + + { + gpu_dp_inst_t uniConvert1stUint8SubZpToFp32_4x4 = {{ + 0x05050505, // TCfg + 0x04040404, // ASelt + 0x00010000, 0x00030002, // ABin + 0x0a0a0a0a, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0xffff0001, 0x00000000, 0xffff0001, 0x00000000, 0xffff0001, 0x00000000, 0xffff0001, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniConvertInt32toUint8_2x8 = {{ + 0x33333333, // TCfg + 0x11110000, // ASelt + 0x03020100, 0x03020100, // ABin + 0x00000000, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t UniFP16toFP32Lo4_dp4x4 = {{ + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00010000, 0x00030002, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x00003c00, 0x00000000, 0x00003c00, 0x00000000, 0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + + status = vsi_nn_kernel_gpu_add_param( node, + "uniConvert1stUint8SubZpToFp32_4x4", &uniConvert1stUint8SubZpToFp32_4x4 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniConvertInt32toUint8_2x8", &uniConvertInt32toUint8_2x8 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "UniFP16toFP32Lo4_dp4x4", &UniFP16toFP32Lo4_dp4x4 ); + status |= vsi_nn_kernel_gpu_add_param(node, "inputZP", &input_zp); + status |= vsi_nn_kernel_gpu_add_param(node, "input_scale", &scaleIn); + status |= vsi_nn_kernel_gpu_add_param(node, "output_ZP", &output_zp); + status |= vsi_nn_kernel_gpu_add_param(node, "outputVal1", &outputVal1); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + + status = vsi_nn_kernel_gpu_config( node, &gpu_param ); + +final: + if (attr[0]) + { + vsi_nn_kernel_tensor_attr_release( &attr[0] ); + attr[0] = NULL; + } + if (attr[1]) + { + vsi_nn_kernel_tensor_attr_release( &attr[1] ); + attr[1] = NULL; + } + + return status; +} /* _sequence_mask_initializer() */ + +static vsi_status _query_kernel + ( + vsi_nn_tensor_t* const* const inputs, + vsi_nn_tensor_t* const* const outputs, + vsi_nn_kernel_t* kernel, + int32_t is2Dflg + ) +{ + vsi_nn_kernel_dtype_e input0_dtype = I32; + vsi_nn_kernel_dtype_e output_dtype = U8; + vsi_status status = VSI_FAILURE; + uint32_t key = 0; + int i = 0; + + input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + if (output_dtype == BOOL8) + { + output_dtype= U8; + } + + key = HASH_SEQUENCE_MASK_KEY( input0_dtype, output_dtype, is2Dflg); + + for ( i = 0; i < _cnt_of_array(kernel_map); i ++ ) + { + if ( kernel_map[i].key == key ) + { + break; + } + } + if ( i < _cnt_of_array(kernel_map) ) + { + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name ); + kernel->info.parameters = kernel_param_def; + kernel->info.numParams = _cnt_of_array( kernel_param_def ); + kernel->info.initialize = _sequence_mask_initializer; + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2, + "vsi_nn_kernel_header", + kernel_map[i].source_name ); + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + kernel_map[i].source_name ); + status = VSI_SUCCESS; + } + return status; +} /* _query_kernel() */ + +static int32_t _optimize_mask_shape + ( + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs, + int32_t max_len, + int32_t* opt_shape_in, + int32_t* opt_shape_out, + int32_t* is2Dflg + ) +{ + vsi_status status = VSI_SUCCESS; + int32_t in_shape[VSI_NN_MAX_DIM_NUM] = {0}; + int32_t new_rank = 0; + uint32_t i = 0; + + for(i = 0; i < inputs[0]->attr.dim_num; i++) + { + in_shape[i] = inputs[0]->attr.size[i]; + } + + vsi_nn_kernel_optimize_element_shape( in_shape, inputs[0]->attr.dim_num, opt_shape_in, &new_rank ); + if (new_rank > 2) + { + return VSI_FAILURE; + } + + opt_shape_out[0] = max_len; + for(i = 0; i < (uint32_t)new_rank; i++) + { + opt_shape_out[i + 1] = opt_shape_in[i]; + } + if (opt_shape_out[2] == 1) + { + is2Dflg[0] = 1; + } + + return status; +} + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t tmp_params[_EVIS_PARAM_NUM] = {NULL}; + vsi_nn_kernel_node_t node = NULL; + vsi_nn_kernel_tensor_t rs_input = NULL, rs_output = NULL; + int32_t new_shape[2][VSI_NN_MAX_DIM_NUM] = {{ 1, 1, 1, 1 }, { 1, 1, 1, 1 }}; + int32_t max_len = vsi_nn_kernel_param_get_int32( params, "max_len" ); + int32_t is2Dflg = 0; + + if ( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size, + outputs[0]->attr.dim_num ) ) + { + return NULL; + } + + status = _optimize_mask_shape(inputs, outputs, max_len, new_shape[0], new_shape[1], &is2Dflg); + if ( VSI_SUCCESS != status ) + { + goto final; + } + rs_input = vsi_nn_kernel_tensor_reshape(inputs[0]->t, new_shape[0], 2); + rs_output = vsi_nn_kernel_tensor_reshape(outputs[0]->t, new_shape[1], 4); + + status = _query_kernel( inputs, outputs, kernel, is2Dflg ); + if ( VSI_SUCCESS == status ) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if ( node ) + { + uint32_t index = 0; + /* Pass parameters to node. */ + tmp_params[index++] = rs_input; + tmp_params[index++] = rs_output; + tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &max_len ); + status = vsi_nn_kernel_node_pass_param( node, tmp_params, _EVIS_PARAM_NUM ); + CHECK_STATUS(status); + vsi_nn_kernel_scalar_release( &tmp_params[2] ); + } + } + +final: + if (rs_input) + { + vsi_nn_kernel_tensor_release( &rs_input ); + } + if (rs_output) + { + vsi_nn_kernel_tensor_release( &rs_output ); + } + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_EVIS( sequence_mask, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/evis/slice_evis.c b/src/tim/vx/internal/src/kernel/evis/slice_evis.c new file mode 100644 index 0000000..35b8b99 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/evis/slice_evis.c @@ -0,0 +1,451 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_error.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "utils/vsi_nn_dtype_util.h" +#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h" + +__BEGIN_DECLS + + +#define _SLICE_KERNEL_SOURCE "slice" +#define _SLICE_KERNEL_NAME CVIVANTE_NAMESPACE("evis.slice") + + // Add kernel hashtable here +#define SLICE_SH_KERNEL_NAME(IN0_DTYPE, IN1_DTYPE, OUT_DTYPE) \ + CVIVANTE_NAMESPACE("evis.slice_"#IN0_DTYPE"_"#IN1_DTYPE"to"#OUT_DTYPE) + + // Add kernel hashtable here +#define SLICE_HASH_KEY( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE , _IMAGE_2D, _SAMEFL) \ + (( IN1_DTYPE << 18 ) | ( IN0_DTYPE << 10 ) | ( OUT_DTYPE << 2 ) | (_IMAGE_2D << 1) | (_SAMEFL)) + +#define PACK_KERNEL_MAP( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, SOURCE ) \ +{ SLICE_HASH_KEY( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, 0, 0 ), \ + SLICE_SH_KERNEL_NAME( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ), SOURCE } + +#define SLICE_SH_KERNEL_2D_NAME(IN0_DTYPE, IN1_DTYPE, OUT_DTYPE) \ + CVIVANTE_NAMESPACE("evis.slice_"#IN0_DTYPE"_"#IN1_DTYPE"to"#OUT_DTYPE"_2D") + +#define PACK_KERNEL_MAP_2D( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, SOURCE ) \ +{ SLICE_HASH_KEY( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, 1, 0 ), \ + SLICE_SH_KERNEL_2D_NAME( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ), SOURCE } + +#define SLICE_SH_KERNEL_SAMEFL_NAME(IN0_DTYPE, IN1_DTYPE, OUT_DTYPE) \ + CVIVANTE_NAMESPACE("evis.slice_"#IN0_DTYPE"_"#IN1_DTYPE"to"#OUT_DTYPE"_SAMEFL") + +#define PACK_KERNEL_MAP_SAMEFL( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, SOURCE ) \ +{ SLICE_HASH_KEY( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, 0, 1 ), \ + SLICE_SH_KERNEL_SAMEFL_NAME( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ), SOURCE } + +#define SLICE_SH_KERNEL_SAMEFL_2D_NAME(IN0_DTYPE, IN1_DTYPE, OUT_DTYPE) \ + CVIVANTE_NAMESPACE("evis.slice_"#IN0_DTYPE"_"#IN1_DTYPE"to"#OUT_DTYPE"_SAMEFL_2D") + +#define PACK_KERNEL_MAP_SAMEFL_2D( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, SOURCE ) \ +{ SLICE_HASH_KEY( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, 1, 1 ), \ + SLICE_SH_KERNEL_SAMEFL_2D_NAME( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ), SOURCE } + + typedef struct +{ + uint32_t key; + char * function_name; + const char * source_name; +} _kernel_map_type; + +static const _kernel_map_type _slice_kernel_map[] = +{ + // Register kernel here + PACK_KERNEL_MAP( F16, I32, F16, _SLICE_KERNEL_SOURCE ), + PACK_KERNEL_MAP( I16, I32, I16, _SLICE_KERNEL_SOURCE ), + PACK_KERNEL_MAP( U8, I32, U8, _SLICE_KERNEL_SOURCE ), + PACK_KERNEL_MAP( I8, I32, I8, _SLICE_KERNEL_SOURCE ), + + PACK_KERNEL_MAP_2D( F16, I32, F16, _SLICE_KERNEL_SOURCE ), + PACK_KERNEL_MAP_2D( I16, I32, I16, _SLICE_KERNEL_SOURCE ), + PACK_KERNEL_MAP_2D( U8, I32, U8, _SLICE_KERNEL_SOURCE ), + PACK_KERNEL_MAP_2D( I8, I32, I8, _SLICE_KERNEL_SOURCE ), + + PACK_KERNEL_MAP_SAMEFL( I16, I32, I16, _SLICE_KERNEL_SOURCE ), + PACK_KERNEL_MAP_SAMEFL( U8, I32, U8, _SLICE_KERNEL_SOURCE ), + + PACK_KERNEL_MAP_SAMEFL_2D( I16, I32, I16, _SLICE_KERNEL_SOURCE ), + PACK_KERNEL_MAP_SAMEFL_2D( U8, I32, U8, _SLICE_KERNEL_SOURCE ), +}; + +#define _INPUT_NUM (2) +#define _OUTPUT_NUM (1) +#define _IO_NUM (_INPUT_NUM + _OUTPUT_NUM) +/* +* Kernel params +*/ +static vx_param_description_t _slice_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, +}; +#define _SLICE_PARAM_NUM _cnt_of_array( _slice_kernel_param_def ) +#define SCALAR_SAMLEFL_VALUE (3) +/* +* Kernel initializer +*/ +DEF_KERNEL_INITIALIZER(_slice_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ +#define _PACK_SLICE_KEY( IN0_TYPE, OUT_TYPE, SAMLEFL) \ + (IN0_TYPE | (OUT_TYPE << 8) | (SAMLEFL << 16)) + vsi_status status = VSI_FAILURE; + gpu_param_t gpu_param = { + 3, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0} + }; + vsi_nn_kernel_tensor_attr_t * output_attr = NULL; + vsi_nn_kernel_tensor_attr_t * input_attr = NULL; + vsi_int_array_t * out_shape = NULL; + vsi_nn_kernel_dtype_e input_dtype = F16; + vsi_nn_kernel_dtype_e output_dtype = F16; + float scaleIn = 1.0f; + float scaleOut = 1.0f; + int32_t output_ZP = 0; + int32_t input_ZP = 0; + int32_t srcFixPointPos = 0; + int32_t dstFixPointPos = 0; + int32_t is_samefl = 0; + uint32_t pack_key = 0; + + input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); + CHECK_PTR_FAIL_GOTO( input_attr, "Create tensor attr buffer fail.", final ); + output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] ); + CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final ); + + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[SCALAR_SAMLEFL_VALUE], &is_samefl); + CHECK_STATUS_FAIL_GOTO(status, final ); + + out_shape = output_attr->shape; + input_dtype = input_attr->dtype; + output_dtype = output_attr->dtype; + + pack_key = _PACK_SLICE_KEY( input_dtype, output_dtype, is_samefl); + + if (VSI_NN_KERNEL_QUANT_DFP == input_attr->quant) + { + srcFixPointPos = input_attr->dfp.fl; + if (srcFixPointPos > 0) + { + scaleIn = (1.0f / ((float) ((int64_t)1 << srcFixPointPos))); + } + else + { + scaleIn = ((float) ((int64_t)1 << -srcFixPointPos)); + } + } + else if (VSI_NN_KERNEL_QUANT_ASYMM == input_attr->quant) + { + input_ZP = input_attr->asymm.zero_point; + scaleIn = input_attr->asymm.scale; + } + + if (VSI_NN_KERNEL_QUANT_DFP == output_attr->quant) + { + dstFixPointPos = output_attr->dfp.fl; + if (dstFixPointPos > 0) + { + scaleOut = (1.0f / ((float) ((int64_t)1 << dstFixPointPos))); + } + else + { + scaleOut = ((float) ((int64_t)1 << -dstFixPointPos)); + } + } + else if (VSI_NN_KERNEL_QUANT_ASYMM == output_attr->quant) + { + output_ZP = output_attr->asymm.zero_point; + scaleOut = output_attr->asymm.scale; + } + + if ((F16 == input_dtype) + || (I16 == input_dtype) + || (BF16 == input_dtype) + ) + { + gpu_param.global_scale[0] = 8; + gpu_param.global_scale[1] = 1; + gpu_param.global_scale[2] = 1; + } + else + { + gpu_param.global_scale[0] = 16; + gpu_param.global_scale[1] = 1; + gpu_param.global_scale[2] = 1; + } + + gpu_param.dim = out_shape->size < 3 ? 2 : 3; + gpu_param.global_size[0] = gpu_align_p2( + (out_shape->data[0] + gpu_param.global_scale[0] - 1) + / gpu_param.global_scale[0], 4); + gpu_param.global_size[1] = ( + (out_shape->data[1] + gpu_param.global_scale[1] - 1) + / gpu_param.global_scale[1]); + gpu_param.global_size[2] = out_shape->size > 2 ? out_shape->data[2] : 1; + + switch (pack_key) + { + case _PACK_SLICE_KEY(I16, I16, 0): + case _PACK_SLICE_KEY(U8, U8, 0): + case _PACK_SLICE_KEY(I8, I8, 0): + case _PACK_SLICE_KEY(I16, F16, 0): + case _PACK_SLICE_KEY(U8, F16, 0): + case _PACK_SLICE_KEY(I8, F16, 0): + case _PACK_SLICE_KEY(F16, I16, 0): + case _PACK_SLICE_KEY(F16, U8, 0): + case _PACK_SLICE_KEY(F16, I8, 0): + { + float uint8Scale = scaleIn / scaleOut; + uint16_t M0 = 0; + int32_t postShift = 0; + uint32_t multAndoutZP[2] = {0}; + gpu_dp_inst_t uniU8MulAndPostShift_Lo_2x8 = {{ + 0xdddddddd, // TCfg + 0x44444444, // ASelt + 0x13121110, 0x17161514, // ABin + 0x11111111, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002600, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniU8MulAndPostShift_Hi_2x8 = {{ + 0xdddddddd, // TCfg + 0x44444444, // ASelt + 0x1b1a1918, 0x1f1e1d1c, // ABin + 0x11111111, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002600, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + + gpu_quantize_multiplier_16bit(uint8Scale, &M0, &postShift); + multAndoutZP[0] = (uint32_t)(M0); + multAndoutZP[1] = (uint32_t)((output_ZP << postShift) - input_ZP * M0); + + uniU8MulAndPostShift_Lo_2x8.data[7] |= (postShift & 0x1F); + uniU8MulAndPostShift_Hi_2x8.data[7] |= (postShift & 0x1F); + + status = vsi_nn_kernel_gpu_add_param( node, "uniU8MulAndPostShift_Lo_2x8", &uniU8MulAndPostShift_Lo_2x8); + status |= vsi_nn_kernel_gpu_add_param( node, "uniU8MulAndPostShift_Hi_2x8", &uniU8MulAndPostShift_Hi_2x8); + status |= vsi_nn_kernel_gpu_add_param( node, "multAndoutZP", multAndoutZP); + } + break; + default: + break; + } + CHECK_STATUS_FAIL_GOTO(status, final ); + +#undef _PACK_SLICE_KEY + status = vsi_nn_kernel_gpu_config( node, &gpu_param ); + +final: +#define SAFE_FREE_TENSOR_ATTR(_PTR) if( _PTR ) { vsi_nn_kernel_tensor_attr_release( &_PTR ); _PTR = NULL; } + SAFE_FREE_TENSOR_ATTR(output_attr); + SAFE_FREE_TENSOR_ATTR(input_attr); + return status; +} /* _slice_initializer() */ + +static vsi_bool _is_same_quant + ( + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_nn_dtype_t *src_dtype = NULL,*dst_dtype = NULL; + + src_dtype = &inputs[0]->attr.dtype; + dst_dtype = &outputs[0]->attr.dtype; + + if (vsi_nn_DtypeCompare(src_dtype, dst_dtype) == FALSE) + { + return FALSE; + } + + return TRUE; +} /* _is_same_quant */ + +/* +* Query kernel +*/ +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + vsi_nn_tensor_t * const* const inputs, + vsi_nn_tensor_t * const* const outputs, + vsi_bool image_2d, + vsi_bool is_same_quant + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_dtype_e in0_dtype; + vsi_nn_kernel_dtype_e in1_dtype; + vsi_nn_kernel_dtype_e out_dtype; + const _kernel_map_type * kernel_map = _slice_kernel_map; + size_t kernel_map_size = _cnt_of_array( _slice_kernel_map ); + vx_param_description_t * param_def = _slice_kernel_param_def; + size_t param_def_size = _cnt_of_array( _slice_kernel_param_def ); + vx_kernel_initialize_f initializer = _slice_initializer; + + uint32_t key; + uint32_t i; + + in0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + in1_dtype = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type ); + out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + + if (is_same_quant && (F16 == in0_dtype || BF16 == in0_dtype) ) + { + in0_dtype = I16; + out_dtype = I16; + } + else if (is_same_quant && (I8 == in0_dtype || BOOL8 == in0_dtype) ) + { + in0_dtype = U8; + out_dtype = U8; + } + + key = SLICE_HASH_KEY( in0_dtype, in1_dtype, out_dtype, image_2d, is_same_quant ); + + for ( i = 0; i < (uint32_t)kernel_map_size; i ++ ) + { + if ( kernel_map[i].key == key ) + { + break; + } + } + if ( i < kernel_map_size ) + { + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name ); + kernel->info.parameters = param_def; + kernel->info.numParams = (uint32_t)param_def_size; + kernel->info.initialize = initializer; + // Register code source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2, + "vsi_nn_kernel_header", + kernel_map[i].source_name ); + // Register binary source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + kernel_map[i].source_name ); + status = VSI_SUCCESS; + } + + return status; +} /* _query_kernel() */ + + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_SLICE_PARAM_NUM]; + vsi_nn_kernel_node_t node = NULL; + vsi_bool image_2d = FALSE; + uint32_t rank[_IO_NUM] = {0}; + int32_t shapes[_IO_NUM][VSI_NN_MAX_DIM_NUM] = {{ 1 }}; + vsi_nn_tensor_t* reshape_tensors[_IO_NUM] = { NULL }; + int32_t i = 0; + int32_t input_batch = inputs[0]->attr.dim_num > 3 ? inputs[0]->attr.size[3] : 1; + int32_t output_batch = outputs[0]->attr.dim_num > 3 ? outputs[0]->attr.size[3] : 1; + vsi_bool is_same_quant = FALSE; + + vsi_nn_kernel_optimize_1d_tensor_shape( (const int32_t*)inputs[0]->attr.size, inputs[0]->attr.dim_num, + shapes[0], &rank[0]); + vsi_nn_kernel_optimize_1d_tensor_shape( (const int32_t*)inputs[1]->attr.size, inputs[1]->attr.dim_num, + shapes[1], &rank[1]); + vsi_nn_kernel_optimize_1d_tensor_shape( (const int32_t*)outputs[0]->attr.size, outputs[0]->attr.dim_num, + shapes[2], &rank[2]); + + for (i = 0; i < _INPUT_NUM; i++) + { + reshape_tensors[i] = vsi_nn_reshape_tensor( graph, + inputs[i], (uint32_t*)shapes[i], rank[i] ); + } + reshape_tensors[_INPUT_NUM] = vsi_nn_reshape_tensor( graph, + outputs[0], (uint32_t*)shapes[_INPUT_NUM], rank[_INPUT_NUM] ); + + if ( !vsi_nn_kernel_gpu_check_shape( (int32_t*)reshape_tensors[0]->attr.size, + reshape_tensors[0]->attr.dim_num ) || input_batch != output_batch ) + { + return NULL; + } + + image_2d = (rank[0] < 3 || shapes[0][2] == 1); + is_same_quant = _is_same_quant(inputs, outputs); + + status = _query_kernel( kernel, inputs, outputs , image_2d, is_same_quant ); + if( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if( node ) + { + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( node_params, _SLICE_PARAM_NUM, + reshape_tensors, input_num, &reshape_tensors[_INPUT_NUM], output_num ); + node_params[SCALAR_SAMLEFL_VALUE] = vsi_nn_kernel_scalar_create( graph, I32, &is_same_quant ); + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, node_params, _SLICE_PARAM_NUM ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_SAMLEFL_VALUE] ); + } + } + + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_EVIS( slice, _setup ) diff --git a/src/tim/vx/internal/src/kernel/evis/tile_evis.c b/src/tim/vx/internal/src/kernel/evis/tile_evis.c index a076329..5f3465b 100644 --- a/src/tim/vx/internal/src/kernel/evis/tile_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/tile_evis.c @@ -41,51 +41,55 @@ __BEGIN_DECLS /* * Define kernel meta. */ -#define HASH_TILE_KEY(_input_type, _output_type, _image_2d, _remainder) \ - ((_input_type << 18) | (_output_type << 4) | (_image_2d << 3) | (_remainder)) +#define HASH_TILE_KEY(_input_type, _output_type, _image_2d, _is_size1, _remainder) \ + ((_input_type << 19) | (_output_type << 5) | (_image_2d << 4) | (_is_size1 << 3) | (_remainder)) #define KERNEL_SOURCE "tile", #define KERNEL_SOURCE1 "tile_mix", #define STR(a) #a + #define TENSOR_TILE_KEY_DIM0_IS1_2D(SRC_TYPE, OUT_TYPE) \ + { HASH_TILE_KEY(SRC_TYPE, OUT_TYPE, 1, 1, 1), \ + CVIVANTE_NAMESPACE("evis.tile_1toN_"#SRC_TYPE"to"#OUT_TYPE"_2D"), \ + KERNEL_SOURCE }, + #define HASH_TILE_SH_KERNEL_NAME(SRC_TYPE, DST_TYPE, REMAINDER) \ CVIVANTE_NAMESPACE("evis.tile_remain"STR(REMAINDER)"_"#SRC_TYPE"to"#DST_TYPE) -#define TENSOR_TILE_KERNELS(SRC_TYPE, OUT_TYPE, REMAINDER) \ - { HASH_TILE_KEY(SRC_TYPE, OUT_TYPE, 0, REMAINDER), \ +#define TENSOR_TILE_KERNELS(SRC_TYPE, OUT_TYPE, ISSIZE1, REMAINDER) \ + { HASH_TILE_KEY(SRC_TYPE, OUT_TYPE, 0, ISSIZE1, REMAINDER), \ HASH_TILE_SH_KERNEL_NAME(SRC_TYPE, OUT_TYPE, REMAINDER), \ KERNEL_SOURCE1 }, #define HASH_TILE_SH_KERNEL_2D_NAME(SRC_TYPE, DST_TYPE, REMAINDER) \ CVIVANTE_NAMESPACE("evis.tile_remain"STR(REMAINDER)"_"#SRC_TYPE"to"#DST_TYPE"_2D") -#define TENSOR_TILE_KERNELS_2D(SRC_TYPE, OUT_TYPE, REMAINDER) \ - { HASH_TILE_KEY(SRC_TYPE, OUT_TYPE, 1, REMAINDER), \ +#define TENSOR_TILE_KERNELS_2D(SRC_TYPE, OUT_TYPE, ISSIZE1, REMAINDER) \ + { HASH_TILE_KEY(SRC_TYPE, OUT_TYPE, 1, ISSIZE1, REMAINDER), \ HASH_TILE_SH_KERNEL_2D_NAME(SRC_TYPE, OUT_TYPE, REMAINDER), \ KERNEL_SOURCE1 }, #define TENSOR_TILE_8BITS_KERNELS(SRC_TYPE, OUT_TYPE, REMAINDER) \ - { HASH_TILE_KEY(SRC_TYPE, OUT_TYPE, 0, REMAINDER), \ + { HASH_TILE_KEY(SRC_TYPE, OUT_TYPE, 0, 0, REMAINDER), \ HASH_TILE_SH_KERNEL_NAME(U8, U8, REMAINDER), \ KERNEL_SOURCE }, #define TENSOR_TILE_16BITS_KERNELS(SRC_TYPE, OUT_TYPE, REMAINDER) \ - { HASH_TILE_KEY(SRC_TYPE, OUT_TYPE, 0, REMAINDER), \ + { HASH_TILE_KEY(SRC_TYPE, OUT_TYPE, 0, 0, REMAINDER), \ HASH_TILE_SH_KERNEL_NAME(I16, I16, REMAINDER), \ KERNEL_SOURCE }, #define TENSOR_TILE_8BITS_2D_KERNELS(SRC_TYPE, OUT_TYPE, REMAINDER) \ - { HASH_TILE_KEY(SRC_TYPE, OUT_TYPE, 1, REMAINDER), \ + { HASH_TILE_KEY(SRC_TYPE, OUT_TYPE, 1, 0, REMAINDER), \ HASH_TILE_SH_KERNEL_2D_NAME(U8, U8, REMAINDER), \ KERNEL_SOURCE }, #define TENSOR_TILE_16BITS_2D_KERNELS(SRC_TYPE, OUT_TYPE, REMAINDER) \ - { HASH_TILE_KEY(SRC_TYPE, OUT_TYPE, 1, REMAINDER), \ + { HASH_TILE_KEY(SRC_TYPE, OUT_TYPE, 1, 0, REMAINDER), \ HASH_TILE_SH_KERNEL_2D_NAME(I16, I16, REMAINDER), \ KERNEL_SOURCE }, - static const struct { uint32_t key; char* function_name; @@ -176,23 +180,44 @@ static const struct { TENSOR_TILE_16BITS_2D_KERNELS( BF16, BF16, 6) TENSOR_TILE_16BITS_2D_KERNELS( BF16, BF16, 7) - TENSOR_TILE_KERNELS( U8, F16, 0) - TENSOR_TILE_KERNELS( U8, F16, 1) - TENSOR_TILE_KERNELS( U8, F16, 2) - TENSOR_TILE_KERNELS( U8, F16, 3) - TENSOR_TILE_KERNELS( U8, F16, 4) - TENSOR_TILE_KERNELS( U8, F16, 5) - TENSOR_TILE_KERNELS( U8, F16, 6) - TENSOR_TILE_KERNELS( U8, F16, 7) + TENSOR_TILE_KERNELS( U8, F16, 0, 0) + TENSOR_TILE_KERNELS( U8, F16, 0, 1) + TENSOR_TILE_KERNELS( U8, F16, 0, 2) + TENSOR_TILE_KERNELS( U8, F16, 0, 3) + TENSOR_TILE_KERNELS( U8, F16, 0, 4) + TENSOR_TILE_KERNELS( U8, F16, 0, 5) + TENSOR_TILE_KERNELS( U8, F16, 0, 6) + TENSOR_TILE_KERNELS( U8, F16, 0, 7) - TENSOR_TILE_KERNELS_2D( U8, F16, 0) - TENSOR_TILE_KERNELS_2D( U8, F16, 1) - TENSOR_TILE_KERNELS_2D( U8, F16, 2) - TENSOR_TILE_KERNELS_2D( U8, F16, 3) - TENSOR_TILE_KERNELS_2D( U8, F16, 4) - TENSOR_TILE_KERNELS_2D( U8, F16, 5) - TENSOR_TILE_KERNELS_2D( U8, F16, 6) - TENSOR_TILE_KERNELS_2D( U8, F16, 7) + TENSOR_TILE_KERNELS( U8, F16, 1, 0) + TENSOR_TILE_KERNELS( U8, F16, 1, 1) + TENSOR_TILE_KERNELS( U8, F16, 1, 2) + TENSOR_TILE_KERNELS( U8, F16, 1, 3) + TENSOR_TILE_KERNELS( U8, F16, 1, 4) + TENSOR_TILE_KERNELS( U8, F16, 1, 5) + TENSOR_TILE_KERNELS( U8, F16, 1, 6) + TENSOR_TILE_KERNELS( U8, F16, 1, 7) + + TENSOR_TILE_KERNELS_2D( U8, F16, 0, 0) + TENSOR_TILE_KERNELS_2D( U8, F16, 0, 1) + TENSOR_TILE_KERNELS_2D( U8, F16, 0, 2) + TENSOR_TILE_KERNELS_2D( U8, F16, 0, 3) + TENSOR_TILE_KERNELS_2D( U8, F16, 0, 4) + TENSOR_TILE_KERNELS_2D( U8, F16, 0, 5) + TENSOR_TILE_KERNELS_2D( U8, F16, 0, 6) + TENSOR_TILE_KERNELS_2D( U8, F16, 0, 7) + + TENSOR_TILE_KERNELS_2D( U8, F16, 1, 0) + TENSOR_TILE_KERNELS_2D( U8, F16, 1, 1) + TENSOR_TILE_KERNELS_2D( U8, F16, 1, 2) + TENSOR_TILE_KERNELS_2D( U8, F16, 1, 3) + TENSOR_TILE_KERNELS_2D( U8, F16, 1, 4) + TENSOR_TILE_KERNELS_2D( U8, F16, 1, 5) + TENSOR_TILE_KERNELS_2D( U8, F16, 1, 6) + TENSOR_TILE_KERNELS_2D( U8, F16, 1, 7) + + TENSOR_TILE_KEY_DIM0_IS1_2D(U8, U8) + TENSOR_TILE_KEY_DIM0_IS1_2D(I16, I16) }; /* @@ -383,11 +408,24 @@ static vsi_status _query_kernel vsi_nn_kernel_dtype_e output_dtype; vsi_status status = VSI_FAILURE; uint32_t key; - int i; + int32_t i = 0; + int32_t dim0_size1 = inputs[0]->attr.size[0] == 1 ? 1 : 0; input_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); - key = HASH_TILE_KEY( input_dtype, output_dtype, image_2d, remainder); + + if (input_dtype == output_dtype && image_2d == TRUE && dim0_size1) + { + input_dtype = input_dtype == I8 ? U8 : input_dtype; + input_dtype = input_dtype == F16 ? I16 : input_dtype; + input_dtype = input_dtype == BF16 ? I16 : input_dtype; + output_dtype = input_dtype; + key = HASH_TILE_KEY(input_dtype, output_dtype, 1, 1, 1); + } + else + { + key = HASH_TILE_KEY( input_dtype, output_dtype, image_2d, dim0_size1, remainder); + } for( i = 0; i < _cnt_of_array(_tile_evis_kernel_map); i ++ ) { diff --git a/src/tim/vx/internal/src/kernel/vsi_nn_kernel_param.c b/src/tim/vx/internal/src/kernel/vsi_nn_kernel_param.c index 12adee6..20b4589 100644 --- a/src/tim/vx/internal/src/kernel/vsi_nn_kernel_param.c +++ b/src/tim/vx/internal/src/kernel/vsi_nn_kernel_param.c @@ -42,6 +42,7 @@ typedef enum _PARAM_I64, _PARAM_F32, _PARAM_BUFFER, + _PARAM_CONST_BUFFER, _PARAM_STR, } _param_dtype_e; @@ -54,6 +55,7 @@ typedef struct int64_t int64; float float32; void* buffer; + const void* const_buffer; const char* str; } value; size_t size; @@ -164,6 +166,45 @@ void* vsi_nn_kernel_param_get_buffer return p->value.buffer; } /* vsi_nn_kernel_param_get_buffer() */ +vsi_bool vsi_nn_kernel_param_add_const_buffer + ( + vsi_nn_kernel_param_t * params, + const char * key, + const void * value, + size_t size + ) +{ + _param_type* p; + CHECK_PARAM_NULL( params, FALSE, "Params is null ptr." ); + CHECK_PARAM_NULL( key, FALSE, "Param key is null ptr." ); + p = malloc( sizeof(_param_type) ); + CHECK_PARAM_NULL( p, FALSE, "Out of memory, add param fail." ); + p->type = _PARAM_CONST_BUFFER; + p->value.const_buffer = value; + p->size = size; + vsi_nn_hashmap_add( params, key, p ); + return TRUE; +} /* vsi_nn_kernel_param_add_const_buffer() */ + +const void* vsi_nn_kernel_param_get_const_buffer + ( const vsi_nn_kernel_param_t * params, const char * key, size_t * size) +{ + _param_type* p; + CHECK_PARAM_NULL( params, FALSE, "Params is null ptr." ); + CHECK_PARAM_NULL( key, FALSE, "Param key is null ptr." ); + p = vsi_nn_hashmap_get( params, key ); + CHECK_PARAM_NULL( p, 0, "Key %s not in params.", key ); + if( p->type != _PARAM_CONST_BUFFER ) + { + VSILOGW("Key %s is not \"const buffer\"", key ); + } + if( size != NULL ) + { + *size = p->size; + } + return p->value.const_buffer; +} /* vsi_nn_kernel_param_get_const_buffer() */ + vsi_nn_kernel_param_t* vsi_nn_kernel_param_create() { return (vsi_nn_kernel_param_t*)vsi_nn_hashmap_create(); diff --git a/src/tim/vx/internal/src/kernel/vsi_nn_kernel_selector.c b/src/tim/vx/internal/src/kernel/vsi_nn_kernel_selector.c index 3b446b8..fd4d2e7 100644 --- a/src/tim/vx/internal/src/kernel/vsi_nn_kernel_selector.c +++ b/src/tim/vx/internal/src/kernel/vsi_nn_kernel_selector.c @@ -57,12 +57,27 @@ KERNEL_SELECTOR( depthwise_conv1d ) vsi_nn_kernel_selector_t * selector ) { + int32_t dilation = vsi_nn_kernel_param_get_int32( params, "dilation" ); + int32_t kernel = inputs[1]->attr.size[0]; + int32_t real_kernel = 0; + int32_t stride = vsi_nn_kernel_param_get_int32( params, "stride" ); vsi_nn_kernel_pirority_t pirority[] = { { VSI_NN_KERNEL_TYPE_VX, 0 }, { VSI_NN_KERNEL_TYPE_EVIS, 3 }, { VSI_NN_KERNEL_TYPE_CL, 2 }, { VSI_NN_KERNEL_TYPE_CPU, 1 }, }; + dilation = dilation == 0 ? 0 : dilation - 1; + real_kernel = (kernel - 1) * dilation + kernel; + + if (real_kernel < 16 && stride < 3) + { + pirority[0].fps = 3; + pirority[1].fps = 2; + pirority[2].fps = 1; + pirority[3].fps = 0; + } + return vsi_nn_kernel_pirority_set( selector, pirority, _cnt_of_array(pirority) ); } /* depthwise_conv1d */ @@ -111,5 +126,6 @@ REGISTER_VX_FIRST_KERNEL_SELECTOR(mish) REGISTER_VX_FIRST_KERNEL_SELECTOR(hard_sigmoid) REGISTER_VX_FIRST_KERNEL_SELECTOR(clip) REGISTER_VX_FIRST_KERNEL_SELECTOR(relu_keras) +REGISTER_VX_FIRST_KERNEL_SELECTOR(erf) __END_DECLS diff --git a/src/tim/vx/internal/src/kernel/vx/erf_vx.c b/src/tim/vx/internal/src/kernel/vx/erf_vx.c new file mode 100644 index 0000000..8daf0be --- /dev/null +++ b/src/tim/vx/internal/src/kernel/vx/erf_vx.c @@ -0,0 +1,216 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_node.h" +#include "vsi_nn_log.h" +#include "vsi_nn_prv.h" +#include +#include "utils/vsi_nn_dtype_util_prv.h" +#include "vsi_nn_tensor_util.h" +#include "kernel/vsi_nn_kernel.h" + +typedef struct _sort_lut_s +{ + float index; + float val; +} sort_lut; + +static float erf_eval(float x) +{ + float res = 0; + float tmp = x; + float factorial = 1; /*n!*/ + float x_pow = x; + int32_t one = 1; + int32_t n = 1; + + while (vsi_abs(tmp) > 1e-5) + { + res += tmp; + + factorial *= n; + one *= -1; + x_pow *= x * x; + tmp = one / factorial * x_pow / ( 2 * n + 1); + + n ++; + } +#define MUL2_RSQRTPI (1.1283791670955126f) + + res *= MUL2_RSQRTPI; + + return res; +} + +#ifdef VX_USER_LOOKUP_TABLE_SUPPORT +static int32_t _lut_comparator(const void *pa, const void *pb) +{ + sort_lut a = *(sort_lut *)pa; + sort_lut b = *(sort_lut *)pb; + float diff = a.index - b.index; + if ( diff > 0 ) + { + return 1; + } + else if ( diff < 0 ) + { + return -1; + } + + return 0; +} + +static void _set_table_lookup(float func(float), float *index, float *value) +{ +#define VSI_NN_MAX_LUT_SIZE (1024) +#define FLT16_MAX (57344) +#define FLT16_MIN (-57344) + uint32_t i = 0; + sort_lut *lut = (sort_lut *)calloc(VSI_NN_MAX_LUT_SIZE, sizeof(sort_lut)); + + for ( i = 0; i < VSI_NN_MAX_LUT_SIZE; i++) + { + int16_t val = (int16_t)(i << 6); + lut[i].index = fp16_to_fp32(val); + lut[i].val = func(lut[i].index); + } + + for (i = 0x0; i < 0x10; i++) + { + lut[i].index = 0; + lut[i].val = func(lut[i].index); + } + + for (i = 0x1F0; i < 0x200; i++) + { + lut[i].index = FLT16_MAX; + lut[i].val = func(lut[i].index); + } + + for (i = 0x3F0; i < 0x400; i++) + { + lut[i].index = FLT16_MIN; + lut[i].val = func(lut[i].index); + } + + qsort(lut, VSI_NN_MAX_LUT_SIZE, sizeof(sort_lut), _lut_comparator); + + for ( i = 0; i < VSI_NN_MAX_LUT_SIZE; i++) + { + index[i] = lut[i].index; + value[i] = lut[i].val; + } + + vsi_nn_safe_free(lut); + +#undef VSI_NN_MAX_LUT_SIZE +#undef FLT16_MIN +#undef FLT16_MAX +} +#endif + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel, + float func(float) + ) +{ +#ifdef VX_USER_LOOKUP_TABLE_SUPPORT + vx_lut lut1 = NULL; + vx_lut lut2 = NULL; + vx_node node = NULL; + float index[1024] = {0}; + float value[1024] = {0}; + + if ( inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT32 || + outputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT32 ) + { + return NULL; + } + + _set_table_lookup(func, index, value); + + lut1 = vxCreateLUT( graph->ctx->c, VX_TYPE_FLOAT32, 1024); + lut2 = vxCreateLUT( graph->ctx->c, VX_TYPE_FLOAT32, 1024); + if( NULL == lut1 || NULL == lut2 ) + { + VSILOGE("create lut object fail."); + goto OnError; + } + + vxCopyLUT(lut1, (void*)&index, VX_WRITE_ONLY, VX_MEMORY_TYPE_HOST); + vxCopyLUT(lut2, (void*)&value, VX_WRITE_ONLY, VX_MEMORY_TYPE_HOST); + + node = vxTensorTableLookupLayer( graph->g, inputs[0]->t, lut1, lut2, outputs[0]->t); + if( NULL == node ) + { + VSILOGE("Call vxTensorTableLookupLayer fail."); + goto OnError; + } + +OnError: + if (lut1) + { + vxReleaseLUT(&lut1); + lut1 = NULL; + } + if (lut2) + { + vxReleaseLUT(&lut2); + lut2 = NULL; + } + return (vsi_nn_kernel_node_t)node; +#else + return NULL; +#endif +} /* _setup() */ + +#define REGISTER_CLIP_OPENVX_KERNEL(KERNEL_NAME, UNARY_FUNC) \ + static vsi_nn_kernel_node_t _##KERNEL_NAME##_setup \ + ( \ + vsi_nn_graph_t * graph, \ + vsi_nn_tensor_t ** inputs, \ + size_t input_num, \ + vsi_nn_tensor_t ** outputs, \ + size_t output_num, \ + const vsi_nn_kernel_param_t * params, \ + vsi_nn_kernel_t * kernel \ + ) \ + { \ + return _setup(graph, inputs, input_num, outputs, output_num, \ + params, kernel, UNARY_FUNC); \ + } \ + REGISTER_BACKEND_OPENVX( KERNEL_NAME, _##KERNEL_NAME##_setup ) + +REGISTER_CLIP_OPENVX_KERNEL( erf, erf_eval ) + +#undef REGISTER_CLIP_OPENVX_KERNEL diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/eltwise_ops_helper.cl b/src/tim/vx/internal/src/libnnext/ops/cl/eltwise_ops_helper.cl index cd255bb..9a322a5 100644 --- a/src/tim/vx/internal/src/libnnext/ops/cl/eltwise_ops_helper.cl +++ b/src/tim/vx/internal/src/libnnext/ops/cl/eltwise_ops_helper.cl @@ -1,6 +1,62 @@ #pragma OPENCL EXTENSION CL_VIV_asm : enable #pragma OPENCL EXTENSION cl_viv_vx_extension : enable +typedef struct Image +{ + __global uchar *ptr; + int stride_x; + int stride_y; +} Image; + +inline uchar* get_image_ptr_from_coord(Image img, int2 coord) +{ + return img.ptr + coord.x * img.stride_x + coord.y * img.stride_y; +} + +inline Image create_image_from_image2d(image2d_t input, int stride_x) +{ + int8 desc; + _viv_asm(COPY, desc, input, sizeof(desc)); + + Image img = + { + .ptr = (uchar*)desc.s0, + .stride_x = stride_x, + .stride_y = desc.s1 + }; + + return img; +} + +typedef struct Tensor +{ + __global uchar *ptr; + int stride_x; + int stride_y; + int stride_z; +} Tensor; + +inline uchar* create_tensor_ptr_from_coord(Tensor t, int4 coord) +{ + return t.ptr + coord.x * t.stride_x + coord.y * t.stride_y + coord.z * t.stride_z; +} + +inline Tensor create_tensor_from_image2d_array(image2d_array_t input, int stride_x) +{ + int8 desc; + _viv_asm(COPY, desc, input, sizeof(desc)); + + Tensor t = + { + .ptr = (uchar*)desc.s0, + .stride_x = stride_x, + .stride_y = desc.s1, + .stride_z = desc.s4 + }; + + return t; +} + #define readImage2DArray(Dest, Image, Coord) \ do { \ int8 desc; \ diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/eltwise_unary.cl b/src/tim/vx/internal/src/libnnext/ops/cl/eltwise_unary.cl index 68febfb..5b23144 100644 --- a/src/tim/vx/internal/src/libnnext/ops/cl/eltwise_unary.cl +++ b/src/tim/vx/internal/src/libnnext/ops/cl/eltwise_unary.cl @@ -64,6 +64,11 @@ float4 eltwise_unary_mish(float4 x, float alpha) return x; } +float4 eltwise_unary_round(float4 x, float alpha) +{ + return convert_float4(convert_int4_rte(x)); +} + #define ELTWISE_UNARY_F32(func_name) \ __kernel void func_name##_F32toF32 \ ( \ @@ -91,6 +96,7 @@ ELTWISE_UNARY_F32(elu) ELTWISE_UNARY_F32(neg) ELTWISE_UNARY_F32(mish) ELTWISE_UNARY_F32(hard_sigmoid) +ELTWISE_UNARY_F32(round) #define ELTWISE_UNARY_F32_2D(func_name) \ __kernel void func_name##_F32toF32_2D \ @@ -119,6 +125,7 @@ ELTWISE_UNARY_F32_2D(elu) ELTWISE_UNARY_F32_2D(neg) ELTWISE_UNARY_F32_2D(mish) ELTWISE_UNARY_F32_2D(hard_sigmoid) +ELTWISE_UNARY_F32_2D(round) #define ELTWISE_UNARY_U8(func_name) \ __kernel void func_name##_U8toU8 \ @@ -149,6 +156,7 @@ ELTWISE_UNARY_U8(elu) ELTWISE_UNARY_U8(neg) ELTWISE_UNARY_U8(mish) ELTWISE_UNARY_U8(hard_sigmoid) +ELTWISE_UNARY_U8(round) #define ELTWISE_UNARY_U8_2D(func_name) \ __kernel void func_name##_U8toU8_2D \ @@ -179,7 +187,7 @@ ELTWISE_UNARY_U8_2D(elu) ELTWISE_UNARY_U8_2D(neg) ELTWISE_UNARY_U8_2D(mish) ELTWISE_UNARY_U8_2D(hard_sigmoid) - +ELTWISE_UNARY_U8_2D(round) __kernel void neg_I32toI32 ( diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/erf.cl b/src/tim/vx/internal/src/libnnext/ops/cl/erf.cl new file mode 100644 index 0000000..9f38f95 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/cl/erf.cl @@ -0,0 +1,113 @@ +#define MUL2_RSQRTPI (1.1283791670955126f) +float eltwise_unary_erf(float x) +{ + float res = 0; + float tmp = x; + float factorial = 1; + float x_pow = x; + float one = 1.0f; + float n = 1; + + while (fabs(tmp) > 1e-5) + { + res += tmp; + + factorial *= n; + one *= -1; + x_pow *= x * x; + tmp = one / factorial * x_pow / ( 2 * n + 1); + + n += 1.0f; + } + return res * MUL2_RSQRTPI; +} + +#define ELTWISE_UNARY_F32(func_name) \ +__kernel void func_name##_F32toF32 \ + ( \ + __read_only image2d_array_t input, \ + __write_only image2d_array_t output, \ + float inputScale, \ + float inputTail, \ + float outputScale, \ + float outputZP \ + ) \ +{ \ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \ + \ + float4 src = read_imagef(input, coord); \ + \ + float4 dst = 0; \ + dst.x = eltwise_unary_##func_name(src.x); \ + \ + write_imagef(output, coord, dst); \ +} +ELTWISE_UNARY_F32(erf) + +#define ELTWISE_UNARY_F32_2D(func_name) \ +__kernel void func_name##_F32toF32_2D \ + ( \ + __read_only image2d_t input, \ + __write_only image2d_t output, \ + float inputScale, \ + float inputTail, \ + float outputScale, \ + float outputZP \ + ) \ +{ \ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); \ + \ + float4 src = read_imagef(input, coord); \ + \ + float4 dst = 0; \ + dst.x = eltwise_unary_##func_name(src.x); \ + \ + write_imagef(output, coord, dst); \ +} +ELTWISE_UNARY_F32_2D(erf) + +#define ELTWISE_UNARY_U8(func_name) \ +__kernel void func_name##_U8toU8 \ + ( \ + __read_only image2d_array_t input, \ + __write_only image2d_array_t output, \ + float inputScale, \ + float inputTail, \ + float outputScale, \ + float outputZP \ + ) \ +{ \ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \ + \ + uint4 src = read_imageui(input, coord); \ + float4 data = convert_float4(src) * inputScale - inputTail; \ + \ + data.x = eltwise_unary_##func_name(data.x); \ + uint4 dst = convert_uint4(data * outputScale + outputZP); \ + \ + write_imageui(output, coord, dst); \ +} +ELTWISE_UNARY_U8(erf) + +#define ELTWISE_UNARY_U8_2D(func_name) \ +__kernel void func_name##_U8toU8_2D \ + ( \ + __read_only image2d_t input, \ + __write_only image2d_t output, \ + float inputScale, \ + float inputTail, \ + float outputScale, \ + float outputZP \ + ) \ +{ \ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); \ + \ + uint4 src = read_imageui(input, coord); \ + float4 data = convert_float4(src) * inputScale - inputTail; \ + \ + data.x = eltwise_unary_##func_name(data.x); \ + uint4 dst = convert_uint4(data * outputScale + outputZP); \ + \ + write_imageui(output, coord, dst); \ +} +ELTWISE_UNARY_U8_2D(erf) diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/floordiv.cl b/src/tim/vx/internal/src/libnnext/ops/cl/floordiv.cl index 2164ea2..581694a 100644 --- a/src/tim/vx/internal/src/libnnext/ops/cl/floordiv.cl +++ b/src/tim/vx/internal/src/libnnext/ops/cl/floordiv.cl @@ -50,6 +50,44 @@ __kernel void floordiv_I32I32toI32_2D( write_imagei(output, coord, dst); } +__kernel void floordiv_I32I32toU8( + __read_only image2d_array_t input, + __read_only image2d_array_t input1, + __write_only image2d_array_t output, + float input0Scale, + float input0Tail, + float input1Scale, + float input1Tail, + float outputScale, + float outputTail ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + int4 src0; + int4 src1; + readImage2DArray(src0, input, coord); + readImage2DArray(src1, input1, coord); + uint4 dst = convert_uint4(floor(convert_float4(src0) / convert_float4(src1)) * outputScale + outputTail); + write_imageui(output, coord, dst); +} + +__kernel void floordiv_I32I32toU8_2D( + __read_only image2d_t input, + __read_only image2d_t input1, + __write_only image2d_t output, + float input0Scale, + float input0Tail, + float input1Scale, + float input1Tail, + float outputScale, + float outputTail ) +{ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); + int4 src0 = read_imagei(input, coord); + int4 src1 = read_imagei(input1, coord); + uint4 dst = convert_uint4(floor(convert_float4(src0) / convert_float4(src1)) * outputScale + outputTail); + write_imageui(output, coord, dst); +} + __kernel void floordiv_U8U8toU8( __read_only image2d_array_t input, __read_only image2d_array_t input1, @@ -94,3 +132,49 @@ __kernel void floordiv_U8U8toU8_2D( uint4 dst = convert_uint4(out); write_imageui(output, coord, dst); } + +__kernel void floordiv_U8I32toU8( + __read_only image2d_array_t input, + __read_only image2d_array_t input1, + __write_only image2d_array_t output, + float input0Scale, + float input0Tail, + float input1Scale, + float input1Tail, + float outputScale, + float outputTail ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + uint4 src0; + int4 src1; + float4 in0, in1, out; + readImage2DArray(src0, input, coord); + readImage2DArray(src1, input1, coord); + in0 = convert_float4(src0) * input0Scale + input0Tail; + in1 = convert_float4(src1); + out = floor(in0 / in1) * outputScale + outputTail; + uint4 dst = convert_uint4(out); + write_imageui(output, coord, dst); +} + +__kernel void floordiv_U8I32toU8_2D( + __read_only image2d_t input, + __read_only image2d_t input1, + __write_only image2d_t output, + float input0Scale, + float input0Tail, + float input1Scale, + float input1Tail, + float outputScale, + float outputTail ) +{ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); + uint4 src0 = read_imageui(input, coord); + int4 src1 = read_imagei(input1, coord); + float4 in0, in1, out; + in0 = convert_float4(src0) * input0Scale + input0Tail; + in1 = convert_float4(src1); + out = floor(in0 / in1) * outputScale + outputTail; + uint4 dst = convert_uint4(out); + write_imageui(output, coord, dst); +} diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/group_normalization_f32.cl b/src/tim/vx/internal/src/libnnext/ops/cl/group_normalization_f32.cl new file mode 100644 index 0000000..cfb6014 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/cl/group_normalization_f32.cl @@ -0,0 +1,248 @@ +__kernel void group_norm_sumsqr_F32( + __read_only image2d_array_t input, + __write_only image2d_t output, + float eps, + int is2d, + float input_zp, + float input_scale, + int width, + int height + ) +{ + int gidx = get_global_id(0); + int gidz = get_global_id(1); + int lidx = get_local_id(0); + + int4 coord = (int4)(gidx, 0, gidz, 0); + float4 data; + float sum = 0, sqr = 0; + + __local float lcl_sum[16]; + __local float lcl_sqr[16]; + + if(gidx < width) + { + for(coord.y = 0; coord.y < height;) + { + data = read_imagef(input, coord); + coord.y++; + sum += data.x; + sqr += data.x * data.x; + } + } + lcl_sum[lidx] = sum; + lcl_sqr[lidx] = sqr; + barrier(CLK_LOCAL_MEM_FENCE); + + int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0); + if(lidx == 0) + { + float4 one = (float4)(1, 1, 1, 1); + __local float4* tmp_sum = (__local float4*)lcl_sum; + __local float4* tmp_sqr = (__local float4*)lcl_sqr; + + sum = 0; sqr = 0; + for(int i = 0; i < 4; i++) + { + sum += dot(tmp_sum[i], one); + sqr += dot(tmp_sqr[i], one); + } + + float4 dst = (float4)(0); + dst.x = sum; + write_imagef(output, coord_out.xy, dst); + coord_out.x++; + dst.x = sqr; + write_imagef(output, coord_out.xy, dst); + } +} + +__kernel void group_norm_sumsqr_F32_2D( + __read_only image2d_t input, + __write_only image2d_t output, + float eps, + int is2d, + float input_zp, + float input_scale, + int width, + int height + ) +{ + int gidx = get_global_id(0); + int gidz = get_global_id(1); + int lidx = get_local_id(0); + + int2 coord = (int2)(gidx, gidz); + float4 data; + float sum = 0, sqr = 0; + + __local float lcl_sum[16]; + __local float lcl_sqr[16]; + + if(gidx < width) + { + data = read_imagef(input, coord); + sum = data.x; + sqr = data.x * data.x; + } + lcl_sum[lidx] = sum; + lcl_sqr[lidx] = sqr; + barrier(CLK_LOCAL_MEM_FENCE); + + int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0); + if(lidx == 0) + { + float4 one = (float4)(1, 1, 1, 1); + __local float4* tmp_sum = (__local float4*)lcl_sum; + __local float4* tmp_sqr = (__local float4*)lcl_sqr; + + sum = 0; sqr = 0; + for(int i = 0; i < 4; i++) + { + sum += dot(tmp_sum[i], one); + sqr += dot(tmp_sqr[i], one); + } + + float4 dst = (float4)(0); + dst.x = sum; + write_imagef(output, coord_out.xy, dst); + coord_out.x++; + dst.x = sqr; + write_imagef(output, coord_out.xy, dst); + } +} + +__kernel void group_norm_meanvari( + __read_only image2d_t input, + __write_only image2d_t output, + float eps, + float group_ratio, + int group_stride + ) +{ + int gidx = get_global_id(0); + int lidx = get_local_id(0); + int2 coord = (int2)(gidx, get_global_id(1)); + + float2 sum_sqr = (float2)(0); + float4 mean_vari = (float4)(0); + + __local float2 lcl_data[16]; + __local float2 lcl_sum[4]; + + for(; coord.x < group_stride;) + { + mean_vari.x += read_imagef(input, coord).x; + coord.x++; + mean_vari.y += read_imagef(input, coord).x; + coord.x+=63; + } + lcl_data[lidx] = mean_vari.xy; + barrier(CLK_LOCAL_MEM_FENCE); + if(lidx < 4) + { + float2 tmpSum = (float2)(0); + for(int i = lidx; i < 16; i+=4) + { + tmpSum += lcl_data[i]; + } + lcl_sum[lidx] = tmpSum; + } + barrier(CLK_LOCAL_MEM_FENCE); + if(lidx == 0) + { + for(int i = 0; i < 4; i++) + { + sum_sqr += lcl_sum[i]; + } + mean_vari.xy = sum_sqr * group_ratio; + mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; + mean_vari.s1 = rsqrt(mean_vari.s1); + + coord.x = 0; + write_imagef(output, coord, mean_vari); + coord.x++; + float4 data; + data.x = mean_vari.y; + write_imagef(output, coord, data); + } +} + +__kernel void group_norm_F32toF32( + __read_only image2d_array_t input, + __read_only image2d_t bias, + __read_only image2d_t scale, + __read_only image2d_t meanVari, + __write_only image2d_array_t output, + float eps, + int is2d, + float input_zp, + float input_scale, + float output_zp, + float output_scale, + float rSpaceOrg, + int width, + int height, + int pStride + ) +{ + int gidy = get_global_id(1); + int gidz = get_global_id(2); + int4 coord = (int4)(get_global_id(0), gidy, gidz, 0); + int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 1); + + float4 gamma = read_imagef(scale, coord_para.xy); + float4 beta = read_imagef(bias, coord_para.xy); + float4 mean_vari = read_imagef(meanVari, coord_para.zy); + mean_vari.y = read_imagef(meanVari, coord_para.wy).x; + float4 data = read_imagef(input, coord); + + float scale_vari, bias_val; + + scale_vari = gamma.s0 * mean_vari.s1; + bias_val = (beta.s0 - scale_vari * mean_vari.s0); + + float4 dst; + + dst.x = data.x * scale_vari + bias_val; + write_imagef(output, coord, dst); +} + +__kernel void group_norm_F32toF32_2D( + __read_only image2d_t input, + __read_only image2d_t bias, + __read_only image2d_t scale, + __read_only image2d_t meanVari, + __write_only image2d_t output, + float eps, + int is2d, + float input_zp, + float input_scale, + float output_zp, + float output_scale, + float rSpaceOrg, + int width, + int height, + int pStride + ) +{ + int gidz = get_global_id(1); + int2 coord = (int2)(get_global_id(0), gidz); + int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 1); + + float4 gamma = read_imagef(scale, coord_para.xy); + float4 beta = read_imagef(bias, coord_para.xy); + float4 mean_vari = read_imagef(meanVari, coord_para.zy); + mean_vari.y = read_imagef(meanVari, coord_para.wy).x; + float4 data = read_imagef(input, coord); + + float scale_vari, bias_val; + + scale_vari = gamma.s0 * mean_vari.s1; + bias_val = beta.s0 - scale_vari * mean_vari.s0; + + float4 dst; + + dst.x = data.x * scale_vari + bias_val; + write_imagef(output, coord, dst); +} diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/group_normalization_i32.cl b/src/tim/vx/internal/src/libnnext/ops/cl/group_normalization_i32.cl new file mode 100644 index 0000000..72690c7 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/cl/group_normalization_i32.cl @@ -0,0 +1,278 @@ +__kernel void group_norm_sumsqr_I32( + __read_only image2d_array_t input, + __write_only image2d_t output, + float eps, + int is2d, + float input_zp, + float input_scale, + int width, + int height + ) +{ + int gidx = get_global_id(0); + int gidz = get_global_id(1); + int lidx = get_local_id(0); + + int4 coord = (int4)(gidx, 0, gidz, 0); + float4 data; + float sum = 0, sqr = 0; + float tmpSum = 0; + float e2InScale = input_scale * input_scale; + + __local float lcl_sum[16]; + __local float lcl_sqr[16]; + + if(gidx < width) + { + for(coord.y = 0; coord.y < height;) + { + data = convert_float4(read_imagei(input, coord)); + coord.y++; + tmpSum += data.x; + sqr += (data.x * data.x * e2InScale); + } + sum = tmpSum * input_scale; + } + lcl_sum[lidx] = sum; + lcl_sqr[lidx] = sqr; + barrier(CLK_LOCAL_MEM_FENCE); + + int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0); + if(lidx == 0) + { + float4 one = (float4)(1, 1, 1, 1); + __local float4* tmp_sum = (__local float4*)lcl_sum; + __local float4* tmp_sqr = (__local float4*)lcl_sqr; + + sum = 0; sqr = 0; + for(int i = 0; i < 4; i++) + { + sum += dot(tmp_sum[i], one); + sqr += dot(tmp_sqr[i], one); + } + + float4 dst = (float4)(0); + dst.x = sum; + write_imagef(output, coord_out.xy, dst); + coord_out.x++; + dst.x = sqr; + write_imagef(output, coord_out.xy, dst); + } +} + +__kernel void group_norm_sumsqr_I32_2D( + __read_only image2d_t input, + __write_only image2d_t output, + float eps, + int is2d, + float input_zp, + float input_scale, + int width, + int height + ) +{ + int gidx = get_global_id(0); + int gidz = get_global_id(1); + int lidx = get_local_id(0); + + int2 coord = (int2)(gidx, gidz); + float4 data; + float sum = 0, sqr = 0; + + __local float lcl_sum[16]; + __local float lcl_sqr[16]; + + if(gidx < width) + { + data = convert_float4(read_imagei(input, coord)); + sum = data.x * input_scale; + sqr = sum * sum; + } + lcl_sum[lidx] = sum; + lcl_sqr[lidx] = sqr; + barrier(CLK_LOCAL_MEM_FENCE); + + int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0); + if(lidx == 0) + { + float4 one = (float4)(1, 1, 1, 1); + __local float4* tmp_sum = (__local float4*)lcl_sum; + __local float4* tmp_sqr = (__local float4*)lcl_sqr; + + sum = 0; sqr = 0; + for(int i = 0; i < 4; i++) + { + sum += dot(tmp_sum[i], one); + sqr += dot(tmp_sqr[i], one); + } + + float4 dst = (float4)(0); + dst.x = sum; + write_imagef(output, coord_out.xy, dst); + coord_out.x++; + dst.x = sqr; + write_imagef(output, coord_out.xy, dst); + } +} + +__kernel void group_norm_I32toI32( + __read_only image2d_array_t input, + __read_only image2d_t bias, + __read_only image2d_t scale, + __read_only image2d_t meanVari, + __write_only image2d_array_t output, + float eps, + int is2d, + float input_zp, + float input_scale, + float output_zp, + float output_scale, + float rSpaceOrg, + int width, + int height, + int pStride + ) +{ + int gidy = get_global_id(1); + int gidz = get_global_id(2); + int4 coord = (int4)(get_global_id(0), gidy, gidz, 0); + int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 1); + + float4 gamma = read_imagef(scale, coord_para.xy); + float4 beta = read_imagef(bias, coord_para.xy); + float4 mean_vari = read_imagef(meanVari, coord_para.zy); + mean_vari.y = read_imagef(meanVari, coord_para.wy).x; + float4 data = convert_float4(read_imagei(input, coord)); + + float scale_vari, bias_val; + + scale_vari = gamma.s0 * mean_vari.s1; + float alpha = input_scale * output_scale * scale_vari; + bias_val = (beta.s0 - scale_vari * mean_vari.s0) * output_scale; + + int4 dst; + float4 norm; + norm.x = data.x * alpha + bias_val; + dst = convert_int4_rte(norm); + write_imagei(output, coord, dst); +} + +__kernel void group_norm_I32toI32_2D( + __read_only image2d_t input, + __read_only image2d_t bias, + __read_only image2d_t scale, + __read_only image2d_t meanVari, + __write_only image2d_t output, + float eps, + int is2d, + float input_zp, + float input_scale, + float output_zp, + float output_scale, + float rSpaceOrg, + int width, + int height, + int pStride + ) +{ + int gidz = get_global_id(1); + int2 coord = (int2)(get_global_id(0), gidz); + int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 1); + + float4 gamma = read_imagef(scale, coord_para.xy); + float4 beta = read_imagef(bias, coord_para.xy); + float4 mean_vari = read_imagef(meanVari, coord_para.zy); + mean_vari.y = read_imagef(meanVari, coord_para.wy).x; + float4 data = convert_float4(read_imagei(input, coord)); + + float scale_vari, bias_val; + + scale_vari = gamma.s0 * mean_vari.s1; + float alpha = input_scale * output_scale * scale_vari; + bias_val = (beta.s0 - scale_vari * mean_vari.s0) * output_scale; + + int4 dst; + float4 norm; + norm.x = data.x * alpha + bias_val; + dst = convert_int4_rte(norm); + write_imagei(output, coord, dst); +} + +__kernel void group_norm_I32toF32( + __read_only image2d_array_t input, + __read_only image2d_t bias, + __read_only image2d_t scale, + __read_only image2d_t meanVari, + __write_only image2d_array_t output, + float eps, + int is2d, + float input_zp, + float input_scale, + float output_zp, + float output_scale, + float rSpaceOrg, + int width, + int height, + int pStride + ) +{ + int gidy = get_global_id(1); + int gidz = get_global_id(2); + int4 coord = (int4)(get_global_id(0), gidy, gidz, 0); + int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 1); + + float4 gamma = read_imagef(scale, coord_para.xy); + float4 beta = read_imagef(bias, coord_para.xy); + float4 mean_vari = read_imagef(meanVari, coord_para.zy); + mean_vari.y = read_imagef(meanVari, coord_para.wy).x; + float4 data = convert_float4(read_imagei(input, coord)); + + float scale_vari, bias_val; + + scale_vari = gamma.s0 * mean_vari.s1; + float alpha = input_scale * scale_vari; + bias_val = (beta.s0 - scale_vari * mean_vari.s0); + + float4 norm; + norm.x = data.x * alpha + bias_val; + write_imagef(output, coord, norm); +} + +__kernel void group_norm_I32toF32_2D( + __read_only image2d_t input, + __read_only image2d_t bias, + __read_only image2d_t scale, + __read_only image2d_t meanVari, + __write_only image2d_t output, + float eps, + int is2d, + float input_zp, + float input_scale, + float output_zp, + float output_scale, + float rSpaceOrg, + int width, + int height, + int pStride + ) +{ + int gidz = get_global_id(1); + int2 coord = (int2)(get_global_id(0), gidz); + int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 1); + + float4 gamma = read_imagef(scale, coord_para.xy); + float4 beta = read_imagef(bias, coord_para.xy); + float4 mean_vari = read_imagef(meanVari, coord_para.zy); + mean_vari.y = read_imagef(meanVari, coord_para.wy).x; + float4 data = convert_float4(read_imagei(input, coord)); + + float scale_vari, bias_val; + + scale_vari = gamma.s0 * mean_vari.s1; + float alpha = input_scale * scale_vari; + bias_val = beta.s0 - scale_vari * mean_vari.s0; + + float4 norm; + norm.x = data.x * alpha + bias_val; + write_imagef(output, coord, norm); +} diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/group_normalization_u8.cl b/src/tim/vx/internal/src/libnnext/ops/cl/group_normalization_u8.cl new file mode 100644 index 0000000..a7ccd60 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/cl/group_normalization_u8.cl @@ -0,0 +1,287 @@ +__kernel void group_norm_sumsqr_U8( + __read_only image2d_array_t input, + __write_only image2d_t output, + float eps, + int is2d, + float input_zp, + float input_scale, + int width, + int height + ) +{ + int gidx = get_global_id(0); + int gidz = get_global_id(1); + int lidx = get_local_id(0); + + int4 coord = (int4)(gidx, 0, gidz, 0); + float4 data; + float sum = 0, sqr = 0; + float tmpSum = 0, tmpSqr = 0; + float e2InScale = input_scale * input_scale; + + __local float lcl_sum[16]; + __local float lcl_sqr[16]; + + if(gidx < width) + { + for(coord.y = 0; coord.y < height;) + { + data = convert_float4(read_imageui(input, coord)); + coord.y++; + tmpSum += data.x; + tmpSqr += data.x * data.x; + } + sqr = (tmpSqr - 2 * input_zp * tmpSum + height * input_zp * input_zp) * e2InScale; + sum = (tmpSum - height * input_zp) * input_scale; + } + lcl_sum[lidx] = sum; + lcl_sqr[lidx] = sqr; + barrier(CLK_LOCAL_MEM_FENCE); + + int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0); + if(lidx == 0) + { + float4 one = (float4)(1, 1, 1, 1); + __local float4* tmp_sum = (__local float4*)lcl_sum; + __local float4* tmp_sqr = (__local float4*)lcl_sqr; + + sum = 0; sqr = 0; + for(int i = 0; i < 4; i++) + { + sum += dot(tmp_sum[i], one); + sqr += dot(tmp_sqr[i], one); + } + + float4 dst = (float4)(0); + dst.x = sum; + write_imagef(output, coord_out.xy, dst); + coord_out.x++; + dst.x = sqr; + write_imagef(output, coord_out.xy, dst); + } +} + +__kernel void group_norm_sumsqr_U8_2D( + __read_only image2d_t input, + __write_only image2d_t output, + float eps, + int is2d, + float input_zp, + float input_scale, + int width, + int height + ) +{ + int gidx = get_global_id(0); + int gidz = get_global_id(1); + int lidx = get_local_id(0); + + int2 coord = (int2)(gidx, gidz); + float4 data; + float sum = 0, sqr = 0; + + __local float lcl_sum[16]; + __local float lcl_sqr[16]; + + if(gidx < width) + { + data = convert_float4(read_imageui(input, coord)); + sum = (data.x - input_zp) * input_scale; + sqr = sum * sum; + } + lcl_sum[lidx] = sum; + lcl_sqr[lidx] = sqr; + barrier(CLK_LOCAL_MEM_FENCE); + + int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0); + if(lidx == 0) + { + float4 one = (float4)(1, 1, 1, 1); + __local float4* tmp_sum = (__local float4*)lcl_sum; + __local float4* tmp_sqr = (__local float4*)lcl_sqr; + + sum = 0; sqr = 0; + for(int i = 0; i < 4; i++) + { + sum += dot(tmp_sum[i], one); + sqr += dot(tmp_sqr[i], one); + } + + float4 dst = (float4)(0); + dst.x = sum; + write_imagef(output, coord_out.xy, dst); + coord_out.x++; + dst.x = sqr; + write_imagef(output, coord_out.xy, dst); + } +} + +__kernel void group_norm_U8toU8( + __read_only image2d_array_t input, + __read_only image2d_t bias, + __read_only image2d_t scale, + __read_only image2d_t meanVari, + __write_only image2d_array_t output, + float eps, + int is2d, + float input_zp, + float input_scale, + float output_zp, + float output_scale, + float rSpaceOrg, + int width, + int height, + int pStride + ) +{ + int gidy = get_global_id(1); + int gidz = get_global_id(2); + int4 coord = (int4)(get_global_id(0), gidy, gidz, 0); + int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 1); + + float4 gamma = read_imagef(scale, coord_para.xy); + float4 beta = read_imagef(bias, coord_para.xy); + float4 mean_vari = read_imagef(meanVari, coord_para.zy); + mean_vari.y = read_imagef(meanVari, coord_para.wy).x; + float4 data = convert_float4(read_imageui(input, coord)); + + float scale_vari, bias_val; + float scale_inOut = input_scale * output_scale; + + scale_vari = gamma.s0 * mean_vari.s1; + float alpha = scale_inOut * scale_vari; + bias_val = (beta.s0 - scale_vari * mean_vari.s0) * output_scale + output_zp; + + uint4 dst; + data.x -= input_zp; + float4 norm; + norm.x = data.x * alpha + bias_val; + dst = convert_uint4_rte(norm); + write_imageui(output, coord, dst); +} + +__kernel void group_norm_U8toU8_2D( + __read_only image2d_t input, + __read_only image2d_t bias, + __read_only image2d_t scale, + __read_only image2d_t meanVari, + __write_only image2d_t output, + float eps, + int is2d, + float input_zp, + float input_scale, + float output_zp, + float output_scale, + float rSpaceOrg, + int width, + int height, + int pStride + ) +{ + int gidz = get_global_id(1); + int2 coord = (int2)(get_global_id(0), gidz); + int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 1); + + float4 gamma = read_imagef(scale, coord_para.xy); + float4 beta = read_imagef(bias, coord_para.xy); + float4 mean_vari = read_imagef(meanVari, coord_para.zy); + mean_vari.y = read_imagef(meanVari, coord_para.wy).x; + float4 data = convert_float4(read_imageui(input, coord)); + + float scale_vari, bias_val; + float scale_inOut = input_scale * output_scale; + + scale_vari = gamma.s0 * mean_vari.s1; + float alpha = scale_inOut * scale_vari; + bias_val = (beta.s0 - scale_vari * mean_vari.s0) * output_scale + output_zp; + + uint4 dst; + data.x -= input_zp; + float4 norm; + norm.x = data.x * alpha + bias_val; + dst = convert_uint4_rte(norm); + write_imageui(output, coord, dst); +} + +__kernel void group_norm_U8toF32( + __read_only image2d_array_t input, + __read_only image2d_t bias, + __read_only image2d_t scale, + __read_only image2d_t meanVari, + __write_only image2d_array_t output, + float eps, + int is2d, + float input_zp, + float input_scale, + float output_zp, + float output_scale, + float rSpaceOrg, + int width, + int height, + int pStride + ) +{ + int gidy = get_global_id(1); + int gidz = get_global_id(2); + int4 coord = (int4)(get_global_id(0), gidy, gidz, 0); + int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 1); + + float4 gamma = read_imagef(scale, coord_para.xy); + float4 beta = read_imagef(bias, coord_para.xy); + float4 mean_vari = read_imagef(meanVari, coord_para.zy); + mean_vari.y = read_imagef(meanVari, coord_para.wy).x; + float4 data = convert_float4(read_imageui(input, coord)); + + float scale_vari, bias_val; + float scale_inOut = input_scale * output_scale; + + scale_vari = gamma.s0 * mean_vari.s1; + float alpha = scale_inOut * scale_vari; + bias_val = (beta.s0 - scale_vari * mean_vari.s0) * output_scale + output_zp; + + data.x -= input_zp; + float4 norm; + norm.x = data.x * alpha + bias_val; + write_imagef(output, coord, norm); +} + +__kernel void group_norm_U8toF32_2D( + __read_only image2d_t input, + __read_only image2d_t bias, + __read_only image2d_t scale, + __read_only image2d_t meanVari, + __write_only image2d_t output, + float eps, + int is2d, + float input_zp, + float input_scale, + float output_zp, + float output_scale, + float rSpaceOrg, + int width, + int height, + int pStride + ) +{ + int gidz = get_global_id(1); + int2 coord = (int2)(get_global_id(0), gidz); + int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 1); + + float4 gamma = read_imagef(scale, coord_para.xy); + float4 beta = read_imagef(bias, coord_para.xy); + float4 mean_vari = read_imagef(meanVari, coord_para.zy); + mean_vari.y = read_imagef(meanVari, coord_para.wy).x; + float4 data = convert_float4(read_imageui(input, coord)); + + float scale_vari, bias_val; + float scale_inOut = input_scale * output_scale; + + scale_vari = gamma.s0 * mean_vari.s1; + float alpha = scale_inOut * scale_vari; + bias_val = (beta.s0 - scale_vari * mean_vari.s0) * output_scale + output_zp; + + data.x -= input_zp; + float4 norm; + norm.x = data.x * alpha + bias_val; + write_imagef(output, coord, norm); +} diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/matrixmul.cl b/src/tim/vx/internal/src/libnnext/ops/cl/matrixmul.cl index 70a81da..9efcd9e 100644 --- a/src/tim/vx/internal/src/libnnext/ops/cl/matrixmul.cl +++ b/src/tim/vx/internal/src/libnnext/ops/cl/matrixmul.cl @@ -143,7 +143,7 @@ __kernel void gemm_transb_F32F32toF32_3D( coord_a.x = get_global_id(0); coord_a.z = get_global_id(2); - write_imagef(output, coord_b, sum); + write_imagef(output, coord_a, sum); } __kernel void gemm_transb_F32I8toF32_2D( @@ -219,5 +219,5 @@ __kernel void gemm_transb_F32I8toF32_3D( coord_a.x = get_global_id(0); coord_a.z = get_global_id(2); - write_imagef(output, coord_b, sum); + write_imagef(output, coord_a, sum); } diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/one_hot.cl b/src/tim/vx/internal/src/libnnext/ops/cl/one_hot.cl new file mode 100644 index 0000000..d186c41 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/cl/one_hot.cl @@ -0,0 +1,130 @@ +__kernel void one_hot_F32toF32 + ( + __read_only image2d_t input, + __write_only image2d_array_t output, + int depth, + float on_value, + float off_value, + float inputScale, + float inputTail + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0); + + float4 val = read_imagef(input, coord.xy); + + do + { + float4 dst; + dst.x = convert_int(val.x) == coord.z ? on_value : off_value; + + write_imagef(output, coord.xzyw, dst.xxxx); + + coord.z ++; + } while (coord.z < depth); +} + +__kernel void one_hot_I32toI32 + ( + __read_only image2d_t input, + __write_only image2d_array_t output, + int depth, + int on_value, + int off_value, + float inputScale, + float inputTail + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0); + + int4 val = read_imagei(input, coord.xy); + + do + { + int4 dst; + dst.x = val.x == coord.z ? on_value : off_value; + + write_imagei(output, coord.xzyw, dst.xxxx); + + coord.z ++; + } while (coord.z < depth); +} + +__kernel void one_hot_I32toU8 + ( + __read_only image2d_t input, + __write_only image2d_array_t output, + int depth, + uint on_value, + uint off_value, + float inputScale, + float inputTail + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0); + + int4 val = read_imagei(input, coord.xy); + do + { + uint4 dst; + dst.x = val.x == coord.z ? on_value : off_value; + + write_imageui(output, coord.xzyw, dst.xxxx); + + coord.z ++; + } while (coord.z < depth); +} + +__kernel void one_hot_I32toF32 + ( + __read_only image2d_t input, + __write_only image2d_array_t output, + int depth, + float on_value, + float off_value, + float inputScale, + float inputTail + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0); + + int4 val = read_imagei(input, coord.xy); + + do + { + float4 dst; + dst.x = val.x == coord.z ? on_value : off_value; + + write_imagef(output, coord.xzyw, dst.xxxx); + + coord.z ++; + } while (coord.z < depth); +} + +__kernel void one_hot_U8toU8 + ( + __read_only image2d_t input, + __write_only image2d_array_t output, + int depth, + uint on_value, + uint off_value, + float inputScale, + float inputTail + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0); + + uint4 src = read_imageui(input, coord.xy); + + int val = convert_int(convert_float(src.x) * inputScale - inputTail); + + do + { + uint4 dst; + dst.x = val == coord.z ? on_value : off_value; + + write_imageui(output, coord.xzyw, dst.xxxx); + + coord.z ++; + } while (coord.z < depth); +} diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/repeat.cl b/src/tim/vx/internal/src/libnnext/ops/cl/repeat.cl new file mode 100644 index 0000000..2492a9c --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/cl/repeat.cl @@ -0,0 +1,176 @@ +__kernel void repeat_I32_axis0( + __read_only image2d_array_t input0, + __read_only image2d_t input1, + __write_only image2d_array_t output, + int width, int height, int channel, int axis) +{ + int4 coord = (int4)(get_global_id(0), 0, get_global_id(2), 0); + int4 coord_out = coord; + + for(coord.y = 0; coord.y < height;) + { + int4 data = read_imagei(input0, coord); + int4 len = read_imagei(input1, coord.yw); + coord.y++; + for(int i = 0; i < len.x; i++) + { + write_imagei(output, coord_out, data); + coord_out.y++; + } + } +} + +__kernel void repeat_I32_axis1( + __read_only image2d_array_t input0, + __read_only image2d_t input1, + __write_only image2d_array_t output, + int width, int height, int channel, int axis) +{ + int4 coord = (int4)(0, get_global_id(1), get_global_id(2), 0); + int4 coord_out = coord; + + for(coord.x = 0; coord.x < width;) + { + int4 data = read_imagei(input0, coord); + int4 len = read_imagei(input1, coord.xw); + coord.x++; + for(int i = 0; i < len.x; i++) + { + write_imagei(output, coord_out, data); + coord_out.x++; + } + } +} + +__kernel void repeat_I32_axis2( + __read_only image2d_array_t input0, + __read_only image2d_t input1, + __write_only image2d_array_t output, + int width, int height, int channel, int axis) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0); + int4 coord_out = coord; + + for(coord.z = 0; coord.z < channel;) + { + int4 data = read_imagei(input0, coord); + int4 len = read_imagei(input1, coord.zw); + coord.z++; + for(int i = 0; i < len.x; i++) + { + write_imagei(output, coord_out, data); + coord_out.z++; + } + } +} + +__kernel void repeat_I32_1D( + __read_only image2d_t input0, + __read_only image2d_t input1, + __write_only image2d_t output, + int width, int height, int channel, int axis) +{ + int2 coord = (int2)(0, 0); + int2 coord_out = coord; + + for(coord.x = 0; coord.x < width;) + { + int4 data = read_imagei(input0, coord); + int4 len = read_imagei(input1, coord.xy); + coord.x++; + for(int i = 0; i < len.x; i++) + { + write_imagei(output, coord_out, data); + coord_out.x++; + } + } +} + +__kernel void repeat_F32_axis0( + __read_only image2d_array_t input0, + __read_only image2d_t input1, + __write_only image2d_array_t output, + int width, int height, int channel, int axis) +{ + int4 coord = (int4)(get_global_id(0), 0, get_global_id(2), 0); + int4 coord_out = coord; + + for(coord.y = 0; coord.y < height;) + { + float4 data = read_imagef(input0, coord); + int4 len = read_imagei(input1, coord.yw); + coord.y++; + for(int i = 0; i < len.x; i++) + { + write_imagef(output, coord_out, data); + coord_out.y++; + } + } +} + +__kernel void repeat_F32_axis1( + __read_only image2d_array_t input0, + __read_only image2d_t input1, + __write_only image2d_array_t output, + int width, int height, int channel, int axis) +{ + int4 coord = (int4)(0, get_global_id(1), get_global_id(2), 0); + int4 coord_out = coord; + + for(coord.x = 0; coord.x < width;) + { + float4 data = read_imagef(input0, coord); + int4 len = read_imagei(input1, coord.xw); + coord.x++; + for(int i = 0; i < len.x; i++) + { + write_imagef(output, coord_out, data); + coord_out.x++; + } + } +} + +__kernel void repeat_F32_axis2( + __read_only image2d_array_t input0, + __read_only image2d_t input1, + __write_only image2d_array_t output, + int width, int height, int channel, int axis) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0); + int4 coord_out = coord; + + for(coord.z = 0; coord.z < channel;) + { + float4 data = read_imagef(input0, coord); + int4 len = read_imagei(input1, coord.zw); + coord.z++; + for(int i = 0; i < len.x; i++) + { + write_imagef(output, coord_out, data); + coord_out.z++; + } + } +} + +__kernel void repeat_F32_1D( + __read_only image2d_t input0, + __read_only image2d_t input1, + __write_only image2d_t output, + int width, int height, int channel, int axis) +{ + int2 coord = (int2)(0, 0); + int2 coord_out = coord; + + for(coord.x = 0; coord.x < width;) + { + float4 data = read_imagef(input0, coord); + int4 len = read_imagei(input1, coord.xy); + coord.x++; + for(int i = 0; i < len.x; i++) + { + write_imagef(output, coord_out, data); + coord_out.x++; + } + } +} + diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/sequence_mask.cl b/src/tim/vx/internal/src/libnnext/ops/cl/sequence_mask.cl new file mode 100644 index 0000000..4813eb7 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/cl/sequence_mask.cl @@ -0,0 +1,72 @@ + +__kernel void sequence_mask_I32toU8( + image2d_t input, image2d_array_t output, int maxLen, + float input_scale, float input_zpScale, float outputVal1, int output_ZP) +{ + int gidx = get_global_id(0); + int4 coord = (int4)(gidx, get_global_id(1), get_global_id(2), 0); + int4 index = read_imagei(input, coord.yz); + uint4 data; + data.x = gidx < index.x ? convert_uint_rte(outputVal1) : (uint)(output_ZP); + write_imageui(output, coord, data); +} + +__kernel void sequence_mask_I32toU8_2D( + image2d_t input, image2d_t output, int maxLen, + float input_scale, float input_zpScale, float outputVal1, int output_ZP) +{ + int gidx = get_global_id(0); + int2 coord = (int2)(gidx, get_global_id(1)); + int4 index = read_imagei(input, coord.yy); + uint4 data; + data.x = gidx < index.x ? convert_uint_rte(outputVal1) : (uint)(output_ZP); + write_imageui(output, coord, data); +} + +__kernel void sequence_mask_I32toI32( + image2d_t input, image2d_array_t output, int maxLen, + float input_scale, float input_zpScale, float outputVal1, int output_ZP) +{ + int gidx = get_global_id(0); + int4 coord = (int4)(gidx, get_global_id(1), get_global_id(2), 0); + int4 index = read_imagei(input, coord.yz); + int4 data; + data = gidx < index.x ? (int4)(1) : (int4)(0); + write_imagei(output, coord, data); +} + +__kernel void sequence_mask_I32toI32_2D( + image2d_t input, image2d_t output, int maxLen, + float input_scale, float input_zpScale, float outputVal1, int output_ZP) +{ + int gidx = get_global_id(0); + int2 coord = (int2)(gidx, get_global_id(1)); + int4 index = read_imagei(input, coord.yy); + int4 data; + data = gidx < index.x ? (int4)(1) : (int4)(0); + write_imagei(output, coord, data); +} + +__kernel void sequence_mask_I32toF32( + image2d_t input, image2d_array_t output, int maxLen, + float input_scale, float input_zpScale, float outputVal1, int output_ZP) +{ + int gidx = get_global_id(0); + int4 coord = (int4)(gidx, get_global_id(1), get_global_id(2), 0); + int4 index = read_imagei(input, coord.yz); + float4 data; + data = gidx < index.x ? (float4)(1.0f) : (float4)(0.0f); + write_imagef(output, coord, data); +} + +__kernel void sequence_mask_I32toF32_2D( + image2d_t input, image2d_t output, int maxLen, + float input_scale, float input_zpScale, float outputVal1, int output_ZP) +{ + int gidx = get_global_id(0); + int2 coord = (int2)(gidx, get_global_id(1)); + int4 index = read_imagei(input, coord.yy); + float4 data; + data = gidx < index.x ? (float4)(1.0f) : (float4)(0.0f); + write_imagef(output, coord, data); +} \ No newline at end of file diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/slice.cl b/src/tim/vx/internal/src/libnnext/ops/cl/slice.cl new file mode 100644 index 0000000..764aca2 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/cl/slice.cl @@ -0,0 +1,144 @@ +__kernel void slice_F32_I32toF32 + ( + __read_only image2d_array_t input0, + __read_only image2d_t input1, + __write_only image2d_array_t output, + float inputScale, + float inputTail, + float outputScale, + float outputZP + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + int4 coord_in; + Image begin_img = create_image_from_image2d(input1, 4); + uchar* begin_ptr = begin_img.ptr; + int4 begin = ((int4 *)begin_ptr)[0]; + + coord_in = coord + begin; + float4 src = read_imagef(input0, coord_in); + + write_imagef(output, coord, src); +} + +__kernel void slice_F32_I32toF32_2D + ( + __read_only image2d_t input0, + __read_only image2d_t input1, + __write_only image2d_t output, + float inputScale, + float inputTail, + float outputScale, + float outputZP + ) +{ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); + int2 coord_in; + Image begin_img = create_image_from_image2d(input1, 4); + uchar* begin_ptr = begin_img.ptr; + int2 begin = ((int2 *)begin_ptr)[0]; + + coord_in = coord + begin; + float4 src = read_imagef(input0, coord_in); + + write_imagef(output, coord, src); +} + +__kernel void slice_U8_I32toU8 + ( + __read_only image2d_array_t input0, + __read_only image2d_t input1, + __write_only image2d_array_t output, + float inputScale, + float inputTail, + float outputScale, + float outputZP + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + int4 coord_in; + Image begin_img = create_image_from_image2d(input1, 4); + uchar* begin_ptr = begin_img.ptr; + int4 begin = ((int4 *)begin_ptr)[0]; + + coord_in = coord + begin; + uint4 src = read_imageui(input0, coord_in); + + float4 data = convert_float4(src) * inputScale - inputTail; + uint4 dst = convert_uint4(data * outputScale + outputZP); + + write_imageui(output, coord, dst); +} + +__kernel void slice_U8_I32toU8_2D + ( + __read_only image2d_t input0, + __read_only image2d_t input1, + __write_only image2d_t output, + float inputScale, + float inputTail, + float outputScale, + float outputZP + ) +{ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); + int2 coord_in; + Image begin_img = create_image_from_image2d(input1, 4); + uchar* begin_ptr = begin_img.ptr; + int2 begin = ((int2 *)begin_ptr)[0]; + + coord_in = coord + begin; + uint4 src = read_imageui(input0, coord_in); + + float4 data = convert_float4(src) * inputScale - inputTail; + uint4 dst = convert_uint4(data * outputScale + outputZP); + + write_imageui(output, coord, dst); +} + +__kernel void slice_I32_I32toI32 + ( + __read_only image2d_array_t input0, + __read_only image2d_t input1, + __write_only image2d_array_t output, + float inputScale, + float inputTail, + float outputScale, + float outputZP + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + int4 coord_in; + Image begin_img = create_image_from_image2d(input1, 4); + uchar* begin_ptr = begin_img.ptr; + int4 begin = ((int4 *)begin_ptr)[0]; + + coord_in = coord + begin; + int4 src = read_imagei(input0, coord_in); + + write_imagei(output, coord, src); +} + +__kernel void slice_I32_I32toI32_2D + ( + __read_only image2d_t input0, + __read_only image2d_t input1, + __write_only image2d_t output, + float inputScale, + float inputTail, + float outputScale, + float outputZP + ) +{ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); + int2 coord_in; + Image begin_img = create_image_from_image2d(input1, 4); + uchar* begin_ptr = begin_img.ptr; + int2 begin = ((int2 *)begin_ptr)[0]; + + coord_in = coord + begin; + int4 src = read_imagei(input0, coord_in); + + write_imagei(output, coord, src); +} + diff --git a/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_axis_aligned_bbox_transform.c b/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_axis_aligned_bbox_transform.c deleted file mode 100644 index 2755dc8..0000000 --- a/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_axis_aligned_bbox_transform.c +++ /dev/null @@ -1,275 +0,0 @@ -/**************************************************************************** -* -* Copyright (c) 2020 Vivante Corporation -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -* DEALINGS IN THE SOFTWARE. -* -*****************************************************************************/ -#include -#include -#include -#include -#include -#include - -#include "vsi_nn_platform.h" - -#include "vsi_nn_prv.h" -#include "vsi_nn_log.h" -#include "vsi_nn_test.h" -#include "vsi_nn_tensor_util.h" -#include "utils/vsi_nn_util.h" -#include "utils/vsi_nn_dtype_util.h" -#include "utils/vsi_nn_math.h" -#include "client/vsi_nn_vxkernel.h" -#include "libnnext/vx_lib_nnext.h" - -#define _VX_KERNEL_VAR (vx_kernel_AXIS_ALIGNED_BBOX_TRANSFORM) -#define _VX_KERNEL_ID (VX_KERNEL_ENUM_AXIS_ALIGNED_BBOX_TRANSFORM) -#define _VX_KERNEL_NAME (VX_KERNEL_NAME_AXIS_ALIGNED_BBOX_TRANSFORM) -#define _VX_KERNEL_FUNC_KERNEL (vxAxis_aligned_bbox_transformKernel) - -typedef struct -{ - float x1, y1, x2, y2; -}BoxEncodingCorner; -typedef struct -{ - float w, h, x, y; -}BoxEncodingCenter; - -void toBoxEncodingCorner - ( - BoxEncodingCenter* ctr, - BoxEncodingCorner* cnr - ) -{ - cnr->x1 = ctr->x - ctr->w / 2; - cnr->y1 = ctr->y - ctr->h / 2; - cnr->x2 = ctr->x + ctr->w / 2; - cnr->y2 = ctr->y + ctr->h / 2; -} - -void toBoxEncodingCenter - ( - BoxEncodingCorner* cnr, - BoxEncodingCenter* ctr - ) -{ - ctr->w = cnr->x2 - cnr->x1; - ctr->h = cnr->y2 - cnr->y1; - ctr->x = (cnr->x1 + cnr->x2) / 2; - ctr->y = (cnr->y1 + cnr->y2) / 2; -} - -static vsi_status VX_CALLBACK vxAxis_aligned_bbox_transformKernel - ( - vx_node node, - const vx_reference* paramObj, - uint32_t paramNum - ) -{ -#define ARG_NUM (0) -#define TENSOR_NUM_INPUT (4) -#define TENSOR_NUM_OUTPUT (1) -#define TENSOR_NUM (TENSOR_NUM_INPUT+TENSOR_NUM_OUTPUT) - - vsi_status status = VSI_FAILURE; - vx_context context = NULL; - vx_tensor input[TENSOR_NUM_INPUT] = {0}; - vx_tensor output[TENSOR_NUM_OUTPUT] = {0}; - float *f32_in_buffer[TENSOR_NUM_INPUT] = {0}; - int32_t* int32_in_buffer[TENSOR_NUM_INPUT] = {0}; - float *f32_out_buffer[TENSOR_NUM_OUTPUT] = {0}; - vsi_nn_tensor_attr_t in_attr[TENSOR_NUM_INPUT]; - vsi_nn_tensor_attr_t out_attr[TENSOR_NUM_OUTPUT]; - uint32_t in_elements[TENSOR_NUM_INPUT] = {0}; - uint32_t out_elements[TENSOR_NUM_OUTPUT]= {0}; - - int32_t i; - for(i = 0; i < TENSOR_NUM_INPUT; i++) - { - memset(&in_attr[i], 0x0, sizeof(vsi_nn_tensor_attr_t)); - } - for(i = 0; i < TENSOR_NUM_OUTPUT; i++) - { - memset(&out_attr[i], 0x0, sizeof(vsi_nn_tensor_attr_t)); - } - /* prepare data */ - context = vxGetContext((vx_reference)node); - - for(i = 0; i < TENSOR_NUM_INPUT; i ++) - { - input[i] = (vx_tensor)paramObj[i]; - status = vsi_nn_vxGetTensorAttr(input[i], &in_attr[i]); - TEST_CHECK_STATUS(status, final); - in_elements[i] = vsi_nn_vxGetTensorElementNum(&in_attr[i]); - if (i == 2) - { - int32_in_buffer[i] = (int32_t *)vsi_nn_vxCopyTensorToData(context, - input[i], &in_attr[i]); - } - else - { - f32_in_buffer[i] = (float *)malloc(in_elements[i] * sizeof(float)); - status = vsi_nn_vxConvertTensorToFloat32Data( - context, input[i], &in_attr[i], f32_in_buffer[i], - in_elements[i] * sizeof(float)); - TEST_CHECK_STATUS(status, final); - } - } - for(i = 0; i < TENSOR_NUM_OUTPUT; i ++) - { - output[i] = (vx_tensor)paramObj[i + TENSOR_NUM_INPUT]; - status = vsi_nn_vxGetTensorAttr(output[i], &out_attr[i]); - TEST_CHECK_STATUS(status, final); - out_elements[i] = vsi_nn_vxGetTensorElementNum(&out_attr[i]); - f32_out_buffer[i]= (float *)malloc(out_elements[i] * sizeof(float)); - memset(f32_out_buffer[i], 0, out_elements[i] * sizeof(float)); - } - - /* TODO: Add CPU kernel implement */ - { - const uint32_t roiLength = 4; - const uint32_t imageLength = 2; - - uint32_t numClasses = in_attr[1].size[0] / roiLength; - uint32_t numRois = in_attr[0].size[1]; - uint32_t j; - uint32_t roiIndex; - for(roiIndex = 0; roiIndex < numRois; roiIndex++) - { - uint32_t batchIndex = int32_in_buffer[2][roiIndex]; - float imageHeight = f32_in_buffer[3][batchIndex * imageLength]; - float imageWidth = f32_in_buffer[3][batchIndex * imageLength + 1]; - BoxEncodingCorner roi_cnr; - BoxEncodingCenter roiBefore; - roi_cnr.x1 = f32_in_buffer[0][roiIndex * roiLength]; - roi_cnr.y1 = f32_in_buffer[0][roiIndex * roiLength + 1]; - roi_cnr.x2 = f32_in_buffer[0][roiIndex * roiLength + 2]; - roi_cnr.y2 = f32_in_buffer[0][roiIndex * roiLength + 3]; - toBoxEncodingCenter(&roi_cnr, &roiBefore); - for (j = 0; j < numClasses; j++) - { - BoxEncodingCenter roi_ctr; - BoxEncodingCorner roiAfter; - BoxEncodingCorner cliped; - uint32_t index = (roiIndex * numClasses + j) * roiLength; - roi_ctr.w = (float)(exp(f32_in_buffer[1][index + 2]) * roiBefore.w); - roi_ctr.h = (float)(exp(f32_in_buffer[1][index + 3]) * roiBefore.h); - roi_ctr.x = roiBefore.x + f32_in_buffer[1][index] * roiBefore.w; - roi_ctr.y = roiBefore.y + f32_in_buffer[1][index + 1] * roiBefore.h; - toBoxEncodingCorner(&roi_ctr, &roiAfter); - cliped.x1 = vsi_nn_min(vsi_nn_max(roiAfter.x1, 0.0f), imageWidth); - cliped.y1 = vsi_nn_min(vsi_nn_max(roiAfter.y1, 0.0f), imageHeight); - cliped.x2 = vsi_nn_min(vsi_nn_max(roiAfter.x2, 0.0f), imageWidth); - cliped.y2 = vsi_nn_min(vsi_nn_max(roiAfter.y2, 0.0f), imageHeight); - f32_out_buffer[0][index] = cliped.x1; - f32_out_buffer[0][index + 1] = cliped.y1; - f32_out_buffer[0][index + 2] = cliped.x2; - f32_out_buffer[0][index + 3] = cliped.y2; - } - } - } - - /* save data */ - for(i = 0; i < TENSOR_NUM_OUTPUT; i++) - { - status = vsi_nn_vxConvertFloat32DataToTensor( - context, output[i], &out_attr[i], f32_out_buffer[i], - out_elements[i] * sizeof(float)); - TEST_CHECK_STATUS(status, final); - } - -final: - for (i = 0; i < TENSOR_NUM_INPUT; i++) - { - if (f32_in_buffer[i]) free(f32_in_buffer[i]); - if (int32_in_buffer[i]) free(int32_in_buffer[i]); - } - for(i = 0; i < TENSOR_NUM_OUTPUT; i++) - { - if (f32_out_buffer[i]) free(f32_out_buffer[i]); - } - return status; -} /* _VX_KERNEL_FUNC_KERNEL() */ - -static vx_param_description_t vxAxis_aligned_bbox_transformKernelParam[] = -{ - {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, - {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, - {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, - {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, - {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, -}; - -vx_status VX_CALLBACK vxAxis_aligned_bbox_transformInitializer - ( - vx_node nodObj, - const vx_reference *paramObj, - vx_uint32 paraNum - ) -{ - vx_status status = VX_SUCCESS; - /*TODO: Add initial code for VX program*/ - - return status; -} - - -#ifdef __cplusplus -extern "C" { -#endif -vx_kernel_description_t vxAxis_aligned_bbox_transform_CPU = -{ - _VX_KERNEL_ID, - _VX_KERNEL_NAME, - _VX_KERNEL_FUNC_KERNEL, - vxAxis_aligned_bbox_transformKernelParam, - _cnt_of_array( vxAxis_aligned_bbox_transformKernelParam ), - vsi_nn_KernelValidator, - NULL, - NULL, - vsi_nn_KernelInitializer, - vsi_nn_KernelDeinitializer -}; - -vx_kernel_description_t vxAxis_aligned_bbox_transform_VX = -{ - _VX_KERNEL_ID, - _VX_KERNEL_NAME, - NULL, - vxAxis_aligned_bbox_transformKernelParam, - _cnt_of_array( vxAxis_aligned_bbox_transformKernelParam ), - vsi_nn_KernelValidator, - NULL, - NULL, - vxAxis_aligned_bbox_transformInitializer, - vsi_nn_KernelDeinitializer -}; - -vx_kernel_description_t * vx_kernel_AXIS_ALIGNED_BBOX_TRANSFORM_list[] = -{ - &vxAxis_aligned_bbox_transform_CPU, - &vxAxis_aligned_bbox_transform_VX, - NULL -}; -#ifdef __cplusplus -} -#endif diff --git a/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_box_with_nms_limit.c b/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_box_with_nms_limit.c index 8114caf..f4b6949 100644 --- a/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_box_with_nms_limit.c +++ b/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_box_with_nms_limit.c @@ -38,7 +38,7 @@ #include "utils/vsi_nn_dtype_util.h" #include "utils/vsi_nn_math.h" #include "utils/vsi_nn_link_list.h" -#include "client/vsi_nn_vxkernel.h" +#include "libnnext/vsi_nn_vxkernel.h" #include "libnnext/vx_lib_nnext.h" #define _VX_KERNEL_VAR (vx_kernel_BOX_WITH_NMS_LIMIT) diff --git a/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_extra_ending.c b/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_extra_ending.c index 34de7f7..f14c2f6 100644 --- a/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_extra_ending.c +++ b/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_extra_ending.c @@ -35,7 +35,7 @@ #include "utils/vsi_nn_util.h" #include "utils/vsi_nn_dtype_util.h" #include "utils/vsi_nn_math.h" -#include "client/vsi_nn_vxkernel.h" +#include "libnnext/vsi_nn_vxkernel.h" #include "libnnext/vx_lib_nnext.h" #define _VX_KERNEL_VAR (vx_kernel_EXTRA_ENDING) diff --git a/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_generate_proposals.c b/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_generate_proposals.c deleted file mode 100644 index 0c2b948..0000000 --- a/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_generate_proposals.c +++ /dev/null @@ -1,483 +0,0 @@ -/**************************************************************************** -* -* Copyright (c) 2020 Vivante Corporation -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -* DEALINGS IN THE SOFTWARE. -* -*****************************************************************************/ -#include -#include -#include -#include -#include -#include - -#include "vsi_nn_platform.h" - -#include "vsi_nn_prv.h" -#include "vsi_nn_log.h" -#include "vsi_nn_test.h" -#include "vsi_nn_tensor_util.h" -#include "utils/vsi_nn_util.h" -#include "utils/vsi_nn_dtype_util.h" -#include "utils/vsi_nn_math.h" -#include "client/vsi_nn_vxkernel.h" -#include "libnnext/vx_lib_nnext.h" - -#define _VX_KERNEL_VAR (vx_kernel_GENERATE_PROPOSALS) -#define _VX_KERNEL_ID (VX_KERNEL_ENUM_GENERATE_PROPOSALS) -#define _VX_KERNEL_NAME (VX_KERNEL_NAME_GENERATE_PROPOSALS) -#define _VX_KERNEL_FUNC_KERNEL (vxGenerate_proposalsKernel) - -typedef struct -{ - float x1, y1, x2, y2; -}BoxEncodingCorner; -typedef struct -{ - float w, h, x, y; -}BoxEncodingCenter; - -// toBoxEncodingCorner is implemented in vsi_nn_kernel_box_with_nms_limit.c -void toBoxEncodingCorner - ( - BoxEncodingCenter* ctr, - BoxEncodingCorner* cnr - ); - -// toBoxEncodingCenter is implemented in vsi_nn_kernel_box_with_nms_limit.c -void toBoxEncodingCenter - ( - BoxEncodingCorner* cnr, - BoxEncodingCenter* ctr - ); - -// iota is implemented in vsi_nn_kernel_detection_postprocess.c -static void _iota - ( - int32_t * data, - uint32_t len, - int32_t value - ) -{ - uint32_t i; - for (i = 0; i < len; i++) - { - data [i] = value; - value++; - } -} - -// swap_element is implemented in vsi_nn_kernel_box_with_nms_limit.c -void swap_element - ( - uint32_t* list, - uint32_t first, - uint32_t second - ); - -// max_element is implemented in vsi_nn_kernel_box_with_nms_limit.c -uint32_t max_element - ( - float* data, - uint32_t* index_list, - uint32_t len - ); - -// getIoUAxisAligned is implemented in vsi_nn_kernel_box_with_nms_limit.c -float getIoUAxisAligned - ( - const float* roi1, - const float* roi2 - ); - -// sort_element_by_score is implemented in vsi_nn_kernel_box_with_nms_limit.c -void sort_element_by_score - ( - float* data, - uint32_t* index_list, - uint32_t len - ); - -void filterBoxes - ( - const float* roiBase, - const float* imageInfoBase, - float minSize, - uint32_t* select, - uint32_t* len - ) -{ - const uint32_t kRoiDim = 4; - uint32_t i = 0; - uint32_t j; - for(j = 0; j < *len; j++) - { - const float* roiInfo = roiBase + select[j] * kRoiDim; - float roiWidth, roiHeight, xRoiCenter, yRoiCenter; - roiWidth = roiInfo[2] - roiInfo[0]; - roiHeight = roiInfo[3] - roiInfo[1]; - xRoiCenter = roiInfo[0] + roiWidth / 2.0f; - yRoiCenter = roiInfo[1] + roiHeight / 2.0f; - if(roiWidth > minSize && roiHeight > minSize && xRoiCenter < imageInfoBase[1] - && yRoiCenter < imageInfoBase[0]) - { - select[i] = select[j]; - i++; - } - } - *len = i; -} - -static vsi_status VX_CALLBACK vxGenerate_proposalsKernel - ( - vx_node node, - const vx_reference* paramObj, - uint32_t paramNum - ) -{ -#define ARG_NUM (6) -#define TENSOR_NUM_INPUT (4) -#define TENSOR_NUM_OUTPUT (3) -#define TENSOR_NUM (TENSOR_NUM_INPUT+TENSOR_NUM_OUTPUT) - - vsi_status status = VSI_FAILURE; - vx_context context = NULL; - vx_tensor input[TENSOR_NUM_INPUT] = {0}; - vx_tensor output[TENSOR_NUM_OUTPUT] = {0}; - float *f32_in_buffer[TENSOR_NUM_INPUT] = {0}; - float *f32_out_buffer[TENSOR_NUM_OUTPUT] = {0}; - int32_t* int32_out_buffer[TENSOR_NUM_OUTPUT] = {0}; - vsi_nn_tensor_attr_t in_attr[TENSOR_NUM_INPUT]; - vsi_nn_tensor_attr_t out_attr[TENSOR_NUM_OUTPUT]; - uint32_t in_elements[TENSOR_NUM_INPUT] = {0}; - uint32_t out_elements[TENSOR_NUM_OUTPUT]= {0}; - - float heightStride; - float widthStride; - int32_t preNmsTopN; - int32_t postNmsTopN; - float iouThreshold; - float minSize; - - uint32_t i = 0; - for(i = 0; i < TENSOR_NUM_INPUT; i++) - { - memset(&in_attr[i], 0x0, sizeof(vsi_nn_tensor_attr_t)); - } - for(i = 0; i < TENSOR_NUM_OUTPUT; i++) - { - memset(&out_attr[i], 0x0, sizeof(vsi_nn_tensor_attr_t)); - } - /* prepare data */ - context = vxGetContext((vx_reference)node); - - for(i = 0; i < TENSOR_NUM_INPUT; i ++) - { - input[i] = (vx_tensor)paramObj[i]; - status = vsi_nn_vxGetTensorAttr(input[i], &in_attr[i]); - TEST_CHECK_STATUS(status, final); - in_elements[i] = vsi_nn_vxGetTensorElementNum(&in_attr[i]); - f32_in_buffer[i] = (float *)malloc(in_elements[i] * sizeof(float)); - status = vsi_nn_vxConvertTensorToFloat32Data( - context, input[i], &in_attr[i], f32_in_buffer[i], - in_elements[i] * sizeof(float)); - TEST_CHECK_STATUS(status, final); - } - for(i = 0; i < TENSOR_NUM_OUTPUT; i ++) - { - output[i] = (vx_tensor)paramObj[i + TENSOR_NUM_INPUT]; - status = vsi_nn_vxGetTensorAttr(output[i], &out_attr[i]); - TEST_CHECK_STATUS(status, final); - out_elements[i] = vsi_nn_vxGetTensorElementNum(&out_attr[i]); - if(i < 2) - { - f32_out_buffer[i] = (float *)malloc(out_elements[i] * sizeof(float)); - memset(f32_out_buffer[i], 0, out_elements[i] * sizeof(float)); - } - else - { - int32_out_buffer[i] = (int32_t *)malloc(out_elements[i] * sizeof(int32_t)); - memset(int32_out_buffer[i], 0, out_elements[i] * sizeof(int32_t)); - } - } - vxCopyScalar((vx_scalar)paramObj[TENSOR_NUM], &(heightStride), - VX_READ_ONLY, VX_MEMORY_TYPE_HOST); - vxCopyScalar((vx_scalar)paramObj[TENSOR_NUM + 1], &(widthStride), - VX_READ_ONLY, VX_MEMORY_TYPE_HOST); - vxCopyScalar((vx_scalar)paramObj[TENSOR_NUM + 2], &(preNmsTopN), - VX_READ_ONLY, VX_MEMORY_TYPE_HOST); - vxCopyScalar((vx_scalar)paramObj[TENSOR_NUM + 3], &(postNmsTopN), - VX_READ_ONLY, VX_MEMORY_TYPE_HOST); - vxCopyScalar((vx_scalar)paramObj[TENSOR_NUM + 4], &(iouThreshold), - VX_READ_ONLY, VX_MEMORY_TYPE_HOST); - vxCopyScalar((vx_scalar)paramObj[TENSOR_NUM + 5], &(minSize), - VX_READ_ONLY, VX_MEMORY_TYPE_HOST); - - /* TODO: Add CPU kernel implement */ - { - uint32_t h, w, a, b, j; - const uint32_t kRoiDim = 4; - uint32_t numBatches = in_attr[0].size[3]; - uint32_t height = in_attr[0].size[2]; - uint32_t width = in_attr[0].size[1]; - uint32_t numAnchors = in_attr[0].size[0]; - uint32_t imageInfoLength = in_attr[3].size[0]; - - uint32_t batchSize = height * width * numAnchors; - uint32_t roiBufferSize = batchSize * kRoiDim; - - float * roiBuffer = (float*)malloc(roiBufferSize * sizeof(float)); - float * roiTransformedBuffer = (float*)malloc(roiBufferSize * sizeof(float)); - uint32_t* select = (uint32_t*)malloc(batchSize * sizeof(uint32_t)); - uint32_t index = 0; - uint32_t scores_index = 0; - uint32_t bboxDeltas_index = 0; - uint32_t imageInfo_index = 0; - uint32_t scores_out_index = 0; - uint32_t roi_out_index = 0; - - // Compute the roi region for each anchor. - for(h = 0; h < height; h++) - { - float hShift = h * heightStride; - for(w = 0; w < width; w++) - { - float wShift = w * widthStride; - uint32_t anchor_index = 0; - for(a = 0; a < numAnchors; a++) - { - roiBuffer[index] = f32_in_buffer[2][anchor_index] + wShift; - roiBuffer[index + 1] = f32_in_buffer[2][anchor_index + 1] + hShift; - roiBuffer[index + 2] = f32_in_buffer[2][anchor_index + 2] + wShift; - roiBuffer[index + 3] = f32_in_buffer[2][anchor_index + 3] + hShift; - - index += kRoiDim; - anchor_index += kRoiDim; - } - } - } - - for(b = 0; b < numBatches; b++) - { - const uint32_t roiLength = 4; - - uint32_t numRois = batchSize; - uint32_t roiIndex; - uint32_t select_len; - int32_t numDetections = 0; - for(roiIndex = 0; roiIndex < numRois; roiIndex++) - { - float imageHeight = f32_in_buffer[3][imageInfo_index]; - float imageWidth = f32_in_buffer[3][imageInfo_index + 1]; - BoxEncodingCorner roi_cnr; - BoxEncodingCenter roiBefore; - roi_cnr.x1 = roiBuffer[roiIndex * roiLength]; - roi_cnr.y1 = roiBuffer[roiIndex * roiLength + 1]; - roi_cnr.x2 = roiBuffer[roiIndex * roiLength + 2]; - roi_cnr.y2 = roiBuffer[roiIndex * roiLength + 3]; - toBoxEncodingCenter(&roi_cnr, &roiBefore); - { - BoxEncodingCenter roi_ctr; - BoxEncodingCorner roiAfter; - BoxEncodingCorner cliped; - uint32_t idx = bboxDeltas_index + roiIndex * roiLength; - roi_ctr.w = (float)(exp(f32_in_buffer[1][idx + 2]) * roiBefore.w); - roi_ctr.h = (float)(exp(f32_in_buffer[1][idx + 3]) * roiBefore.h); - roi_ctr.x = roiBefore.x + f32_in_buffer[1][idx] * roiBefore.w; - roi_ctr.y = roiBefore.y + f32_in_buffer[1][idx + 1] * roiBefore.h; - toBoxEncodingCorner(&roi_ctr, &roiAfter); - cliped.x1 = vsi_nn_min(vsi_nn_max(roiAfter.x1, 0.0f), imageWidth); - cliped.y1 = vsi_nn_min(vsi_nn_max(roiAfter.y1, 0.0f), imageHeight); - cliped.x2 = vsi_nn_min(vsi_nn_max(roiAfter.x2, 0.0f), imageWidth); - cliped.y2 = vsi_nn_min(vsi_nn_max(roiAfter.y2, 0.0f), imageHeight); - roiTransformedBuffer[idx] = cliped.x1; - roiTransformedBuffer[idx + 1] = cliped.y1; - roiTransformedBuffer[idx + 2] = cliped.x2; - roiTransformedBuffer[idx + 3] = cliped.y2; - } - } - - // Find the top preNmsTopN scores. - _iota((int32_t*)select, batchSize, 0); - select_len = batchSize; - if(preNmsTopN > 0 && preNmsTopN < (int32_t)batchSize) - { - sort_element_by_score(&(f32_in_buffer[0][scores_index]), - select, batchSize); - select_len = preNmsTopN; - } - - // Filter boxes, disgard regions with height or width < minSize. - filterBoxes(roiTransformedBuffer, &(f32_in_buffer[3][0]), - minSize, select, &select_len); - - // Apply hard NMS. - if(postNmsTopN < 0) - { - postNmsTopN = select_len; - } - - for(j = 0; (j < select_len && numDetections < postNmsTopN); j++) - { - // find max score and swap to the front. - int32_t max_index = max_element(&(f32_in_buffer[0][scores_index]), - &(select[j]), select_len - j) + j; - swap_element(select, max_index, j); - - // Calculate IoU of the rest, swap to the end (disgard) ifneeded. - for(i = j + 1; i < select_len; i++) - { - int32_t roiBase0 = select[i] * kRoiDim; - int32_t roiBase1 = select[j] * kRoiDim; - float iou = getIoUAxisAligned(&(roiTransformedBuffer[roiBase0]), - &(roiTransformedBuffer[roiBase1])); - - if(iou >= iouThreshold) - { - swap_element(select, i, select_len - 1); - i--; - select_len--; - } - } - numDetections++; - } - - for(i = 0; i < select_len; i++) - { - memcpy(&(f32_out_buffer[1][roi_out_index]), - &(roiTransformedBuffer[select[i] * kRoiDim]), kRoiDim * sizeof(float)); - f32_out_buffer[0][scores_out_index] = - f32_in_buffer[0][scores_index + select[i]]; - int32_out_buffer[2][scores_out_index] = b; - scores_out_index++; - roi_out_index += kRoiDim; - } - - scores_index += batchSize; - bboxDeltas_index += roiBufferSize; - imageInfo_index += imageInfoLength; - } - - vsi_nn_safe_free(roiBuffer); - vsi_nn_safe_free(roiTransformedBuffer); - vsi_nn_safe_free(select); - } - - /* save data */ - for(i = 0; i < TENSOR_NUM_OUTPUT; i++) - { - if(i < 2) - { - status = vsi_nn_vxConvertFloat32DataToTensor( - context, output[i], &out_attr[i], f32_out_buffer[i], - out_elements[i] * sizeof(float)); - TEST_CHECK_STATUS(status, final); - } - else - { - vsi_nn_vxCopyDataToTensor(context, output[i], &out_attr[i], - (uint8_t *)int32_out_buffer[i]); - } - } - -final: - for(i = 0; i < TENSOR_NUM_INPUT; i++) - { - if(f32_in_buffer[i]) free(f32_in_buffer[i]); - } - for(i = 0; i < TENSOR_NUM_OUTPUT; i++) - { - if(f32_out_buffer[i]) free(f32_out_buffer[i]); - if(int32_out_buffer[i]) free(int32_out_buffer[i]); - } - return status; -} /* _VX_KERNEL_FUNC_KERNEL() */ - -static vx_param_description_t vxGenerate_proposalsKernelParam[] = -{ - {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, - {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, - {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, - {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, - {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, - {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, - {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, - {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, - {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, - {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, - {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, - {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, - {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, -}; - -vx_status VX_CALLBACK vxGenerate_proposalsInitializer - ( - vx_node nodObj, - const vx_reference *paramObj, - vx_uint32 paraNum - ) -{ - vx_status status = VX_SUCCESS; - /*TODO: Add initial code for VX program*/ - - return status; -} - - -#ifdef __cplusplus -extern "C" { -#endif -vx_kernel_description_t vxGenerate_proposals_CPU = -{ - _VX_KERNEL_ID, - _VX_KERNEL_NAME, - _VX_KERNEL_FUNC_KERNEL, - vxGenerate_proposalsKernelParam, - _cnt_of_array( vxGenerate_proposalsKernelParam ), - vsi_nn_KernelValidator, - NULL, - NULL, - vsi_nn_KernelInitializer, - vsi_nn_KernelDeinitializer -}; - -vx_kernel_description_t vxGenerate_proposals_VX = -{ - _VX_KERNEL_ID, - _VX_KERNEL_NAME, - NULL, - vxGenerate_proposalsKernelParam, - _cnt_of_array( vxGenerate_proposalsKernelParam ), - vsi_nn_KernelValidator, - NULL, - NULL, - vxGenerate_proposalsInitializer, - vsi_nn_KernelDeinitializer -}; - -vx_kernel_description_t * vx_kernel_GENERATE_PROPOSALS_list[] = -{ - &vxGenerate_proposals_CPU, - &vxGenerate_proposals_VX, - NULL -}; -#ifdef __cplusplus -} -#endif diff --git a/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_heatmap_max_keypoint.c b/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_heatmap_max_keypoint.c index fa9537a..e464197 100644 --- a/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_heatmap_max_keypoint.c +++ b/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_heatmap_max_keypoint.c @@ -37,7 +37,7 @@ #include "utils/vsi_nn_util.h" #include "utils/vsi_nn_dtype_util.h" #include "utils/vsi_nn_math.h" -#include "client/vsi_nn_vxkernel.h" +#include "libnnext/vsi_nn_vxkernel.h" #include "libnnext/vx_lib_nnext.h" #define _VX_KERNEL_VAR (vx_kernel_HEATMAP_MAX_KEYPOINT) diff --git a/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_imageprocess.c b/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_imageprocess.c index cc99b85..a63cb15 100644 --- a/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_imageprocess.c +++ b/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_imageprocess.c @@ -35,7 +35,7 @@ #include "utils/vsi_nn_util.h" #include "utils/vsi_nn_dtype_util.h" #include "utils/vsi_nn_math.h" -#include "client/vsi_nn_vxkernel.h" +#include "libnnext/vsi_nn_vxkernel.h" #include "libnnext/vx_lib_nnext.h" #define _VX_KERNEL_VAR (vx_kernel_IMAGEPROCESS) diff --git a/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_signalframe.c b/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_signalframe.c index a473b6e..0cb39a1 100644 --- a/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_signalframe.c +++ b/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_signalframe.c @@ -36,7 +36,7 @@ #include "utils/vsi_nn_util.h" #include "utils/vsi_nn_dtype_util.h" #include "utils/vsi_nn_math.h" -#include "client/vsi_nn_vxkernel.h" +#include "libnnext/vsi_nn_vxkernel.h" #include "libnnext/vx_lib_nnext.h" #define INPUT_FP16 0 diff --git a/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_spatial_transformer.c b/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_spatial_transformer.c index e302139..ffa26dd 100644 --- a/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_spatial_transformer.c +++ b/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_spatial_transformer.c @@ -35,7 +35,7 @@ #include "utils/vsi_nn_util.h" #include "utils/vsi_nn_dtype_util.h" #include "utils/vsi_nn_math.h" -#include "client/vsi_nn_vxkernel.h" +#include "libnnext/vsi_nn_vxkernel.h" #include "libnnext/vx_lib_nnext.h" #define _VX_KERNEL_VAR (vx_kernel_SPATIAL_TRANSFORMER) diff --git a/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_sync_host.c b/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_sync_host.c index 9378674..0b4805d 100644 --- a/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_sync_host.c +++ b/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_sync_host.c @@ -29,7 +29,7 @@ #include "vsi_nn_pub.h" #include "utils/vsi_nn_util.h" -#include "client/vsi_nn_vxkernel.h" +#include "libnnext/vsi_nn_vxkernel.h" #include "libnnext/vx_lib_nnext.h" #define _VX_KERNEL_VAR_CPU (vx_client_kernel_cpu_SYNC_HOST) diff --git a/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_tensorstackconcat.c b/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_tensorstackconcat.c index 5d5f1ea..9d2c936 100644 --- a/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_tensorstackconcat.c +++ b/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_tensorstackconcat.c @@ -36,7 +36,7 @@ #include "utils/vsi_nn_util.h" #include "utils/vsi_nn_dtype_util.h" #include "utils/vsi_nn_math.h" -#include "client/vsi_nn_vxkernel.h" +#include "libnnext/vsi_nn_vxkernel.h" #include "libnnext/vx_lib_nnext.h" void tensorStackConcatFunc diff --git a/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_topk.c b/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_topk.c deleted file mode 100644 index 2fdf3bd..0000000 --- a/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_topk.c +++ /dev/null @@ -1,266 +0,0 @@ -/**************************************************************************** -* -* Copyright (c) 2020 Vivante Corporation -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -* DEALINGS IN THE SOFTWARE. -* -*****************************************************************************/ -#include -#include -#include -#include - -#include "vsi_nn_platform.h" - -#include "vsi_nn_prv.h" -#include "vsi_nn_log.h" -#include "vsi_nn_test.h" -#include "vsi_nn_tensor_util.h" -#include "utils/vsi_nn_util.h" -#include "utils/vsi_nn_dtype_util.h" -#include "utils/vsi_nn_math.h" -#include "client/vsi_nn_vxkernel.h" -#include "libnnext/vx_lib_nnext.h" - -#define _VX_KERNEL_VAR (vx_kernel_TOPK) -#define _VX_KERNEL_ID (VX_KERNEL_ENUM_TOPK) -#define _VX_KERNEL_NAME (VX_KERNEL_NAME_TOPK) -#define _VX_KERNEL_FUNC_KERNEL (vxTopkKernel) - -static uint32_t max_comp_func(void* data, int32_t left, int32_t right) -{ - float* fdata = (float*)data; - if (fdata[left] >= fdata[right]) - { - return TRUE; - } - else - { - return FALSE; - } -} - -static void find_top_k_1d -( - float* input, - uint32_t input_len, - uint32_t k, - float* value, - uint32_t* indices -) -{ - int32_t low = 0; - int32_t high = input_len - 1; - int32_t j; - - for (j = 0; j < (int32_t)input_len; j++) - { - indices[j] = j; - } - - j = vsi_nn_partition(input, low, high, max_comp_func, FALSE, indices); - - //part_sort - while (j != (int32_t)k) - { - if ((int32_t)k > j) - { - low = j + 1; - } - else - { - high = j; - } - j = vsi_nn_partition(input, low, high, max_comp_func, FALSE, indices); - } - //all_sort - vsi_nn_partition(input, 0, k - 1, max_comp_func, TRUE, indices); - - for (j = 0; j < (int32_t)k; j++) - { - value[j] = input[indices[j]]; - } -} - -static vsi_status VX_CALLBACK vxTopkKernel - ( - vx_node node, - const vx_reference* paramObj, - uint32_t paramNum - ) -{ -#define ARG_NUM (1) -#define TENSOR_NUM_INPUT (1) -#define TENSOR_NUM_OUTPUT (2) -#define TENSOR_NUM (TENSOR_NUM_INPUT+TENSOR_NUM_OUTPUT) - - vsi_status status = VSI_FAILURE; - vx_context context = NULL; - vx_tensor input[TENSOR_NUM_INPUT] = {0}; - vx_tensor output[TENSOR_NUM_OUTPUT] = {0}; - float *f32_in_buffer[TENSOR_NUM_INPUT] = {0}; - float *f32_out_buffer = NULL; - uint32_t *u32_out_buffer = NULL; - vsi_nn_tensor_attr_t in_attr[TENSOR_NUM_INPUT]; - vsi_nn_tensor_attr_t out_attr[TENSOR_NUM_OUTPUT]; - uint32_t in_elements[TENSOR_NUM_INPUT] = {0}; - uint32_t out_elements[TENSOR_NUM_OUTPUT]= {0}; - - int32_t top_k; - - uint32_t i = 0; - for(i = 0; i < TENSOR_NUM_INPUT; i++) - { - memset(&in_attr[i], 0x0, sizeof(vsi_nn_tensor_attr_t)); - } - for(i = 0; i < TENSOR_NUM_OUTPUT; i++) - { - memset(&out_attr[i], 0x0, sizeof(vsi_nn_tensor_attr_t)); - } - /* prepare data */ - context = vxGetContext((vx_reference)node); - - for(i = 0; i < TENSOR_NUM_INPUT; i ++) - { - input[i] = (vx_tensor)paramObj[i]; - status = vsi_nn_vxGetTensorAttr(input[i], &in_attr[i]); - TEST_CHECK_STATUS(status, final); - in_elements[i] = vsi_nn_vxGetTensorElementNum(&in_attr[i]); - f32_in_buffer[i] = (float *)malloc(in_elements[i] * sizeof(float)); - status = vsi_nn_vxConvertTensorToFloat32Data( - context, input[i], &in_attr[i], f32_in_buffer[i], - in_elements[i] * sizeof(float)); - TEST_CHECK_STATUS(status, final); - } - for(i = 0; i < TENSOR_NUM_OUTPUT; i ++) - { - output[i] = (vx_tensor)paramObj[i + TENSOR_NUM_INPUT]; - status = vsi_nn_vxGetTensorAttr(output[i], &out_attr[i]); - TEST_CHECK_STATUS(status, final); - out_elements[i] = vsi_nn_vxGetTensorElementNum(&out_attr[i]); - } - f32_out_buffer = (float *)malloc(out_elements[0] * sizeof(float)); - u32_out_buffer = (uint32_t *)malloc(out_elements[1] * sizeof(uint32_t)); - vxCopyScalar((vx_scalar)paramObj[TENSOR_NUM], &(top_k), - VX_READ_ONLY, VX_MEMORY_TYPE_HOST); - - /* TODO: Add CPU kernel implement */ - { - uint32_t block_num = in_attr[0].size[1]; - uint32_t block_size = in_attr[0].size[0]; - uint32_t * indices = (uint32_t*)malloc(block_size * sizeof(uint32_t)); - - for(i = 0; i < block_num; i++) - { - uint32_t in_index = i * block_size; - uint32_t out_index = i * top_k; - find_top_k_1d(&(f32_in_buffer[0][in_index]), - block_size, top_k, &(f32_out_buffer[out_index]), indices); - memcpy(&(u32_out_buffer[out_index]), - indices, top_k * sizeof(uint32_t)); - } - // Handle the 1D input - if (!block_num) { - find_top_k_1d(&(f32_in_buffer[0][0]), - block_size, top_k, &(f32_out_buffer[0]), indices); - memcpy(&(u32_out_buffer[0]), - indices, top_k * sizeof(uint32_t)); - } - if (indices) free(indices); - } - - /* save data */ - status = vsi_nn_vxConvertFloat32DataToTensor( - context, output[0], &out_attr[0], f32_out_buffer, - out_elements[0] * sizeof(float)); - TEST_CHECK_STATUS(status, final); - vsi_nn_vxCopyDataToTensor(context, output[1], &out_attr[1], (uint8_t *)u32_out_buffer); - -final: - for (i = 0; i < TENSOR_NUM_INPUT; i++) - { - if (f32_in_buffer[i]) free(f32_in_buffer[i]); - } - if (f32_out_buffer) free(f32_out_buffer); - if (u32_out_buffer) free(u32_out_buffer); - return status; -} /* _VX_KERNEL_FUNC_KERNEL() */ - -static vx_param_description_t vxTopkKernelParam[] = -{ - {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, - {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, - {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, - {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, -}; - -vx_status VX_CALLBACK vxTopkInitializer - ( - vx_node nodObj, - const vx_reference *paramObj, - vx_uint32 paraNum - ) -{ - vx_status status = VX_SUCCESS; - /*TODO: Add initial code for VX program*/ - - return status; -} - - -#ifdef __cplusplus -extern "C" { -#endif -vx_kernel_description_t vxTopk_CPU = -{ - _VX_KERNEL_ID, - _VX_KERNEL_NAME, - _VX_KERNEL_FUNC_KERNEL, - vxTopkKernelParam, - _cnt_of_array( vxTopkKernelParam ), - vsi_nn_KernelValidator, - NULL, - NULL, - vsi_nn_KernelInitializer, - vsi_nn_KernelDeinitializer -}; - -vx_kernel_description_t vxTopk_VX = -{ - _VX_KERNEL_ID, - _VX_KERNEL_NAME, - NULL, - vxTopkKernelParam, - _cnt_of_array( vxTopkKernelParam ), - vsi_nn_KernelValidator, - NULL, - NULL, - vxTopkInitializer, - vsi_nn_KernelDeinitializer -}; - -vx_kernel_description_t * vx_kernel_TOPK_list[] = -{ - &vxTopk_CPU, - &vxTopk_VX, - NULL -}; -#ifdef __cplusplus -} -#endif diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/batchnorm_single.vx b/src/tim/vx/internal/src/libnnext/ops/vx/batchnorm_single.vx index 9ac4945..292e86a 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/batchnorm_single.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/batchnorm_single.vx @@ -9,7 +9,7 @@ _viv_uniform float output_scale; _viv_uniform float output_zp; #define BATCH_NORM_SH_IMPL(name0, name1, src_type, read_type, conv_type, dst_type, save_type) \ -__kernel void batch_norm_##name0##to##name1##_brdcst1( \ +__kernel void batch_norm_##name0##_F16_F16_F16_F32to##name1##_brdcst1( \ __read_only image2d_array_t input, \ __read_only image2d_array_t Mean, \ __read_only image2d_array_t Variance, \ @@ -73,7 +73,7 @@ BATCH_NORM_SH_IMPL(I8, I8, vxc_char16, vxc_char16, int4, vxc_char16, vxc_c BATCH_NORM_SH_IMPL(I8, F16, vxc_char16, vxc_char16, half4, vxc_half8, vxc_ushort8) #define BATCH_NORM_SH_IMPL_2D(name0, name1, src_type, read_type, conv_type, dst_type, save_type) \ -__kernel void batch_norm_##name0##to##name1##_brdcst1_2D( \ +__kernel void batch_norm_##name0##_F16_F16_F16_F32to##name1##_brdcst1_2D( \ __read_only image2d_array_t input, \ __read_only image2d_t Mean, \ __read_only image2d_t Variance, \ @@ -138,7 +138,7 @@ BATCH_NORM_SH_IMPL_2D(I8, F16, vxc_char16, vxc_char16, half4, vxc_half8, vx #define BATCH_NORM_SH_IMPL_AXIS1(name0, name1, src_type, read_type, conv_type, dst_type, save_type) \ -__kernel void batch_norm_##name0##to##name1##_brdcst0( \ +__kernel void batch_norm_##name0##_F16_F16_F16_F32to##name1##_brdcst0( \ __read_only image2d_array_t input, \ __read_only image2d_array_t Mean, \ __read_only image2d_array_t Variance, \ @@ -205,7 +205,7 @@ BATCH_NORM_SH_IMPL_AXIS1(I8, I8, vxc_char16, vxc_char16, int4, vxc_char16, BATCH_NORM_SH_IMPL_AXIS1(I8, F16, vxc_char16, vxc_char16, half4, vxc_half8, vxc_ushort8) #define BATCH_NORM_SH_IMPL_AXIS1_2D(name0, name1, src_type, read_type, conv_type, dst_type, save_type) \ -__kernel void batch_norm_##name0##to##name1##_brdcst0_2D( \ +__kernel void batch_norm_##name0##_F16_F16_F16_F32to##name1##_brdcst0_2D( \ __read_only image2d_array_t input, \ __read_only image2d_t Mean, \ __read_only image2d_t Variance, \ diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/batchnorm_single_f32.vx b/src/tim/vx/internal/src/libnnext/ops/vx/batchnorm_single_f32.vx new file mode 100644 index 0000000..e419457 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/batchnorm_single_f32.vx @@ -0,0 +1,267 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform VXC_512Bits uniDatatoF32_0_4x4; +_viv_uniform VXC_512Bits uniDatatoF32_1_4x4; +_viv_uniform VXC_512Bits uniExtract8Data_2x8; +_viv_uniform float input_scale; +_viv_uniform float input_tail; +_viv_uniform float output_scale; +_viv_uniform float output_zp; + +#define BATCH_NORM_SH_IMPL(name0, name1, src_type, read_type, conv_type, dst_type, save_type) \ +__kernel void batch_norm_##name0##_F16_F16_F32_F32to##name1##_brdcst1( \ + __read_only image2d_array_t input, \ + __read_only image2d_array_t Mean, \ + __read_only image2d_array_t Variance, \ + __read_only image2d_array_t Gamma, \ + __read_only image2d_array_t Beta, \ + __write_only image2d_array_t output, \ + float eps \ + ) \ +{ \ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \ + read_type vec; \ + src_type src; \ + VXC_ReadImage2DArray(vec, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src, vec, 16); \ + vxc_ushort8 _mean, _var; \ + vxc_half8 mean, var; \ + VXC_ReadImage2DArray(_mean, Mean, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, mean, _mean, 16); \ + VXC_ReadImage2DArray(_var, Variance, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, var, _var, 16); \ + float4 gamma0 = read_imagef(Gamma, coord); \ + coord.x += 4; \ + float4 gamma1 = read_imagef(Gamma, coord); \ + coord.x -= 4; \ + float4 beta = read_imagef(Beta, coord); \ + \ + float4 src0, src1, m, v; \ + VXC_DP4x4(src0, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_0_4x4); \ + VXC_DP4x4(m, mean, mean, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_0_4x4); \ + VXC_DP4x4(v, var, var, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_0_4x4); \ + gamma0 = gamma0 * rsqrt(v + eps); \ + src0 = src0 * input_scale + input_tail; \ + src0 = (src0 - m) * gamma0 + beta.xxxx; \ + src0 = src0 * output_scale + output_zp; \ + VXC_DP4x4(src1, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_1_4x4); \ + VXC_DP4x4(m, mean, mean, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_1_4x4); \ + VXC_DP4x4(v, var, var, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_1_4x4); \ + gamma1 = gamma1 * rsqrt(v + eps); \ + src1 = src1 * input_scale + input_tail; \ + src1 = (src1 - m) * gamma1 + beta.xxxx; \ + src1 = src1 * output_scale + output_zp; \ + \ + conv_type dst0, dst1; \ + _viv_asm(CONV_RTE, dst0, src0); \ + _viv_asm(CONV_RTE, dst1, src1); \ + dst_type tmp; \ + save_type dst; \ + VXC_DP2x8(tmp, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \ + _viv_asm(COPY, dst, tmp, 16); \ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ +} +BATCH_NORM_SH_IMPL(F16, F16, vxc_half8, vxc_ushort8, half4, vxc_half8, vxc_ushort8) +BATCH_NORM_SH_IMPL(F16, I16, vxc_half8, vxc_ushort8, int4, vxc_short8, vxc_short8) +BATCH_NORM_SH_IMPL(F16, U8, vxc_half8, vxc_ushort8, int4, vxc_uchar16, vxc_uchar16) +BATCH_NORM_SH_IMPL(F16, I8, vxc_half8, vxc_ushort8, int4, vxc_char16, vxc_char16) +BATCH_NORM_SH_IMPL(I16, I16, vxc_short8, vxc_short8, int4, vxc_short8, vxc_short8) +BATCH_NORM_SH_IMPL(I16, F16, vxc_short8, vxc_short8, half4, vxc_half8, vxc_ushort8) +BATCH_NORM_SH_IMPL(U8, U8, vxc_uchar16, vxc_uchar16, int4, vxc_uchar16, vxc_uchar16) +BATCH_NORM_SH_IMPL(U8, F16, vxc_uchar16, vxc_uchar16, half4, vxc_half8, vxc_ushort8) +BATCH_NORM_SH_IMPL(I8, I8, vxc_char16, vxc_char16, int4, vxc_char16, vxc_char16) +BATCH_NORM_SH_IMPL(I8, F16, vxc_char16, vxc_char16, half4, vxc_half8, vxc_ushort8) + +#define BATCH_NORM_SH_IMPL_2D(name0, name1, src_type, read_type, conv_type, dst_type, save_type) \ +__kernel void batch_norm_##name0##_F16_F16_F32_F32to##name1##_brdcst1_2D( \ + __read_only image2d_array_t input, \ + __read_only image2d_t Mean, \ + __read_only image2d_t Variance, \ + __read_only image2d_t Gamma, \ + __read_only image2d_t Beta, \ + __write_only image2d_array_t output, \ + float eps \ + ) \ +{ \ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \ + read_type vec; \ + src_type src; \ + VXC_ReadImage(vec, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src, vec, 16); \ + coord.z = coord.x + 4; \ + vxc_ushort8 _mean, _var; \ + vxc_half8 mean, var; \ + VXC_ReadImage(_mean, Mean, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, mean, _mean, 16); \ + VXC_ReadImage(_var, Variance, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, var, _var, 16); \ + float4 gamma0 = read_imagef(Gamma, coord.xy); \ + float4 gamma1 = read_imagef(Gamma, coord.zy); \ + float4 beta = read_imagef(Beta, coord.xy); \ + \ + float4 src0, src1, m, v; \ + VXC_DP4x4(src0, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_0_4x4); \ + VXC_DP4x4(m, mean, mean, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_0_4x4); \ + VXC_DP4x4(v, var, var, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_0_4x4); \ + gamma0 = gamma0 * rsqrt(v + eps); \ + src0 = src0 * input_scale + input_tail; \ + src0 = (src0 - m) * gamma0 + beta.xxxx; \ + src0 = src0 * output_scale + output_zp; \ + VXC_DP4x4(src1, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_1_4x4); \ + VXC_DP4x4(m, mean, mean, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_1_4x4); \ + VXC_DP4x4(v, var, var, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_1_4x4); \ + gamma1 = gamma1 * rsqrt(v + eps); \ + src1 = src1 * input_scale + input_tail; \ + src1 = (src1 - m) * gamma1 + beta.xxxx; \ + src1 = src1 * output_scale + output_zp; \ + \ + conv_type dst0, dst1; \ + _viv_asm(CONV_RTE, dst0, src0); \ + _viv_asm(CONV_RTE, dst1, src1); \ + dst_type tmp; \ + save_type dst; \ + VXC_DP2x8(tmp, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \ + _viv_asm(COPY, dst, tmp, 16); \ + VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ +} +BATCH_NORM_SH_IMPL_2D(F16, F16, vxc_half8, vxc_ushort8, half4, vxc_half8, vxc_ushort8) +BATCH_NORM_SH_IMPL_2D(F16, I16, vxc_half8, vxc_ushort8, int4, vxc_short8, vxc_short8) +BATCH_NORM_SH_IMPL_2D(F16, U8, vxc_half8, vxc_ushort8, int4, vxc_uchar16, vxc_uchar16) +BATCH_NORM_SH_IMPL_2D(F16, I8, vxc_half8, vxc_ushort8, int4, vxc_char16, vxc_char16) +BATCH_NORM_SH_IMPL_2D(I16, I16, vxc_short8, vxc_short8, int4, vxc_short8, vxc_short8) +BATCH_NORM_SH_IMPL_2D(I16, F16, vxc_short8, vxc_short8, half4, vxc_half8, vxc_ushort8) +BATCH_NORM_SH_IMPL_2D(U8, U8, vxc_uchar16, vxc_uchar16, int4, vxc_uchar16, vxc_uchar16) +BATCH_NORM_SH_IMPL_2D(U8, F16, vxc_uchar16, vxc_uchar16, half4, vxc_half8, vxc_ushort8) +BATCH_NORM_SH_IMPL_2D(I8, I8, vxc_char16, vxc_char16, int4, vxc_char16, vxc_char16) +BATCH_NORM_SH_IMPL_2D(I8, F16, vxc_char16, vxc_char16, half4, vxc_half8, vxc_ushort8) + + +#define BATCH_NORM_SH_IMPL_AXIS1(name0, name1, src_type, read_type, conv_type, dst_type, save_type) \ +__kernel void batch_norm_##name0##_F16_F16_F32_F32to##name1##_brdcst0( \ + __read_only image2d_array_t input, \ + __read_only image2d_array_t Mean, \ + __read_only image2d_array_t Variance, \ + __read_only image2d_array_t Gamma, \ + __read_only image2d_array_t Beta, \ + __write_only image2d_array_t output, \ + float eps \ + ) \ +{ \ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \ + read_type vec; \ + src_type src; \ + VXC_ReadImage2DArray(vec, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src, vec, 16); \ + vxc_ushort8 _mean, _var; \ + vxc_half8 mean, var; \ + VXC_ReadImage2DArray(_mean, Mean, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, mean, _mean, 16); \ + VXC_ReadImage2DArray(_var, Variance, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, var, _var, 16); \ + float4 gamma0 = read_imagef(Gamma, coord); \ + float4 beta0 = read_imagef(Beta, coord); \ + coord.x += 4; \ + float4 gamma1 = read_imagef(Gamma, coord); \ + float4 beta1 = read_imagef(Beta, coord); \ + coord.x -= 4; \ + \ + float4 src0, src1, m, v; \ + VXC_DP4x4(src0, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_0_4x4); \ + VXC_DP4x4(m, mean, mean, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_0_4x4); \ + VXC_DP4x4(v, var, var, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_0_4x4); \ + gamma0 = gamma0 * rsqrt(v + eps); \ + src0 = src0 * input_scale + input_tail; \ + src0 = (src0 - m) * gamma0 + beta0; \ + src0 = src0 * output_scale + output_zp; \ + VXC_DP4x4(src1, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_1_4x4); \ + VXC_DP4x4(m, mean, mean, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_1_4x4); \ + VXC_DP4x4(v, var, var, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_1_4x4); \ + gamma1 = gamma1 * rsqrt(v + eps); \ + src1 = src1 * input_scale + input_tail; \ + src1 = (src1 - m) * gamma1 + beta1; \ + src1 = src1 * output_scale + output_zp; \ + \ + conv_type dst0, dst1; \ + _viv_asm(CONV_RTE, dst0, src0); \ + _viv_asm(CONV_RTE, dst1, src1); \ + dst_type tmp; \ + save_type dst; \ + VXC_DP2x8(tmp, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \ + _viv_asm(COPY, dst, tmp, 16); \ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ +} +BATCH_NORM_SH_IMPL_AXIS1(F16, F16, vxc_half8, vxc_ushort8, half4, vxc_half8, vxc_ushort8) +BATCH_NORM_SH_IMPL_AXIS1(F16, I16, vxc_half8, vxc_ushort8, int4, vxc_short8, vxc_short8) +BATCH_NORM_SH_IMPL_AXIS1(F16, U8, vxc_half8, vxc_ushort8, int4, vxc_uchar16, vxc_uchar16) +BATCH_NORM_SH_IMPL_AXIS1(F16, I8, vxc_half8, vxc_ushort8, int4, vxc_char16, vxc_char16) +BATCH_NORM_SH_IMPL_AXIS1(I16, I16, vxc_short8, vxc_short8, int4, vxc_short8, vxc_short8) +BATCH_NORM_SH_IMPL_AXIS1(I16, F16, vxc_short8, vxc_short8, half4, vxc_half8, vxc_ushort8) +BATCH_NORM_SH_IMPL_AXIS1(U8, U8, vxc_uchar16, vxc_uchar16, int4, vxc_uchar16, vxc_uchar16) +BATCH_NORM_SH_IMPL_AXIS1(U8, F16, vxc_uchar16, vxc_uchar16, half4, vxc_half8, vxc_ushort8) +BATCH_NORM_SH_IMPL_AXIS1(I8, I8, vxc_char16, vxc_char16, int4, vxc_char16, vxc_char16) +BATCH_NORM_SH_IMPL_AXIS1(I8, F16, vxc_char16, vxc_char16, half4, vxc_half8, vxc_ushort8) + +#define BATCH_NORM_SH_IMPL_AXIS1_2D(name0, name1, src_type, read_type, conv_type, dst_type, save_type) \ +__kernel void batch_norm_##name0##_F16_F16_F32_F32to##name1##_brdcst0_2D( \ + __read_only image2d_array_t input, \ + __read_only image2d_t Mean, \ + __read_only image2d_t Variance, \ + __read_only image2d_t Gamma, \ + __read_only image2d_t Beta, \ + __write_only image2d_array_t output, \ + float eps \ + ) \ +{ \ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \ + read_type vec; \ + src_type src; \ + VXC_ReadImage(vec, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src, vec, 16); \ + coord.z += 4; \ + vxc_ushort8 _mean, _var; \ + vxc_half8 mean, var; \ + VXC_ReadImage(_mean, Mean, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, mean, _mean, 16); \ + VXC_ReadImage(_var, Variance, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, var, _var, 16); \ + float4 gamma0 = read_imagef(Gamma, coord.xy); \ + float4 gamma1 = read_imagef(Gamma, coord.zy); \ + float4 beta0 = read_imagef(Beta, coord.xy); \ + float4 beta1 = read_imagef(Beta, coord.zy); \ + \ + float4 src0, src1, m, v; \ + VXC_DP4x4(src0, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_0_4x4); \ + VXC_DP4x4(m, mean, mean, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_0_4x4); \ + VXC_DP4x4(v, var, var, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_0_4x4); \ + gamma0 = gamma0 * rsqrt(v + eps); \ + src0 = src0 * input_scale + input_tail; \ + src0 = (src0 - m) * gamma0 + beta0; \ + src0 = src0 * output_scale + output_zp; \ + VXC_DP4x4(src1, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_1_4x4); \ + VXC_DP4x4(m, mean, mean, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_1_4x4); \ + VXC_DP4x4(v, var, var, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_1_4x4); \ + gamma1 = gamma1 * rsqrt(v + eps); \ + src1 = src1 * input_scale + input_tail; \ + src1 = (src1 - m) * gamma1 + beta1; \ + src1 = src1 * output_scale + output_zp; \ + \ + conv_type dst0, dst1; \ + _viv_asm(CONV_RTE, dst0, src0); \ + _viv_asm(CONV_RTE, dst1, src1); \ + dst_type tmp; \ + save_type dst; \ + VXC_DP2x8(tmp, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \ + _viv_asm(COPY, dst, tmp, 16); \ + VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ +} +BATCH_NORM_SH_IMPL_AXIS1_2D(F16, F16, vxc_half8, vxc_ushort8, half4, vxc_half8, vxc_ushort8) +BATCH_NORM_SH_IMPL_AXIS1_2D(F16, I16, vxc_half8, vxc_ushort8, int4, vxc_short8, vxc_short8) +BATCH_NORM_SH_IMPL_AXIS1_2D(F16, U8, vxc_half8, vxc_ushort8, int4, vxc_uchar16, vxc_uchar16) +BATCH_NORM_SH_IMPL_AXIS1_2D(F16, I8, vxc_half8, vxc_ushort8, int4, vxc_char16, vxc_char16) +BATCH_NORM_SH_IMPL_AXIS1_2D(I16, I16, vxc_short8, vxc_short8, int4, vxc_short8, vxc_short8) +BATCH_NORM_SH_IMPL_AXIS1_2D(I16, F16, vxc_short8, vxc_short8, half4, vxc_half8, vxc_ushort8) +BATCH_NORM_SH_IMPL_AXIS1_2D(U8, U8, vxc_uchar16, vxc_uchar16, int4, vxc_uchar16, vxc_uchar16) +BATCH_NORM_SH_IMPL_AXIS1_2D(U8, F16, vxc_uchar16, vxc_uchar16, half4, vxc_half8, vxc_ushort8) +BATCH_NORM_SH_IMPL_AXIS1_2D(I8, I8, vxc_char16, vxc_char16, int4, vxc_char16, vxc_char16) +BATCH_NORM_SH_IMPL_AXIS1_2D(I8, F16, vxc_char16, vxc_char16, half4, vxc_half8, vxc_ushort8) + diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/conv1d_ovxlib.vx b/src/tim/vx/internal/src/libnnext/ops/vx/conv1d_ovxlib.vx new file mode 100644 index 0000000..6c67421 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/conv1d_ovxlib.vx @@ -0,0 +1,151 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform VXC_512Bits uniConv1DK3_Lo0_4x4; +_viv_uniform VXC_512Bits uniConv1DK3_Lo1_4x4; +_viv_uniform VXC_512Bits uniConv1DK3_Lo2_4x4; +_viv_uniform VXC_512Bits uniConv1DK3_Hi0_4x4; +_viv_uniform VXC_512Bits uniConv1DK3_Hi1_4x4; +_viv_uniform VXC_512Bits uniConv1DK3_Hi2_4x4; +_viv_uniform VXC_512Bits uniDataConvK3_2x8; +_viv_uniform VXC_512Bits uniSumOrderUchar_2x8; + +_viv_uniform int input_ZP; +_viv_uniform int weight_ZP; +_viv_uniform float output_ZP; +_viv_uniform float scaleOut; +_viv_uniform int input_height; + +__kernel void conv1d_U8U8I32toU8_K3_S1( + __read_only image2d_array_t input, + __read_only image2d_array_t weight, + __read_only image2d_t bias, + __write_only image2d_array_t output, + int stride, + int pad_front, + int pad_end, + int dilation, + int overflow_policy) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0); + int4 coord_w = (int4)(0, 0, get_global_id(1), 0); + float4 sum0, sum1, dst; + vxc_short8 weight_val_s =(short)input_ZP; + vxc_uchar16 input_val = 0, weight_val = 0; + int temp = 0, i; + + temp = read_imagei(bias, coord.yz).x; + sum0 = convert_float(temp); + sum1 = sum0; + weight_val_s.s5 = (short)weight_ZP; + + for (i = 0; i < input_height; i++) + { + VXC_ReadImage2DArray(weight_val, weight, coord_w, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 2, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(input_val, input, coord.xz, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_DP2x8(weight_val_s, weight_val, weight_val_s, \ + VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0), uniDataConvK3_2x8); + + VXC_DP4x4(dst, input_val, weight_val_s, \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConv1DK3_Lo0_4x4); + sum0 += dst; + VXC_DP4x4(dst, input_val, weight_val_s, \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConv1DK3_Hi0_4x4); + sum1 += dst; + coord.x += dilation; + VXC_ReadImage(input_val, input, coord.xz, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_DP4x4(dst, input_val, weight_val_s, \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConv1DK3_Lo1_4x4); + sum0 += dst; + VXC_DP4x4(dst, input_val, weight_val_s, \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConv1DK3_Hi1_4x4); + sum1 += dst; + coord.x += dilation; + VXC_ReadImage(input_val, input, coord.xz, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_DP4x4(dst, input_val, weight_val_s, \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConv1DK3_Lo2_4x4); + sum0 += dst; + VXC_DP4x4(dst, input_val, weight_val_s, \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConv1DK3_Hi2_4x4); + sum1 += dst; + coord_w.y++; + coord.z++; + coord.x = get_global_id(0); + } + + sum0 = sum0 * scaleOut + output_ZP; + sum1 = sum1 * scaleOut + output_ZP; + uchar4 result0, result1; + _viv_asm(CONV_SAT_RTE, result0, sum0); + _viv_asm(CONV_SAT_RTE, result1, sum1); + vxc_uchar8 result; + VXC_DP2x8(result, result0, result1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSumOrderUchar_2x8); + VXC_WriteImage(output, coord.xy, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); +} + +__kernel void conv1d_U8U8I32toU8_K3_S1_D2_D4( + __read_only image2d_array_t input, + __read_only image2d_array_t weight, + __read_only image2d_t bias, + __write_only image2d_array_t output, + int stride, + int pad_front, + int pad_end, + int dilation, + int overflow_policy) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0); + int4 coord_w = (int4)(0, 0, get_global_id(1), 0); + float4 sum0, sum1, dst; + vxc_short8 weight_val_s =(short)input_ZP; + vxc_uchar16 input_val = 0, weight_val = 0; + int temp = 0, i; + + temp = read_imagei(bias, coord.yz).x; + sum0 = convert_float(temp); + sum1 = sum0; + weight_val_s.s5 = (short)weight_ZP; + + for (i = 0; i < input_height; i++) + { + VXC_ReadImage2DArray(weight_val, weight, coord_w, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 2, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(input_val, input, coord.xz, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_DP2x8(weight_val_s, weight_val, weight_val_s, \ + VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0), uniDataConvK3_2x8); + + VXC_DP4x4(dst, input_val, weight_val_s, \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConv1DK3_Lo0_4x4); + sum0 += dst; + VXC_DP4x4(dst, input_val, weight_val_s, \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConv1DK3_Hi0_4x4); + sum1 += dst; + VXC_DP4x4(dst, input_val, weight_val_s, \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConv1DK3_Lo1_4x4); + sum0 += dst; + VXC_DP4x4(dst, input_val, weight_val_s, \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConv1DK3_Hi1_4x4); + sum1 += dst; + VXC_DP4x4(dst, input_val, weight_val_s, \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConv1DK3_Lo2_4x4); + sum0 += dst; + VXC_DP4x4(dst, input_val, weight_val_s, \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConv1DK3_Hi2_4x4); + sum1 += dst; + coord_w.y++; + coord.z++; + } + + sum0 = sum0 * scaleOut + output_ZP; + sum1 = sum1 * scaleOut + output_ZP; + uchar4 result0, result1; + _viv_asm(CONV_SAT_RTE, result0, sum0); + _viv_asm(CONV_SAT_RTE, result1, sum1); + vxc_uchar8 result; + VXC_DP2x8(result, result0, result1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSumOrderUchar_2x8); + VXC_WriteImage(output, coord.xy, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); +} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/conv1d_ovxlib_k1024.vx b/src/tim/vx/internal/src/libnnext/ops/vx/conv1d_ovxlib_k1024.vx new file mode 100644 index 0000000..f6ac4ce --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/conv1d_ovxlib_k1024.vx @@ -0,0 +1,167 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform VXC_512Bits uniU8SubZp_lo_2x8; +_viv_uniform VXC_512Bits uniU8SubZp_hi_2x8; +_viv_uniform VXC_512Bits uniU8Conv1d_part0_8x2; +_viv_uniform VXC_512Bits uniU8Conv1d_part1_8x2; +_viv_uniform VXC_512Bits uniU8Conv1d_part2_8x2; +_viv_uniform VXC_512Bits uniU8Conv1d_part3_8x2; +_viv_uniform VXC_512Bits uniSumOrderUchar_2x8; + +_viv_uniform int kernel_cnt_x16; +_viv_uniform int weight_ZP; +_viv_uniform float output_ZP; +_viv_uniform float scaleOut; +_viv_uniform int input_height; +_viv_uniform int input_width; +_viv_uniform int output_width; + +__kernel void conv1d_U8U8I32toU8_K1024_SMALL( + __read_only image2d_array_t input, + __read_only image2d_array_t weight, + __read_only image2d_t bias, + __write_only image2d_array_t output, + int stride, + int pad_front, + int pad_end, + int dilation, + int overflow_policy) +{ + int start_x = get_global_id(0) - pad_front; + int4 coord = (int4)(start_x, get_global_id(1), 0, get_global_id(0)); + int4 coord_w = (int4)(0, 0, get_global_id(1), 0); + float4 sum0, sum1, dst; + vxc_short8 coef; + vxc_short8 w_zp = (short)weight_ZP; + vxc_uchar16 input_val = 0, weight_val = 0; + int temp = 0, i, j; + + temp = read_imagei(bias, coord.yz).x; + sum0 = convert_float(temp); + sum1 = sum0; + + for (i = 0; i < input_height; i++) + { + for (j = 0; j < kernel_cnt_x16; j++) + { + VXC_ReadImage2DArray(weight_val, weight, coord_w, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(input_val, input, coord.xz, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_DP2x8(coef, weight_val, w_zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniU8SubZp_lo_2x8); + VXC_DP8x2(dst, input_val, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8Conv1d_part0_8x2); + VXC_DP8x2(dst, input_val, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8Conv1d_part1_8x2); + sum0 += dst; + VXC_DP8x2(dst, input_val, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8Conv1d_part2_8x2); + VXC_DP8x2(dst, input_val, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8Conv1d_part3_8x2); + sum1 += dst; + VXC_ReadImage(input_val, input, coord.xz, VXC_5BITOFFSET_XY(8, 0), \ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_DP2x8(coef, weight_val, w_zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniU8SubZp_hi_2x8); + VXC_DP8x2(dst, input_val, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8Conv1d_part0_8x2); + VXC_DP8x2(dst, input_val, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8Conv1d_part1_8x2); + sum0 += dst; + VXC_DP8x2(dst, input_val, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8Conv1d_part2_8x2); + VXC_DP8x2(dst, input_val, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8Conv1d_part3_8x2); + sum1 += dst; + coord_w.x += 16; + coord.x += 16; + } + coord_w.x = 0; + coord_w.y++; + coord.z++; + coord.x = start_x; + } + + sum0 = sum0 * scaleOut + output_ZP; + sum1 = sum1 * scaleOut + output_ZP; + uchar4 result0, result1; + _viv_asm(CONV_SAT_RTE, result0, sum0); + _viv_asm(CONV_SAT_RTE, result1, sum1); + vxc_uchar8 result; + VXC_DP2x8(result, result0, result1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSumOrderUchar_2x8); + VXC_WriteImage(output, coord.wy, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); +} + +inline uchar* get_image2D_array_ptr(image2d_array_t input) +{ + int8 desc; + _viv_asm(COPY, desc, input, sizeof(desc)); + uchar *src_ptr = (uchar*)desc.s0; + return src_ptr; +} + +__kernel void conv1d_U8U8I32toU8_K1024_LARGE( + __read_only image2d_array_t input, + __read_only image2d_array_t weight, + __read_only image2d_t bias, + __write_only image2d_array_t output, + int stride, + int pad_front, + int pad_end, + int dilation, + int overflow_policy) +{ + int start_x = get_global_id(0); + int w_left = output_width - start_x; + int out_x = w_left < 8 ? get_global_id(0) - w_left : get_global_id(0); + int4 coord = (int4)(start_x, get_global_id(1), 0, out_x); + int4 coord_w = (int4)(0, 0, get_global_id(1), 0); + float4 sum0, sum1, dst; + vxc_short8 coef; + vxc_short8 w_zp = (short)weight_ZP; + vxc_uchar16 input_val = 0, weight_val = 0; + int temp = 0, i, j; + uchar *src_ptr_base = (uchar *)get_image2D_array_ptr(input); + uchar *src_ptr; + uchar *dst_ptr = (uchar *)get_image2D_array_ptr(output); + + temp = read_imagei(bias, coord.yz).x; + sum0 = convert_float(temp); + sum1 = sum0; + + for (i = 0; i < input_height; i++) + { + src_ptr = src_ptr_base + (coord.x + coord.z * input_width); + for (j = 0; j < kernel_cnt_x16; j++) + { + VXC_ReadImage2DArray(weight_val, weight, coord_w, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_Vload16(input_val, src_ptr, 0); + VXC_DP2x8(coef, weight_val, w_zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniU8SubZp_lo_2x8); + VXC_DP8x2(dst, input_val, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8Conv1d_part0_8x2); + VXC_DP8x2(dst, input_val, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8Conv1d_part1_8x2); + sum0 += dst; + VXC_DP8x2(dst, input_val, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8Conv1d_part2_8x2); + VXC_DP8x2(dst, input_val, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8Conv1d_part3_8x2); + sum1 += dst; + src_ptr += 8; + VXC_Vload16(input_val, src_ptr, 0); + VXC_DP2x8(coef, weight_val, w_zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniU8SubZp_hi_2x8); + VXC_DP8x2(dst, input_val, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8Conv1d_part0_8x2); + VXC_DP8x2(dst, input_val, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8Conv1d_part1_8x2); + sum0 += dst; + VXC_DP8x2(dst, input_val, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8Conv1d_part2_8x2); + VXC_DP8x2(dst, input_val, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8Conv1d_part3_8x2); + sum1 += dst; + coord_w.x += 16; + coord.x += 16; + src_ptr += 8; + } + coord_w.x = 0; + coord_w.y++; + coord.z++; + coord.x = start_x; + } + + sum0 = sum0 * scaleOut + output_ZP; + sum1 = sum1 * scaleOut + output_ZP; + uchar4 result0, result1; + _viv_asm(CONV_SAT_RTE, result0, sum0); + _viv_asm(CONV_SAT_RTE, result1, sum1); + vxc_uchar8 result; + VXC_DP2x8(result, result0, result1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSumOrderUchar_2x8); + dst_ptr = dst_ptr + (coord.w + coord.y * output_width); + VXC_Vstore8(dst_ptr, 0, result); +} + diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/depth2space_crd.vx b/src/tim/vx/internal/src/libnnext/ops/vx/depth2space_crd.vx index 0d4ac70..a5612b4 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/depth2space_crd.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/depth2space_crd.vx @@ -2,21 +2,24 @@ _viv_uniform int2 multAndoutZP0;//[0:15] multiplier, [31:63] output zp _viv_uniform VXC_512Bits uniU8MulAndPostShift_0_Lo_2x8; +_viv_uniform VXC_512Bits uniU8MulAndPostShift_ExLo_2x8; +_viv_uniform VXC_512Bits uniU8MulAndPostShift_ExHi_2x8; +_viv_uniform VXC_512Bits uniDepth2SpaceF16Blk2_lo_2x8; +_viv_uniform VXC_512Bits uniDepth2SpaceF16Blk2_hi_2x8; + #define DEPTH2SPACE_CRD_QINT_TO_QINT(src0_type_name, src1_type_name, read_type, write_type) \ __kernel void depth2space_crd_##src0_type_name##to##src1_type_name( \ - image2d_array_t input, \ - image2d_array_t output, \ - int block_size \ - ) \ + image2d_array_t input, image2d_array_t output, int block_size) \ { \ int gidx = get_global_id(0); \ int gidy = get_global_id(1); \ int gidz = get_global_id(2); \ int4 coord_out = (int4)(gidx, gidy, gidz, 0); \ int block_e2 = block_size * block_size; \ - int inx = gidx / block_size; \ - int iny = gidy / block_size; \ + ushort blk = (ushort)block_size; \ + int inx = (int)((ushort)gidx / blk); \ + int iny = (int)((ushort)gidy / blk); \ int inz = (gidx % block_size) + (gidy % block_size) * block_size + gidz * block_e2; \ int4 coord_in = (int4)(inx, iny, inz, 0); \ read_type src; \ @@ -33,18 +36,16 @@ DEPTH2SPACE_CRD_QINT_TO_QINT(I8, I8, vxc_char16, vxc_char16) DEPTH2SPACE_CRD_QINT_TO_QINT(I16, I16, vxc_short8, vxc_short8) __kernel void depth2space_crd_F16toF16( - image2d_array_t input, - image2d_array_t output, - int block_size - ) + image2d_array_t input, image2d_array_t output, int block_size) { int gidx = get_global_id(0); int gidy = get_global_id(1); int gidz = get_global_id(2); int4 coord_out = (int4)(gidx, gidy, gidz, 0); int block_e2 = block_size * block_size; - int inx = gidx / block_size; - int iny = gidy / block_size; + ushort blk = (ushort)block_size; + int inx = (int)((ushort)gidx / blk); + int iny = (int)((ushort)gidy / blk); int inz = (gidx % block_size) + (gidy % block_size) * block_size + gidz * block_e2; int4 coord_in = (int4)(inx, iny, inz, 0); vxc_short8 data; @@ -54,18 +55,16 @@ __kernel void depth2space_crd_F16toF16( #define DEPTH2SPACE_CRD_QINT_TO_F16(src0_type_name, read_type) \ __kernel void depth2space_crd_##src0_type_name##toF16( \ - image2d_array_t input, \ - image2d_array_t output, \ - int block_size \ - ) \ + image2d_array_t input, image2d_array_t output, int block_size) \ { \ int gidx = get_global_id(0); \ int gidy = get_global_id(1); \ int gidz = get_global_id(2); \ int4 coord_out = (int4)(gidx, gidy, gidz, 0); \ int block_e2 = block_size * block_size; \ - int inx = gidx / block_size; \ - int iny = gidy / block_size; \ + ushort blk = (ushort)block_size; \ + int inx = (int)((ushort)gidx / blk); \ + int iny = (int)((ushort)gidy / blk); \ int inz = (gidx % block_size) + (gidy % block_size) * block_size + gidz * block_e2; \ int4 coord_in = (int4)(inx, iny, inz, 0); \ read_type src; \ @@ -85,18 +84,16 @@ DEPTH2SPACE_CRD_QINT_TO_F16(I16, vxc_short8) #define DEPTH2SPACE_CRD_F16_TO_QINT(src1_type_name, write_type) \ __kernel void depth2space_crd_F16to##src1_type_name( \ - image2d_array_t input, \ - image2d_array_t output, \ - int block_size \ - ) \ + image2d_array_t input, image2d_array_t output, int block_size) \ { \ int gidx = get_global_id(0); \ int gidy = get_global_id(1); \ int gidz = get_global_id(2); \ int4 coord_out = (int4)(gidx, gidy, gidz, 0); \ int block_e2 = block_size * block_size; \ - int inx = gidx / block_size; \ - int iny = gidy / block_size; \ + ushort blk = (ushort)block_size; \ + int inx = (int)((ushort)gidx / blk); \ + int iny = (int)((ushort)gidy / blk); \ int inz = (gidx % block_size) + (gidy % block_size) * block_size + gidz * block_e2; \ int4 coord_in = (int4)(inx, iny, inz, 0); \ vxc_short8 src; \ @@ -112,4 +109,199 @@ __kernel void depth2space_crd_F16to##src1_type_name( \ } DEPTH2SPACE_CRD_F16_TO_QINT(U8, vxc_uchar16) DEPTH2SPACE_CRD_F16_TO_QINT(I8, vxc_char16) -DEPTH2SPACE_CRD_F16_TO_QINT(I16, vxc_short8) \ No newline at end of file +DEPTH2SPACE_CRD_F16_TO_QINT(I16, vxc_short8) + +#define DEPTH2SPACE_CRD_QINT_TO_QINT_BLK2(src0_type_name, src1_type_name, read_type, write_type) \ +__kernel void depth2space_crd_##src0_type_name##to##src1_type_name##_blk2( \ + image2d_array_t input, image2d_array_t output, int block_size) \ +{ \ + int gidy = get_global_id(1); \ + int gidz = get_global_id(2); \ + int4 coord_out = (int4)(get_global_id(0), gidy, gidz, gidz); \ + int4 coord_in = coord_out >> 1; \ + coord_in.z = (gidy & 1) * 2 + gidz * 4; \ + coord_in.w = coord_in.z + 1; \ + read_type src; \ + VXC_ReadImage2DArray(src, input, coord_in.xyzz, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage2DArray(src, input, coord_in.xyww, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 0)); \ + \ + write_type dst; \ + vxc_ushort8 ms0; \ + _viv_asm(COPY, ms0, multAndoutZP0, 16); \ + VXC_DP2x8(dst, src, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniU8MulAndPostShift_ExLo_2x8); \ + VXC_DP2x8(dst, src, ms0, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniU8MulAndPostShift_ExHi_2x8); \ + VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ +} +DEPTH2SPACE_CRD_QINT_TO_QINT_BLK2(U8, U8, vxc_uchar16, vxc_uchar16) +DEPTH2SPACE_CRD_QINT_TO_QINT_BLK2(I8, I8, vxc_char16, vxc_char16) + +__kernel void depth2space_crd_F16toF16_blk2( + image2d_array_t input, image2d_array_t output, int block_size) +{ + int gidy = get_global_id(1); + int gidz = get_global_id(2); + int4 coord_out = (int4)(get_global_id(0), gidy, gidz, gidz); + int4 coord_in = coord_out >> 1; + coord_in.z = (gidy & 1) * 2 + gidz * 4; + coord_in.w = coord_in.z + 1; + vxc_short8 data0, data1, dst0, dst1; + VXC_ReadImage2DArray(data0, input, coord_in.xyzz, + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage2DArray(data1, input, coord_in.xyww, + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_DP2x8(dst0, data0, data1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniDepth2SpaceF16Blk2_lo_2x8); + VXC_DP2x8(dst1, data0, data1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniDepth2SpaceF16Blk2_hi_2x8); + + VXC_WriteImage2DArray(output, coord_out, dst0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + coord_out.x += 8; + VXC_WriteImage2DArray(output, coord_out, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); +} + +__kernel void depth2space_crd_I16toI16_blk2( + image2d_array_t input, image2d_array_t output, int block_size) +{ + int gidy = get_global_id(1); + int gidz = get_global_id(2); + int4 coord_out = (int4)(get_global_id(0), gidy, gidz, gidz); + int4 coord_in = coord_out >> 1; + coord_in.z = (gidy & 1) * 2 + gidz * 4; + coord_in.w = coord_in.z + 1; + vxc_short8 src0, src1, data0, data1, dst0, dst1; + VXC_ReadImage2DArray(src0, input, coord_in.xyzz, + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage2DArray(src1, input, coord_in.xyww, + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_DP2x8(data0, src0, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniDepth2SpaceF16Blk2_lo_2x8); + VXC_DP2x8(data1, src0, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniDepth2SpaceF16Blk2_hi_2x8); + + vxc_ushort8 ms0; + _viv_asm(COPY, ms0, multAndoutZP0, 16); + VXC_DP2x8(dst0, data0, ms0, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 1), uniU8MulAndPostShift_0_Lo_2x8); + VXC_DP2x8(dst1, data1, ms0, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 1), uniU8MulAndPostShift_0_Lo_2x8); + + VXC_WriteImage2DArray(output, coord_out, dst0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + coord_out.x += 8; + VXC_WriteImage2DArray(output, coord_out, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); +} + +#define DEPTH2SPACE_CRD_QINT_TO_F16_BLK2(src0_type_name, read_type) \ +__kernel void depth2space_crd_##src0_type_name##toF16_blk2( \ + image2d_array_t input, image2d_array_t output, int block_size) \ +{ \ + int gidy = get_global_id(1); \ + int gidz = get_global_id(2); \ + int4 coord_out = (int4)(get_global_id(0), gidy, gidz, gidz); \ + int4 coord_in = coord_out >> 1; \ + coord_in.z = (gidy & 1) * 2 + gidz * 4; \ + coord_in.w = coord_in.z + 1; \ + read_type src; \ + VXC_ReadImage2DArray(src, input, coord_in.xyzz, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage2DArray(src, input, coord_in.xyww, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 0)); \ + \ + vxc_half8 tmpDst0, tmpDst1; \ + vxc_short8 dst0, dst1; \ + vxc_ushort8 ms0; \ + _viv_asm(COPY, ms0, multAndoutZP0, 16); \ + VXC_DP2x8(tmpDst0, src, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniU8MulAndPostShift_ExLo_2x8); \ + VXC_DP2x8(tmpDst1, src, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniU8MulAndPostShift_ExHi_2x8); \ + _viv_asm(COPY, dst0, tmpDst0, 16); \ + _viv_asm(COPY, dst1, tmpDst1, 16); \ + VXC_WriteImage2DArray(output, coord_out, dst0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + coord_out.x+=8; \ + VXC_WriteImage2DArray(output, coord_out, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ +} +DEPTH2SPACE_CRD_QINT_TO_F16_BLK2(U8, vxc_uchar16) +DEPTH2SPACE_CRD_QINT_TO_F16_BLK2(I8, vxc_char16) + +__kernel void depth2space_crd_I16toF16_blk2( + image2d_array_t input, image2d_array_t output, int block_size) +{ + int gidy = get_global_id(1); + int gidz = get_global_id(2); + int4 coord_out = (int4)(get_global_id(0), gidy, gidz, gidz); + int4 coord_in = coord_out >> 1; + coord_in.z = (gidy & 1) * 2 + gidz * 4; + coord_in.w = coord_in.z + 1; + vxc_short8 src0, src1, data0, data1, dst0, dst1; + vxc_half8 tmpDst0, tmpDst1; + VXC_ReadImage2DArray(src0, input, coord_in.xyzz, + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage2DArray(src1, input, coord_in.xyww, + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_DP2x8(data0, src0, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniDepth2SpaceF16Blk2_lo_2x8); + VXC_DP2x8(data1, src0, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniDepth2SpaceF16Blk2_hi_2x8); + + vxc_ushort8 ms0; + _viv_asm(COPY, ms0, multAndoutZP0, 16); + VXC_DP2x8(tmpDst0, data0, ms0, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 1), uniU8MulAndPostShift_0_Lo_2x8); + VXC_DP2x8(tmpDst1, data1, ms0, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 1), uniU8MulAndPostShift_0_Lo_2x8); + _viv_asm(COPY, dst0, tmpDst0, 16); + _viv_asm(COPY, dst1, tmpDst1, 16); + VXC_WriteImage2DArray(output, coord_out, dst0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + coord_out.x += 8; + VXC_WriteImage2DArray(output, coord_out, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); +} + +#define DEPTH2SPACE_CRD_F16_TO_QINT_BLK2(src1_type_name, write_type) \ +__kernel void depth2space_crd_F16to##src1_type_name##_blk2( \ + image2d_array_t input, image2d_array_t output, int block_size) \ +{ \ + int gidy = get_global_id(1); \ + int gidz = get_global_id(2); \ + int4 coord_out = (int4)(get_global_id(0), gidy, gidz, gidz); \ + int4 coord_in = coord_out >> 1; \ + coord_in.z = (gidy & 1) * 2 + gidz * 4; \ + coord_in.w = coord_in.z + 1; \ + vxc_short8 src0, src1, data0, data1; \ + vxc_half8 tmpDst0, tmpDst1; \ + VXC_ReadImage2DArray(src0, input, coord_in.xyzz, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage2DArray(src1, input, coord_in.xyww, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + VXC_DP2x8(data0, src0, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniDepth2SpaceF16Blk2_lo_2x8); \ + VXC_DP2x8(data1, src0, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniDepth2SpaceF16Blk2_hi_2x8); \ + \ + write_type dst; \ + vxc_ushort8 ms0; \ + _viv_asm(COPY, ms0, multAndoutZP0, 16); \ + _viv_asm(COPY, tmpDst0, data0, 16); \ + _viv_asm(COPY, tmpDst1, data1, 16); \ + VXC_DP2x8(dst, tmpDst0, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniU8MulAndPostShift_0_Lo_2x8); \ + VXC_DP2x8(dst, tmpDst1, ms0, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniU8MulAndPostShift_0_Lo_2x8); \ + VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ +} +DEPTH2SPACE_CRD_F16_TO_QINT_BLK2(U8, vxc_uchar16) +DEPTH2SPACE_CRD_F16_TO_QINT_BLK2(I8, vxc_char16) + +__kernel void depth2space_crd_F16toI16_blk2( + image2d_array_t input, image2d_array_t output, int block_size) +{ + int gidy = get_global_id(1); + int gidz = get_global_id(2); + int4 coord_out = (int4)(get_global_id(0), gidy, gidz, gidz); + int4 coord_in = coord_out >> 1; + coord_in.z = (gidy & 1) * 2 + gidz * 4; + coord_in.w = coord_in.z + 1; + vxc_short8 src0, src1, data0, data1, dst0, dst1; + vxc_half8 tmpDst0, tmpDst1; + VXC_ReadImage2DArray(src0, input, coord_in.xyzz, + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage2DArray(src1, input, coord_in.xyww, + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + vxc_ushort8 ms0; + _viv_asm(COPY, ms0, multAndoutZP0, 16); + VXC_DP2x8(data0, src0, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniDepth2SpaceF16Blk2_lo_2x8); + VXC_DP2x8(data1, src0, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniDepth2SpaceF16Blk2_hi_2x8); + _viv_asm(COPY, tmpDst0, data0, 16); + _viv_asm(COPY, tmpDst1, data1, 16); + VXC_DP2x8(dst0, tmpDst0, ms0, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 1), uniU8MulAndPostShift_0_Lo_2x8); + VXC_DP2x8(dst1, tmpDst1, ms0, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 1), uniU8MulAndPostShift_0_Lo_2x8); + + VXC_WriteImage2DArray(output, coord_out, dst0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + coord_out.x += 8; + VXC_WriteImage2DArray(output, coord_out, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); +} \ No newline at end of file diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/eltwise_unary_2d.vx b/src/tim/vx/internal/src/libnnext/ops/vx/eltwise_unary_2d.vx index 8b03b5c..a8c4583 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/eltwise_unary_2d.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/eltwise_unary_2d.vx @@ -67,6 +67,11 @@ float4 eltwise_unary_mish(float4 x) return x; } +float4 eltwise_unary_round(float4 x) +{ + return convert_float4(convert_int4_rte(x)); +} + _viv_uniform float inputScale; _viv_uniform float inputTail; _viv_uniform float outputScale; @@ -187,7 +192,17 @@ ELTSISE_UNARY_2D(hard_sigmoid, U8, U8, vxc_uchar8, vxc_uchar8, int4, vxc_ucha ELTSISE_UNARY_2D(hard_sigmoid, U8, F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8, vxc_short8) ELTSISE_UNARY_2D(hard_sigmoid, I16, I16, vxc_short8, vxc_short8, int4, vxc_short8, vxc_short8) ELTSISE_UNARY_2D(hard_sigmoid, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8, vxc_short8) - +//ROUND +ELTSISE_UNARY_2D(round, F16, F16, vxc_short8, vxc_half8, half4, vxc_half8, vxc_short8) +ELTSISE_UNARY_2D(round, F16, I8, vxc_short8, vxc_half8, int4, vxc_char8, vxc_char8) +ELTSISE_UNARY_2D(round, F16, U8, vxc_short8, vxc_half8, int4, vxc_uchar8, vxc_uchar8) +ELTSISE_UNARY_2D(round, F16, I16, vxc_short8, vxc_half8, int4, vxc_short8, vxc_short8) +ELTSISE_UNARY_2D(round, I8, I8, vxc_char8, vxc_char8, int4, vxc_char8, vxc_char8) +ELTSISE_UNARY_2D(round, I8, F16, vxc_char8, vxc_char8, half4, vxc_half8, vxc_short8) +ELTSISE_UNARY_2D(round, U8, U8, vxc_uchar8, vxc_uchar8, int4, vxc_uchar8, vxc_uchar8) +ELTSISE_UNARY_2D(round, U8, F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8, vxc_short8) +ELTSISE_UNARY_2D(round, I16, I16, vxc_short8, vxc_short8, int4, vxc_short8, vxc_short8) +ELTSISE_UNARY_2D(round, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8, vxc_short8) _viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8; _viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8; @@ -235,3 +250,5 @@ ELTSISE_UNARY_BF16_2D(neg) ELTSISE_UNARY_BF16_2D(mish) //HARD_SIGMOID ELTSISE_UNARY_BF16_2D(hard_sigmoid) +//ROUND +ELTSISE_UNARY_BF16_2D(round) diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/eltwise_unary_3d.vx b/src/tim/vx/internal/src/libnnext/ops/vx/eltwise_unary_3d.vx index f452849..393e4a0 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/eltwise_unary_3d.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/eltwise_unary_3d.vx @@ -67,6 +67,11 @@ float4 eltwise_unary_mish(float4 x) return x; } +float4 eltwise_unary_round(float4 x) +{ + return convert_float4(convert_int4_rte(x)); +} + _viv_uniform float inputScale; _viv_uniform float inputTail; _viv_uniform float outputScale; @@ -187,6 +192,17 @@ ELTSISE_UNARY_3D(hard_sigmoid, U8, U8, vxc_uchar8, vxc_uchar8, int4, vxc_ucha ELTSISE_UNARY_3D(hard_sigmoid, U8, F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8, vxc_short8) ELTSISE_UNARY_3D(hard_sigmoid, I16, I16, vxc_short8, vxc_short8, int4, vxc_short8, vxc_short8) ELTSISE_UNARY_3D(hard_sigmoid, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8, vxc_short8) +//ROUND +ELTSISE_UNARY_3D(round, F16, F16, vxc_short8, vxc_half8, half4, vxc_half8, vxc_short8) +ELTSISE_UNARY_3D(round, F16, I8, vxc_short8, vxc_half8, int4, vxc_char8, vxc_char8) +ELTSISE_UNARY_3D(round, F16, U8, vxc_short8, vxc_half8, int4, vxc_uchar8, vxc_uchar8) +ELTSISE_UNARY_3D(round, F16, I16, vxc_short8, vxc_half8, int4, vxc_short8, vxc_short8) +ELTSISE_UNARY_3D(round, I8, I8, vxc_char8, vxc_char8, int4, vxc_char8, vxc_char8) +ELTSISE_UNARY_3D(round, I8, F16, vxc_char8, vxc_char8, half4, vxc_half8, vxc_short8) +ELTSISE_UNARY_3D(round, U8, U8, vxc_uchar8, vxc_uchar8, int4, vxc_uchar8, vxc_uchar8) +ELTSISE_UNARY_3D(round, U8, F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8, vxc_short8) +ELTSISE_UNARY_3D(round, I16, I16, vxc_short8, vxc_short8, int4, vxc_short8, vxc_short8) +ELTSISE_UNARY_3D(round, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8, vxc_short8) _viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8; _viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8; @@ -232,4 +248,6 @@ ELTSISE_UNARY_BF16(neg) //MISH ELTSISE_UNARY_BF16(mish) //HARD_SIGMOID -ELTSISE_UNARY_BF16(hard_sigmoid) \ No newline at end of file +ELTSISE_UNARY_BF16(hard_sigmoid) +//ROUND +ELTSISE_UNARY_BF16(round) \ No newline at end of file diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/erf.vx b/src/tim/vx/internal/src/libnnext/ops/vx/erf.vx new file mode 100644 index 0000000..9247044 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/erf.vx @@ -0,0 +1,174 @@ +#include "cl_viv_vx_ext.h" + +#define MUL2_RSQRTPI (1.1283791670955126f) +float eltwise_unary_erf(float x) +{ + float res = 0; + float tmp = x; + float factorial = 1; + float x_pow = x; + float one = 1.0f; + float n = 1; + + while (fabs(tmp) > 1e-5) + { + res += tmp; + + factorial *= n; + one *= -1; + x_pow *= x * x; + tmp = one / factorial * x_pow / ( 2 * n + 1); + + n += 1.0f; + } + return res * MUL2_RSQRTPI; +} + +_viv_uniform float inputScale; +_viv_uniform float inputTail; +_viv_uniform float outputScale; +_viv_uniform float outputZP; +_viv_uniform VXC_512Bits uniExtract8Data_2x8; +_viv_uniform VXC_512Bits uniDatatoFp32Part0_4x4; + +#define ELTSISE_UNARY_2D(func_name, src_type_name, dst_type_name, src_type, \ + src_copy_type, convert_type, dst_type, dst_copy_type) \ + __kernel void func_name##_##src_type_name##to##dst_type_name##_2D( \ + __read_only image2d_array_t input, \ + __write_only image2d_array_t output \ + ) \ +{ \ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); \ + src_type src0; \ + src_copy_type src1; \ + VXC_ReadImage(src0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src1, src0, 16); \ + \ + float4 vecA; \ + VXC_DP4x4(vecA, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDatatoFp32Part0_4x4); \ + vecA = vecA * inputScale + inputTail; \ + vecA.x = eltwise_unary_##func_name(vecA.x); \ + vecA.y = eltwise_unary_##func_name(vecA.y); \ + vecA.z = eltwise_unary_##func_name(vecA.z); \ + vecA.w = eltwise_unary_##func_name(vecA.w); \ + vecA = vecA * outputScale + outputZP; \ + \ + convert_type dst0; \ + _viv_asm(CONV_RTE, dst0, vecA); \ + dst_type dst2; \ + VXC_DP2x8(dst2, dst0, dst0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \ + dst_copy_type dst; \ + _viv_asm(COPY, dst, dst2, 16); \ + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ +} +//ERF +ELTSISE_UNARY_2D(erf, F16, F16, vxc_short8, vxc_half8, half4, vxc_half8, vxc_short8) +ELTSISE_UNARY_2D(erf, F16, I8, vxc_short8, vxc_half8, int4, vxc_char8, vxc_char8) +ELTSISE_UNARY_2D(erf, F16, U8, vxc_short8, vxc_half8, int4, vxc_uchar8, vxc_uchar8) +ELTSISE_UNARY_2D(erf, F16, I16, vxc_short8, vxc_half8, int4, vxc_short8, vxc_short8) +ELTSISE_UNARY_2D(erf, I8, I8, vxc_char8, vxc_char8, int4, vxc_char8, vxc_char8) +ELTSISE_UNARY_2D(erf, I8, F16, vxc_char8, vxc_char8, half4, vxc_half8, vxc_short8) +ELTSISE_UNARY_2D(erf, U8, U8, vxc_uchar8, vxc_uchar8, int4, vxc_uchar8, vxc_uchar8) +ELTSISE_UNARY_2D(erf, U8, F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8, vxc_short8) +ELTSISE_UNARY_2D(erf, I16, I16, vxc_short8, vxc_short8, int4, vxc_short8, vxc_short8) +ELTSISE_UNARY_2D(erf, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8, vxc_short8) + + +_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8; +_viv_uniform VXC_512Bits uniExtractOddData_2x8; + +#define ELTSISE_UNARY_BF16_2D(func_name) \ + __kernel void func_name##_BF16toBF16_2D( \ + __read_only image2d_array_t input, \ + __write_only image2d_array_t output \ + ) \ +{ \ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); \ + vxc_ushort8 src0, src1, dst; \ + VXC_ReadImage(src0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + \ + float4 vecA; \ + vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0); \ + VXC_DP2x8(src1, src0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); \ + _viv_asm(COPY, vecA, src1, 16); \ + vecA.x = eltwise_unary_##func_name(vecA.x); \ + vecA.y = eltwise_unary_##func_name(vecA.y); \ + vecA.z = eltwise_unary_##func_name(vecA.z); \ + vecA.w = eltwise_unary_##func_name(vecA.w); \ + \ + _viv_asm(COPY, src0, vecA, 16); \ + \ + VXC_DP2x8(dst, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); \ + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ +} +//EXP +ELTSISE_UNARY_BF16_2D(erf) + +#define ELTSISE_UNARY_3D(func_name, src_type_name, dst_type_name, src_type, \ + src_copy_type, convert_type, dst_type, dst_copy_type) \ +__kernel void func_name##_##src_type_name##to##dst_type_name( \ +__read_only image2d_array_t input, \ +__write_only image2d_array_t output \ +) \ +{ \ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \ + src_type src0; \ + src_copy_type src1; \ + VXC_ReadImage2DArray(src0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src1, src0, 16); \ + \ + float4 vecA; \ + VXC_DP4x4(vecA, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDatatoFp32Part0_4x4); \ + vecA = vecA * inputScale + inputTail; \ + vecA.x = eltwise_unary_##func_name(vecA.x); \ + vecA.y = eltwise_unary_##func_name(vecA.y); \ + vecA.z = eltwise_unary_##func_name(vecA.z); \ + vecA.w = eltwise_unary_##func_name(vecA.w); \ + vecA = vecA * outputScale + outputZP; \ + \ + convert_type dst0; \ + _viv_asm(CONV_RTE, dst0, vecA); \ + dst_type dst2; \ + VXC_DP2x8(dst2, dst0, dst0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \ + dst_copy_type dst; \ + _viv_asm(COPY, dst, dst2, 16); \ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ +} +//ERF +ELTSISE_UNARY_3D(erf, F16, F16, vxc_short8, vxc_half8, half4, vxc_half8, vxc_short8) +ELTSISE_UNARY_3D(erf, F16, I8, vxc_short8, vxc_half8, int4, vxc_char8, vxc_char8) +ELTSISE_UNARY_3D(erf, F16, U8, vxc_short8, vxc_half8, int4, vxc_uchar8, vxc_uchar8) +ELTSISE_UNARY_3D(erf, F16, I16, vxc_short8, vxc_half8, int4, vxc_short8, vxc_short8) +ELTSISE_UNARY_3D(erf, I8, I8, vxc_char8, vxc_char8, int4, vxc_char8, vxc_char8) +ELTSISE_UNARY_3D(erf, I8, F16, vxc_char8, vxc_char8, half4, vxc_half8, vxc_short8) +ELTSISE_UNARY_3D(erf, U8, U8, vxc_uchar8, vxc_uchar8, int4, vxc_uchar8, vxc_uchar8) +ELTSISE_UNARY_3D(erf, U8, F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8, vxc_short8) +ELTSISE_UNARY_3D(erf, I16, I16, vxc_short8, vxc_short8, int4, vxc_short8, vxc_short8) +ELTSISE_UNARY_3D(erf, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8, vxc_short8) + +#define ELTSISE_UNARY_BF16_3D(func_name) \ + __kernel void func_name##_BF16toBF16( \ + __read_only image2d_array_t input, \ + __write_only image2d_array_t output \ + ) \ +{ \ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \ + vxc_ushort8 src0, src1, dst; \ + VXC_ReadImage2DArray(src0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + \ + float4 vecA; \ + vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0); \ + VXC_DP2x8(src1, src0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); \ + _viv_asm(COPY, vecA, src1, 16); \ + vecA.x = eltwise_unary_##func_name(vecA.x); \ + vecA.y = eltwise_unary_##func_name(vecA.y); \ + vecA.z = eltwise_unary_##func_name(vecA.z); \ + vecA.w = eltwise_unary_##func_name(vecA.w); \ + \ + _viv_asm(COPY, src0, vecA, 16); \ + \ + VXC_DP2x8(dst, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); \ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ +} +//ERF +ELTSISE_UNARY_BF16_3D(erf) \ No newline at end of file diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/gather_array.vx b/src/tim/vx/internal/src/libnnext/ops/vx/gather_array.vx new file mode 100644 index 0000000..9ed2876 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/gather_array.vx @@ -0,0 +1,157 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform int indices_num; +_viv_uniform VXC_512Bits uniExtraCopyDpKeepinEvis_2x8; + +__kernel void gather_I8toI8_array( + __read_only image2d_t input0, + __read_only image2d_t input1, + __write_only image2d_t output, + int block_size, + int block_num, + int axis_num + ) +{ + int gidx = get_global_id(0); // block_size + int gidy = get_global_id(1); // indices_num + int gidz = get_global_id(2); // block_num + + int4 coord_in = (int4)(gidy, 0, gidx, 0); + int4 indice = read_imagei(input1, coord_in.xy); + coord_in.w = gidz * axis_num + indice.x; + + Image img1 = create_image_from_image2d(input0, 1); + Image img2 = create_image_from_image2d(output, 1); + uchar* input_ptr = get_image_ptr_from_coord(img1, coord_in.zw); + __global vxc_char16* data_ptr = (__global vxc_char16*)input_ptr; + vxc_char16 src = data_ptr[0]; + int2 coord = (int2)(gidx, gidz * indices_num + gidy); + uchar* output_ptr = get_image_ptr_from_coord(img2, coord); + __global vxc_char16* dst_ptr = (__global vxc_char16*)output_ptr; + dst_ptr[0] = src; +} + +__kernel void gather_U8toU8_array( + __read_only image2d_t input0, + __read_only image2d_t input1, + __write_only image2d_t output, + int block_size, + int block_num, + int axis_num + ) +{ + int gidx = get_global_id(0); // block_size + int gidy = get_global_id(1); // indices_num + int gidz = get_global_id(2); // block_num + + int4 coord_in = (int4)(gidy, 0, gidx, 0); + int4 indice = read_imagei(input1, coord_in.xy); + coord_in.w = gidz * axis_num + indice.x; + + Image img1 = create_image_from_image2d(input0, 1); + Image img2 = create_image_from_image2d(output, 1); + uchar* input_ptr = get_image_ptr_from_coord(img1, coord_in.zw); + __global vxc_uchar16* data_ptr = (__global vxc_uchar16*)input_ptr; + vxc_uchar16 src = data_ptr[0]; + int2 coord = (int2)(gidx, gidz * indices_num + gidy); + uchar* output_ptr = get_image_ptr_from_coord(img2, coord); + __global vxc_uchar16* dst_ptr = (__global vxc_uchar16*)output_ptr; + dst_ptr[0] = src; +} + +__kernel void gather_I16toI16_array( + __read_only image2d_t input0, + __read_only image2d_t input1, + __write_only image2d_t output, + int block_size, + int block_num, + int axis_num + ) +{ + int gidx = get_global_id(0); // block_size + int gidy = get_global_id(1); // indices_num + int gidz = get_global_id(2); // block_num + + int4 coord_in = (int4)(gidy, 0, gidx, 0); + + + int4 indice = read_imagei(input1, coord_in.xy); + coord_in.w = gidz * axis_num + indice.x; + + Image img1 = create_image_from_image2d(input0, 2); + Image img2 = create_image_from_image2d(output, 2); + uchar* input_ptr = get_image_ptr_from_coord(img1, coord_in.zw); + __global vxc_short8* data_ptr = (__global vxc_short8*)input_ptr; + vxc_short8 src = data_ptr[0]; + int2 coord = (int2)(gidx, gidz * indices_num + gidy); + uchar* output_ptr = get_image_ptr_from_coord(img2, coord); + __global vxc_short8* dst_ptr = (__global vxc_short8*)output_ptr; + dst_ptr[0] = src; +} + +__kernel void gather_F16toF16_array( + __read_only image2d_t input0, + __read_only image2d_t input1, + __write_only image2d_t output, + int block_size, + int block_num, + int axis_num + ) +{ + int gidx = get_global_id(0); // block_size + int gidy = get_global_id(1); // indices_num + int gidz = get_global_id(2); // block_num + + int4 coord_in = (int4)(gidy, 0, gidx, 0); + + int4 indice = read_imagei(input1, coord_in.xy); + coord_in.w = gidz * axis_num + indice.x; + + Image img1 = create_image_from_image2d(input0, 2); + Image img2 = create_image_from_image2d(output, 2); + uchar* input_ptr = get_image_ptr_from_coord(img1, coord_in.zw); + __global vxc_short8* data_ptr = (__global vxc_short8*)input_ptr; + vxc_short8 src = data_ptr[0]; + int2 coord = (int2)(gidx, gidz * indices_num + gidy); + uchar* output_ptr = get_image_ptr_from_coord(img2, coord); + __global vxc_short8* dst_ptr = (__global vxc_short8*)output_ptr; + dst_ptr[0] = src; +} + +#define GATHER_AXIS0_ARRAY(src0_type_name, read_type, data_type, write_type) \ +__kernel void gather_##src0_type_name##to##src0_type_name##_axis0_array( \ + __read_only image2d_t input0, \ + __read_only image2d_t input1, \ + __write_only image2d_t output, \ + int block_size, \ + int block_num, \ + int axis_num \ + ) \ +{ \ + Image img0 = create_image_from_image2d(input0, 1); \ + Image img1 = create_image_from_image2d(input1, 4); \ + Image img2 = create_image_from_image2d(output, 1); \ + int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0); \ + uchar* index_ptr = get_image_ptr_from_coord(img1, coord.xz); \ + __global int* index = (__global int*)index_ptr; \ + int4 indices = vload4(0, index); \ + \ + read_type src, dst; \ + \ + uchar* input_ptr = get_image_ptr_from_coord(img0, coord.zy); \ + uchar* output_ptr = get_image_ptr_from_coord(img2, coord.xy); \ + __global data_type* data_ptr = (__global data_type*)input_ptr; \ + __global write_type* out_ptr = (__global write_type*)output_ptr; \ + src.s0 = data_ptr[indices.x]; \ + src.s1 = data_ptr[indices.y]; \ + src.s2 = data_ptr[indices.z]; \ + src.s3 = data_ptr[indices.w]; \ + \ + VXC_DP2x8(dst, src, src, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \ + uniExtraCopyDpKeepinEvis_2x8); \ + out_ptr[0] = dst.s0123; \ +} +GATHER_AXIS0_ARRAY(U8, vxc_uchar16, uchar, vxc_uchar4) +GATHER_AXIS0_ARRAY(I8, vxc_char16, char, vxc_char4) +GATHER_AXIS0_ARRAY(I16, vxc_short8, short, vxc_short4) +GATHER_AXIS0_ARRAY(F16, vxc_short8, short, vxc_short4) \ No newline at end of file diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd.vx b/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd.vx index a526d21..f6aa7c7 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd.vx @@ -12,7 +12,10 @@ __kernel void gather_nd_I8toI8_1D( int gidy = get_global_id(1); // indices_num int4 coord = (int4)(0, gidy, gidx, 0); - int4 indice = read_imagei(input1, coord.xy); + Image img = create_image_from_image2d(input1, 4); + uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy); + int4 indice = ((int4 *)indice_ptr)[0]; + coord.w = indice.x; vxc_char16 src; @@ -33,7 +36,10 @@ __kernel void gather_nd_U8toU8_1D( int gidy = get_global_id(1); // indices_num int4 coord = (int4)(0, gidy, gidx, 0); - int4 indice = read_imagei(input1, coord.xy); + Image img = create_image_from_image2d(input1, 4); + uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy); + int4 indice = ((int4 *)indice_ptr)[0]; + coord.w = indice.x; vxc_uchar16 src; @@ -53,7 +59,10 @@ __kernel void gather_nd_I16toI16_1D( int gidy = get_global_id(1); // indices_num int4 coord = (int4)(0, gidy, gidx, 0); - int4 indice = read_imagei(input1, coord.xy); + Image img = create_image_from_image2d(input1, 4); + uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy); + int4 indice = ((int4 *)indice_ptr)[0]; + coord.w = indice.x; vxc_short8 src; @@ -73,7 +82,10 @@ __kernel void gather_nd_F16toF16_1D( int gidy = get_global_id(1); // indices_num int4 coord = (int4)(0, gidy, gidx, 0); - int4 indice = read_imagei(input1, coord.xy); + Image img = create_image_from_image2d(input1, 4); + uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy); + int4 indice = ((int4 *)indice_ptr)[0]; + coord.w = indice.x; vxc_short8 src; diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd_2d.vx b/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd_2d.vx index 6b3d90a..74c1a22 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd_2d.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd_2d.vx @@ -12,7 +12,10 @@ __kernel void gather_nd_I8toI8_2D( int gidy = get_global_id(1); // indices_num int4 coord = (int4)(0, gidy, gidx, 0); - int4 indice = read_imagei(input1, coord.xy); + Image img = create_image_from_image2d(input1, 4); + uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy); + int4 indice = ((int4 *)indice_ptr)[0]; + indice.x = indice.x * block_size + gidx; vxc_char16 src; @@ -33,7 +36,10 @@ __kernel void gather_nd_U8toU8_2D( int gidy = get_global_id(1); // indices_num int4 coord = (int4)(0, gidy, gidx, 0); - int4 indice = read_imagei(input1, coord.xy); + Image img = create_image_from_image2d(input1, 4); + uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy); + int4 indice = ((int4 *)indice_ptr)[0]; + indice.x = indice.x * block_size + gidx; vxc_uchar16 src; @@ -53,7 +59,10 @@ __kernel void gather_nd_I16toI16_2D( int gidy = get_global_id(1); // indices_num int4 coord = (int4)(0, gidy, gidx, 0); - int4 indice = read_imagei(input1, coord.xy); + Image img = create_image_from_image2d(input1, 4); + uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy); + int4 indice = ((int4 *)indice_ptr)[0]; + indice.x = indice.x * block_size + gidx; vxc_short8 src; @@ -73,7 +82,10 @@ __kernel void gather_nd_F16toF16_2D( int gidy = get_global_id(1); // indices_num int4 coord = (int4)(0, gidy, gidx, 0); - int4 indice = read_imagei(input1, coord.xy); + Image img = create_image_from_image2d(input1, 4); + uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy); + int4 indice = ((int4 *)indice_ptr)[0]; + indice.x = indice.x * block_size + gidx; vxc_short8 src; diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd_2d_mix.vx b/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd_2d_mix.vx index 6b0be59..e45482c 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd_2d_mix.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd_2d_mix.vx @@ -26,7 +26,10 @@ __kernel void gather_nd_##src0_type_name##toF16_2D( \ int gidy = get_global_id(1); \ \ int4 coord = (int4)(0, gidy, gidx, 0); \ - int4 indice = read_imagei(input1, coord.xy); \ + Image img = create_image_from_image2d(input1, 4); \ + uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy); \ + int4 indice = ((int4 *)indice_ptr)[0]; \ + \ indice.x = indice.x * block_size + gidx; \ \ read_type src; \ @@ -57,7 +60,10 @@ __kernel void gather_nd_F16to##src1_type_name##_2D( \ int gidy = get_global_id(1); \ \ int4 coord = (int4)(0, gidy, gidx, 0); \ - int4 indice = read_imagei(input1, coord.xy); \ + Image img = create_image_from_image2d(input1, 4); \ + uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy); \ + int4 indice = ((int4 *)indice_ptr)[0]; \ + \ indice.x = indice.x * block_size + gidx; \ \ vxc_short8 src; \ diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd_3d.vx b/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd_3d.vx index 2aa9d4c..566aaa5 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd_3d.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd_3d.vx @@ -12,7 +12,10 @@ __kernel void gather_nd_I8toI8_3D( int gidy = get_global_id(1); // indices_num int4 coord = (int4)(0, gidy, gidx, 0); - int4 indice = read_imagei(input1, coord.xy); + Image img = create_image_from_image2d(input1, 4); + uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy); + int4 indice = ((int4 *)indice_ptr)[0]; + indice.x = indice.x * block_size + gidx; indice.w = 0; @@ -34,7 +37,11 @@ __kernel void gather_nd_U8toU8_3D( int gidy = get_global_id(1); // indices_num int4 coord = (int4)(0, gidy, gidx, 0); - int4 indice = read_imagei(input1, coord.xy); + + Image img = create_image_from_image2d(input1, 4); + uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy); + int4 indice = ((int4 *)indice_ptr)[0]; + indice.x = indice.x * block_size + gidx; indice.w = 0; @@ -55,7 +62,10 @@ __kernel void gather_nd_I16toI16_3D( int gidy = get_global_id(1); // indices_num int4 coord = (int4)(0, gidy, gidx, 0); - int4 indice = read_imagei(input1, coord.xy); + Image img = create_image_from_image2d(input1, 4); + uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy); + int4 indice = ((int4 *)indice_ptr)[0]; + indice.x = indice.x * block_size + gidx; indice.w = 0; @@ -76,7 +86,10 @@ __kernel void gather_nd_F16toF16_3D( int gidy = get_global_id(1); // indices_num int4 coord = (int4)(0, gidy, gidx, 0); - int4 indice = read_imagei(input1, coord.xy); + Image img = create_image_from_image2d(input1, 4); + uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy); + int4 indice = ((int4 *)indice_ptr)[0]; + indice.x = indice.x * block_size + gidx; indice.w = 0; @@ -84,3 +97,4 @@ __kernel void gather_nd_F16toF16_3D( VXC_ReadImage2DArray(src, input0, indice, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); VXC_WriteImage(output, coord.zy, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); } + diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd_3d_mix.vx b/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd_3d_mix.vx index 3d92bef..e9ca9ec 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd_3d_mix.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd_3d_mix.vx @@ -23,7 +23,10 @@ __kernel void gather_nd_##src0_type_name##toF16_3D( \ int gidy = get_global_id(1); \ \ int4 coord = (int4)(0, gidy, gidx, 0); \ - int4 indice = read_imagei(input1, coord.xy); \ + Image img = create_image_from_image2d(input1, 4); \ + uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy); \ + int4 indice = ((int4 *)indice_ptr)[0]; \ + \ indice.x = indice.x * block_size + gidx; \ indice.w = 0; \ \ @@ -55,7 +58,10 @@ __kernel void gather_nd_F16to##src1_type_name##_3D( \ int gidy = get_global_id(1); \ \ int4 coord = (int4)(0, gidy, gidx, 0); \ - int4 indice = read_imagei(input1, coord.xy); \ + Image img = create_image_from_image2d(input1, 4); \ + uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy); \ + int4 indice = ((int4 *)indice_ptr)[0]; \ + \ indice.x = indice.x * block_size + gidx; \ indice.w = 0; \ \ diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd_mix.vx b/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd_mix.vx index 770498b..8288ab0 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd_mix.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd_mix.vx @@ -26,7 +26,10 @@ __kernel void gather_nd_##src0_type_name##toF16_1D( \ int gidy = get_global_id(1); \ \ int4 coord = (int4)(0, gidy, gidx, 0); \ - int4 indice = read_imagei(input1, coord.xy); \ + Image img = create_image_from_image2d(input1, 4); \ + uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy); \ + int4 indice = ((int4 *)indice_ptr)[0]; \ + \ coord.w = indice.x; \ \ read_type src; \ @@ -57,7 +60,10 @@ __kernel void gather_nd_F16to##src1_type_name##_1D( \ int gidy = get_global_id(1); \ \ int4 coord = (int4)(0, gidy, gidx, 0); \ - int4 indice = read_imagei(input1, coord.xy); \ + Image img = create_image_from_image2d(input1, 4); \ + uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy); \ + int4 indice = ((int4 *)indice_ptr)[0]; \ + \ coord.w = indice.x; \ \ vxc_short8 src; \ diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_f16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_f16.vx new file mode 100644 index 0000000..161383d --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_f16.vx @@ -0,0 +1,306 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform int width; +_viv_uniform int height; +_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4; +_viv_uniform VXC_512Bits uniConvertHalfToFp16_2x8; +_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8; + +_viv_uniform VXC_512Bits uniFp16SumSqr_dp8x2; +_viv_uniform VXC_512Bits uniConvertEndInt16Fp32_4x4; + +_viv_uniform float outputScale; +_viv_uniform int output_ZP; + +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_sumsqr_F16( + image2d_array_t input, image2d_array_t output, float eps, int is2D) +{ + int gidx = get_global_id(0) << 3; + int lidx = get_local_id(0); + int gidz = get_global_id(1); + int4 coord = (int4)(gidx, 0, gidz, 0); + vxc_short8 src0; + vxc_half8 in_h; + vxc_float4 sumsqr; + vxc_float4 tmpSumSqr = (vxc_float4)(0); + + __local float lcl_sum[16]; + __local float lcl_sqr[16]; + + int8 input_desc; + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); + int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0; + _viv_asm(MOV, coord.z, baseAddr_a); + + if(gidx < width) + { + for(coord.y = 0; coord.y < height;) + { + VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + coord.y++; + _viv_asm(COPY, in_h, src0, 16); + VXC_DP8x2(sumsqr, in_h, in_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\ + uniFp16SumSqr_dp8x2); + tmpSumSqr += sumsqr; + } + } + + lcl_sum[lidx] = tmpSumSqr.x; + lcl_sqr[lidx] = tmpSumSqr.y; + barrier(CLK_LOCAL_MEM_FENCE); + + int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0); + if(lidx == 0) + { + float4 one = (float4)(1, 1, 1, 1); + __local float4* tmp_sum = (__local float4*)lcl_sum; + __local float4* tmp_sqr = (__local float4*)lcl_sqr; + + float sum = 0; + float sqr = 0; + for(int i = 0; i < 4; i++) + { + //sum += lcl_sum[i]; + //sqr += lcl_sqr[i]; + sum += dot(tmp_sum[i], one); + sqr += dot(tmp_sqr[i], one); + } + + float4 data = (float4)(sum, sqr, 0, 0); + write_imagef(output, coord_out, data); + } +} + +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_sumsqr_F16_2D( + image2d_array_t input, image2d_array_t output, float eps, int is2D) +{ + int gidx = get_global_id(0) << 3; + int lidx = get_local_id(0); + + int2 coord = (int2)(gidx, get_global_id(1)); + vxc_short8 src0; + vxc_half8 in_h; + vxc_float4 sumsqr = (vxc_float4)(0); + + __local float lcl_sum[16]; + __local float lcl_sqr[16]; + + if(gidx < width) + { + VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, in_h, src0, 16); + VXC_DP8x2(sumsqr, in_h, in_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\ + uniFp16SumSqr_dp8x2); + } + + lcl_sum[lidx] = sumsqr.x; + lcl_sqr[lidx] = sumsqr.y; + barrier(CLK_LOCAL_MEM_FENCE); + + int4 coord_out = (int4)(get_group_id(0) << 2, get_global_id(1), 0, 0); + if(lidx == 0) + { + float4 one = (float4)(1, 1, 1, 1); + __local float4* tmp_sum = (__local float4*)lcl_sum; + __local float4* tmp_sqr = (__local float4*)lcl_sqr; + + float sum = 0; + float sqr = 0; + for(int i = 0; i < 4; i++) + { + //sum += lcl_sum[i]; + //sqr += lcl_sqr[i]; + sum += dot(tmp_sum[i], one); + sqr += dot(tmp_sqr[i], one); + } + + float4 data = (float4)(sum, sqr, 0, 0); + write_imagef(output, coord_out, data); + } +} + +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_F16toF16( + image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari, image2d_array_t output, + float eps, int is2D, float rSpaceOrg, int pStride) +{ + int gidy = get_global_id(1); + int gidz = get_global_id(2); + int4 coord = (int4)(get_global_id(0), gidy, gidz, 0); + int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0); + vxc_short8 src0; + vxc_short8 src1; + vxc_half8 scale_h, in_h; + float scale_vari, bias_val; + vxc_float4 bias_f, scale_f; + + vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy); + VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + bias_f = read_imagef(bias, coord_para.xy); + VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + + _viv_asm(COPY, scale_h, src1, 16); + VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4); + + scale_vari = scale_f.s0 * mean_vari.s1; + vxc_float4 tmpData0, tmpData1; + vxc_short8 outval; + half4 tmpVal0, tmpVal1; + bias_val = (bias_f.s0 - scale_vari * mean_vari.s0); + vxc_half8 dst; + + _viv_asm(COPY, in_h, src0, 16); + VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + UniFP16toFP32Lo4_dp4x4); + VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvertEndInt16Fp32_4x4); + + vxc_float4 norm; + norm = scale_vari * tmpData0 + bias_val; + _viv_asm(CONV, tmpVal0, norm); + norm = scale_vari * tmpData1 + bias_val; + _viv_asm(CONV, tmpVal1, norm); + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\ + uniConvertHalfToFp16_2x8); + _viv_asm(COPY, outval, dst, 16); + VXC_WriteImage2DArray(output, coord, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); +} + +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_F16toF16_2D( + image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari, image2d_array_t output, + float eps, int is2D, float rSpaceOrg, int pStride) +{ + int gidz = get_global_id(1); + int2 coord = (int2)(get_global_id(0), gidz); + int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 0); + vxc_short8 src0; + vxc_short8 src1; + vxc_half8 scale_h, in_h; + float scale_vari, bias_val; + vxc_float4 bias_f, scale_f; + + vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy); + VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + bias_f = read_imagef(bias, coord_para.xy); + VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + + _viv_asm(COPY, scale_h, src1, 16); + VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4); + + scale_vari = scale_f.s0 * mean_vari.s1; + vxc_float4 tmpData0, tmpData1; + vxc_short8 outval; + half4 tmpVal0, tmpVal1; + bias_val = (bias_f.s0 - scale_vari * mean_vari.s0); + vxc_half8 dst; + + _viv_asm(COPY, in_h, src0, 16); + VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + UniFP16toFP32Lo4_dp4x4); + VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvertEndInt16Fp32_4x4); + vxc_float4 norm; + norm = scale_vari * tmpData0 + bias_val; + _viv_asm(CONV, tmpVal0, norm); + norm = scale_vari * tmpData1 + bias_val; + _viv_asm(CONV, tmpVal1, norm); + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\ + uniConvertHalfToFp16_2x8); + _viv_asm(COPY, outval, dst, 16); + VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); +} + +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_F16toU8( + image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari, image2d_array_t output, + float eps, int is2D, float rSpaceOrg, int pStride) +{ + int gidy = get_global_id(1); + int gidz = get_global_id(2); + int4 coord = (int4)(get_global_id(0), gidy, gidz, 0); + int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0); + vxc_short8 src0; + vxc_short8 src1; + vxc_half8 scale_h, in_h; + float scale_vari, bias_val; + vxc_float4 bias_f, scale_f; + + vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy); + VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + bias_f = read_imagef(bias, coord_para.xy); + VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + + _viv_asm(COPY, scale_h, src1, 16); + VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4); + + scale_vari = scale_f.s0 * mean_vari.s1; + vxc_float4 tmpData0, tmpData1; + vxc_uchar16 outval; + vxc_int4 tmpVal0, tmpVal1; + float alpha = outputScale * scale_vari; + bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * outputScale + output_ZP; + + _viv_asm(COPY, in_h, src0, 16); + VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + UniFP16toFP32Lo4_dp4x4); + VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvertEndInt16Fp32_4x4); + + vxc_float4 norm; + norm = alpha * tmpData0 + bias_val; + tmpVal0 = convert_int4_rte(norm); + norm = alpha * tmpData1 + bias_val; + tmpVal1 = convert_int4_rte(norm); + VXC_DP2x8(outval, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ + uniConvertInt32toUint8_2x8); + VXC_WriteImage2DArray(output, coord, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); +} + +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_F16toU8_2D( + image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari, image2d_array_t output, + float eps, int is2D, float rSpaceOrg, int pStride) +{ + int gidz = get_global_id(1); + int2 coord = (int2)(get_global_id(0), gidz); + int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 0); + vxc_short8 src0; + vxc_short8 src1; + vxc_half8 scale_h, in_h; + float scale_vari, bias_val; + vxc_float4 bias_f, scale_f; + + vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy); + VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + bias_f = read_imagef(bias, coord_para.xy); + VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + + _viv_asm(COPY, scale_h, src1, 16); + VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4); + + scale_vari = scale_f.s0 * mean_vari.s1; + vxc_float4 tmpData0, tmpData1; + vxc_uchar16 outval; + vxc_int4 tmpVal0, tmpVal1; + float alpha = outputScale * scale_vari; + bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * outputScale + output_ZP; + + _viv_asm(COPY, in_h, src0, 16); + VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + UniFP16toFP32Lo4_dp4x4); + VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvertEndInt16Fp32_4x4); + vxc_float4 norm; + norm = alpha * tmpData0 + bias_val; + tmpVal0 = convert_int4_rte(norm); + norm = alpha * tmpData1 + bias_val; + tmpVal1 = convert_int4_rte(norm); + VXC_DP2x8(outval, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ + uniConvertInt32toUint8_2x8); + VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); +} \ No newline at end of file diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_i16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_i16.vx new file mode 100644 index 0000000..1282e00 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_i16.vx @@ -0,0 +1,339 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform int width; +_viv_uniform int height; +_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4; +_viv_uniform VXC_512Bits uniConvertHalfToFp16_2x8; + +_viv_uniform float inFlScale_s2; +_viv_uniform float input_fl_scale; +_viv_uniform float inOut_fl_scale; +_viv_uniform float output_fl_scale; + +_viv_uniform VXC_512Bits uniInt16SumSqr_dp8x2; +_viv_uniform VXC_512Bits uniConvertInt16Fp32Fst_4x4; +_viv_uniform VXC_512Bits uniConvertInt16Fp32Secd_4x4; +_viv_uniform VXC_512Bits uniConvertInt32toInt16_2x8; + +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_sumsqr_I16( + image2d_array_t input, + image2d_array_t output, + float eps, + int is2D) +{ + int gidx = get_global_id(0) << 3; + int lidx = get_local_id(0); + int gidz = get_global_id(1); + int4 coord = (int4)(gidx, 0, gidz, 0); + vxc_short8 src0; + float sum = 0, sqr = 0; + vxc_float4 sumsqr = (vxc_float4)(0); + vxc_float4 tmpSumSqr = (vxc_float4)(0); + + __local float lcl_sum[16]; + __local float lcl_sqr[16]; + + int8 input_desc; + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); + int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0; + _viv_asm(MOV, coord.z, baseAddr_a); + if(gidx < width) + { + for(coord.y = 0; coord.y < height;) + { + VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + coord.y++; + VXC_DP8x2(sumsqr, src0, src0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\ + uniInt16SumSqr_dp8x2); + //tmpSumSqr += sumsqr; + tmpSumSqr.x += sumsqr.x; + sqr += (sumsqr.y * inFlScale_s2); + } + sum = tmpSumSqr.x * input_fl_scale; + //sqr = tmpSumSqr.y * inFlScale_s2; + } + + lcl_sum[lidx] = sum; + lcl_sqr[lidx] = sqr; + barrier(CLK_LOCAL_MEM_FENCE); + + int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0); + if(lidx == 0) + { + float4 one = (float4)(1, 1, 1, 1); + __local float4* tmp_sum = (__local float4*)lcl_sum; + __local float4* tmp_sqr = (__local float4*)lcl_sqr; + + sum = 0; sqr = 0; + for(int i = 0; i < 4; i++) + { + //sum += lcl_sum[i]; + //sqr += lcl_sqr[i]; + sum += dot(tmp_sum[i], one); + sqr += dot(tmp_sqr[i], one); + } + + float4 data = (float4)(sum, sqr, 0, 0); + write_imagef(output, coord_out, data); + } +} + +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_sumsqr_I16_2D( + image2d_array_t input, + image2d_array_t output, + float eps, + int is2D) +{ + int gidx = get_global_id(0) << 3; + int lidx = get_local_id(0); + int gidz = get_global_id(1); + + int2 coord = (int2)(gidx, gidz); + vxc_short8 src0; + float sum = 0, sqr = 0; + vxc_float4 sumsqr = (vxc_float4)(0); + + __local float lcl_sum[16]; + __local float lcl_sqr[16]; + + if(gidx < width) + { + VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_DP8x2(sumsqr, src0, src0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\ + uniInt16SumSqr_dp8x2); + sqr = sumsqr.y * inFlScale_s2; + sum = sumsqr.x * input_fl_scale; + //sqr = tmpSumSqr.y * inFlScale_s2; + } + + lcl_sum[lidx] = sum; + lcl_sqr[lidx] = sqr; + barrier(CLK_LOCAL_MEM_FENCE); + + int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0); + if(lidx == 0) + { + float4 one = (float4)(1, 1, 1, 1); + __local float4* tmp_sum = (__local float4*)lcl_sum; + __local float4* tmp_sqr = (__local float4*)lcl_sqr; + + sum = 0; sqr = 0; + for(int i = 0; i < 4; i++) + { + //sum += lcl_sum[i]; + //sqr += lcl_sqr[i]; + sum += dot(tmp_sum[i], one); + sqr += dot(tmp_sqr[i], one); + } + + float4 data = (float4)(sum, sqr, 0, 0); + write_imagef(output, coord_out, data); + } +} + +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_I16toF16( + image2d_array_t input, + image2d_t bias, + image2d_t scale, + image2d_t meanVari, + image2d_array_t output, + float eps, + int is2D, + float rSpaceOrg, int pStride) +{ + int gidy = get_global_id(1); + int gidz = get_global_id(2); + int4 coord = (int4)(get_global_id(0), gidy, gidz, 0); + int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0); + vxc_short8 src0; + vxc_short8 src1; + vxc_half8 scale_h; + float scale_vari, bias_val; + vxc_float4 bias_f, scale_f; + + vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy); + VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + bias_f = read_imagef(bias, coord_para.xy); + VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + + _viv_asm(COPY, scale_h, src1, 16); + VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + UniFP16toFP32Lo4_dp4x4); + + scale_vari = scale_f.s0 * mean_vari.s1; + vxc_float4 tmpData0, tmpData1; + vxc_short8 outval; + half4 tmpVal0, tmpVal1; + float alpha = input_fl_scale * scale_vari; + bias_val = (bias_f.s0 - scale_vari * mean_vari.s0); + vxc_half8 dst; + + VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvertInt16Fp32Fst_4x4); + VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvertInt16Fp32Secd_4x4); + + vxc_float4 norm; + norm = alpha * tmpData0 + bias_val; + _viv_asm(CONV, tmpVal0, norm); + norm = alpha * tmpData1 + bias_val; + _viv_asm(CONV, tmpVal1, norm); + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\ + uniConvertHalfToFp16_2x8); + _viv_asm(COPY, outval, dst, 16); + VXC_WriteImage2DArray(output, coord, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); +} + +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_I16toF16_2D( + image2d_array_t input, + image2d_t bias, + image2d_t scale, + image2d_t meanVari, + image2d_array_t output, + float eps, + int is2D, + float rSpaceOrg, int pStride) +{ + int gidz = get_global_id(1); + int2 coord = (int2)(get_global_id(0), gidz); + int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 0); + vxc_short8 src0; + vxc_short8 src1; + vxc_half8 scale_h; + float scale_vari, bias_val; + vxc_float4 bias_f, scale_f; + + vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy); + VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + bias_f = read_imagef(bias, coord_para.xy); + VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + + _viv_asm(COPY, scale_h, src1, 16); + VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + UniFP16toFP32Lo4_dp4x4); + + scale_vari = scale_f.s0 * mean_vari.s1; + vxc_float4 tmpData0, tmpData1; + vxc_short8 outval; + half4 tmpVal0, tmpVal1; + float alpha = input_fl_scale * scale_vari; + bias_val = (bias_f.s0 - scale_vari * mean_vari.s0); + vxc_half8 dst; + + VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvertInt16Fp32Fst_4x4); + VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvertInt16Fp32Secd_4x4); + vxc_float4 norm; + norm = alpha * tmpData0 + bias_val; + _viv_asm(CONV, tmpVal0, norm); + norm = alpha * tmpData1 + bias_val; + _viv_asm(CONV, tmpVal1, norm); + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\ + uniConvertHalfToFp16_2x8); + _viv_asm(COPY, outval, dst, 16); + VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); +} + +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_I16toI16( + image2d_array_t input, + image2d_t bias, + image2d_t scale, + image2d_t meanVari, + image2d_array_t output, + float eps, + int is2D, + float rSpaceOrg, int pStride) +{ + int gidy = get_global_id(1); + int gidz = get_global_id(2); + int4 coord = (int4)(get_global_id(0), gidy, gidz, 0); + int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0); + vxc_short8 src0, src2; + vxc_short8 src1; + vxc_half8 scale_h; + float scale_vari, bias_val; + vxc_float4 bias_f, scale_f; + + vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy); + VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + bias_f = read_imagef(bias, coord_para.xy); + VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + + _viv_asm(COPY, scale_h, src1, 16); + VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4); + + scale_vari = scale_f.s0 * mean_vari.s1; + vxc_int4 tmpVal0, tmpVal1; + vxc_float4 tmpData0, tmpData1; + float alpha = inOut_fl_scale * scale_vari; + bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * output_fl_scale; + + VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvertInt16Fp32Fst_4x4); + VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvertInt16Fp32Secd_4x4); + vxc_float4 norm; + norm = tmpData0 * alpha + bias_val; + tmpVal0 = convert_int4_rte(norm); + norm = tmpData1 * alpha + bias_val; + tmpVal1 = convert_int4_rte(norm); + VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ + uniConvertInt32toInt16_2x8); + VXC_WriteImage2DArray(output, coord, src2, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); +} + +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_I16toI16_2D( + image2d_array_t input, + image2d_t bias, + image2d_t scale, + image2d_t meanVari, + image2d_array_t output, + float eps, + int is2D, + float rSpaceOrg, int pStride) +{ + int gidz = get_global_id(1); + int2 coord = (int2)(get_global_id(0), gidz); + int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 0); + vxc_short8 src0, src2; + vxc_short8 src1; + vxc_half8 scale_h; + float scale_vari, bias_val; + vxc_float4 bias_f, scale_f; + + vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy); + VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + bias_f = read_imagef(bias, coord_para.xy); + VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + + _viv_asm(COPY, scale_h, src1, 16); + VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4); + + scale_vari = scale_f.s0 * mean_vari.s1; + vxc_int4 tmpVal0, tmpVal1; + vxc_float4 tmpData0, tmpData1; + float alpha = inOut_fl_scale * scale_vari; + bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * output_fl_scale; + + VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvertInt16Fp32Fst_4x4); + VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvertInt16Fp32Secd_4x4); + vxc_float4 norm; + norm = tmpData0 * alpha + bias_val; + tmpVal0 = convert_int4_rte(norm); + norm = tmpData1 * alpha + bias_val; + tmpVal1 = convert_int4_rte(norm); + VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ + uniConvertInt32toInt16_2x8); + VXC_WriteImage(output, coord.xy, src2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); +} \ No newline at end of file diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_i8.vx b/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_i8.vx new file mode 100644 index 0000000..6a407a3 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_i8.vx @@ -0,0 +1,317 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform int width; +_viv_uniform int height; +_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4; +_viv_uniform VXC_512Bits uniConvertHalfToFp16_2x8; +_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8; + +_viv_uniform VXC_512Bits uniSumInt8_16x1; +_viv_uniform VXC_512Bits uniSqrSumInt8_16x1; +_viv_uniform float inFlScale_s2; +_viv_uniform float input_fl_scale; + +_viv_uniform VXC_512Bits uniConvertDirInt8Fp32_4x4; +_viv_uniform VXC_512Bits uniConvertEndInt8Fp32_4x4; +_viv_uniform VXC_512Bits uniConvertTrdInt8Fp32_4x4; +_viv_uniform VXC_512Bits uniConvertFthInt8Fp32_4x4; + +_viv_uniform float inOut_fl_scale; +_viv_uniform float output_fl_scale; + +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_sumsqr_I8( + image2d_array_t input, image2d_array_t output, float eps, int is2D) +{ + int gidx = get_global_id(0) << 4; + int lidx = get_local_id(0); + int gidz = get_global_id(1); + int4 coord = (int4)(gidx, 0, gidz, 0); + vxc_char16 src0; + float sum = 0, sqr = 0; + int tmpSum = 0, tmpSqr = 0, tmpSum1, tmpSqr1; + + __local float lcl_sum[16]; + __local float lcl_sqr[16]; + + int8 input_desc; + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); + int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0; + _viv_asm(MOV, coord.z, baseAddr_a); + + if(gidx < width) + { + for(coord.y = 0; coord.y < height;) + { + VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + coord.y++; + VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumInt8_16x1); + tmpSum += (tmpSum1); + VXC_DP16x1(tmpSqr1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSqrSumInt8_16x1); + tmpSqr += (tmpSqr1); + } + sqr = tmpSqr * inFlScale_s2; + sum = tmpSum * input_fl_scale; + } + lcl_sum[lidx] = sum; + lcl_sqr[lidx] = sqr; + barrier(CLK_LOCAL_MEM_FENCE); + + int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0); + if(lidx == 0) + { + float4 one = (float4)(1, 1, 1, 1); + __local float4* tmp_sum = (__local float4*)lcl_sum; + __local float4* tmp_sqr = (__local float4*)lcl_sqr; + + sum = 0; sqr = 0; + for(int i = 0; i < 4; i++) + { + sum += dot(tmp_sum[i], one); + sqr += dot(tmp_sqr[i], one); + } + + float4 data = (float4)(sum, sqr, 0, 0); + write_imagef(output, coord_out, data); + } +} + +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_sumsqr_I8_2D( + image2d_array_t input, image2d_array_t output, float eps, int is2D) +{ + int gidx = get_global_id(0) << 4; + int lidx = get_local_id(0); + int gidz = get_global_id(1); + + int2 coord = (int2)(gidx, gidz); + vxc_char16 src0; + float sum = 0, sqr = 0; + int tmpSum1, tmpSqr1; + + __local float lcl_sum[16]; + __local float lcl_sqr[16]; + + if(gidx < width) + { + VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumInt8_16x1); + VXC_DP16x1(tmpSqr1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSqrSumInt8_16x1); + sqr = tmpSqr1 * inFlScale_s2; + sum = tmpSum1 * input_fl_scale; + } + + lcl_sum[lidx] = sum; + lcl_sqr[lidx] = sqr; + barrier(CLK_LOCAL_MEM_FENCE); + + int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0); + if(lidx == 0) + { + float4 one = (float4)(1, 1, 1, 1); + __local float4* tmp_sum = (__local float4*)lcl_sum; + __local float4* tmp_sqr = (__local float4*)lcl_sqr; + + sum = 0; sqr = 0; + for(int i = 0; i < 4; i++) + { + sum += dot(tmp_sum[i], one); + sqr += dot(tmp_sqr[i], one); + } + + float4 data = (float4)(sum, sqr, 0, 0); + write_imagef(output, coord_out, data); + } +} + +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_I8toF16( + image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari, + image2d_array_t output, float eps, int is2D, float rSpaceOrg, int pStride) +{ + int gidy = get_global_id(1); + int gidz = get_global_id(2); + int4 coord = (int4)(get_global_id(0), gidy, gidz, 0); + int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0); + vxc_char16 src0; + vxc_short8 src1, outval; + vxc_half8 scale_h, dst; + float scale_vari, bias_val; + vxc_float4 bias_f, scale_f; + + vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy); + VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + bias_f = read_imagef(bias, coord_para.xy); + VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + + _viv_asm(COPY, scale_h, src1, 16); + VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4); + + scale_vari = scale_f.s0 * mean_vari.s1; + vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3, norm; + half4 tmpVal0, tmpVal1; + float alpha = input_fl_scale * scale_vari; + bias_val = (bias_f.s0 - scale_vari * mean_vari.s0); + + VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDirInt8Fp32_4x4); + VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertEndInt8Fp32_4x4); + VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertTrdInt8Fp32_4x4); + VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFthInt8Fp32_4x4); + + norm = alpha * tmpData0 + bias_val; + _viv_asm(CONV, tmpVal0, norm); + norm = alpha * tmpData1 + bias_val; + _viv_asm(CONV, tmpVal1, norm); + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8); + _viv_asm(COPY, outval, dst, 16); + VXC_WriteImage2DArray(output, coord, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); + coord.x += 8; + norm = alpha * tmpData2 + bias_val; + _viv_asm(CONV, tmpVal0, norm); + norm = alpha * tmpData3 + bias_val; + _viv_asm(CONV, tmpVal1, norm); + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8); + _viv_asm(COPY, outval, dst, 16); + VXC_WriteImage2DArray(output, coord, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); +} + +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_I8toF16_2D( + image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari, + image2d_array_t output, float eps, int is2D, float rSpaceOrg, int pStride) +{ + int gidz = get_global_id(1); + int2 coord = (int2)(get_global_id(0), gidz); + int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 0); + vxc_char16 src0; + vxc_short8 src1, outval; + vxc_half8 scale_h, dst; + float scale_vari, bias_val; + vxc_float4 bias_f, scale_f; + + vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy); + VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + bias_f = read_imagef(bias, coord_para.xy); + VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + + _viv_asm(COPY, scale_h, src1, 16); + VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4); + + scale_vari = scale_f.s0 * mean_vari.s1; + vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3, norm; + half4 tmpVal0, tmpVal1; + float alpha = input_fl_scale * scale_vari; + bias_val = (bias_f.s0 - scale_vari * mean_vari.s0); + + VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDirInt8Fp32_4x4); + VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertEndInt8Fp32_4x4); + VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertTrdInt8Fp32_4x4); + VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFthInt8Fp32_4x4); + norm = alpha * tmpData0 + bias_val; + _viv_asm(CONV, tmpVal0, norm); + norm = alpha * tmpData1 + bias_val; + _viv_asm(CONV, tmpVal1, norm); + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8); + _viv_asm(COPY, outval, dst, 16); + VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + coord.x += 8; + norm = alpha * tmpData2 + bias_val; + _viv_asm(CONV, tmpVal0, norm); + norm = alpha * tmpData3 + bias_val; + _viv_asm(CONV, tmpVal1, norm); + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8); + _viv_asm(COPY, outval, dst, 16); + VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); +} + +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_I8toI8( + image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari, + image2d_array_t output, float eps, int is2D, float rSpaceOrg, int pStride) +{ + int gidy = get_global_id(1); + int gidz = get_global_id(2); + int4 coord = (int4)(get_global_id(0), gidy, gidz, 0); + int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0); + vxc_char16 src0, src2; + vxc_short8 src1; + vxc_half8 scale_h; + float scale_vari, bias_val; + vxc_float4 bias_f, scale_f; + + vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy); + VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + bias_f = read_imagef(bias, coord_para.xy); + VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + + _viv_asm(COPY, scale_h, src1, 16); + VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4); + + scale_vari = scale_f.s0 * mean_vari.s1; + vxc_int4 tmpVal0, tmpVal1; + vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3, norm; + float alpha = inOut_fl_scale * scale_vari; + bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * output_fl_scale; + + VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDirInt8Fp32_4x4); + VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertEndInt8Fp32_4x4); + VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertTrdInt8Fp32_4x4); + VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFthInt8Fp32_4x4); + norm = tmpData0 * alpha + bias_val; + tmpVal0 = convert_int4_rte(norm); + norm = tmpData1 * alpha + bias_val; + tmpVal1 = convert_int4_rte(norm); + VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8); + norm = tmpData2 * alpha + bias_val; + tmpVal0 = convert_int4_rte(norm); + norm = tmpData3 * alpha + bias_val; + tmpVal1 = convert_int4_rte(norm); + VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8); + VXC_WriteImage2DArray(output, coord, src2, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); +} + +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_I8toI8_2D( + image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari, + image2d_array_t output, float eps, int is2D, float rSpaceOrg, int pStride) +{ + int gidz = get_global_id(1); + int2 coord = (int2)(get_global_id(0), gidz); + int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 0); + vxc_char16 src0, src2; + vxc_short8 src1; + vxc_half8 scale_h; + float scale_vari, bias_val; + vxc_float4 bias_f, scale_f; + + vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy); + VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + bias_f = read_imagef(bias, coord_para.xy); + VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + + _viv_asm(COPY, scale_h, src1, 16); + VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4); + + scale_vari = scale_f.s0 * mean_vari.s1; + vxc_int4 tmpVal0, tmpVal1; + vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3, norm; + float alpha = inOut_fl_scale * scale_vari; + bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * output_fl_scale; + + VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDirInt8Fp32_4x4); + VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertEndInt8Fp32_4x4); + VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertTrdInt8Fp32_4x4); + VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFthInt8Fp32_4x4); + norm = tmpData0 * alpha + bias_val; + tmpVal0 = convert_int4_rte(norm); + norm = tmpData1 * alpha + bias_val; + tmpVal1 = convert_int4_rte(norm); + VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8); + norm = tmpData2 * alpha + bias_val; + tmpVal0 = convert_int4_rte(norm); + norm = tmpData3 * alpha + bias_val; + tmpVal1 = convert_int4_rte(norm); + VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8); + VXC_WriteImage(output, coord.xy, src2, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); +} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_u8.vx b/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_u8.vx new file mode 100644 index 0000000..af20584 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_u8.vx @@ -0,0 +1,261 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform int width; +_viv_uniform int height; +_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4; +_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8; + +_viv_uniform VXC_512Bits uniConvert1stUint8SubZpToFp32_4x4; +_viv_uniform VXC_512Bits uniConvert2ndUint8SubZpToFp32_4x4; +_viv_uniform VXC_512Bits uniConvert3rdUint8SubZpToFp32_4x4; +_viv_uniform VXC_512Bits uniConvert4thUint8SubZpToFp32_4x4; +_viv_uniform VXC_512Bits uniSumU8_16x1; +_viv_uniform VXC_512Bits uniSqrSum_16x1; +_viv_uniform float input_scale; +_viv_uniform int inputZP; +_viv_uniform int sumInZp; +_viv_uniform int tmpZp1; +_viv_uniform float e2InScale; +_viv_uniform float rowSumScale; +_viv_uniform float scale_inOut; +_viv_uniform float outputScale; +_viv_uniform int output_ZP; + +_viv_uniform VXC_512Bits uniResetFp32_4x4; +_viv_uniform int group_stride; + +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_sumsqr_U8( + image2d_array_t input, image2d_array_t output, float eps, int is2D) +{ + int gidx = get_global_id(0) << 4; + int lidx = get_local_id(0); + int gidz = get_global_id(1); + int4 coord = (int4)(gidx, 0, gidz, 0); + vxc_uchar16 src0; + float sum = 0, sqr = 0; + int tmpSum = 0, tmpSqr = 0, tmpSum1 = 0, tmpSqr1 = 0; + + __local float lcl_sum[16]; + __local float lcl_sqr[16]; + int8 input_desc; + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); + int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0; + _viv_asm(MOV, coord.z, baseAddr_a); + if(gidx < width) + { + for(coord.y = 0; coord.y < height;) + { + VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + coord.y++; + VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumU8_16x1); + tmpSum += (tmpSum1); + VXC_DP16x1(tmpSqr1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSqrSum_16x1); + tmpSqr += (tmpSqr1 + tmpZp1 * tmpSum1); + } + sqr += (tmpSqr * e2InScale + rowSumScale); + sum = (tmpSum + sumInZp) * input_scale; + } + lcl_sum[lidx] = sum; + lcl_sqr[lidx] = sqr; + barrier(CLK_LOCAL_MEM_FENCE); + + int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0); + if(lidx == 0) + { + float4 one = (float4)(1, 1, 1, 1); + __local float4* tmp_sum = (__local float4*)lcl_sum; + __local float4* tmp_sqr = (__local float4*)lcl_sqr; + sum = 0; sqr = 0; + for(int i = 0; i < 4; i++) + { + sum += dot(tmp_sum[i], one); + sqr += dot(tmp_sqr[i], one); + } + float4 data = (float4)(sum, sqr, 0, 0); + write_imagef(output, coord_out, data); + } +} + +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_sumsqr_U8_2D( + image2d_array_t input, image2d_array_t output, float eps, int is2D) +{ + int gidx = get_global_id(0) << 4; + int lidx = get_local_id(0); + + int2 coord = (int2)(gidx, get_global_id(1)); + vxc_uchar16 src0; + float sum = 0, sqr = 0; + int tmpSqr, tmpSum1, tmpSqr1; + + __local float lcl_sum[16]; + __local float lcl_sqr[16]; + if(gidx < width) + { + VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumU8_16x1); + VXC_DP16x1(tmpSqr1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSqrSum_16x1); + tmpSqr = tmpSqr1 + tmpZp1 * tmpSum1; + sqr = (tmpSqr * e2InScale + rowSumScale); + sum = (tmpSum1 + sumInZp) * input_scale; + } + lcl_sum[lidx] = sum; + lcl_sqr[lidx] = sqr; + barrier(CLK_LOCAL_MEM_FENCE); + + int4 coord_out = (int4)(get_group_id(0) << 2, get_global_id(1), 0, 0); + if(lidx == 0) + { + float4 one = (float4)(1, 1, 1, 1); + __local float4* tmp_sum = (__local float4*)lcl_sum; + __local float4* tmp_sqr = (__local float4*)lcl_sqr; + sum = 0; sqr = 0; + for(int i = 0; i < 4; i++) + { + sum += dot(tmp_sum[i], one); + sqr += dot(tmp_sqr[i], one); + } + float4 data = (float4)(sum, sqr, 0, 0); + write_imagef(output, coord_out, data); + } +} + +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_meanvari( + image2d_t input, image2d_t output, float eps, float group_ratio) +{ + int gidx = get_global_id(0); + int lidx = get_local_id(0); + + int2 coord = (int2)(gidx, get_global_id(1)); + vxc_uchar16 src0; + float2 sum_sqr = (float2)(0); + vxc_float4 mean_vari; + VXC_DP4x4(mean_vari, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniResetFp32_4x4); + + __local float2 lcl_data[16]; + __local float2 lcl_sum[4]; + + for(; coord.x < group_stride; coord.x += 64) + { + mean_vari += read_imagef(input, coord); + } + lcl_data[lidx] = mean_vari.xy; + barrier(CLK_LOCAL_MEM_FENCE); + if(lidx < 4) + { + float2 tmpSum = (float2)(0); + for(int i = lidx; i < 16; i+=4) + { + tmpSum += lcl_data[i]; + } + lcl_sum[lidx] = tmpSum; + } + barrier(CLK_LOCAL_MEM_FENCE); + if(lidx == 0) + { + for(int i = 0; i < 4; i++) + { + sum_sqr += lcl_sum[i]; + } + mean_vari.xy = sum_sqr * group_ratio; + mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; + mean_vari.s1 = rsqrt(mean_vari.s1); + + coord.x = 0; + write_imagef(output, coord, mean_vari); + } +} + +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_U8toU8( + image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari, + image2d_array_t output, float eps, int is2D, float rSpaceOrg, int pStride) +{ + int gidy = get_global_id(1); + int gidz = get_global_id(2); + int4 coord = (int4)(get_global_id(0), gidy, gidz, 0); + int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0); + vxc_uchar16 src0, src2; + vxc_short8 src1; + vxc_half8 scale_h; + float scale_vari, bias_val; + vxc_float4 bias_f, scale_f; + + vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy); + VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + bias_f = read_imagef(bias, coord_para.xy); + VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + + _viv_asm(COPY, scale_h, src1, 16); + VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4); + + scale_vari = scale_f.s0 * mean_vari.s1; + short zp = inputZP; + vxc_int4 tmpVal0, tmpVal1; + vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3, norm; + float alpha = scale_inOut * scale_vari; + bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * outputScale + output_ZP; + + VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert1stUint8SubZpToFp32_4x4); + VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert2ndUint8SubZpToFp32_4x4); + VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert3rdUint8SubZpToFp32_4x4); + VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert4thUint8SubZpToFp32_4x4); + norm = tmpData0 * alpha + bias_val; + tmpVal0 = convert_int4_rte(norm); + norm = tmpData1 * alpha + bias_val; + tmpVal1 = convert_int4_rte(norm); + VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8); + norm = tmpData2 * alpha + bias_val; + tmpVal0 = convert_int4_rte(norm); + norm = tmpData3 * alpha + bias_val; + tmpVal1 = convert_int4_rte(norm); + VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8); + VXC_WriteImage2DArray(output, coord, src2, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); +} + +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_U8toU8_2D( + image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari, + image2d_array_t output, float eps, int is2D, float rSpaceOrg, int pStride) +{ + int gidz = get_global_id(1); + int2 coord = (int2)(get_global_id(0), gidz); + int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 0); + vxc_uchar16 src0, src2; + vxc_short8 src1; + vxc_half8 scale_h; + float scale_vari, bias_val; + vxc_float4 bias_f, scale_f; + + vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy); + VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + bias_f = read_imagef(bias, coord_para.xy); + VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + + _viv_asm(COPY, scale_h, src1, 16); + VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4); + + scale_vari = scale_f.s0 * mean_vari.s1; + short zp = inputZP; + vxc_int4 tmpVal0, tmpVal1; + vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3, norm; + float alpha = scale_inOut * scale_vari; + bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * outputScale + output_ZP; + + VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert1stUint8SubZpToFp32_4x4); + VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert2ndUint8SubZpToFp32_4x4); + VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert3rdUint8SubZpToFp32_4x4); + VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert4thUint8SubZpToFp32_4x4); + norm = tmpData0 * alpha + bias_val; + tmpVal0 = convert_int4_rte(norm); + norm = tmpData1 * alpha + bias_val; + tmpVal1 = convert_int4_rte(norm); + VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8); + norm = tmpData2 * alpha + bias_val; + tmpVal0 = convert_int4_rte(norm); + norm = tmpData3 * alpha + bias_val; + tmpVal1 = convert_int4_rte(norm); + VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8); + VXC_WriteImage(output, coord.xy, src2, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); +} \ No newline at end of file diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_u8_f16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_u8_f16.vx new file mode 100644 index 0000000..3c1b892 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_u8_f16.vx @@ -0,0 +1,114 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform int width; +_viv_uniform int height; +_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4; +_viv_uniform VXC_512Bits uniConvertHalfToFp16_2x8; + +_viv_uniform VXC_512Bits uniConvert1stUint8SubZpToFp32_4x4; +_viv_uniform VXC_512Bits uniConvert2ndUint8SubZpToFp32_4x4; +_viv_uniform VXC_512Bits uniConvert3rdUint8SubZpToFp32_4x4; +_viv_uniform VXC_512Bits uniConvert4thUint8SubZpToFp32_4x4; +_viv_uniform float input_scale; +_viv_uniform int inputZP; + +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_U8toF16( + image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari, + image2d_array_t output, float eps, int is2D, float rSpaceOrg, int pStride) +{ + int gidy = get_global_id(1); + int gidz = get_global_id(2); + int4 coord = (int4)(get_global_id(0), gidy, gidz, 0); + int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0); + vxc_uchar16 src0; + vxc_short8 src1, outval; + vxc_half8 scale_h, dst; + float scale_vari, bias_val; + vxc_float4 bias_f, scale_f; + + vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy); + VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + bias_f = read_imagef(bias, coord_para.xy); + VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + + _viv_asm(COPY, scale_h, src1, 16); + VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4); + + scale_vari = scale_f.s0 * mean_vari.s1; + short zp = inputZP; + vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3, norm; + half4 tmpVal0, tmpVal1; + float alpha = input_scale * scale_vari; + bias_val = (bias_f.s0 - scale_vari * mean_vari.s0); + + VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert1stUint8SubZpToFp32_4x4); + VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert2ndUint8SubZpToFp32_4x4); + VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert3rdUint8SubZpToFp32_4x4); + VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert4thUint8SubZpToFp32_4x4); + norm = alpha * tmpData0 + bias_val; + _viv_asm(CONV, tmpVal0, norm); + norm = alpha * tmpData1 + bias_val; + _viv_asm(CONV, tmpVal1, norm); + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8); + _viv_asm(COPY, outval, dst, 16); + VXC_WriteImage2DArray(output, coord, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); + coord.x += 8; + norm = alpha * tmpData2 + bias_val; + _viv_asm(CONV, tmpVal0, norm); + norm = alpha * tmpData3 + bias_val; + _viv_asm(CONV, tmpVal1, norm); + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8); + _viv_asm(COPY, outval, dst, 16); + VXC_WriteImage2DArray(output, coord, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); +} + +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_U8toF16_2D( + image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari, + image2d_array_t output, float eps, int is2D, float rSpaceOrg, int pStride) +{ + int gidz = get_global_id(1); + int2 coord = (int2)(get_global_id(0), gidz); + int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 0); + vxc_uchar16 src0; + vxc_short8 src1, outval; + vxc_half8 scale_h, dst; + float scale_vari, bias_val; + vxc_float4 bias_f, scale_f; + + vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy); + VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + bias_f = read_imagef(bias, coord_para.xy); + VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + + _viv_asm(COPY, scale_h, src1, 16); + VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4); + + scale_vari = scale_f.s0 * mean_vari.s1; + short zp = inputZP; + vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3, norm; + half4 tmpVal0, tmpVal1; + float alpha = input_scale * scale_vari; + bias_val = (bias_f.s0 - scale_vari * mean_vari.s0); + + VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert1stUint8SubZpToFp32_4x4); + VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert2ndUint8SubZpToFp32_4x4); + VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert3rdUint8SubZpToFp32_4x4); + VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert4thUint8SubZpToFp32_4x4); + norm = alpha * tmpData0 + bias_val; + _viv_asm(CONV, tmpVal0, norm); + norm = alpha * tmpData1 + bias_val; + _viv_asm(CONV, tmpVal1, norm); + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8); + _viv_asm(COPY, outval, dst, 16); + VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + coord.x += 8; + norm = alpha * tmpData2 + bias_val; + _viv_asm(CONV, tmpVal0, norm); + norm = alpha * tmpData3 + bias_val; + _viv_asm(CONV, tmpVal1, norm); + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8); + _viv_asm(COPY, outval, dst, 16); + VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); +} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_f16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_f16.vx index c942079..ed18f67 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_f16.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_f16.vx @@ -11,10 +11,7 @@ _viv_uniform VXC_512Bits uniFp16SumSqr_dp8x2; _viv_uniform VXC_512Bits uniConvertEndInt16Fp32_4x4; __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_meanvari_F16( - image2d_array_t input, - image2d_array_t output, - float eps, - int rsFlg) + image2d_array_t input, image2d_array_t output, float eps, int rsFlg) { int gidx = get_global_id(0) << 3; int lidx = get_local_id(0); @@ -28,12 +25,17 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_mean __local float lcl_sum[16]; __local float lcl_sqr[16]; + int8 input_desc; + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); + int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0; + _viv_asm(MOV, coord.z, baseAddr_a); + if(gidx < width) { for(coord.y = 0; coord.y < height;) { - VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); coord.y++; _viv_asm(COPY, in_h, src0, 16); VXC_DP8x2(sumsqr, in_h, in_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\ @@ -69,10 +71,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_mean } __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_meanvari_F16_2D( - image2d_array_t input, - image2d_array_t output, - float eps, - int rsFlg) + image2d_array_t input, image2d_array_t output, float eps, int rsFlg) { int gidx = get_global_id(0) << 3; int lidx = get_local_id(0); @@ -130,13 +129,8 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_mean } __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_F16toF16( - image2d_array_t input, - image2d_array_t bias, - image2d_array_t scale, - image2d_array_t meanVari, - image2d_array_t output, - float eps, - int rsFlg) + image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari, + image2d_array_t output, float eps, int rsFlg) { int gidz = get_global_id(1); int4 coord = (int4)(get_global_id(0), 0, gidz, 0); @@ -153,12 +147,10 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_F16t VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4); bias_f = read_imagef(bias, coord_para); - coord_para.x = 0; - coord_para.y = gidz; for(int i = 0; i < group_num; i++) { - mean_vari += read_imagef(meanVari, coord_para); - coord_para.x += 4; + mean_vari += read_imagef(meanVari, coord_para.yx); + coord_para.y += 4; } mean_vari *= dimRatio; mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; @@ -171,10 +163,19 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_F16t bias_val = (bias_f.s0 - scale_vari * mean_vari.s0); vxc_half8 dst; + int8 input_desc, output_desc; + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); + int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0; + _viv_asm(MOV, coord.z, baseAddr_a); + + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); + int baseAddr = (int)get_global_id(1) * output_desc.s4 + output_desc.s0; + _viv_asm(MOV, coord.w, baseAddr); + for(coord.y = 0; coord.y < height; coord.y++) { - VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); _viv_asm(COPY, in_h, src0, 16); VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ @@ -190,18 +191,14 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_F16t VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\ uniConvertHalfToFp16_2x8); _viv_asm(COPY, outval, dst, 16); - VXC_WriteImage2DArray(output, coord, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_OP4_NoDest(img_store_3d, output, coord.xyww, outval, \ + VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); } } __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_F16toF16_2D( - image2d_array_t input, - image2d_array_t bias, - image2d_array_t scale, - image2d_array_t meanVari, - image2d_array_t output, - float eps, - int rsFlg) + image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari, + image2d_array_t output, float eps, int rsFlg) { int gidz = get_global_id(1); int gidy = gidz * height; @@ -220,12 +217,10 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_F16t VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4); bias_f = read_imagef(bias, coord_para); - coord_para.x = 0; - coord_para.y = gidz; for(int i = 0; i < group_num; i++) { - mean_vari += read_imagef(meanVari, coord_para); - coord_para.x += 4; + mean_vari += read_imagef(meanVari, coord_para.yx); + coord_para.y += 4; } mean_vari *= dimRatio; mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_i16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_i16.vx index cedc0a2..523bb38 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_i16.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_i16.vx @@ -35,12 +35,16 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_mean __local float lcl_sum[16]; __local float lcl_sqr[16]; + int8 input_desc; + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); + int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0; + _viv_asm(MOV, coord.z, baseAddr_a); if(gidx < width) { for(coord.y = 0; coord.y < height;) { - VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); coord.y++; VXC_DP8x2(sumsqr, src0, src0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\ uniInt16SumSqr_dp8x2); @@ -144,7 +148,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I16t image2d_array_t input, image2d_array_t bias, image2d_array_t scale, - image2d_array_t meanVari, + image2d_t meanVari, image2d_array_t output, float eps, int rsFlg) @@ -166,12 +170,10 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I16t bias_f = read_imagef(bias, coord_para); - coord_para.x = 0; - coord_para.y = gidz; for(int i = 0; i < group_num; i++) { - mean_vari += read_imagef(meanVari, coord_para); - coord_para.x += 4; + mean_vari += read_imagef(meanVari, coord_para.yx); + coord_para.y += 4; } mean_vari *= dimRatio; mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; @@ -185,11 +187,19 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I16t bias_val = (bias_f.s0 - scale_vari * mean_vari.s0); vxc_half8 dst; + int8 input_desc, output_desc; + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); + int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0; + _viv_asm(MOV, coord.z, baseAddr_a); + + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); + int baseAddr = (int)get_global_id(1) * output_desc.s4 + output_desc.s0; + _viv_asm(MOV, coord.w, baseAddr); + for(coord.y = 0; coord.y < height; coord.y++) { - VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - + VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ uniConvertInt16Fp32Fst_4x4); VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ @@ -203,7 +213,8 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I16t VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\ uniConvertHalfToFp16_2x8); _viv_asm(COPY, outval, dst, 16); - VXC_WriteImage2DArray(output, coord, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_OP4_NoDest(img_store_3d, output, coord.xyww, outval, \ + VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); } } @@ -211,7 +222,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I16t image2d_array_t input, image2d_array_t bias, image2d_array_t scale, - image2d_array_t meanVari, + image2d_t meanVari, image2d_array_t output, float eps, int rsFlg) @@ -235,12 +246,10 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I16t bias_f = read_imagef(bias, coord_para); - coord_para.x = 0; - coord_para.y = gidz; for(int i = 0; i < group_num; i++) { - mean_vari += read_imagef(meanVari, coord_para); - coord_para.x += 4; + mean_vari += read_imagef(meanVari, coord_para.yx); + coord_para.y += 4; } mean_vari *= dimRatio; mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; @@ -279,7 +288,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I16t image2d_array_t input, image2d_array_t bias, image2d_array_t scale, - image2d_array_t meanVari, + image2d_t meanVari, image2d_array_t output, float eps, int rsFlg) @@ -299,12 +308,10 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I16t VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4); bias_f = read_imagef(bias, coord_para); - coord_para.x = 0; - coord_para.y = gidz; for(int i = 0; i < group_num; i++) { - mean_vari += read_imagef(meanVari, coord_para); - coord_para.x += 4; + mean_vari += read_imagef(meanVari, coord_para.yx); + coord_para.y += 4; } mean_vari *= dimRatio; mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; @@ -316,10 +323,18 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I16t float alpha = inOut_fl_scale * scale_vari; bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * output_fl_scale; + int8 input_desc, output_desc; + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); + int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0; + _viv_asm(MOV, coord.z, baseAddr_a); + + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); + int baseAddr = (int)get_global_id(1) * output_desc.s4 + output_desc.s0; + _viv_asm(MOV, coord.w, baseAddr); for(coord.y = 0; coord.y < height; coord.y++) { - VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ uniConvertInt16Fp32Fst_4x4); VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ @@ -331,7 +346,8 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I16t tmpVal1 = convert_int4_rte(norm); VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ uniConvertInt32toInt16_2x8); - VXC_WriteImage2DArray(output, coord, src2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_OP4_NoDest(img_store_3d, output, coord.xyww, src2, \ + VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); } } @@ -339,7 +355,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I16t image2d_array_t input, image2d_array_t bias, image2d_array_t scale, - image2d_array_t meanVari, + image2d_t meanVari, image2d_array_t output, float eps, int rsFlg) @@ -361,12 +377,10 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I16t VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4); bias_f = read_imagef(bias, coord_para); - coord_para.x = 0; - coord_para.y = gidz; for(int i = 0; i < group_num; i++) { - mean_vari += read_imagef(meanVari, coord_para); - coord_para.x += 4; + mean_vari += read_imagef(meanVari, coord_para.yx); + coord_para.y += 4; } mean_vari *= dimRatio; mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_i8.vx b/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_i8.vx index 489da14..dc19b5e 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_i8.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_i8.vx @@ -22,10 +22,7 @@ _viv_uniform float inOut_fl_scale; _viv_uniform float output_fl_scale; __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_meanvari_I8( - image2d_array_t input, - image2d_array_t output, - float eps, - int rsFlg) + image2d_array_t input, image2d_array_t output, float eps, int rsFlg) { int gidx = get_global_id(0) << 4; int lidx = get_local_id(0); @@ -33,18 +30,22 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_mean int4 coord = (int4)(gidx, 0, gidz, 0); vxc_char16 src0; float sum = 0, sqr = 0; - int tmpSum = 0, tmpSqr = 0; - int tmpSum1, tmpSqr1; + int tmpSum = 0, tmpSqr = 0, tmpSum1, tmpSqr1; __local float lcl_sum[16]; __local float lcl_sqr[16]; + int8 input_desc; + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); + int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0; + _viv_asm(MOV, coord.z, baseAddr_a); + if(gidx < width) { for(coord.y = 0; coord.y < height;) { - VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), - VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); coord.y++; VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumInt8_16x1); tmpSum += (tmpSum1); @@ -54,7 +55,6 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_mean sqr = tmpSqr * inFlScale_s2; sum = tmpSum * input_fl_scale; } - lcl_sum[lidx] = sum; lcl_sqr[lidx] = sqr; barrier(CLK_LOCAL_MEM_FENCE); @@ -69,8 +69,6 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_mean sum = 0; sqr = 0; for(int i = 0; i < 4; i++) { - //sum += lcl_sum[i]; - //sqr += lcl_sqr[i]; sum += dot(tmp_sum[i], one); sqr += dot(tmp_sqr[i], one); } @@ -81,10 +79,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_mean } __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_meanvari_I8_2D( - image2d_array_t input, - image2d_array_t output, - float eps, - int rsFlg) + image2d_array_t input, image2d_array_t output, float eps, int rsFlg) { int gidx = get_global_id(0) << 4; int lidx = get_local_id(0); @@ -94,8 +89,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_mean int2 coord = (int2)(gidx, gidy); vxc_char16 src0; float sum = 0, sqr = 0; - int tmpSum = 0, tmpSqr = 0; - int tmpSum1, tmpSqr1; + int tmpSum = 0, tmpSqr = 0, tmpSum1, tmpSqr1; __local float lcl_sum[16]; __local float lcl_sqr[16]; @@ -103,7 +97,6 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_mean int endH = gidy + height; if(gidx < width) { - tmpSqr = 0; for(; coord.y < endH;) { VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), @@ -132,8 +125,6 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_mean sum = 0; sqr = 0; for(int i = 0; i < 4; i++) { - //sum += lcl_sum[i]; - //sqr += lcl_sqr[i]; sum += dot(tmp_sum[i], one); sqr += dot(tmp_sqr[i], one); } @@ -144,94 +135,81 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_mean } __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I8toF16( - image2d_array_t input, - image2d_array_t bias, - image2d_array_t scale, - image2d_array_t meanVari, - image2d_array_t output, - float eps, - int rsFlg) + image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari, + image2d_array_t output, float eps, int rsFlg) { int gidz = get_global_id(1); int4 coord = (int4)(get_global_id(0), 0, gidz, 0); int4 coord_para = (int4)(gidz, 0, 0, 0); vxc_char16 src0; - vxc_short8 src1; - vxc_half8 scale_h; + vxc_short8 src1, outval; + vxc_half8 scale_h, dst; float scale_vari, bias_val; vxc_float4 bias_f, scale_f, mean_vari = (vxc_float4)(0); VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\ VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); _viv_asm(COPY, scale_h, src1, 16); - VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - UniFP16toFP32Lo4_dp4x4); - + VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4); bias_f = read_imagef(bias, coord_para); - coord_para.x = 0; - coord_para.y = gidz; for(int i = 0; i < group_num; i++) { - mean_vari += read_imagef(meanVari, coord_para); - coord_para.x += 4; + mean_vari += read_imagef(meanVari, coord_para.yx); + coord_para.y += 4; } mean_vari *= dimRatio; mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; mean_vari.s1 = rsqrt(mean_vari.s1); scale_vari = scale_f.s0 * mean_vari.s1; - vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3; - vxc_short8 outval; + vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3, norm; half4 tmpVal0, tmpVal1; float alpha = input_fl_scale * scale_vari; bias_val = (bias_f.s0 - scale_vari * mean_vari.s0); - vxc_half8 dst; + + coord_para = coord; + int8 input_desc, output_desc; + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); + int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0; + _viv_asm(MOV, coord.z, baseAddr_a); + + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); + int baseAddr = (int)get_global_id(1) * output_desc.s4 + output_desc.s0; + _viv_asm(MOV, coord_para.z, baseAddr); for(coord.y = 0; coord.y < height;) { - coord_para = coord; - VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\ - VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + coord_para.xy = coord.xy; coord.y++; - VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - uniConvertDirInt8Fp32_4x4); - VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - uniConvertEndInt8Fp32_4x4); - VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - uniConvertTrdInt8Fp32_4x4); - VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - uniConvertFthInt8Fp32_4x4); + VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDirInt8Fp32_4x4); + VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertEndInt8Fp32_4x4); + VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertTrdInt8Fp32_4x4); + VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFthInt8Fp32_4x4); - vxc_float4 norm; norm = alpha * tmpData0 + bias_val; _viv_asm(CONV, tmpVal0, norm); norm = alpha * tmpData1 + bias_val; _viv_asm(CONV, tmpVal1, norm); - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\ - uniConvertHalfToFp16_2x8); + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8); _viv_asm(COPY, outval, dst, 16); - VXC_WriteImage2DArray(output, coord_para, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_OP4_NoDest(img_store_3d, output, coord_para, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); coord_para.x += 8; norm = alpha * tmpData2 + bias_val; _viv_asm(CONV, tmpVal0, norm); norm = alpha * tmpData3 + bias_val; _viv_asm(CONV, tmpVal1, norm); - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\ - uniConvertHalfToFp16_2x8); + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8); _viv_asm(COPY, outval, dst, 16); - VXC_WriteImage2DArray(output, coord_para, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_OP4_NoDest(img_store_3d, output, coord_para, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); } } __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I8toF16_2D( - image2d_array_t input, - image2d_array_t bias, - image2d_array_t scale, - image2d_array_t meanVari, - image2d_array_t output, - float eps, - int rsFlg) + image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari, + image2d_array_t output, float eps, int rsFlg) { int gidz = get_global_id(1); int gidy = gidz * height; @@ -239,59 +217,48 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I8to int4 coord_para = (int4)(gidz, 0, 0, 0); int endH = gidy + height; vxc_char16 src0; - vxc_short8 src1; - vxc_half8 scale_h; + vxc_short8 src1, outval; + vxc_half8 scale_h, dst; float scale_vari, bias_val; vxc_float4 bias_f, scale_f, mean_vari = (vxc_float4)(0); VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\ VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); _viv_asm(COPY, scale_h, src1, 16); - VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - UniFP16toFP32Lo4_dp4x4); + VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4); bias_f = read_imagef(bias, coord_para); - coord_para.x = 0; - coord_para.y = gidz; for(int i = 0; i < group_num; i++) { - mean_vari += read_imagef(meanVari, coord_para); - coord_para.x += 4; + mean_vari += read_imagef(meanVari, coord_para.yx); + coord_para.y += 4; } mean_vari *= dimRatio; mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; mean_vari.s1 = rsqrt(mean_vari.s1); scale_vari = scale_f.s0 * mean_vari.s1; - vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3; - vxc_short8 outval; + vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3, norm; half4 tmpVal0, tmpVal1; float alpha = input_fl_scale * scale_vari; bias_val = (bias_f.s0 - scale_vari * mean_vari.s0); - vxc_half8 dst; for(; coord.y < endH;) { - coord_para = coord; VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\ VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + coord_para = coord; coord.y++; - VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - uniConvertDirInt8Fp32_4x4); - VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - uniConvertEndInt8Fp32_4x4); - VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - uniConvertTrdInt8Fp32_4x4); - VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - uniConvertFthInt8Fp32_4x4); - vxc_float4 norm; + VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDirInt8Fp32_4x4); + VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertEndInt8Fp32_4x4); + VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertTrdInt8Fp32_4x4); + VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFthInt8Fp32_4x4); norm = alpha * tmpData0 + bias_val; _viv_asm(CONV, tmpVal0, norm); norm = alpha * tmpData1 + bias_val; _viv_asm(CONV, tmpVal1, norm); - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\ - uniConvertHalfToFp16_2x8); + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8); _viv_asm(COPY, outval, dst, 16); VXC_WriteImage(output, coord_para.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); coord_para.x += 8; @@ -299,21 +266,15 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I8to _viv_asm(CONV, tmpVal0, norm); norm = alpha * tmpData3 + bias_val; _viv_asm(CONV, tmpVal1, norm); - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\ - uniConvertHalfToFp16_2x8); + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8); _viv_asm(COPY, outval, dst, 16); VXC_WriteImage(output, coord_para.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); } } __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I8toI8( - image2d_array_t input, - image2d_array_t bias, - image2d_array_t scale, - image2d_array_t meanVari, - image2d_array_t output, - float eps, - int rsFlg) + image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari, + image2d_array_t output, float eps, int rsFlg) { int gidz = get_global_id(1); int4 coord = (int4)(get_global_id(0), 0, gidz, 0); @@ -330,12 +291,10 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I8to VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4); bias_f = read_imagef(bias, coord_para); - coord_para.x = 0; - coord_para.y = gidz; for(int i = 0; i < group_num; i++) { - mean_vari += read_imagef(meanVari, coord_para); - coord_para.x += 4; + mean_vari += read_imagef(meanVari, coord_para.yx); + coord_para.y += 4; } mean_vari *= dimRatio; mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; @@ -343,47 +302,44 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I8to scale_vari = scale_f.s0 * mean_vari.s1; vxc_int4 tmpVal0, tmpVal1; - vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3; + vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3, norm; float alpha = inOut_fl_scale * scale_vari; bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * output_fl_scale; + int8 input_desc, output_desc; + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); + int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0; + _viv_asm(MOV, coord.z, baseAddr_a); + + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); + int baseAddr = (int)get_global_id(1) * output_desc.s4 + output_desc.s0; + _viv_asm(MOV, coord.w, baseAddr); + for(coord.y = 0; coord.y < height; coord.y++) { - VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\ - VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); - VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - uniConvertDirInt8Fp32_4x4); - VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - uniConvertEndInt8Fp32_4x4); - VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - uniConvertTrdInt8Fp32_4x4); - VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - uniConvertFthInt8Fp32_4x4); - vxc_float4 norm; + VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDirInt8Fp32_4x4); + VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertEndInt8Fp32_4x4); + VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertTrdInt8Fp32_4x4); + VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFthInt8Fp32_4x4); norm = tmpData0 * alpha + bias_val; tmpVal0 = convert_int4_rte(norm); norm = tmpData1 * alpha + bias_val; tmpVal1 = convert_int4_rte(norm); - VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ - uniConvertInt32toUint8_2x8); + VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8); norm = tmpData2 * alpha + bias_val; tmpVal0 = convert_int4_rte(norm); norm = tmpData3 * alpha + bias_val; tmpVal1 = convert_int4_rte(norm); - VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1),\ - uniConvertInt32toUint8_2x8); - VXC_WriteImage2DArray(output, coord, src2, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8); + VXC_OP4_NoDest(img_store_3d, output, coord.xyww, src2, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); } } __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I8toI8_2D( - image2d_array_t input, - image2d_array_t bias, - image2d_array_t scale, - image2d_array_t meanVari, - image2d_array_t output, - float eps, - int rsFlg) + image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari, + image2d_array_t output, float eps, int rsFlg) { int gidz = get_global_id(1); int gidy = gidz * height; @@ -402,12 +358,10 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I8to VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4); bias_f = read_imagef(bias, coord_para); - coord_para.x = 0; - coord_para.y = gidz; for(int i = 0; i < group_num; i++) { - mean_vari += read_imagef(meanVari, coord_para); - coord_para.x += 4; + mean_vari += read_imagef(meanVari, coord_para.yx); + coord_para.y += 4; } mean_vari *= dimRatio; mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; @@ -415,35 +369,27 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I8to scale_vari = scale_f.s0 * mean_vari.s1; vxc_int4 tmpVal0, tmpVal1; - vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3; + vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3, norm; float alpha = inOut_fl_scale * scale_vari; bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * output_fl_scale; for(; coord.y < endH; coord.y++) { - VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\ - VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); - VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - uniConvertDirInt8Fp32_4x4); - VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - uniConvertEndInt8Fp32_4x4); - VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - uniConvertTrdInt8Fp32_4x4); - VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - uniConvertFthInt8Fp32_4x4); - vxc_float4 norm; + VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDirInt8Fp32_4x4); + VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertEndInt8Fp32_4x4); + VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertTrdInt8Fp32_4x4); + VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFthInt8Fp32_4x4); norm = tmpData0 * alpha + bias_val; tmpVal0 = convert_int4_rte(norm); norm = tmpData1 * alpha + bias_val; tmpVal1 = convert_int4_rte(norm); - VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ - uniConvertInt32toUint8_2x8); + VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8); norm = tmpData2 * alpha + bias_val; tmpVal0 = convert_int4_rte(norm); norm = tmpData3 * alpha + bias_val; tmpVal1 = convert_int4_rte(norm); - VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1),\ - uniConvertInt32toUint8_2x8); + VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8); VXC_WriteImage(output, coord, src2, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); } } diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_scale_f32.vx b/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_scale_f32.vx new file mode 100644 index 0000000..845945c --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_scale_f32.vx @@ -0,0 +1,285 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform int height; +_viv_uniform float dimRatio; +_viv_uniform int group_num; +_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8; + +_viv_uniform VXC_512Bits uniConvert1stUint8SubZpToFp32_4x4; +_viv_uniform VXC_512Bits uniConvert2ndUint8SubZpToFp32_4x4; +_viv_uniform VXC_512Bits uniConvert3rdUint8SubZpToFp32_4x4; +_viv_uniform VXC_512Bits uniConvert4thUint8SubZpToFp32_4x4; +_viv_uniform int inputZP; +_viv_uniform float scale_inOut; +_viv_uniform float outputScale; +_viv_uniform int output_ZP; +_viv_uniform float inOut_fl_scale; +_viv_uniform float output_fl_scale; +_viv_uniform VXC_512Bits uniConvertInt16Fp32Fst_4x4; +_viv_uniform VXC_512Bits uniConvertInt16Fp32Secd_4x4; +_viv_uniform VXC_512Bits uniConvertInt32toInt16_2x8; + +#define INSTANCENORM_8BITS_F32(src1_type_name, read_type) \ +__kernel void instance_norm_##src1_type_name##F32to##src1_type_name( \ + image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari, \ + image2d_array_t output, float eps, int rsFlg) \ +{ \ + int gidz = get_global_id(1); \ + int4 coord = (int4)(get_global_id(0), 0, gidz, 0); \ + int2 coord_para = (int2)(gidz, 0); \ + read_type src0, src2; \ + float scale_vari, bias_val; \ + vxc_float4 mean_vari = (vxc_float4)(0); \ + \ + Image img1 = create_image_from_image2d(bias, 4); \ + Image img2 = create_image_from_image2d(scale, 4); \ + Image img3 = create_image_from_image2d(meanVari, 4); \ + __global float* bias_ptr = (__global float*)img1.ptr; \ + __global float* scal_ptr = (__global float*)img2.ptr; \ + __global uchar* sumVari_ptr = (__global uchar*)get_image_ptr_from_coord(img3, coord_para.yx); \ + __global float4* vari_ptr = (__global float4*)sumVari_ptr; \ + \ + float bval = bias_ptr[gidz]; \ + float sval = scal_ptr[gidz]; \ + \ + for(int i = 0; i < group_num; i++) \ + { \ + mean_vari += vari_ptr[i]; \ + } \ + mean_vari *= dimRatio; \ + mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; \ + mean_vari.s1 = rsqrt(mean_vari.s1); \ + \ + scale_vari = sval * mean_vari.s1; \ + short zp = inputZP; \ + vxc_int4 tmpVal0, tmpVal1; \ + vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3, norm; \ + float alpha = scale_inOut * scale_vari; \ + bias_val = (bval - scale_vari * mean_vari.s0) * outputScale + output_ZP; \ + \ + int8 input_desc, output_desc; \ + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); \ + int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0; \ + _viv_asm(MOV, coord.z, baseAddr_a); \ + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \ + int baseAddr = (int)get_global_id(1) * output_desc.s4 + output_desc.s0; \ + _viv_asm(MOV, coord.w, baseAddr); \ + \ + for(coord.y = 0; coord.y < height; coord.y++) \ + { \ + VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ + VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniConvert1stUint8SubZpToFp32_4x4); \ + VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniConvert2ndUint8SubZpToFp32_4x4); \ + VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniConvert3rdUint8SubZpToFp32_4x4); \ + VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniConvert4thUint8SubZpToFp32_4x4); \ + norm = tmpData0 * alpha + bias_val; \ + tmpVal0 = convert_int4_rte(norm); \ + norm = tmpData1 * alpha + bias_val; \ + tmpVal1 = convert_int4_rte(norm); \ + VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8); \ + norm = tmpData2 * alpha + bias_val; \ + tmpVal0 = convert_int4_rte(norm); \ + norm = tmpData3 * alpha + bias_val; \ + tmpVal1 = convert_int4_rte(norm); \ + VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8); \ + VXC_OP4_NoDest(img_store_3d, output, coord.xyww, src2, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); \ + } \ +} +INSTANCENORM_8BITS_F32(U8, vxc_uchar16) +INSTANCENORM_8BITS_F32(I8, vxc_char16) + +#define INSTANCENORM_8BITS_F32_2D(src1_type_name, read_type) \ +__kernel void instance_norm_##src1_type_name##F32to##src1_type_name##_2D( \ + image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari, \ + image2d_array_t output, float eps, int rsFlg) \ +{ \ + int gidz = get_global_id(1); \ + int gidy = gidz * height; \ + int2 coord = (int2)(get_global_id(0), gidy); \ + int2 coord_para = (int2)(gidz, 0); \ + int endH = gidy + height; \ + read_type src0, src2; \ + float scale_vari, bias_val; \ + vxc_float4 mean_vari = (vxc_float4)(0); \ + \ + Image img1 = create_image_from_image2d(bias, 4); \ + Image img2 = create_image_from_image2d(scale, 4); \ + Image img3 = create_image_from_image2d(meanVari, 4); \ + __global float* bias_ptr = (__global float*)img1.ptr; \ + __global float* scal_ptr = (__global float*)img2.ptr; \ + __global uchar* sumVari_ptr = (__global uchar*)get_image_ptr_from_coord(img3, coord_para.yx); \ + __global float4* vari_ptr = (__global float4*)sumVari_ptr; \ + \ + float bval = bias_ptr[gidz]; \ + float sval = scal_ptr[gidz]; \ + \ + for(int i = 0; i < group_num; i++) \ + { \ + mean_vari += vari_ptr[i]; \ + } \ + \ + mean_vari *= dimRatio; \ + mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; \ + mean_vari.s1 = rsqrt(mean_vari.s1); \ + \ + scale_vari = sval * mean_vari.s1; \ + short zp = inputZP; \ + vxc_int4 tmpVal0, tmpVal1; \ + vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3, norm; \ + float alpha = scale_inOut * scale_vari; \ + bias_val = (bval - scale_vari * mean_vari.s0) * outputScale + output_ZP; \ + \ + for(; coord.y < endH; coord.y++) \ + { \ + VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ + VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniConvert1stUint8SubZpToFp32_4x4); \ + VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniConvert2ndUint8SubZpToFp32_4x4); \ + VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniConvert3rdUint8SubZpToFp32_4x4); \ + VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniConvert4thUint8SubZpToFp32_4x4); \ + norm = tmpData0 * alpha + bias_val; \ + tmpVal0 = convert_int4_rte(norm); \ + norm = tmpData1 * alpha + bias_val; \ + tmpVal1 = convert_int4_rte(norm); \ + VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8); \ + norm = tmpData2 * alpha + bias_val; \ + tmpVal0 = convert_int4_rte(norm); \ + norm = tmpData3 * alpha + bias_val; \ + tmpVal1 = convert_int4_rte(norm); \ + VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8); \ + VXC_WriteImage(output, coord, src2, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ + } \ +} +INSTANCENORM_8BITS_F32_2D(U8, vxc_uchar16) +INSTANCENORM_8BITS_F32_2D(I8, vxc_char16) + +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I16F32toI16( + image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari, + image2d_array_t output, float eps, int rsFlg) +{ + int gidz = get_global_id(1); + int4 coord = (int4)(get_global_id(0), 0, gidz, 0); + int2 coord_para = (int2)(gidz, 0); + vxc_short8 src0, src2; + float scale_vari, bias_val; + vxc_float4 mean_vari = (vxc_float4)(0); + + Image img1 = create_image_from_image2d(bias, 4); + Image img2 = create_image_from_image2d(scale, 4); + Image img3 = create_image_from_image2d(meanVari, 4); + __global float* bias_ptr = (__global float*)img1.ptr; + __global float* scal_ptr = (__global float*)img2.ptr; + __global uchar* sumVari_ptr = (__global uchar*)get_image_ptr_from_coord(img3, coord_para.yx); + __global float4* vari_ptr = (__global float4*)sumVari_ptr; + + float bval = bias_ptr[gidz]; + float sval = scal_ptr[gidz]; + + for(int i = 0; i < group_num; i++) + { + mean_vari += vari_ptr[i]; + } + + mean_vari *= dimRatio; + mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; + mean_vari.s1 = rsqrt(mean_vari.s1); + + scale_vari = sval * mean_vari.s1; + vxc_int4 tmpVal0, tmpVal1; + vxc_float4 tmpData0, tmpData1; + float alpha = inOut_fl_scale * scale_vari; + bias_val = (bval - scale_vari * mean_vari.s0) * output_fl_scale; + + int8 input_desc, output_desc; + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); + int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0; + _viv_asm(MOV, coord.z, baseAddr_a); + + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); + int baseAddr = (int)get_global_id(1) * output_desc.s4 + output_desc.s0; + _viv_asm(MOV, coord.w, baseAddr); + for(coord.y = 0; coord.y < height; coord.y++) + { + VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvertInt16Fp32Fst_4x4); + VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvertInt16Fp32Secd_4x4); + vxc_float4 norm; + norm = tmpData0 * alpha + bias_val; + tmpVal0 = convert_int4_rte(norm); + norm = tmpData1 * alpha + bias_val; + tmpVal1 = convert_int4_rte(norm); + VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ + uniConvertInt32toInt16_2x8); + VXC_OP4_NoDest(img_store_3d, output, coord.xyww, src2, \ + VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); + } +} + +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I16F32toI16_2D( + image2d_t input, image2d_t bias, image2d_t scale, + image2d_t meanVari, image2d_array_t output, float eps, int rsFlg) +{ + int gidz = get_global_id(1); + int gidy = gidz * height; + int2 coord = (int2)(get_global_id(0), gidy); + int2 coord_para = (int2)(gidz, 0); + int endH = gidy + height; + vxc_short8 src0, src2; + float scale_vari, bias_val; + vxc_float4 mean_vari = (vxc_float4)(0); + + Image img1 = create_image_from_image2d(bias, 4); + Image img2 = create_image_from_image2d(scale, 4); + Image img3 = create_image_from_image2d(meanVari, 4); + __global float* bias_ptr = (__global float*)img1.ptr; + __global float* scal_ptr = (__global float*)img2.ptr; + __global uchar* sumVari_ptr = (__global uchar*)get_image_ptr_from_coord(img3, coord_para.yx); + __global float4* vari_ptr = (__global float4*)sumVari_ptr; + + float bval = bias_ptr[gidz]; + float sval = scal_ptr[gidz]; + + for(int i = 0; i < group_num; i++) + { + mean_vari += vari_ptr[i]; + } + + mean_vari *= dimRatio; + mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; + mean_vari.s1 = rsqrt(mean_vari.s1); + + scale_vari = sval * mean_vari.s1; + vxc_int4 tmpVal0, tmpVal1; + vxc_float4 tmpData0, tmpData1; + float alpha = inOut_fl_scale * scale_vari; + bias_val = (bval - scale_vari * mean_vari.s0) * output_fl_scale; + + for(; coord.y < endH; coord.y++) + { + VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvertInt16Fp32Fst_4x4); + VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvertInt16Fp32Secd_4x4); + vxc_float4 norm; + norm = tmpData0 * alpha + bias_val; + tmpVal0 = convert_int4_rte(norm); + norm = tmpData1 * alpha + bias_val; + tmpVal1 = convert_int4_rte(norm); + VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ + uniConvertInt32toInt16_2x8); + VXC_WriteImage(output, coord, src2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + } +} \ No newline at end of file diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_scale_f32_bf16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_scale_f32_bf16.vx new file mode 100644 index 0000000..771b319 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_scale_f32_bf16.vx @@ -0,0 +1,253 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform int width; +_viv_uniform int height; +_viv_uniform float dimRatio; +_viv_uniform int group_num; +_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8; +_viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8; +_viv_uniform VXC_512Bits uniExtractOddData_2x8; + +constant vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0); +constant float4 one = (float4)(1.0, 1.0, 1.0, 1.0); + +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_meanvari_BF16( + image2d_array_t input, image2d_array_t output, float eps, int rsFlg) +{ + int gidx = get_global_id(0) << 3; + int lidx = get_local_id(0); + int gidz = get_global_id(1); + int4 coord = (int4)(gidx, 0, gidz, 0); + vxc_short8 src0, src1, src2; + float4 srcA, srcB; + vxc_float sum = 0, sqr = 0; + + __local float lcl_sum[16]; + __local float lcl_sqr[16]; + + int8 input_desc; + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); + int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0; + _viv_asm(MOV, coord.z, baseAddr_a); + + if(gidx < width) + { + for(coord.y = 0; coord.y < height;) + { + VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + coord.y++; + VXC_DP2x8(src1, src0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), + uniConvBF16toF32_Part0_2x8); + VXC_DP2x8(src2, src0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), + uniConvBF16toF32_Part1_2x8); + _viv_asm(COPY, srcA, src1, 16); + _viv_asm(COPY, srcB, src2, 16); + sum += dot(srcA, one) + dot(srcB, one); + sqr += dot(srcA * srcA, one) + dot(srcB * srcB, one); + } + } + + lcl_sum[lidx] = sum; + lcl_sqr[lidx] = sqr; + barrier(CLK_LOCAL_MEM_FENCE); + + int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0); + if(lidx == 0) + { + __local float4* tmp_sum = (__local float4*)lcl_sum; + __local float4* tmp_sqr = (__local float4*)lcl_sqr; + + sum = 0; + sqr = 0; + for(int i = 0; i < 4; i++) + { + sum += dot(tmp_sum[i], one); + sqr += dot(tmp_sqr[i], one); + } + + float4 data = (float4)(sum, sqr, 0, 0); + write_imagef(output, coord_out, data); + } +} + +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_meanvari_BF16_2D( + image2d_array_t input, image2d_array_t output, float eps, int rsFlg) +{ + int gidx = get_global_id(0) << 3; + int lidx = get_local_id(0); + int gidz = get_global_id(1); + int gidy = gidz * height; + + int2 coord = (int2)(gidx, gidy); + vxc_short8 src0, src1, src2; + float4 srcA, srcB; + vxc_float sum = 0, sqr = 0; + + __local float lcl_sum[16]; + __local float lcl_sqr[16]; + + int endH = gidy + height; + if(gidx < width) + { + for(; coord.y < endH;) + { + VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + coord.y++; + VXC_DP2x8(src1, src0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), + uniConvBF16toF32_Part0_2x8); + VXC_DP2x8(src2, src0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), + uniConvBF16toF32_Part1_2x8); + _viv_asm(COPY, srcA, src1, 16); + _viv_asm(COPY, srcB, src2, 16); + sum += dot(srcA, one) + dot(srcB, one); + sqr += dot(srcA * srcA, one) + dot(srcB * srcB, one); + } + } + + lcl_sum[lidx] = sum; + lcl_sqr[lidx] = sqr; + barrier(CLK_LOCAL_MEM_FENCE); + + int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0); + if(lidx == 0) + { + __local float4* tmp_sum = (__local float4*)lcl_sum; + __local float4* tmp_sqr = (__local float4*)lcl_sqr; + + sum = 0; + sqr = 0; + for(int i = 0; i < 4; i++) + { + sum += dot(tmp_sum[i], one); + sqr += dot(tmp_sqr[i], one); + } + + float4 data = (float4)(sum, sqr, 0, 0); + write_imagef(output, coord_out, data); + } +} + +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_BF16F32toBF16( + image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari, + image2d_array_t output, float eps, int rsFlg) +{ + int gidz = get_global_id(1); + int4 coord = (int4)(get_global_id(0), 0, gidz, 0); + vxc_short8 src0, src1, src2; + float scale_vari, bias_val; + vxc_float4 mean_vari = (vxc_float4)(0); + + Image img1 = create_image_from_image2d(bias, 4); + Image img2 = create_image_from_image2d(scale, 4); + Image img3 = create_image_from_image2d(meanVari, 4); + __global float* bias_ptr = (__global float*)img1.ptr; + __global float* scal_ptr = (__global float*)img2.ptr; + __global uchar* sumVari_ptr = (__global uchar*)get_image_ptr_from_coord(img3, coord.wz); + __global float4* vari_ptr = (__global float4*)sumVari_ptr; + + float bval = bias_ptr[gidz]; + float sval = scal_ptr[gidz]; + + for(int i = 0; i < group_num; i++) + { + mean_vari += vari_ptr[i]; + } + + mean_vari *= dimRatio; + mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; + mean_vari.s1 = rsqrt(mean_vari.s1); + + scale_vari = sval * mean_vari.s1; + vxc_float4 tmpData0, tmpData1; + bias_val = (bval - scale_vari * mean_vari.s0); + + int8 input_desc, output_desc; + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); + int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0; + _viv_asm(MOV, coord.z, baseAddr_a); + + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); + int baseAddr = (int)get_global_id(1) * output_desc.s4 + output_desc.s0; + _viv_asm(MOV, coord.w, baseAddr); + + for(coord.y = 0; coord.y < height; coord.y++) + { + VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_DP2x8(src1, src0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), + uniConvBF16toF32_Part0_2x8); + VXC_DP2x8(src2, src0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), + uniConvBF16toF32_Part1_2x8); + _viv_asm(COPY, tmpData0, src1, 16); + _viv_asm(COPY, tmpData1, src2, 16); + + vxc_float4 norm; + norm = scale_vari * tmpData0 + bias_val; + _viv_asm(COPY, src0, norm, 16); + norm = scale_vari * tmpData1 + bias_val; + _viv_asm(COPY, src1, norm, 16); + VXC_DP2x8(src2, src0, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); + VXC_OP4_NoDest(img_store_3d, output, coord.xyww, src2, \ + VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); + } +} + +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_BF16F32toBF16_2D( + image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari, + image2d_array_t output, float eps, int rsFlg) +{ + int gidz = get_global_id(1); + int gidy = gidz * height; + int2 coord = (int2)(get_global_id(0), gidy); + int2 coord_para = (int2)(gidz, 0); + int endH = gidy + height; + vxc_short8 src0, src1, src2; + float scale_vari, bias_val; + vxc_float4 mean_vari = (vxc_float4)(0); + + Image img1 = create_image_from_image2d(bias, 4); + Image img2 = create_image_from_image2d(scale, 4); + Image img3 = create_image_from_image2d(meanVari, 4); + __global float* bias_ptr = (__global float*)img1.ptr; + __global float* scal_ptr = (__global float*)img2.ptr; + __global uchar* sumVari_ptr = (__global uchar*)get_image_ptr_from_coord(img3, coord_para.yx); + __global float4* vari_ptr = (__global float4*)sumVari_ptr; + + float bval = bias_ptr[gidz]; + float sval = scal_ptr[gidz]; + + for(int i = 0; i < group_num; i++) + { + mean_vari += vari_ptr[i]; + } + + mean_vari *= dimRatio; + mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; + mean_vari.s1 = rsqrt(mean_vari.s1); + + scale_vari = sval * mean_vari.s1; + vxc_float4 tmpData0, tmpData1; + bias_val = (bval - scale_vari * mean_vari.s0); + + for(; coord.y < endH; coord.y++) + { + VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_DP2x8(src1, src0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), + uniConvBF16toF32_Part0_2x8); + VXC_DP2x8(src2, src0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), + uniConvBF16toF32_Part1_2x8); + _viv_asm(COPY, tmpData0, src1, 16); + _viv_asm(COPY, tmpData1, src2, 16); + + vxc_float4 norm; + norm = scale_vari * tmpData0 + bias_val; + _viv_asm(COPY, src0, norm, 16); + norm = scale_vari * tmpData1 + bias_val; + _viv_asm(COPY, src1, norm, 16); + VXC_DP2x8(src2, src0, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); + VXC_WriteImage(output, coord.xy, src2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + } +} \ No newline at end of file diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_scale_f32_f16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_scale_f32_f16.vx new file mode 100644 index 0000000..81e5ec5 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_scale_f32_f16.vx @@ -0,0 +1,143 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform int height; +_viv_uniform float dimRatio; +_viv_uniform int group_num; +_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4; +_viv_uniform VXC_512Bits uniConvertHalfToFp16_2x8; + +_viv_uniform VXC_512Bits uniConvertEndInt16Fp32_4x4; + +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_F16F32toF16( + image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari, + image2d_array_t output, float eps, int rsFlg) +{ + int gidz = get_global_id(1); + int4 coord = (int4)(get_global_id(0), 0, gidz, 0); + vxc_short8 src0; + vxc_half8 in_h; + float scale_vari, bias_val; + vxc_float4 mean_vari = (vxc_float4)(0); + + Image img1 = create_image_from_image2d(bias, 4); + Image img2 = create_image_from_image2d(scale, 4); + Image img3 = create_image_from_image2d(meanVari, 4); + __global float* bias_ptr = (__global float*)img1.ptr; + __global float* scal_ptr = (__global float*)img2.ptr; + __global uchar* sumVari_ptr = (__global uchar*)get_image_ptr_from_coord(img3, coord.wz); + __global float4* vari_ptr = (__global float4*)sumVari_ptr; + + float bval = bias_ptr[gidz]; + float sval = scal_ptr[gidz]; + + for(int i = 0; i < group_num; i++) + { + mean_vari += vari_ptr[i]; + } + + mean_vari *= dimRatio; + mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; + mean_vari.s1 = rsqrt(mean_vari.s1); + + scale_vari = sval * mean_vari.s1; + vxc_float4 tmpData0, tmpData1; + vxc_short8 outval; + half4 tmpVal0, tmpVal1; + bias_val = (bval - scale_vari * mean_vari.s0); + vxc_half8 dst; + + int8 input_desc, output_desc; + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); + int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0; + _viv_asm(MOV, coord.z, baseAddr_a); + + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); + int baseAddr = (int)get_global_id(1) * output_desc.s4 + output_desc.s0; + _viv_asm(MOV, coord.w, baseAddr); + + for(coord.y = 0; coord.y < height; coord.y++) + { + VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, in_h, src0, 16); + + VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + UniFP16toFP32Lo4_dp4x4); + VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvertEndInt16Fp32_4x4); + + vxc_float4 norm; + norm = scale_vari * tmpData0 + bias_val; + _viv_asm(CONV, tmpVal0, norm); + norm = scale_vari * tmpData1 + bias_val; + _viv_asm(CONV, tmpVal1, norm); + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\ + uniConvertHalfToFp16_2x8); + _viv_asm(COPY, outval, dst, 16); + VXC_OP4_NoDest(img_store_3d, output, coord.xyww, outval, \ + VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); + } +} + +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_F16F32toF16_2D( + image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari, + image2d_array_t output, float eps, int rsFlg) +{ + int gidz = get_global_id(1); + int gidy = gidz * height; + int2 coord = (int2)(get_global_id(0), gidy); + int2 coord_para = (int2)(gidz, 0); + int endH = gidy + height; + vxc_short8 src0; + vxc_half8 in_h; + float scale_vari, bias_val; + vxc_float4 mean_vari = (vxc_float4)(0); + + Image img1 = create_image_from_image2d(bias, 4); + Image img2 = create_image_from_image2d(scale, 4); + Image img3 = create_image_from_image2d(meanVari, 4); + __global float* bias_ptr = (__global float*)img1.ptr; + __global float* scal_ptr = (__global float*)img2.ptr; + __global uchar* sumVari_ptr = (__global uchar*)get_image_ptr_from_coord(img3, coord_para.yx); + __global float4* vari_ptr = (__global float4*)sumVari_ptr; + + float bval = bias_ptr[gidz]; + float sval = scal_ptr[gidz]; + + for(int i = 0; i < group_num; i++) + { + mean_vari += vari_ptr[i]; + } + + mean_vari *= dimRatio; + mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; + mean_vari.s1 = rsqrt(mean_vari.s1); + + scale_vari = sval * mean_vari.s1; + vxc_float4 tmpData0, tmpData1; + vxc_short8 outval; + half4 tmpVal0, tmpVal1; + bias_val = (bval - scale_vari * mean_vari.s0); + vxc_half8 dst; + + for(; coord.y < endH; coord.y++) + { + VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, in_h, src0, 16); + + VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + UniFP16toFP32Lo4_dp4x4); + VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvertEndInt16Fp32_4x4); + vxc_float4 norm; + norm = scale_vari * tmpData0 + bias_val; + _viv_asm(CONV, tmpVal0, norm); + norm = scale_vari * tmpData1 + bias_val; + _viv_asm(CONV, tmpVal1, norm); + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\ + uniConvertHalfToFp16_2x8); + _viv_asm(COPY, outval, dst, 16); + VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + } +} \ No newline at end of file diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_u8.vx b/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_u8.vx index 68f8f8a..4becc2b 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_u8.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_u8.vx @@ -5,7 +5,6 @@ _viv_uniform int height; _viv_uniform float dimRatio; _viv_uniform int group_num; _viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4; -_viv_uniform VXC_512Bits uniConvertHalfToFp16_2x8; _viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8; _viv_uniform VXC_512Bits uniConvert1stUint8SubZpToFp32_4x4; @@ -25,9 +24,7 @@ _viv_uniform float outputScale; _viv_uniform int output_ZP; __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_meanvari_U8( - image2d_array_t input, - image2d_array_t output, - float eps, int rsFlg) + image2d_array_t input, image2d_array_t output, float eps, int rsFlg) { int gidx = get_global_id(0) << 4; int lidx = get_local_id(0); @@ -35,17 +32,20 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_mean int4 coord = (int4)(gidx, 0, gidz, 0); vxc_uchar16 src0; float sum = 0, sqr = 0; - int tmpSum = 0, tmpSqr = 0, tmpSum1, tmpSqr1; + int tmpSum = 0, tmpSqr = 0, tmpSum1 = 0, tmpSqr1 = 0; __local float lcl_sum[16]; __local float lcl_sqr[16]; - + int8 input_desc; + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); + int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0; + _viv_asm(MOV, coord.z, baseAddr_a); if(gidx < width) { for(coord.y = 0; coord.y < height;) { - VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), - VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); coord.y++; VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumU8_16x1); tmpSum += (tmpSum1); @@ -55,7 +55,6 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_mean sqr += (tmpSqr * e2InScale + rowSumScale); sum = (tmpSum + sumInZp) * input_scale; } - lcl_sum[lidx] = sum; lcl_sqr[lidx] = sqr; barrier(CLK_LOCAL_MEM_FENCE); @@ -66,23 +65,19 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_mean float4 one = (float4)(1, 1, 1, 1); __local float4* tmp_sum = (__local float4*)lcl_sum; __local float4* tmp_sqr = (__local float4*)lcl_sqr; - sum = 0; sqr = 0; for(int i = 0; i < 4; i++) { sum += dot(tmp_sum[i], one); sqr += dot(tmp_sqr[i], one); } - float4 data = (float4)(sum, sqr, 0, 0); write_imagef(output, coord_out, data); } } __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_meanvari_U8_2D( - image2d_array_t input, - image2d_array_t output, - float eps, int rsFlg) + image2d_array_t input, image2d_array_t output, float eps, int rsFlg) { int gidx = get_global_id(0) << 4; int lidx = get_local_id(0); @@ -93,17 +88,16 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_mean vxc_uchar16 src0; float sum = 0, sqr = 0; int tmpSum = 0, tmpSqr = 0, tmpSum1, tmpSqr1; + int endH = gidy + height; __local float lcl_sum[16]; __local float lcl_sqr[16]; - - int endH = gidy + height; if(gidx < width) { for(; coord.y < endH;) { VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), - VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); coord.y++; VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumU8_16x1); tmpSum += (tmpSum1); @@ -113,7 +107,6 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_mean sqr += (tmpSqr * e2InScale + rowSumScale); sum = (tmpSum + sumInZp) * input_scale; } - lcl_sum[lidx] = sum; lcl_sqr[lidx] = sqr; barrier(CLK_LOCAL_MEM_FENCE); @@ -124,192 +117,20 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_mean float4 one = (float4)(1, 1, 1, 1); __local float4* tmp_sum = (__local float4*)lcl_sum; __local float4* tmp_sqr = (__local float4*)lcl_sqr; - sum = 0; sqr = 0; for(int i = 0; i < 4; i++) { sum += dot(tmp_sum[i], one); sqr += dot(tmp_sqr[i], one); } - float4 data = (float4)(sum, sqr, 0, 0); write_imagef(output, coord_out, data); } } -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_U8toF16( - image2d_array_t input, - image2d_array_t bias, - image2d_array_t scale, - image2d_array_t meanVari, - image2d_array_t output, - float eps, - int rsFlg) -{ - int gidz = get_global_id(1); - int4 coord = (int4)(get_global_id(0), 0, gidz, 0); - int4 coord_para = (int4)(gidz, 0, 0, 0); - vxc_uchar16 src0; - vxc_short8 src1; - vxc_half8 scale_h; - float scale_vari, bias_val; - vxc_float4 bias_f, scale_f, mean_vari = (vxc_float4)(0); - - VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\ - VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); - _viv_asm(COPY, scale_h, src1, 16); - VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - UniFP16toFP32Lo4_dp4x4); - - bias_f = read_imagef(bias, coord_para); - - coord_para.x = 0; - coord_para.y = gidz; - for(int i = 0; i < group_num; i++) - { - mean_vari += read_imagef(meanVari, coord_para); - coord_para.x += 4; - } - mean_vari *= dimRatio; - mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; - mean_vari.s1 = rsqrt(mean_vari.s1); - - scale_vari = scale_f.s0 * mean_vari.s1; - short zp = inputZP; - vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3; - vxc_short8 outval; - half4 tmpVal0, tmpVal1; - float alpha = input_scale * scale_vari; - bias_val = (bias_f.s0 - scale_vari * mean_vari.s0); - vxc_half8 dst; - - for(coord.y = 0; coord.y < height;) - { - coord_para = coord; - VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\ - VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); - coord.y++; - - VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - uniConvert1stUint8SubZpToFp32_4x4); - VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - uniConvert2ndUint8SubZpToFp32_4x4); - VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - uniConvert3rdUint8SubZpToFp32_4x4); - VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - uniConvert4thUint8SubZpToFp32_4x4); - vxc_float4 norm; - norm = alpha * tmpData0 + bias_val; - _viv_asm(CONV, tmpVal0, norm); - norm = alpha * tmpData1 + bias_val; - _viv_asm(CONV, tmpVal1, norm); - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\ - uniConvertHalfToFp16_2x8); - _viv_asm(COPY, outval, dst, 16); - VXC_WriteImage2DArray(output, coord_para, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - coord_para.x += 8; - norm = alpha * tmpData2 + bias_val; - _viv_asm(CONV, tmpVal0, norm); - norm = alpha * tmpData3 + bias_val; - _viv_asm(CONV, tmpVal1, norm); - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\ - uniConvertHalfToFp16_2x8); - _viv_asm(COPY, outval, dst, 16); - VXC_WriteImage2DArray(output, coord_para, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - } -} - -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_U8toF16_2D( - image2d_array_t input, - image2d_array_t bias, - image2d_array_t scale, - image2d_array_t meanVari, - image2d_array_t output, - float eps, - int rsFlg) -{ - int gidz = get_global_id(1); - int gidy = gidz * height; - int4 coord = (int4)(get_global_id(0), gidy, 0, 0); - int4 coord_para = (int4)(gidz, 0, 0, 0); - int endH = gidy + height; - vxc_uchar16 src0; - vxc_short8 src1; - vxc_half8 scale_h; - float scale_vari, bias_val; - vxc_float4 bias_f, scale_f, mean_vari = (vxc_float4)(0); - - VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\ - VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); - _viv_asm(COPY, scale_h, src1, 16); - VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - UniFP16toFP32Lo4_dp4x4); - - bias_f = read_imagef(bias, coord_para); - - coord_para.x = 0; - coord_para.y = gidz; - for(int i = 0; i < group_num; i++) - { - mean_vari += read_imagef(meanVari, coord_para); - coord_para.x += 4; - } - mean_vari *= dimRatio; - mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; - mean_vari.s1 = rsqrt(mean_vari.s1); - - scale_vari = scale_f.s0 * mean_vari.s1; - short zp = inputZP; - vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3; - vxc_short8 outval; - half4 tmpVal0, tmpVal1; - float alpha = input_scale * scale_vari; - bias_val = (bias_f.s0 - scale_vari * mean_vari.s0); - vxc_half8 dst; - - for(; coord.y < endH;) - { - coord_para = coord; - VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\ - VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); - coord.y++; - VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - uniConvert1stUint8SubZpToFp32_4x4); - VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - uniConvert2ndUint8SubZpToFp32_4x4); - VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - uniConvert3rdUint8SubZpToFp32_4x4); - VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - uniConvert4thUint8SubZpToFp32_4x4); - vxc_float4 norm; - norm = alpha * tmpData0 + bias_val; - _viv_asm(CONV, tmpVal0, norm); - norm = alpha * tmpData1 + bias_val; - _viv_asm(CONV, tmpVal1, norm); - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\ - uniConvertHalfToFp16_2x8); - _viv_asm(COPY, outval, dst, 16); - VXC_WriteImage(output, coord_para.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - coord_para.x += 8; - norm = alpha * tmpData2 + bias_val; - _viv_asm(CONV, tmpVal0, norm); - norm = alpha * tmpData3 + bias_val; - _viv_asm(CONV, tmpVal1, norm); - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\ - uniConvertHalfToFp16_2x8); - _viv_asm(COPY, outval, dst, 16); - VXC_WriteImage(output, coord_para.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - } -} - __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_U8toU8( - image2d_array_t input, - image2d_array_t bias, - image2d_array_t scale, - image2d_array_t meanVari, - image2d_array_t output, - float eps, - int rsFlg) + image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari, + image2d_array_t output, float eps, int rsFlg) { int gidz = get_global_id(1); int4 coord = (int4)(get_global_id(0), 0, gidz, 0); @@ -326,12 +147,10 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_U8to VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4); bias_f = read_imagef(bias, coord_para); - coord_para.x = 0; - coord_para.y = gidz; for(int i = 0; i < group_num; i++) { - mean_vari += read_imagef(meanVari, coord_para); - coord_para.x += 4; + mean_vari += read_imagef(meanVari, coord_para.yx); + coord_para.y += 4; } mean_vari *= dimRatio; mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; @@ -340,47 +159,43 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_U8to scale_vari = scale_f.s0 * mean_vari.s1; short zp = inputZP; vxc_int4 tmpVal0, tmpVal1; - vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3; + vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3, norm; float alpha = scale_inOut * scale_vari; bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * outputScale + output_ZP; - for(coord.y = 0; coord.y < height;coord.y++) + int8 input_desc, output_desc; + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); + int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0; + _viv_asm(MOV, coord.z, baseAddr_a); + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); + int baseAddr = (int)get_global_id(1) * output_desc.s4 + output_desc.s0; + _viv_asm(MOV, coord.w, baseAddr); + + for(coord.y = 0; coord.y < height; coord.y++) { - VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\ - VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); - VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - uniConvert1stUint8SubZpToFp32_4x4); - VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - uniConvert2ndUint8SubZpToFp32_4x4); - VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - uniConvert3rdUint8SubZpToFp32_4x4); - VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - uniConvert4thUint8SubZpToFp32_4x4); - vxc_float4 norm; + VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert1stUint8SubZpToFp32_4x4); + VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert2ndUint8SubZpToFp32_4x4); + VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert3rdUint8SubZpToFp32_4x4); + VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert4thUint8SubZpToFp32_4x4); norm = tmpData0 * alpha + bias_val; tmpVal0 = convert_int4_rte(norm); norm = tmpData1 * alpha + bias_val; tmpVal1 = convert_int4_rte(norm); - VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ - uniConvertInt32toUint8_2x8); + VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8); norm = tmpData2 * alpha + bias_val; tmpVal0 = convert_int4_rte(norm); norm = tmpData3 * alpha + bias_val; tmpVal1 = convert_int4_rte(norm); - VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1),\ - uniConvertInt32toUint8_2x8); - VXC_WriteImage2DArray(output, coord, src2, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8); + VXC_OP4_NoDest(img_store_3d, output, coord.xyww, src2, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); } } __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_U8toU8_2D( - image2d_array_t input, - image2d_array_t bias, - image2d_array_t scale, - image2d_array_t meanVari, - image2d_array_t output, - float eps, - int rsFlg) + image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari, + image2d_array_t output, float eps, int rsFlg) { int gidz = get_global_id(1); int gidy = gidz * height; @@ -399,12 +214,10 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_U8to VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4); bias_f = read_imagef(bias, coord_para); - coord_para.x = 0; - coord_para.y = gidz; for(int i = 0; i < group_num; i++) { - mean_vari += read_imagef(meanVari, coord_para); - coord_para.x += 4; + mean_vari += read_imagef(meanVari, coord_para.yx); + coord_para.y += 4; } mean_vari *= dimRatio; mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; @@ -413,35 +226,27 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_U8to scale_vari = scale_f.s0 * mean_vari.s1; short zp = inputZP; vxc_int4 tmpVal0, tmpVal1; - vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3; + vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3, norm; float alpha = scale_inOut * scale_vari; bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * outputScale + output_ZP; for(; coord.y < endH; coord.y++) { - VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\ - VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); - VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - uniConvert1stUint8SubZpToFp32_4x4); - VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - uniConvert2ndUint8SubZpToFp32_4x4); - VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - uniConvert3rdUint8SubZpToFp32_4x4); - VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - uniConvert4thUint8SubZpToFp32_4x4); - vxc_float4 norm; + VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert1stUint8SubZpToFp32_4x4); + VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert2ndUint8SubZpToFp32_4x4); + VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert3rdUint8SubZpToFp32_4x4); + VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert4thUint8SubZpToFp32_4x4); norm = tmpData0 * alpha + bias_val; tmpVal0 = convert_int4_rte(norm); norm = tmpData1 * alpha + bias_val; tmpVal1 = convert_int4_rte(norm); - VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ - uniConvertInt32toUint8_2x8); + VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8); norm = tmpData2 * alpha + bias_val; tmpVal0 = convert_int4_rte(norm); norm = tmpData3 * alpha + bias_val; tmpVal1 = convert_int4_rte(norm); - VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1),\ - uniConvertInt32toUint8_2x8); + VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8); VXC_WriteImage(output, coord, src2, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); } } \ No newline at end of file diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_u8_f16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_u8_f16.vx new file mode 100644 index 0000000..9602d13 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_u8_f16.vx @@ -0,0 +1,147 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform int width; +_viv_uniform int height; +_viv_uniform float dimRatio; +_viv_uniform int group_num; +_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4; +_viv_uniform VXC_512Bits uniConvertHalfToFp16_2x8; + +_viv_uniform VXC_512Bits uniConvert1stUint8SubZpToFp32_4x4; +_viv_uniform VXC_512Bits uniConvert2ndUint8SubZpToFp32_4x4; +_viv_uniform VXC_512Bits uniConvert3rdUint8SubZpToFp32_4x4; +_viv_uniform VXC_512Bits uniConvert4thUint8SubZpToFp32_4x4; +_viv_uniform float input_scale; +_viv_uniform int inputZP; + +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_U8toF16( + image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari, + image2d_array_t output, float eps, int rsFlg) +{ + int gidz = get_global_id(1); + int4 coord = (int4)(get_global_id(0), 0, gidz, 0); + int4 coord_para = (int4)(gidz, 0, 0, 0); + vxc_uchar16 src0; + vxc_short8 src1, outval; + vxc_half8 scale_h, dst; + float scale_vari, bias_val; + vxc_float4 bias_f, scale_f, mean_vari = (vxc_float4)(0); + + VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, scale_h, src1, 16); + VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4); + bias_f = read_imagef(bias, coord_para); + for(int i = 0; i < group_num; i++) + { + mean_vari += read_imagef(meanVari, coord_para.yx); + coord_para.y += 4; + } + mean_vari *= dimRatio; + mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; + mean_vari.s1 = rsqrt(mean_vari.s1); + scale_vari = scale_f.s0 * mean_vari.s1; + short zp = inputZP; + vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3, norm; + half4 tmpVal0, tmpVal1; + float alpha = input_scale * scale_vari; + bias_val = (bias_f.s0 - scale_vari * mean_vari.s0); + + coord_para = coord; + int8 input_desc, output_desc; + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); + int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0; + _viv_asm(MOV, coord.z, baseAddr_a); + + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); + int baseAddr = (int)get_global_id(1) * output_desc.s4 + output_desc.s0; + _viv_asm(MOV, coord_para.z, baseAddr); + for(coord.y = 0; coord.y < height;) + { + VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + coord_para.xy = coord.xy; + coord.y++; + VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert1stUint8SubZpToFp32_4x4); + VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert2ndUint8SubZpToFp32_4x4); + VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert3rdUint8SubZpToFp32_4x4); + VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert4thUint8SubZpToFp32_4x4); + norm = alpha * tmpData0 + bias_val; + _viv_asm(CONV, tmpVal0, norm); + norm = alpha * tmpData1 + bias_val; + _viv_asm(CONV, tmpVal1, norm); + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8); + _viv_asm(COPY, outval, dst, 16); + VXC_OP4_NoDest(img_store_3d, output, coord_para, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); + coord_para.x += 8; + norm = alpha * tmpData2 + bias_val; + _viv_asm(CONV, tmpVal0, norm); + norm = alpha * tmpData3 + bias_val; + _viv_asm(CONV, tmpVal1, norm); + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8); + _viv_asm(COPY, outval, dst, 16); + VXC_OP4_NoDest(img_store_3d, output, coord_para, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); + } +} + +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_U8toF16_2D( + image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari, + image2d_array_t output, float eps, int rsFlg) +{ + int gidz = get_global_id(1); + int gidy = gidz * height; + int4 coord = (int4)(get_global_id(0), gidy, 0, 0); + int4 coord_para = (int4)(gidz, 0, 0, 0); + int endH = gidy + height; + vxc_uchar16 src0; + vxc_short8 src1, outval; + vxc_half8 scale_h, dst; + float scale_vari, bias_val; + vxc_float4 bias_f, scale_f, mean_vari = (vxc_float4)(0); + + VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, scale_h, src1, 16); + VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4); + bias_f = read_imagef(bias, coord_para); + for(int i = 0; i < group_num; i++) + { + mean_vari += read_imagef(meanVari, coord_para.yx); + coord_para.y += 4; + } + mean_vari *= dimRatio; + mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; + mean_vari.s1 = rsqrt(mean_vari.s1); + + scale_vari = scale_f.s0 * mean_vari.s1; + short zp = inputZP; + vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3, norm; + half4 tmpVal0, tmpVal1; + float alpha = input_scale * scale_vari; + bias_val = (bias_f.s0 - scale_vari * mean_vari.s0); + for(; coord.y < endH;) + { + VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + coord_para = coord; + coord.y++; + VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert1stUint8SubZpToFp32_4x4); + VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert2ndUint8SubZpToFp32_4x4); + VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert3rdUint8SubZpToFp32_4x4); + VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert4thUint8SubZpToFp32_4x4); + norm = alpha * tmpData0 + bias_val; + _viv_asm(CONV, tmpVal0, norm); + norm = alpha * tmpData1 + bias_val; + _viv_asm(CONV, tmpVal1, norm); + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8); + _viv_asm(COPY, outval, dst, 16); + VXC_WriteImage(output, coord_para.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + coord_para.x += 8; + norm = alpha * tmpData2 + bias_val; + _viv_asm(CONV, tmpVal0, norm); + norm = alpha * tmpData3 + bias_val; + _viv_asm(CONV, tmpVal1, norm); + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8); + _viv_asm(COPY, outval, dst, 16); + VXC_WriteImage(output, coord_para.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + } +} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_scale_f32.vx b/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_scale_f32.vx new file mode 100644 index 0000000..e39ef71 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_scale_f32.vx @@ -0,0 +1,275 @@ +#include "cl_viv_vx_ext.h" + +/**************************layernorm float16***********************************/ +_viv_uniform int width; +_viv_uniform float dimRatio; +_viv_uniform VXC_512Bits uniFp16SumSqr_dp8x2; +_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4; +_viv_uniform VXC_512Bits uniExtractHalf4_dp4x4; + +__kernel void layer_norm_F16F32toF16( + image2d_array_t input, image2d_t bias, image2d_t scale, + image2d_array_t output, float eps) +{ + int4 coord = (int4)(0, get_global_id(1), get_global_id(2), 0); + int4 coord_out = coord; + + int8 input_desc, output_desc; + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); + int baseAddr_a = (int)get_global_id(2) * input_desc.s4 + input_desc.s0; + _viv_asm(MOV, coord.z, baseAddr_a); + + vxc_short8 src0; + vxc_float sum = 0, sqr = 0; + VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + Image img1 = create_image_from_image2d(bias, 4); + Image img2 = create_image_from_image2d(scale, 4); + + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); + int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0; + _viv_asm(MOV, coord_out.w, baseAddr); + + for(coord.x = 8; coord.x < (width+8); coord.x += 8) + { + vxc_half8 val0_h; + _viv_asm(COPY, val0_h, src0, 16); + VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + vxc_float4 sumsqr; + VXC_DP8x2(sumsqr, val0_h, val0_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\ + uniFp16SumSqr_dp8x2); + sum += sumsqr.x; + sqr += sumsqr.y; + } + vxc_float mean; + mean = sum * dimRatio; + vxc_float vari; + vari = sqr*dimRatio - mean*mean; + vari += eps; + vari = rsqrt(vari); + vxc_float4 bias_f, scale_f, in_f; + __global float* bias_ptr = (__global float*)get_image_ptr_from_coord(img1, coord.ww); + __global float* scale_ptr = (__global float*)get_image_ptr_from_coord(img2, coord.ww); + for(coord.x = 0; coord.x < width; coord.x += 4) + { + VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + bias_f = vload4(0, bias_ptr + coord.x); + scale_f = vload4(0, scale_ptr + coord.x); + vxc_half8 in_h; + _viv_asm(COPY, in_h, src0, 16); + VXC_DP4x4(in_f, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + UniFP16toFP32Lo4_dp4x4); + vxc_float4 sub, norm; + sub = in_f - mean; + norm = scale_f * vari * sub + bias_f; + half4 norm_h; + _viv_asm(CONV, norm_h, norm); + vxc_half8 dst; + VXC_DP4x4(dst, norm_h, norm_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniExtractHalf4_dp4x4); + vxc_short8 dstval; + _viv_asm(COPY, dstval, dst, 16); + coord_out.x = coord.x; + VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dstval, \ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); + } +} +/*****************************layernorm uint8 to uint8****************************/ +_viv_uniform VXC_512Bits uniConvert1stUint8SubZpToFp32_4x4; +_viv_uniform VXC_512Bits uniConvert2ndUint8SubZpToFp32_4x4; +_viv_uniform VXC_512Bits uniConvert3rdUint8SubZpToFp32_4x4; +_viv_uniform VXC_512Bits uniConvert4thUint8SubZpToFp32_4x4; +_viv_uniform VXC_512Bits uniSumU8_16x1; +_viv_uniform VXC_512Bits uniSqrSum_16x1; +_viv_uniform float input_scale; +_viv_uniform int inputZP; +_viv_uniform float outputScale; +_viv_uniform float output_zp; +_viv_uniform int sumInZp; +_viv_uniform int tmpZp1; +_viv_uniform int tmpZp2; +_viv_uniform float e2InScale; +_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8; + +_viv_uniform VXC_512Bits uniInt16SumSqr_dp8x2; +_viv_uniform float dimRatio_scale; + +__kernel void layer_norm_U8F32toU8( + image2d_array_t input, image2d_t bias, image2d_t scale, + image2d_array_t output, float eps) +{ + int4 coord = (int4)(0, get_global_id(1), get_global_id(2), 0); + int4 coord_out = coord; + + vxc_uchar16 src0, src2; + float sum = 0, sqr = 0; + vxc_float4 bias_f0, bias_f1, bias_f2, bias_f3, scale_f0, scale_f1, scale_f2, scale_f3; + int tmpSum = 0, tmpSqr = 0; + vxc_int4 tmpSum1; + vxc_int4 tmpSqr1; + short zp = inputZP; + + int8 input_desc, output_desc; + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); + int baseAddr_a = (int)get_global_id(2) * input_desc.s4 + input_desc.s0; + _viv_asm(MOV, coord.z, baseAddr_a); + + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); + int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0; + _viv_asm(MOV, coord_out.w, baseAddr); + + for(coord.x = 0; coord.x < width; coord.x += 16) + { + VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumU8_16x1); + tmpSum += (tmpSum1.x); + VXC_DP16x1(tmpSqr1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSqrSum_16x1); + tmpSqr += (tmpSqr1.x + tmpZp1 * tmpSum1.x); + } + sum = (tmpSum + sumInZp) * input_scale; + sqr = (tmpSqr + tmpZp2) * e2InScale; + + float mean, vari; + mean = sum * dimRatio; + vari = sqr*dimRatio - mean*mean; + vari += eps; + vari = rsqrt(vari); + vxc_int4 tmpVal0, tmpVal1; + vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3; + + Image img1 = create_image_from_image2d(bias, 4); + Image img2 = create_image_from_image2d(scale, 4); + __global float* bias_ptr = (__global float*)get_image_ptr_from_coord(img1, coord.ww); + __global float* scale_ptr = (__global float*)get_image_ptr_from_coord(img2, coord.ww); + for(coord.x = 0; coord.x < width; coord.x += 16) + { + VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + bias_f0 = vload4(0, bias_ptr); + bias_f1 = vload4(1, bias_ptr); + bias_f2 = vload4(2, bias_ptr); + bias_f3 = vload4(3, bias_ptr); + scale_f0 = vload4(0, scale_ptr); + scale_f1 = vload4(1, scale_ptr); + scale_f2 = vload4(2, scale_ptr); + scale_f3 = vload4(3, scale_ptr); + bias_ptr += 16; + scale_ptr += 16; + + VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvert1stUint8SubZpToFp32_4x4); + VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvert2ndUint8SubZpToFp32_4x4); + VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvert3rdUint8SubZpToFp32_4x4); + VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvert4thUint8SubZpToFp32_4x4); + tmpData0 *= input_scale; + tmpData1 *= input_scale; + tmpData2 *= input_scale; + tmpData3 *= input_scale; + + vxc_float4 norm; + tmpData0 -= mean; + norm = scale_f0 * vari * tmpData0 + bias_f0; + tmpVal0 = convert_int4_rte(norm * outputScale + output_zp); + + tmpData1 -= mean; + norm = scale_f1 * vari * tmpData1 + bias_f1; + tmpVal1 = convert_int4_rte(norm * outputScale + output_zp); + VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ + uniConvertInt32toUint8_2x8); + + tmpData2 -= mean; + norm = scale_f2 * vari * tmpData2 + bias_f2; + tmpVal0 = convert_int4_rte(norm * outputScale + output_zp); + + tmpData3 -= mean; + norm = scale_f3 * vari * tmpData3 + bias_f3; + tmpVal1 = convert_int4_rte(norm * outputScale + output_zp); + VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1),\ + uniConvertInt32toUint8_2x8); + coord_out.x = coord.x; + VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, src2, \ + VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); + } +} + +__kernel void layer_norm_I16F32toI16( + image2d_array_t input, image2d_t bias, image2d_t scale, + image2d_array_t output, float eps) +{ + int4 coord = (int4)(0, get_global_id(1), get_global_id(2), 0); + + int8 input_desc, output_desc; + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); + int baseAddr_a = (int)get_global_id(2) * input_desc.s4 + input_desc.s0; + _viv_asm(MOV, coord.z, baseAddr_a); + + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); + int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0; + _viv_asm(MOV, coord.w, baseAddr); + + vxc_short8 src0, dst; + vxc_float sum = 0, sqr = 0; + for(; coord.x < width;) + { + VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + coord.x += 8; + vxc_float4 sumsqr; + VXC_DP8x2(sumsqr, src0, src0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\ + uniInt16SumSqr_dp8x2); + sum += sumsqr.x; + sqr = sqr + sumsqr.y * e2InScale; + } + vxc_float mean; + mean = sum * dimRatio_scale; + vxc_float vari; + vari = sqr*dimRatio - mean*mean; + vari += eps; + vari = rsqrt(vari); + + short zp = inputZP; + vxc_float4 tmpData0, tmpData1; + vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1; + vxc_int4 tmpVal0, tmpVal1; + + int2 coord_bias = (int2)(0, 0); + Image img1 = create_image_from_image2d(bias, 4); + Image img2 = create_image_from_image2d(scale, 4); + __global float* bias_ptr = (__global float*)get_image_ptr_from_coord(img1, coord_bias); + __global float* scale_ptr = (__global float*)get_image_ptr_from_coord(img2, coord_bias); + for(coord.x = 0; coord.x < width; coord.x += 8) + { + VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + bias_f0 = vload4(0, bias_ptr); + bias_f1 = vload4(1, bias_ptr); + scale_f0 = vload4(0, scale_ptr); + scale_f1 = vload4(1, scale_ptr); + bias_ptr += 8; + scale_ptr += 8; + + VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvert1stUint8SubZpToFp32_4x4); + VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvert2ndUint8SubZpToFp32_4x4); + + vxc_float4 sub, norm; + sub = tmpData0 * input_scale - mean; + norm = scale_f0 * vari * sub + bias_f0; + tmpVal0 = convert_int4_rte(norm * outputScale + output_zp); + sub = tmpData1 * input_scale - mean; + norm = scale_f1 * vari * sub + bias_f1; + tmpVal1 = convert_int4_rte(norm * outputScale + output_zp); + + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ + uniConvertInt32toUint8_2x8); + VXC_OP4_NoDest(img_store_3d, output, coord.xyww, dst, \ + VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); + } +} \ No newline at end of file diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_scale_f32_2d.vx b/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_scale_f32_2d.vx new file mode 100644 index 0000000..8010726 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_scale_f32_2d.vx @@ -0,0 +1,237 @@ +#include "cl_viv_vx_ext.h" + +/**************************layernorm float16***********************************/ +_viv_uniform int width; +_viv_uniform float dimRatio; +_viv_uniform VXC_512Bits uniFp16SumSqr_dp8x2; +_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4; +_viv_uniform VXC_512Bits uniExtractHalf4_dp4x4; + +__kernel void layer_norm_F16F32toF16_2D( + image2d_t input, image2d_t bias, image2d_t scale, + image2d_t output, float eps) +{ + int4 coord = (int4)(0, get_global_id(1), 0, 0); + vxc_short8 src0, src1; + vxc_float sum = 0, sqr = 0; + VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + + Image img1 = create_image_from_image2d(bias, 4); + Image img2 = create_image_from_image2d(scale, 4); + + for(coord.x = 8; coord.x < (width+8); coord.x += 8) + { + vxc_half8 val0_h; + _viv_asm(COPY, val0_h, src0, 16); + VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + vxc_float4 sumsqr; + VXC_DP8x2(sumsqr, val0_h, val0_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\ + uniFp16SumSqr_dp8x2); + sum += sumsqr.x; + sqr += sumsqr.y; + } + vxc_float mean; + mean = sum * dimRatio; + vxc_float vari; + vari = sqr*dimRatio - mean*mean; + vari += eps; + vari = rsqrt(vari); + vxc_float4 bias_f, scale_f, in_f; + __global float* bias_ptr = (__global float*)get_image_ptr_from_coord(img1, coord.zw); + __global float* scale_ptr = (__global float*)get_image_ptr_from_coord(img2, coord.zw); + for(coord.x = 0; coord.x < width; coord.x += 4) + { + VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + bias_f = vload4(0, bias_ptr + coord.x); + scale_f = vload4(0, scale_ptr + coord.x); + + vxc_half8 in_h; + _viv_asm(COPY, in_h, src0, 16); + VXC_DP4x4(in_f, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + UniFP16toFP32Lo4_dp4x4); + vxc_float4 sub, norm; + sub = in_f - mean; + norm = scale_f * vari * sub + bias_f; + half4 norm_h; + _viv_asm(CONV, norm_h, norm); + vxc_half8 dst; + VXC_DP4x4(dst, norm_h, norm_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniExtractHalf4_dp4x4); + vxc_short8 dstval; + _viv_asm(COPY, dstval, dst, 16); + VXC_WriteImage(output, coord.xy, dstval, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + } +} +/*****************************layernorm uint8 to uint8****************************/ +_viv_uniform VXC_512Bits uniConvert1stUint8SubZpToFp32_4x4; +_viv_uniform VXC_512Bits uniConvert2ndUint8SubZpToFp32_4x4; +_viv_uniform VXC_512Bits uniConvert3rdUint8SubZpToFp32_4x4; +_viv_uniform VXC_512Bits uniConvert4thUint8SubZpToFp32_4x4; +_viv_uniform VXC_512Bits uniSumU8_16x1; +_viv_uniform VXC_512Bits uniSqrSum_16x1; +_viv_uniform float input_scale; +_viv_uniform int inputZP; +_viv_uniform float outputScale; +_viv_uniform float output_zp; +_viv_uniform int sumInZp; +_viv_uniform int tmpZp1; +_viv_uniform int tmpZp2; +_viv_uniform float e2InScale; +_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8; + +_viv_uniform VXC_512Bits uniInt16SumSqr_dp8x2; +_viv_uniform float dimRatio_scale; + +__kernel void layer_norm_U8F32toU8_2D( + image2d_t input, image2d_t bias, image2d_t scale, + image2d_t output, float eps) +{ + int4 coord = (int4)(0, get_global_id(1), 0, 0); + vxc_uchar16 src0, src2; + float sum = 0, sqr = 0; + vxc_float4 bias_f0, bias_f1, bias_f2, bias_f3, scale_f0, scale_f1, scale_f2, scale_f3; + int tmpSum = 0, tmpSqr = 0; + vxc_int4 tmpSum1; + vxc_int4 tmpSqr1; + short zp = inputZP; + + for(coord.x = 0; coord.x < width; coord.x += 16) + { + VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumU8_16x1); + tmpSum += (tmpSum1.x); + VXC_DP16x1(tmpSqr1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSqrSum_16x1); + tmpSqr += (tmpSqr1.x + tmpZp1 * tmpSum1.x); + } + sum = (tmpSum + sumInZp) * input_scale; + sqr = (tmpSqr + tmpZp2) * e2InScale; + + float mean, vari; + mean = sum * dimRatio; + vari = sqr*dimRatio - mean*mean; + vari += eps; + vari = rsqrt(vari); + vxc_int4 tmpVal0, tmpVal1; + vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3; + + Image img1 = create_image_from_image2d(bias, 4); + Image img2 = create_image_from_image2d(scale, 4); + __global float* bias_ptr = (__global float*)get_image_ptr_from_coord(img1, coord.zw); + __global float* scale_ptr = (__global float*)get_image_ptr_from_coord(img2, coord.zw); + for(coord.x = 0; coord.x < width; coord.x += 16) + { + VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + bias_f0 = vload4(0, bias_ptr); + bias_f1 = vload4(1, bias_ptr); + bias_f2 = vload4(2, bias_ptr); + bias_f3 = vload4(3, bias_ptr); + scale_f0 = vload4(0, scale_ptr); + scale_f1 = vload4(1, scale_ptr); + scale_f2 = vload4(2, scale_ptr); + scale_f3 = vload4(3, scale_ptr); + bias_ptr += 16; + scale_ptr += 16; + + VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvert1stUint8SubZpToFp32_4x4); + VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvert2ndUint8SubZpToFp32_4x4); + VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvert3rdUint8SubZpToFp32_4x4); + VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvert4thUint8SubZpToFp32_4x4); + tmpData0 = tmpData0 * input_scale - mean; + tmpData1 = tmpData1 * input_scale - mean; + tmpData2 = tmpData2 * input_scale - mean; + tmpData3 = tmpData3 * input_scale - mean; + + vxc_float4 norm; + norm = scale_f0 * vari * tmpData0 + bias_f0; + tmpVal0 = convert_int4_rte(norm * outputScale + output_zp); + + norm = scale_f1 * vari * tmpData1 + bias_f1; + tmpVal1 = convert_int4_rte(norm * outputScale + output_zp); + VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ + uniConvertInt32toUint8_2x8); + + norm = scale_f2 * vari * tmpData2 + bias_f2; + tmpVal0 = convert_int4_rte(norm * outputScale + output_zp); + + norm = scale_f3 * vari * tmpData3 + bias_f3; + tmpVal1 = convert_int4_rte(norm * outputScale + output_zp); + VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1),\ + uniConvertInt32toUint8_2x8); + VXC_WriteImage(output, coord.xy, src2, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + } +} + +__kernel void layer_norm_I16F32toI16_2D( + image2d_t input, image2d_t bias, image2d_t scale, + image2d_t output, float eps) +{ + int4 coord = (int4)(0, get_global_id(1), 0, 0); + + vxc_short8 src0, src1, dst; + vxc_float sum = 0, sqr = 0; + for(; coord.x < width;) + { + VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + coord.x += 8; + vxc_float4 sumsqr; + VXC_DP8x2(sumsqr, src0, src0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\ + uniInt16SumSqr_dp8x2); + sum += sumsqr.x; + sqr = sqr + sumsqr.y * e2InScale; + } + vxc_float mean, vari; + mean = sum * dimRatio_scale; + vari = sqr * dimRatio - mean * mean; + vari += eps; + vari = rsqrt(vari); + + short zp = inputZP; + vxc_float4 tmpData0, tmpData1; + vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1; + vxc_half8 scale_h; + vxc_int4 tmpVal0, tmpVal1; + + Image img1 = create_image_from_image2d(bias, 4); + Image img2 = create_image_from_image2d(scale, 4); + + __global float* bias_ptr = (__global float*)get_image_ptr_from_coord(img1, coord.zw); + __global float* scale_ptr = (__global float*)get_image_ptr_from_coord(img2, coord.zw); + for(coord.x = 0; coord.x < width; coord.x += 8) + { + VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + bias_f0 = vload4(0, bias_ptr); + bias_f1 = vload4(1, bias_ptr); + scale_f0 = vload4(0, scale_ptr); + scale_f1 = vload4(1, scale_ptr); + bias_ptr += 8; + scale_ptr += 8; + + VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvert1stUint8SubZpToFp32_4x4); + VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvert2ndUint8SubZpToFp32_4x4); + + vxc_float4 sub, norm; + sub = tmpData0 * input_scale - mean; + norm = scale_f0 * vari * sub + bias_f0; + tmpVal0 = convert_int4_rte(norm * outputScale + output_zp); + sub = tmpData1 * input_scale - mean; + norm = scale_f1 * vari * sub + bias_f1; + tmpVal1 = convert_int4_rte(norm * outputScale + output_zp); + + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ + uniConvertInt32toUint8_2x8); + VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + } +} \ No newline at end of file diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_scale_f32_bf16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_scale_f32_bf16.vx new file mode 100644 index 0000000..76e3ed9 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_scale_f32_bf16.vx @@ -0,0 +1,159 @@ +#include "cl_viv_vx_ext.h" + +/**************************layernorm float16***********************************/ +_viv_uniform int width; +_viv_uniform float dimRatio; +_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8; +_viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8; +_viv_uniform VXC_512Bits uniExtractOddData_2x8; + +__kernel void layer_norm_BF16F32toBF16( + image2d_array_t input, image2d_t bias, image2d_t scale, + image2d_array_t output, float eps) +{ + int4 coord = (int4)(0, get_global_id(1), get_global_id(2), 0); + int4 coord_out = coord; + + int8 input_desc, output_desc; + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); + int baseAddr_a = (int)get_global_id(2) * input_desc.s4 + input_desc.s0; + _viv_asm(MOV, coord.z, baseAddr_a); + + vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0); + float4 ones = (float4)(1.0, 1.0, 1.0, 1.0); + vxc_ushort8 src0, src1, src2; + vxc_float sum = 0, sqr = 0; + VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + Image img1 = create_image_from_image2d(bias, 4); + Image img2 = create_image_from_image2d(scale, 4); + + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); + int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0; + _viv_asm(MOV, coord_out.w, baseAddr); + float4 srcA, srcB; + for(coord.x = 8; coord.x < (width+8); coord.x += 8) + { + VXC_DP2x8(src1, src0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), + uniConvBF16toF32_Part0_2x8); + VXC_DP2x8(src2, src0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), + uniConvBF16toF32_Part1_2x8); + _viv_asm(COPY, srcA, src1, 16); + _viv_asm(COPY, srcB, src2, 16); + VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + sum += dot(srcA, ones) + dot(srcB, ones); + sqr += dot(srcA * srcA, ones) + dot(srcB * srcB, ones); + } + vxc_float mean; + mean = sum * dimRatio; + vxc_float vari; + vari = sqr*dimRatio - mean*mean; + vari += eps; + vari = rsqrt(vari); + vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1; + __global float* bias_ptr = (__global float*)get_image_ptr_from_coord(img1, coord.ww); + __global float* scale_ptr = (__global float*)get_image_ptr_from_coord(img2, coord.ww); + + for(coord.x = 0; coord.x < width; coord.x += 8) + { + VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + bias_f0 = vload4(0, bias_ptr); + bias_f1 = vload4(1, bias_ptr); + scale_f0 = vload4(0, scale_ptr); + scale_f1 = vload4(1, scale_ptr); + bias_ptr += 8; + scale_ptr += 8; + + VXC_DP2x8(src1, src0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), + uniConvBF16toF32_Part0_2x8); + VXC_DP2x8(src2, src0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), + uniConvBF16toF32_Part1_2x8); + _viv_asm(COPY, srcA, src1, 16); + _viv_asm(COPY, srcB, src2, 16); + + + vxc_float4 sub0, sub1, norm0, norm1; + sub0 = srcA - mean; + sub1 = srcB - mean; + norm0 = scale_f0 * vari * sub0 + bias_f0; + norm1 = scale_f1 * vari * sub1 + bias_f1; + + _viv_asm(COPY, src0, norm0, 16); + _viv_asm(COPY, src1, norm1, 16); + VXC_DP2x8(src2, src0, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); + + coord_out.x = coord.x; + VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, src2, \ + VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); + } +} + +__kernel void layer_norm_BF16F32toBF16_2D( + image2d_t input, image2d_t bias, image2d_t scale, + image2d_t output, float eps) +{ + int4 coord = (int4)(0, get_global_id(1), 0, 0); + vxc_ushort8 src0, src1, src2; + vxc_float sum = 0, sqr = 0; + VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + + vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0); + float4 ones = (float4)(1.0, 1.0, 1.0, 1.0); + Image img1 = create_image_from_image2d(bias, 4); + Image img2 = create_image_from_image2d(scale, 4); + float4 srcA, srcB; + for(coord.x = 8; coord.x < (width+8); coord.x += 8) + { + VXC_DP2x8(src1, src0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), + uniConvBF16toF32_Part0_2x8); + VXC_DP2x8(src2, src0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), + uniConvBF16toF32_Part1_2x8); + _viv_asm(COPY, srcA, src1, 16); + _viv_asm(COPY, srcB, src2, 16); + VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + sum += dot(srcA, ones) + dot(srcB, ones); + sqr += dot(srcA * srcA, ones) + dot(srcB * srcB, ones); + } + vxc_float mean; + mean = sum * dimRatio; + vxc_float vari; + vari = sqr*dimRatio - mean*mean; + vari += eps; + vari = rsqrt(vari); + vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1; + __global float* bias_ptr = (__global float*)get_image_ptr_from_coord(img1, coord.zw); + __global float* scale_ptr = (__global float*)get_image_ptr_from_coord(img2, coord.zw); + for(coord.x = 0; coord.x < width; coord.x += 8) + { + VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + bias_f0 = vload4(0, bias_ptr); + bias_f1 = vload4(1, bias_ptr); + scale_f0 = vload4(0, scale_ptr); + scale_f1 = vload4(1, scale_ptr); + bias_ptr += 8; + scale_ptr += 8; + + VXC_DP2x8(src1, src0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), + uniConvBF16toF32_Part0_2x8); + VXC_DP2x8(src2, src0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), + uniConvBF16toF32_Part1_2x8); + _viv_asm(COPY, srcA, src1, 16); + _viv_asm(COPY, srcB, src2, 16); + + vxc_float4 sub0, sub1, norm0, norm1; + sub0 = srcA - mean; + sub1 = srcB - mean; + norm0 = scale_f0 * vari * sub0 + bias_f0; + norm1 = scale_f1 * vari * sub1 + bias_f1; + + _viv_asm(COPY, src0, norm0, 16); + _viv_asm(COPY, src1, norm1, 16); + VXC_DP2x8(src2, src0, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); + VXC_WriteImage(output, coord.xy, src2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + } +} \ No newline at end of file diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/one_hot.vx b/src/tim/vx/internal/src/libnnext/ops/vx/one_hot.vx new file mode 100644 index 0000000..6d3cd52 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/one_hot.vx @@ -0,0 +1,205 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform VXC_512Bits uniDataConvert_0_4x4; +_viv_uniform VXC_512Bits uniDataConvert_1_4x4; +_viv_uniform VXC_512Bits uniExtract8Data_2x8; +_viv_uniform int depth; +#define ONE_HOT_SH_IMPL(name0, name1, src_type, copy_type, dst_type) \ +__kernel void one_hot_##name0##to##name1 \ + ( \ + __read_only image2d_array_t input, \ + __write_only image2d_array_t output, \ + int suffix_sz, \ + int on_val, \ + int off_val \ + ) \ +{ \ + int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0); \ + \ + copy_type src; \ + src_type val; \ + \ + VXC_ReadImage(val, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src, val, 16); \ + \ + int4 data0, data1; \ + VXC_DP4x4(data0, src, src, \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataConvert_0_4x4); \ + VXC_DP4x4(data1, src, src, \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataConvert_1_4x4); \ + \ + do \ + { \ + int4 d0 = data0 == coord.zzzz ? on_val : off_val; \ + int4 d1 = data1 == coord.zzzz ? on_val : off_val; \ + \ + dst_type dst; \ + VXC_DP2x8(dst, d0, d1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8); \ + \ + VXC_WriteImage2DArray(output, coord.xzyw, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \ + \ + coord.z ++; \ + } while (coord.z < depth); \ +} +ONE_HOT_SH_IMPL(F16, F16, vxc_ushort8, vxc_half8, vxc_ushort8) +ONE_HOT_SH_IMPL(F16, I16, vxc_ushort8, vxc_half8, vxc_ushort8) +ONE_HOT_SH_IMPL(F16, I8, vxc_ushort8, vxc_half8, vxc_uchar8) +ONE_HOT_SH_IMPL(F16, U8, vxc_ushort8, vxc_half8, vxc_uchar8) +ONE_HOT_SH_IMPL(I16, F16, vxc_short8, vxc_short8, vxc_ushort8) +ONE_HOT_SH_IMPL(I16, I16, vxc_short8, vxc_short8, vxc_ushort8) +ONE_HOT_SH_IMPL(I8, F16, vxc_char8, vxc_char8, vxc_ushort8) +ONE_HOT_SH_IMPL(I8, I8, vxc_char8, vxc_char8, vxc_uchar8) + +#define ONE_HOT_SH_IMPL_2D(name0, name1, src_type, copy_type, dst_type) \ +__kernel void one_hot_##name0##to##name1##_2D \ + ( \ + __read_only image2d_array_t input, \ + __write_only image2d_array_t output, \ + int suffix_sz, \ + int on_val, \ + int off_val \ + ) \ +{ \ + int4 coord = (int4)(get_global_id(0), 0, get_global_id(0), get_global_id(0)); \ + \ + copy_type src; \ + src_type val; \ + \ + VXC_ReadImage(val, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src, val, 16); \ + \ + int4 data, data0, data1; \ + VXC_DP4x4(data, src, src, \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataConvert_0_4x4); \ + int4 d4 = (int4)(0, 1, 2, 3); \ + \ + do \ + { \ + coord.zw = coord.xx + (int2)(0, 1); \ + dst_type dst; \ + data0 = data.xxxx == d4 ? on_val : off_val; \ + data1 = data.yyyy == d4 ? on_val : off_val; \ + \ + VXC_DP2x8(dst, data0, data1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8); \ + \ + VXC_WriteImage(output, coord.yz, dst, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \ + VXC_WriteImage(output, coord.yw, dst, VXC_MODIFIER(4, 7, 0,VXC_RM_TowardZero, 0)); \ + coord.zw = coord.zw + (int2)(2, 2); \ + \ + data0 = data.zzzz == d4 ? on_val : off_val; \ + data1 = data.wwww == d4 ? on_val : off_val; \ + \ + VXC_DP2x8(dst, data0, data1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8); \ + \ + VXC_WriteImage(output, coord.yz, dst, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \ + VXC_WriteImage(output, coord.yw, dst, VXC_MODIFIER(4, 7, 0,VXC_RM_TowardZero, 0)); \ + d4 += 4; \ + coord.y += 4; \ + } while (coord.y < depth); \ +} +ONE_HOT_SH_IMPL_2D(F16, F16, vxc_ushort8, vxc_half8, vxc_ushort8) +ONE_HOT_SH_IMPL_2D(F16, I16, vxc_ushort8, vxc_half8, vxc_ushort8) +ONE_HOT_SH_IMPL_2D(F16, I8, vxc_ushort8, vxc_half8, vxc_uchar8) +ONE_HOT_SH_IMPL_2D(F16, U8, vxc_ushort8, vxc_half8, vxc_uchar8) +ONE_HOT_SH_IMPL_2D(I16, F16, vxc_short8, vxc_short8, vxc_ushort8) +ONE_HOT_SH_IMPL_2D(I16, I16, vxc_short8, vxc_short8, vxc_ushort8) +ONE_HOT_SH_IMPL_2D(I8, F16, vxc_char8, vxc_char8, vxc_ushort8) +ONE_HOT_SH_IMPL_2D(I8, I8, vxc_char8, vxc_char8, vxc_uchar8) + +_viv_uniform float input_scale; +_viv_uniform float input_tail; +#define ONE_HOT_ASYM_SH_IMPL(name0, name1, src_type, copy_type, dst_type) \ +__kernel void one_hot_##name0##to##name1 \ + ( \ + __read_only image2d_array_t input, \ + __write_only image2d_array_t output, \ + int suffix_sz, \ + int on_val, \ + int off_val \ + ) \ +{ \ + int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0); \ + \ + copy_type src; \ + src_type val; \ + \ + VXC_ReadImage(val, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src, val, 16); \ + \ + int4 data0, data1; \ + float4 v0, v1; \ + VXC_DP4x4(v0, src, src, \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataConvert_0_4x4); \ + VXC_DP4x4(v1, src, src, \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataConvert_1_4x4); \ + \ + data0 = convert_int4(v0 * input_scale + input_tail); \ + data1 = convert_int4(v1 * input_scale + input_tail); \ + do \ + { \ + int4 d0 = data0 == coord.zzzz ? on_val : off_val; \ + int4 d1 = data1 == coord.zzzz ? on_val : off_val; \ + \ + dst_type dst; \ + VXC_DP2x8(dst, d0, d1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8); \ + \ + VXC_WriteImage2DArray(output, coord.xzyw, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \ + \ + coord.z ++; \ + } while (coord.z < depth); \ +} +ONE_HOT_ASYM_SH_IMPL(U8, F16, vxc_uchar8, vxc_uchar8, vxc_ushort8) +ONE_HOT_ASYM_SH_IMPL(U8, U8, vxc_uchar8, vxc_uchar8, vxc_uchar8) + +#define ONE_HOT_ASYM_SH_IMPL_2D(name0, name1, src_type, copy_type, dst_type) \ +__kernel void one_hot_##name0##to##name1##_2D \ + ( \ + __read_only image2d_array_t input, \ + __write_only image2d_array_t output, \ + int suffix_sz, \ + int on_val, \ + int off_val \ + ) \ +{ \ + int4 coord = (int4)(get_global_id(0), 0, get_global_id(0), get_global_id(0)); \ + \ + copy_type src; \ + src_type val; \ + \ + VXC_ReadImage(val, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src, val, 16); \ + \ + int4 data, data0, data1; \ + float4 v0; \ + VXC_DP4x4(v0, src, src, \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataConvert_0_4x4); \ + int4 d4 = (int4)(0, 1, 2, 3); \ + data = convert_int4(v0 * input_scale + input_tail); \ + \ + do \ + { \ + coord.zw = coord.xx + (int2)(0, 1); \ + dst_type dst; \ + data0 = data.xxxx == d4 ? on_val : off_val; \ + data1 = data.yyyy == d4 ? on_val : off_val; \ + \ + VXC_DP2x8(dst, data0, data1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8); \ + \ + VXC_WriteImage(output, coord.yz, dst, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \ + VXC_WriteImage(output, coord.yw, dst, VXC_MODIFIER(4, 7, 0,VXC_RM_TowardZero, 0)); \ + coord.zw = coord.zw + (int2)(2, 2); \ + \ + data0 = data.zzzz == d4 ? on_val : off_val; \ + data1 = data.wwww == d4 ? on_val : off_val; \ + \ + VXC_DP2x8(dst, data0, data1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8); \ + \ + VXC_WriteImage(output, coord.yz, dst, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \ + VXC_WriteImage(output, coord.yw, dst, VXC_MODIFIER(4, 7, 0,VXC_RM_TowardZero, 0)); \ + d4 += 4; \ + coord.y += 4; \ + } while (coord.y < depth); \ +} +ONE_HOT_ASYM_SH_IMPL_2D(U8, F16, vxc_uchar8, vxc_uchar8, vxc_ushort8) +ONE_HOT_ASYM_SH_IMPL_2D(U8, U8, vxc_uchar8, vxc_uchar8, vxc_uchar8) + diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb_copy.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb_copy.vx index 95a43ed..c200019 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb_copy.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb_copy.vx @@ -6,10 +6,16 @@ _viv_uniform int r_order; _viv_uniform int b_order; _viv_uniform VXC_512Bits uniExtractRtoF32_part0_4x4; _viv_uniform VXC_512Bits uniExtractRtoF32_part1_4x4; +_viv_uniform VXC_512Bits uniExtractRtoF32_part2_4x4; +_viv_uniform VXC_512Bits uniExtractRtoF32_part3_4x4; _viv_uniform VXC_512Bits uniExtractGtoF32_part0_4x4; _viv_uniform VXC_512Bits uniExtractGtoF32_part1_4x4; +_viv_uniform VXC_512Bits uniExtractGtoF32_part2_4x4; +_viv_uniform VXC_512Bits uniExtractGtoF32_part3_4x4; _viv_uniform VXC_512Bits uniExtractBtoF32_part0_4x4; _viv_uniform VXC_512Bits uniExtractBtoF32_part1_4x4; +_viv_uniform VXC_512Bits uniExtractBtoF32_part2_4x4; +_viv_uniform VXC_512Bits uniExtractBtoF32_part3_4x4; _viv_uniform VXC_512Bits uniExtract8Data_2x8; #define IMAGE_PRE_PROCESS_COPY_16BITS(dst_name, dst_type, copy_type, convert_type) \ @@ -31,13 +37,14 @@ __kernel void pre_process_rgb_copy_U8to##dst_name \ { \ int2 coord = (int2)(get_global_id(0) * 3, get_global_id(1)); \ \ - coord.xy += (int2) (*xOffset, *yOffset); \ + coord.xy = coord.xy + (int2) (*xOffset * 3 + 16, *yOffset); \ vxc_uchar16 src0, src1; \ dst_type dst0; \ copy_type dst; \ \ - VXC_ReadImage(src0, input, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ - VXC_ReadImage(src1, input, coord.xy, VXC_5BITOFFSET_XY(15, 0), \ + VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(-16, 0), \ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(src1, input, coord.xy, VXC_5BITOFFSET_XY(0, 0), \ VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ \ f32Var *= outputScale; \ @@ -48,7 +55,7 @@ __kernel void pre_process_rgb_copy_U8to##dst_name \ float4 tmp0, tmp1; \ convert_type result0, result1; \ \ - VXC_DP4x4(tmp0, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractRtoF32_part0_4x4); \ + VXC_DP4x4(tmp0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractRtoF32_part0_4x4); \ VXC_DP4x4(tmp1, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractRtoF32_part1_4x4); \ tmp0 = tmp0 * paramData.w - paramData.x; \ tmp1 = tmp1 * paramData.w - paramData.x; \ @@ -59,7 +66,7 @@ __kernel void pre_process_rgb_copy_U8to##dst_name \ VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ \ coord_out.z = 1; \ - VXC_DP4x4(tmp0, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractGtoF32_part0_4x4); \ + VXC_DP4x4(tmp0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractGtoF32_part0_4x4); \ VXC_DP4x4(tmp1, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractGtoF32_part1_4x4); \ tmp0 = tmp0 * paramData.w - paramData.y; \ tmp1 = tmp1 * paramData.w - paramData.y; \ @@ -70,7 +77,7 @@ __kernel void pre_process_rgb_copy_U8to##dst_name \ VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ \ coord_out.z = b_order; \ - VXC_DP4x4(tmp0, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractBtoF32_part0_4x4); \ + VXC_DP4x4(tmp0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractBtoF32_part0_4x4); \ VXC_DP4x4(tmp1, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractBtoF32_part1_4x4); \ tmp0 = tmp0 * paramData.w - paramData.z; \ tmp1 = tmp1 * paramData.w - paramData.z; \ @@ -101,12 +108,16 @@ __kernel void pre_process_rgb_copy_U8to##dst_name \ ) \ { \ int2 coord = (int2)(get_global_id(0) * 3, get_global_id(1)); \ - coord.xy += (int2) (*xOffset, *yOffset); \ - vxc_uchar16 src0, src1; \ + coord.xy = coord.xy + (int2) (*xOffset * 3 + 16, *yOffset); \ + vxc_uchar16 src0, src1, src2; \ dst_type dst; \ \ - VXC_ReadImage(src0, input, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ - VXC_ReadImage(src1, input, coord.xy, VXC_5BITOFFSET_XY(15, 0), \ + VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(-16, 0), \ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(src1, input, coord.xy, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ + coord.x += 16; \ + VXC_ReadImage(src2, input, coord.xy, VXC_5BITOFFSET_XY(0, 0), \ VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ \ f32Var *= outputScale; \ @@ -117,35 +128,55 @@ __kernel void pre_process_rgb_copy_U8to##dst_name \ float4 tmp0, tmp1; \ int4 result0, result1; \ \ - VXC_DP4x4(tmp0, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractRtoF32_part0_4x4); \ + VXC_DP4x4(tmp0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractRtoF32_part0_4x4); \ VXC_DP4x4(tmp1, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractRtoF32_part1_4x4); \ tmp0 = tmp0 * paramData.w - paramData.x; \ tmp1 = tmp1 * paramData.w - paramData.x; \ result0 = convert_int4_rte(tmp0); \ result1 = convert_int4_rte(tmp1); \ VXC_DP2x8(dst, result0, result1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \ - VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ - \ + VXC_DP4x4(tmp0, src1, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractRtoF32_part2_4x4); \ + VXC_DP4x4(tmp1, src1, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractRtoF32_part3_4x4); \ + tmp0 = tmp0 * paramData.w - paramData.x; \ + tmp1 = tmp1 * paramData.w - paramData.x; \ + result0 = convert_int4_rte(tmp0); \ + result1 = convert_int4_rte(tmp1); \ + VXC_DP2x8(dst, result0, result1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \ + VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ \ coord_out.z = 1; \ - VXC_DP4x4(tmp0, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractGtoF32_part0_4x4); \ + VXC_DP4x4(tmp0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractGtoF32_part0_4x4); \ VXC_DP4x4(tmp1, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractGtoF32_part1_4x4); \ tmp0 = tmp0 * paramData.w - paramData.y; \ tmp1 = tmp1 * paramData.w - paramData.y; \ result0 = convert_int4_rte(tmp0); \ result1 = convert_int4_rte(tmp1); \ VXC_DP2x8(dst, result0, result1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \ - VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + VXC_DP4x4(tmp0, src1, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractGtoF32_part2_4x4); \ + VXC_DP4x4(tmp1, src1, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractGtoF32_part3_4x4); \ + tmp0 = tmp0 * paramData.w - paramData.y; \ + tmp1 = tmp1 * paramData.w - paramData.y; \ + result0 = convert_int4_rte(tmp0); \ + result1 = convert_int4_rte(tmp1); \ + VXC_DP2x8(dst, result0, result1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \ + VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ \ coord_out.z = b_order; \ - VXC_DP4x4(tmp0, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractBtoF32_part0_4x4); \ + VXC_DP4x4(tmp0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractBtoF32_part0_4x4); \ VXC_DP4x4(tmp1, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractBtoF32_part1_4x4); \ tmp0 = tmp0 * paramData.w - paramData.z; \ tmp1 = tmp1 * paramData.w - paramData.z; \ result0 = convert_int4_rte(tmp0); \ result1 = convert_int4_rte(tmp1); \ VXC_DP2x8(dst, result0, result1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \ - VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + VXC_DP4x4(tmp0, src1, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractBtoF32_part2_4x4); \ + VXC_DP4x4(tmp1, src1, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractBtoF32_part3_4x4); \ + tmp0 = tmp0 * paramData.w - paramData.z; \ + tmp1 = tmp1 * paramData.w - paramData.z; \ + result0 = convert_int4_rte(tmp0); \ + result1 = convert_int4_rte(tmp1); \ + VXC_DP2x8(dst, result0, result1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \ + VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ } IMAGE_PRE_PROCESS_COPY_8BITS(U8, vxc_uchar16) IMAGE_PRE_PROCESS_COPY_8BITS(I8, vxc_char16) diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv420_copy_u8.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv420_copy_u8.vx index 951ee96..bce976c 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv420_copy_u8.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv420_copy_u8.vx @@ -53,7 +53,7 @@ __kernel void pre_process_yuv420_copy_U8toU8( ) { int4 pos = (int4)(get_global_id(0) + (*xOffset), get_global_id(1) + (*yOffset), 0, 0); - int4 pos1 = (int4)((get_global_id(0) + (*xOffset)) >> 1, (get_global_id(1) + (*yOffset)) >> 1, 0, 0); + int2 pos1 = (int2)((get_global_id(0) + (*xOffset)) >> 1, (get_global_id(1) + (*yOffset)) >> 1); vxc_uchar16 Y; vxc_uchar8 U, V; vxc_int4 C0, C1, C2, C3; @@ -132,3 +132,109 @@ __kernel void pre_process_yuv420_copy_U8toU8( VXC_WriteImage2DArray(output, pos, dst2, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); } +__kernel void pre_process_yuv420_copy_U8toF16( + __read_only image2d_t y_img, + __read_only image2d_t u_img, + __read_only image2d_t v_img, + __write_only image2d_array_t output, + global int * xRatio, + global int * yRatio, + global int * xOffset, + global int * yOffset, + float rMean, + float gMean, + float bMean, + float var, + int reverse_channel, + int trans + ) +{ + int4 pos = (int4)(get_global_id(0) + (*xOffset), get_global_id(1) + (*yOffset), 0, 0); + int2 pos1 = (int2)((get_global_id(0) + (*xOffset)) >> 1, (get_global_id(1) + (*yOffset)) >> 1); + vxc_uchar16 Y; + vxc_uchar8 U, V; + vxc_int4 C0, C1, C2, C3; + vxc_uchar16 R, G, B; + vxc_half8 dst0, dst1, dst2, dst3, dst4, dst5; + vxc_short8 out0, out1, out2, out3, out4, out5; + + VXC_ReadImage(Y, y_img, pos.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(U, u_img, pos1.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(V, v_img, pos1.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + + //C = Y - 16; + //D = U - 128; + //E = V - 128; + // calculate R + // ((298 * C + 409 * E + 128) >> 8) --> [(298Y + 409V - 56992) >> 8] + int tmpV = -56992; + VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR1st_4x4); + VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR2nd_4x4); + VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR3rd_4x4); + VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR4th_4x4); + + VXC_DP4x4(R, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); + VXC_DP4x4(R, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); + VXC_DP4x4(R, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); + VXC_DP4x4(R, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); + + // calculate G + // ((298 * C - 100* D - 208 * E + 128) >> 8) --> [(298Y - 100U - 208V + 34784) >> 8] + // 298Y - 208V + VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG1st_4x4); + VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG2nd_4x4); + VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG3rd_4x4); + VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG4th_4x4); + // 34784 - 100U + ushort tmpG = 34784; + vxc_ushort8 tmpDstG; + VXC_DP2x8(tmpDstG, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU_2x8); + VXC_DP4x4(G, C0, tmpDstG, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateG1st_4x4); + VXC_DP4x4(G, C1, tmpDstG, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateG2nd_4x4); + VXC_DP4x4(G, C2, tmpDstG, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateG3rd_4x4); + VXC_DP4x4(G, C3, tmpDstG, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateG4th_4x4); + + // calculate B + // ((298 * C + 516 * D + 128) >> 8) ==> [(298Y + 516U - 70688) >> 8] + VXC_DP4x4(C0, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB1st_4x4); + VXC_DP4x4(C1, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB2nd_4x4); + VXC_DP4x4(C2, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB3rd_4x4); + VXC_DP4x4(C3, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB4th_4x4); + tmpV = -70688; + VXC_DP4x4(B, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); + VXC_DP4x4(B, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); + VXC_DP4x4(B, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); + VXC_DP4x4(B, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); + + float4 paramData = (float4)(bMean * var, gMean * var,\ + rMean * var, var); + half4 paramData_f16; + _viv_asm(CONV, paramData_f16, paramData); + + VXC_DP2x8(dst0, B, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoB_2x8); + VXC_DP2x8(dst1, B, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiB_2x8); + + VXC_DP2x8(dst2, G, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoG_2x8); + VXC_DP2x8(dst3, G, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiG_2x8); + + VXC_DP2x8(dst4, R, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoR_2x8); + VXC_DP2x8(dst5, R, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiR_2x8); + + _viv_asm(COPY, out0, dst0, 16); + _viv_asm(COPY, out1, dst1, 16); + _viv_asm(COPY, out2, dst2, 16); + _viv_asm(COPY, out3, dst3, 16); + _viv_asm(COPY, out4, dst4, 16); + _viv_asm(COPY, out5, dst5, 16); + + pos = (int4)(get_global_id(0), get_global_id(1), bOrder, get_global_id(0) + 8); + VXC_WriteImage2DArray(output, pos.xyzz, out0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_WriteImage2DArray(output, pos.wyzz, out1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + pos.z = 1; + VXC_WriteImage2DArray(output, pos.xyzz, out2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_WriteImage2DArray(output, pos.wyzz, out3, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + pos.z = rOrder; + VXC_WriteImage2DArray(output, pos.xyzz, out4, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_WriteImage2DArray(output, pos.wyzz, out5, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); +} + diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv444_copy_u8.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv444_copy_u8.vx index 20803c9..05f9973 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv444_copy_u8.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv444_copy_u8.vx @@ -51,7 +51,7 @@ __kernel void pre_process_yuv444_copy_U8toU8( int trans ) { - int4 pos = (int4)(get_global_id(0) + (*xOffset), get_global_id(1) + (*yOffset), 0, 0); + int2 pos = (int2)(get_global_id(0) + (*xOffset), get_global_id(1) + (*yOffset)); vxc_uchar16 Y, U, V; vxc_int4 C0, C1, C2, C3; vxc_uchar16 R, G, B; @@ -122,11 +122,116 @@ __kernel void pre_process_yuv444_copy_U8toU8( VXC_DP2x8(dst2, R, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoR_2x8); VXC_DP2x8(dst2, R, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiR_2x8); - pos = (int4)(get_global_id(0), get_global_id(1), 0, 0); - pos.z = bOrder; - VXC_WriteImage2DArray(output, pos, dst0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); - pos.z = 1; - VXC_WriteImage2DArray(output, pos, dst1, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); - pos.z = rOrder; - VXC_WriteImage2DArray(output, pos, dst2, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + int4 pos1 = (int4)(get_global_id(0), get_global_id(1), bOrder, 0); + VXC_WriteImage2DArray(output, pos1, dst0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + pos1.z = 1; + VXC_WriteImage2DArray(output, pos1, dst1, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + pos1.z = rOrder; + VXC_WriteImage2DArray(output, pos1, dst2, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); +} + +__kernel void pre_process_yuv444_copy_U8toF16( + __read_only image2d_t y_img, + __read_only image2d_t u_img, + __read_only image2d_t v_img, + __write_only image2d_array_t output, + global int * xRatio, + global int * yRatio, + global int * xOffset, + global int * yOffset, + float rMean, + float gMean, + float bMean, + float var, + int reverse_channel, + int trans + ) +{ + int2 pos = (int2)(get_global_id(0) + (*xOffset), get_global_id(1) + (*yOffset)); + vxc_uchar16 Y, U, V; + vxc_int4 C0, C1, C2, C3; + vxc_uchar16 R, G, B; + vxc_half8 dst0, dst1, dst2, dst3, dst4, dst5; + vxc_short8 out0, out1, out2, out3, out4, out5; + + VXC_ReadImage(Y, y_img, pos.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(U, u_img, pos.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(V, v_img, pos.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + + //C = Y - 16; + //D = U - 128; + //E = V - 128; + // calculate R + // ((298 * C + 409 * E + 128) >> 8) --> [(298Y + 409V - 56992) >> 8] + int tmpV = -56992; + VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR1st_4x4); + VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR2nd_4x4); + VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR3rd_4x4); + VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR4th_4x4); + + VXC_DP4x4(R, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); + VXC_DP4x4(R, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); + VXC_DP4x4(R, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); + VXC_DP4x4(R, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); + + // calculate G + // ((298 * C - 100* D - 208 * E + 128) >> 8) --> [(298Y - 100U - 208V + 34784) >> 8] + // 298Y - 208V + VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG1st_4x4); + VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG2nd_4x4); + VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG3rd_4x4); + VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG4th_4x4); + // 34784 - 100U + ushort tmpG = 34784; + vxc_ushort8 tmpDstG0, tmpDstG1; + VXC_DP2x8(tmpDstG0, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU_2x8); + VXC_DP2x8(tmpDstG1, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU2_2x8); + + VXC_DP4x4(G, C0, tmpDstG0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateG1st_4x4); + VXC_DP4x4(G, C1, tmpDstG0, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateG2nd_4x4); + VXC_DP4x4(G, C2, tmpDstG1, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateG1st_4x4); + VXC_DP4x4(G, C3, tmpDstG1, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateG2nd_4x4); + + // calculate B + // ((298 * C + 516 * D + 128) >> 8) ==> [(298Y + 516U - 70688) >> 8] + VXC_DP4x4(C0, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB1st_4x4); + VXC_DP4x4(C1, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB2nd_4x4); + VXC_DP4x4(C2, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB3rd_4x4); + VXC_DP4x4(C3, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB4th_4x4); + tmpV = -70688; + VXC_DP4x4(B, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); + VXC_DP4x4(B, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); + VXC_DP4x4(B, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); + VXC_DP4x4(B, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); + + float4 paramData = (float4)(bMean * var, gMean * var,\ + rMean * var, var); + half4 paramData_f16; + _viv_asm(CONV, paramData_f16, paramData); + + VXC_DP2x8(dst0, B, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoB_2x8); + VXC_DP2x8(dst1, B, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiB_2x8); + + VXC_DP2x8(dst2, G, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoG_2x8); + VXC_DP2x8(dst3, G, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiG_2x8); + + VXC_DP2x8(dst4, R, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoR_2x8); + VXC_DP2x8(dst5, R, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiR_2x8); + + _viv_asm(COPY, out0, dst0, 16); + _viv_asm(COPY, out1, dst1, 16); + _viv_asm(COPY, out2, dst2, 16); + _viv_asm(COPY, out3, dst3, 16); + _viv_asm(COPY, out4, dst4, 16); + _viv_asm(COPY, out5, dst5, 16); + + int4 pos1 = (int4)(get_global_id(0), get_global_id(1), bOrder, get_global_id(0) + 8); + VXC_WriteImage2DArray(output, pos1.xyzz, out0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_WriteImage2DArray(output, pos1.wyzz, out1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + pos1.z = 1; + VXC_WriteImage2DArray(output, pos1.xyzz, out2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_WriteImage2DArray(output, pos1.wyzz, out3, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + pos1.z = rOrder; + VXC_WriteImage2DArray(output, pos1.xyzz, out4, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_WriteImage2DArray(output, pos1.wyzz, out5, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); } diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/repeat.vx b/src/tim/vx/internal/src/libnnext/ops/vx/repeat.vx new file mode 100644 index 0000000..5898ea4 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/repeat.vx @@ -0,0 +1,224 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform VXC_512Bits uniIntegralHorAcc_4x4; +_viv_uniform VXC_512Bits uniExtract1to8Short_2x8; +_viv_uniform int width; + +// workgroup size is 32 +__kernel void preprocess_start_idx(image2d_t input, image2d_t output) +{ + int lidx = get_local_id(0); + __local int lcl_sum[32]; + __local int last_round[1]; + Image img = create_image_from_image2d(input, 4); + Image dst = create_image_from_image2d(output, 4); + __global int* index_ptr = (__global int*)img.ptr + get_global_id(0); + __global int* output_org = (__global int*)dst.ptr; + __global int* output_ptr = output_org + get_global_id(0) + 1; + + if (lidx == 0) + { + last_round[0] = 0; + output_org[0] = 0; + } + int4 accSum0, accSum1, accSum2, accSum3; + + for(int i = 0; i < width; i += 512) + { + int4 data0 = vload4(0, index_ptr + i); + int4 data1 = vload4(1, index_ptr + i); + int4 data2 = vload4(2, index_ptr + i); + int4 data3 = vload4(3, index_ptr + i); + barrier(CLK_LOCAL_MEM_FENCE); + int prevSum = last_round[0]; + + VXC_DP4x4(accSum0, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniIntegralHorAcc_4x4); + VXC_DP4x4(accSum1, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniIntegralHorAcc_4x4); + VXC_DP4x4(accSum2, data2, data2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniIntegralHorAcc_4x4); + VXC_DP4x4(accSum3, data3, data3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniIntegralHorAcc_4x4); + accSum1 += accSum0.w; + accSum2 += accSum1.w; + accSum3 += accSum2.w; + + lcl_sum[lidx] = accSum3.w; + barrier(CLK_LOCAL_MEM_FENCE); + + for(int j = 0; j < lidx; j++) + { + prevSum += lcl_sum[j]; + } + accSum0 += prevSum; + accSum1 += prevSum; + accSum2 += prevSum; + accSum3 += prevSum; + if(lidx == 31) + { + last_round[0] = accSum3.w; + } + vstore4(accSum0, 0, output_ptr + i); + vstore4(accSum1, 1, output_ptr + i); + vstore4(accSum2, 2, output_ptr + i); + vstore4(accSum3, 3, output_ptr + i); + } +} + +__kernel void repeat_I16_axis0( + image2d_array_t input0, image2d_t input1, image2d_t input2, + image2d_array_t output, int axis) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + vxc_short8 src0; + VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + Image img1 = create_image_from_image2d(input1, 4); + Image img2 = create_image_from_image2d(input2, 4); + __global int* len_ptr = (__global int*)img1.ptr; + __global int* index_ptr = (__global int*)img2.ptr; + + int len = len_ptr[get_global_id(1)]; + int start = index_ptr[get_global_id(1)]; + + int8 output_desc; + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); + int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0; + _viv_asm(MOV, coord.z, baseAddr); + int end = len + start; + + for(coord.y = start; coord.y < end; coord.y++) + { + VXC_OP4_NoDest(img_store_3d, output, coord, src0, \ + VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); + } +} + +__kernel void repeat_I16_axis2( + image2d_array_t input0, image2d_t input1, image2d_t input2, + image2d_array_t output, int axis) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + vxc_short8 src0; + VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + Image img1 = create_image_from_image2d(input1, 4); + Image img2 = create_image_from_image2d(input2, 4); + __global int* len_ptr = (__global int*)img1.ptr; + __global int* index_ptr = (__global int*)img2.ptr; + + int len = len_ptr[get_global_id(2)]; + int start = index_ptr[get_global_id(2)]; + int end = len + start; + + for(coord.z = start; coord.z < end; coord.z++) + { + VXC_WriteImage2DArray(output, coord, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + } +} + +#define REPEAT_1D(src0_type_name, data_type) \ +__kernel void repeat_##src0_type_name##_1D( \ + image2d_t input0, image2d_t input1, image2d_t input2, \ + image2d_t output, int axis) \ +{ \ + int2 coord = (int2)(get_global_id(0), 0); \ + data_type src0; \ + VXC_ReadImage(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \ + Image img1 = create_image_from_image2d(input1, 4); \ + Image img2 = create_image_from_image2d(input2, 4); \ + __global int* len_ptr = (__global int*)img1.ptr; \ + __global int* index_ptr = (__global int*)img2.ptr; \ + int len = len_ptr[get_global_id(0)]; \ + int start = index_ptr[get_global_id(0)]; \ + \ + int iter = len >> 3; \ + int res = len & 7; \ + int end = start + iter * 8; \ + VXC_DP2x8(src0, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract1to8Short_2x8); \ + for(coord.x = start; coord.x < end; coord.x+=8) \ + { \ + VXC_WriteImage(output, coord, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + } \ + \ + if(res == 7) \ + { \ + VXC_WriteImage(output, coord, src0, VXC_MODIFIER(0, 6, 0, VXC_RM_TowardZero, 0)); \ + } \ + else if(res == 6) \ + { \ + VXC_WriteImage(output, coord, src0, VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0)); \ + } \ + else if(res == 5) \ + { \ + VXC_WriteImage(output, coord, src0, VXC_MODIFIER(0, 4, 0, VXC_RM_TowardZero, 0)); \ + } \ + else if(res == 4) \ + { \ + VXC_WriteImage(output, coord, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + } \ + else if(res == 3) \ + { \ + VXC_WriteImage(output, coord, src0, VXC_MODIFIER(0, 2, 0, VXC_RM_TowardZero, 0)); \ + } \ + else if(res == 2) \ + { \ + VXC_WriteImage(output, coord, src0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \ + } \ + else if(res == 1) \ + { \ + VXC_WriteImage(output, coord, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \ + } \ +} +REPEAT_1D(U8, vxc_uchar16) +REPEAT_1D(I16, vxc_short8) + +__kernel void repeat_U8_axis0( + image2d_array_t input0, image2d_t input1, image2d_t input2, + image2d_array_t output, int axis) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + vxc_uchar16 src0; + VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + Image img1 = create_image_from_image2d(input1, 4); + Image img2 = create_image_from_image2d(input2, 4); + __global int* len_ptr = (__global int*)img1.ptr; + __global int* index_ptr = (__global int*)img2.ptr; + + int len = len_ptr[get_global_id(1)]; + int start = index_ptr[get_global_id(1)]; + + int8 output_desc; + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); + int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0; + _viv_asm(MOV, coord.z, baseAddr); + int end = len + start; + + for(coord.y = start; coord.y < end; coord.y++) + { + VXC_OP4_NoDest(img_store_3d, output, coord, src0, \ + VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); + } +} + +__kernel void repeat_U8_axis2( + image2d_array_t input0, image2d_t input1, image2d_t input2, + image2d_array_t output, int axis) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + vxc_uchar16 src0; + VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + Image img1 = create_image_from_image2d(input1, 4); + Image img2 = create_image_from_image2d(input2, 4); + __global int* len_ptr = (__global int*)img1.ptr; + __global int* index_ptr = (__global int*)img2.ptr; + + int len = len_ptr[get_global_id(2)]; + int start = index_ptr[get_global_id(2)]; + int end = len + start; + + for(coord.z = start; coord.z < end; coord.z++) + { + VXC_WriteImage2DArray(output, coord, src0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + } +} \ No newline at end of file diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/repeat_axis1.vx b/src/tim/vx/internal/src/libnnext/ops/vx/repeat_axis1.vx new file mode 100644 index 0000000..d22a292 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/repeat_axis1.vx @@ -0,0 +1,232 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform VXC_512Bits uniExtract1to8Short_2x8; + +#define REPEAT_RES(end_pos) \ +coord.y = gidy; \ +VXC_OP4_NoDest(img_store_3d, output, coord, src0, VXC_MODIFIER(0, end_pos, 0,VXC_RM_TowardZero, 0)); \ +coord.y++; \ +VXC_OP4_NoDest(img_store_3d, output, coord, src1, VXC_MODIFIER(0, end_pos, 0,VXC_RM_TowardZero, 0)); \ +coord.y++; \ +VXC_OP4_NoDest(img_store_3d, output, coord, src2, VXC_MODIFIER(0, end_pos, 0,VXC_RM_TowardZero, 0)); \ +coord.y++; \ +VXC_OP4_NoDest(img_store_3d, output, coord, src3, VXC_MODIFIER(0, end_pos, 0,VXC_RM_TowardZero, 0)); \ +coord.y++; \ +VXC_OP4_NoDest(img_store_3d, output, coord, src4, VXC_MODIFIER(0, end_pos, 0,VXC_RM_TowardZero, 0)); \ +coord.y++; \ +VXC_OP4_NoDest(img_store_3d, output, coord, src5, VXC_MODIFIER(0, end_pos, 0,VXC_RM_TowardZero, 0)); \ +coord.y++; \ +VXC_OP4_NoDest(img_store_3d, output, coord, src6, VXC_MODIFIER(0, end_pos, 0,VXC_RM_TowardZero, 0)); \ +coord.y++; \ +VXC_OP4_NoDest(img_store_3d, output, coord, src7, VXC_MODIFIER(0, end_pos, 0,VXC_RM_TowardZero, 0)); + +__kernel void repeat_I16_axis1( + image2d_array_t input0, image2d_t input1, image2d_t input2, + image2d_array_t output, int axis) +{ + int gidy = get_global_id(1); + int4 coord = (int4)(get_global_id(0), gidy, get_global_id(2), 0); + vxc_short8 src0, src1, src2, src3, src4, src5, src6, src7; + + int8 input_desc, output_desc; + _viv_asm(COPY, input_desc, input0, sizeof(input_desc)); + int baseAddr_a = (int)get_global_id(2) * input_desc.s4 + input_desc.s0; + _viv_asm(MOV, coord.z, baseAddr_a); + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); + int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0; + + VXC_OP4(img_load_3d, src0, input0, coord, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, src1, input0, coord, VXC_5BITOFFSET_XY(0, 1), + VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, src2, input0, coord, VXC_5BITOFFSET_XY(0, 2), + VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, src3, input0, coord, VXC_5BITOFFSET_XY(0, 3), + VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, src4, input0, coord, VXC_5BITOFFSET_XY(0, 4), + VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, src5, input0, coord, VXC_5BITOFFSET_XY(0, 5), + VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, src6, input0, coord, VXC_5BITOFFSET_XY(0, 6), + VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, src7, input0, coord, VXC_5BITOFFSET_XY(0, 7), + VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + + Image img1 = create_image_from_image2d(input1, 4); + Image img2 = create_image_from_image2d(input2, 4); + __global int* len_ptr = (__global int*)img1.ptr; + __global int* index_ptr = (__global int*)img2.ptr; + + int len = len_ptr[get_global_id(0)]; + int start = index_ptr[get_global_id(0)]; + + _viv_asm(MOV, coord.z, baseAddr); + int iter = len >> 3; + int res = len & 7; + coord.x = start; + + VXC_DP2x8(src0, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract1to8Short_2x8); + VXC_DP2x8(src1, src1, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract1to8Short_2x8); + VXC_DP2x8(src2, src2, src2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract1to8Short_2x8); + VXC_DP2x8(src3, src3, src3, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract1to8Short_2x8); + VXC_DP2x8(src4, src4, src4, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract1to8Short_2x8); + VXC_DP2x8(src5, src5, src5, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract1to8Short_2x8); + VXC_DP2x8(src6, src6, src6, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract1to8Short_2x8); + VXC_DP2x8(src7, src7, src7, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract1to8Short_2x8); + + for(int i = 0; i < iter; i++) + { + coord.y = gidy; + VXC_OP4_NoDest(img_store_3d, output, coord, src0, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); + coord.y++; + VXC_OP4_NoDest(img_store_3d, output, coord, src1, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); + coord.y++; + VXC_OP4_NoDest(img_store_3d, output, coord, src2, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); + coord.y++; + VXC_OP4_NoDest(img_store_3d, output, coord, src3, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); + coord.y++; + VXC_OP4_NoDest(img_store_3d, output, coord, src4, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); + coord.y++; + VXC_OP4_NoDest(img_store_3d, output, coord, src5, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); + coord.y++; + VXC_OP4_NoDest(img_store_3d, output, coord, src6, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); + coord.y++; + VXC_OP4_NoDest(img_store_3d, output, coord, src7, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); + coord.x += 8; + } + + if(res == 7) + { + REPEAT_RES(6) + } + else if(res == 6) + { + REPEAT_RES(5) + } + else if(res == 5) + { + REPEAT_RES(4) + } + else if(res == 4) + { + REPEAT_RES(3) + } + else if(res == 3) + { + REPEAT_RES(2) + } + else if(res == 2) + { + REPEAT_RES(1) + } + else if(res == 1) + { + REPEAT_RES(0) + } +} + +__kernel void repeat_U8_axis1( + image2d_array_t input0, image2d_t input1, image2d_t input2, + image2d_array_t output, int axis) +{ + int gidy = get_global_id(1); + int4 coord = (int4)(get_global_id(0), gidy, get_global_id(2), 0); + vxc_uchar16 src0, src1, src2, src3, src4, src5, src6, src7; + + int8 input_desc, output_desc; + _viv_asm(COPY, input_desc, input0, sizeof(input_desc)); + int baseAddr_a = (int)get_global_id(2) * input_desc.s4 + input_desc.s0; + _viv_asm(MOV, coord.z, baseAddr_a); + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); + int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0; + + VXC_OP4(img_load_3d, src0, input0, coord, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, src1, input0, coord, VXC_5BITOFFSET_XY(0, 1), + VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, src2, input0, coord, VXC_5BITOFFSET_XY(0, 2), + VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, src3, input0, coord, VXC_5BITOFFSET_XY(0, 3), + VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, src4, input0, coord, VXC_5BITOFFSET_XY(0, 4), + VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, src5, input0, coord, VXC_5BITOFFSET_XY(0, 5), + VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, src6, input0, coord, VXC_5BITOFFSET_XY(0, 6), + VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, src7, input0, coord, VXC_5BITOFFSET_XY(0, 7), + VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + + Image img1 = create_image_from_image2d(input1, 4); + Image img2 = create_image_from_image2d(input2, 4); + __global int* len_ptr = (__global int*)img1.ptr; + __global int* index_ptr = (__global int*)img2.ptr; + + int len = len_ptr[get_global_id(0)]; + int start = index_ptr[get_global_id(0)]; + + _viv_asm(MOV, coord.z, baseAddr); + int iter = len >> 3; + int res = len & 7; + coord.x = start; + + VXC_DP2x8(src0, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract1to8Short_2x8); + VXC_DP2x8(src1, src1, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract1to8Short_2x8); + VXC_DP2x8(src2, src2, src2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract1to8Short_2x8); + VXC_DP2x8(src3, src3, src3, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract1to8Short_2x8); + VXC_DP2x8(src4, src4, src4, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract1to8Short_2x8); + VXC_DP2x8(src5, src5, src5, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract1to8Short_2x8); + VXC_DP2x8(src6, src6, src6, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract1to8Short_2x8); + VXC_DP2x8(src7, src7, src7, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract1to8Short_2x8); + + for(int i = 0; i < iter; i++) + { + coord.y = gidy; + VXC_OP4_NoDest(img_store_3d, output, coord, src0, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); + coord.y++; + VXC_OP4_NoDest(img_store_3d, output, coord, src1, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); + coord.y++; + VXC_OP4_NoDest(img_store_3d, output, coord, src2, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); + coord.y++; + VXC_OP4_NoDest(img_store_3d, output, coord, src3, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); + coord.y++; + VXC_OP4_NoDest(img_store_3d, output, coord, src4, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); + coord.y++; + VXC_OP4_NoDest(img_store_3d, output, coord, src5, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); + coord.y++; + VXC_OP4_NoDest(img_store_3d, output, coord, src6, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); + coord.y++; + VXC_OP4_NoDest(img_store_3d, output, coord, src7, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); + coord.x += 8; + } + + if(res == 7) + { + REPEAT_RES(6) + } + else if(res == 6) + { + REPEAT_RES(5) + } + else if(res == 5) + { + REPEAT_RES(4) + } + else if(res == 4) + { + REPEAT_RES(3) + } + else if(res == 3) + { + REPEAT_RES(2) + } + else if(res == 2) + { + REPEAT_RES(1) + } + else if(res == 1) + { + REPEAT_RES(0) + } +} + diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_U8_UP_2X.vx b/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_U8_UP_2X.vx deleted file mode 100644 index 25f9350..0000000 --- a/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_U8_UP_2X.vx +++ /dev/null @@ -1,65 +0,0 @@ -#include "cl_viv_vx_ext.h" - -_viv_uniform VXC_512Bits uniResize2xUp_4x8; -_viv_uniform VXC_512Bits uniResize2xUpRound_2x8; -_viv_uniform int out_height; - -__kernel void resize_bilinear_U8toU8_UP_2X_half - ( - __read_only image2d_array_t input, - __write_only image2d_array_t output, - int align_corners, - int half_pixel_centers - ) -{ - int4 coord_out = (int4)(get_global_id(0), 0, get_global_id(1), 0); - int4 coord_in = (int4)(get_global_id(0), -1, get_global_id(1), 0); - coord_in.x = (coord_out.x * 2 - 1) >> 2; - coord_in.x = coord_out.x == 0 ? -1 : coord_in.x; - - vxc_uchar16 in0, in1, tmp, result; - vxc_ushort8 result_s, round_s = 8; - - int8 input_desc; - _viv_asm(COPY, input_desc, input, sizeof(input_desc)); - int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0; - _viv_asm(MOV, coord_in.w, baseAddr); - VXC_OP4(img_load_3d, in0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), - VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); - VXC_OP4(img_load_3d, in1, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1), - VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); - - int8 output_desc; - _viv_asm(COPY, output_desc, output, sizeof(output_desc)); - baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; - _viv_asm(MOV, coord_out.w, baseAddr); - - while (coord_out.y < out_height) - { - VXC_DP4x8(result_s, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniResize2xUp_4x8); - VXC_DP2x8(result, result_s, round_s, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniResize2xUpRound_2x8); - VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, result, - VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); - coord_out.y++; - VXC_OP4(img_load_3d, in0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 2), - VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); - VXC_DP4x8(result_s, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniResize2xUp_4x8); - VXC_DP2x8(result, result_s, round_s, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniResize2xUpRound_2x8); - VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, result, - VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); - coord_out.y++; - VXC_DP4x8(result_s, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniResize2xUp_4x8); - VXC_DP2x8(result, result_s, round_s, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniResize2xUpRound_2x8); - VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, result, - VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); - VXC_OP4(img_load_3d, in1, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 3), - VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); - coord_out.y++; - VXC_DP4x8(result_s, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniResize2xUp_4x8); - VXC_DP2x8(result, result_s, round_s, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniResize2xUpRound_2x8); - VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, result, - VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); - coord_in.y += 2; - coord_out.y++; - } -} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_U8_half_pixel_centers.vx b/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_U8_half_pixel_centers.vx new file mode 100644 index 0000000..1c1071d --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_U8_half_pixel_centers.vx @@ -0,0 +1,229 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform VXC_512Bits uniResize2xUp_0_4x8; +_viv_uniform VXC_512Bits uniResize2xUp_1_4x8; +_viv_uniform int out_height; + +__kernel void resize_bilinear_U8toU8_SAME_2x_upsample_half_pixel_centers + ( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + int align_corners, + int half_pixel_centers + ) +{ + int4 coord_out = (int4)(get_global_id(0), 0, get_global_id(1), 0); + int4 coord_in = (int4)(get_global_id(0), -1, get_global_id(1), 0); + coord_in.x = (coord_out.x * 2 - 1) >> 2; + coord_in.x = coord_out.x == 0 ? -1 : coord_in.x; + + vxc_uchar16 in0, in1, tmp, result; + + int8 input_desc; + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); + int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0; + _viv_asm(MOV, coord_in.w, baseAddr); + VXC_OP4(img_load_3d, in0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, in1, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1), + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + + int8 output_desc; + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); + baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; + _viv_asm(MOV, coord_out.w, baseAddr); + + while (coord_out.y < out_height) + { + VXC_DP4x8(result, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize2xUp_0_4x8); + VXC_DP4x8(result, in0, in1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize2xUp_1_4x8); + VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, result, + VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); + coord_out.y++; + VXC_OP4(img_load_3d, in0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 2), + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_DP4x8(result, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize2xUp_0_4x8); + VXC_DP4x8(result, in0, in1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize2xUp_1_4x8); + VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, result, + VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); + coord_out.y++; + VXC_DP4x8(result, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize2xUp_0_4x8); + VXC_DP4x8(result, in1, in0, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize2xUp_1_4x8); + VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, result, + VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, in1, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 3), + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + coord_out.y++; + VXC_DP4x8(result, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize2xUp_0_4x8); + VXC_DP4x8(result, in1, in0, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize2xUp_1_4x8); + VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, result, + VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); + coord_in.y += 2; + coord_out.y++; + } +} + +_viv_uniform VXC_512Bits uniResize4xUp_l00_4x8; +_viv_uniform VXC_512Bits uniResize4xUp_l01_4x8; +_viv_uniform VXC_512Bits uniResize4xUp_l10_4x8; +_viv_uniform VXC_512Bits uniResize4xUp_l11_4x8; +__kernel void resize_bilinear_U8toU8_SAME_4x_upsample_half_pixel_centers + ( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + int align_corners, + int half_pixel_centers + ) +{ + int4 coord_out = (int4)(get_global_id(0), 0, get_global_id(1), 0); + int4 coord_in = (int4)(get_global_id(0), -1, get_global_id(1), 0); + coord_in.x = (coord_out.x * 2 - 3) >> 3; + coord_in.x = coord_out.x == 0 ? -1 : coord_in.x; + + vxc_uchar16 in0, in1, tmp, dst0, dst1; + + int8 input_desc; + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); + int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0; + _viv_asm(MOV, coord_in.w, baseAddr); + VXC_OP4(img_load_3d, in0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, in1, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1), + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + + int8 output_desc; + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); + baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; + _viv_asm(MOV, coord_out.w, baseAddr); + + while (coord_out.y < out_height) + { + VXC_DP4x8(dst0, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l00_4x8); + VXC_DP4x8(dst0, in0, in1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l01_4x8); + VXC_DP4x8(dst1, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l10_4x8); + VXC_DP4x8(dst1, in0, in1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l11_4x8); + VXC_OP4(img_load_3d, in0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 2), + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst0, + VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); + coord_out.y++; + VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst1, + VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); + coord_out.y++; + VXC_DP4x8(dst0, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l10_4x8); + VXC_DP4x8(dst0, in0, in1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l11_4x8); + VXC_DP4x8(dst1, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l00_4x8); + VXC_DP4x8(dst1, in0, in1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l01_4x8); + VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst0, + VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); + coord_out.y++; + VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst1, + VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); + coord_out.y++; + VXC_DP4x8(dst0, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l00_4x8); + VXC_DP4x8(dst0, in1, in0, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l01_4x8); + VXC_DP4x8(dst1, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l10_4x8); + VXC_DP4x8(dst1, in1, in0, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l11_4x8); + VXC_OP4(img_load_3d, in1, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 3), + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst0, + VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); + coord_out.y++; + VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst1, + VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); + coord_out.y++; + VXC_DP4x8(dst0, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l10_4x8); + VXC_DP4x8(dst0, in1, in0, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l11_4x8); + VXC_DP4x8(dst1, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l00_4x8); + VXC_DP4x8(dst1, in1, in0, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l01_4x8); + VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst0, + VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); + coord_out.y++; + VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst1, + VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); + coord_in.y += 2; + coord_out.y++; + } +} + +_viv_uniform VXC_512Bits uniResize3xUp_l00_2x8; +_viv_uniform VXC_512Bits uniResize3xUp_l01_2x8; +_viv_uniform VXC_512Bits uniResize3xUp_l10_4x4; +_viv_uniform VXC_512Bits uniResize3xUp_l11_4x4; +_viv_uniform VXC_512Bits uniResize3xUp_l12_4x4; +_viv_uniform VXC_512Bits uniResize3xUp_l13_4x4; +__kernel void resize_bilinear_U8toU8_SAME_3x_upsample_half_pixel_centers + ( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + int align_corners, + int half_pixel_centers + ) +{ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + coord_in.x = (short)(coord_out.x * 2 - 1) / (short)6; + coord_in.x = coord_out.x == 0 ? -1 : coord_in.x; + coord_in.y = (short)(coord_out.y * 2 - 1) / (short)6; + coord_in.y = coord_out.y == 0 ? -1 : coord_in.y; + + vxc_uchar16 in0, in1, in2, in3, tmp, dst0, dst1, dst2; + + int8 input_desc; + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); + int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0; + _viv_asm(MOV, coord_in.w, baseAddr); + VXC_OP4(img_load_3d, in0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, in1, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1), + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, in2, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 2), + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, in3, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 3), + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + + int8 output_desc; + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); + baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; + _viv_asm(MOV, coord_out.w, baseAddr); + + VXC_DP4x4(dst2, in1, in0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l10_4x4); + VXC_DP4x4(dst2, in1, in0, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l11_4x4); + VXC_DP4x4(dst2, in1, in0, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l12_4x4); + VXC_DP4x4(dst2, in1, in0, VXC_MODIFIER(12, 14, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l13_4x4); + VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst2, + VXC_MODIFIER(0, 14, 0,VXC_RM_TowardZero, 0)); + coord_out.y++; + + VXC_DP2x8(dst0, in1, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l00_2x8); + VXC_DP2x8(dst0, in1, in1, VXC_MODIFIER(8, 14, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l01_2x8); + VXC_DP4x4(dst1, in1, in2, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l10_4x4); + VXC_DP4x4(dst1, in1, in2, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l11_4x4); + VXC_DP4x4(dst1, in1, in2, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l12_4x4); + VXC_DP4x4(dst1, in1, in2, VXC_MODIFIER(12, 14, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l13_4x4); + VXC_DP4x4(dst2, in2, in1, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l10_4x4); + VXC_DP4x4(dst2, in2, in1, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l11_4x4); + VXC_DP4x4(dst2, in2, in1, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l12_4x4); + VXC_DP4x4(dst2, in2, in1, VXC_MODIFIER(12, 14, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l13_4x4); + VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst0, + VXC_MODIFIER(0, 14, 0,VXC_RM_TowardZero, 0)); + coord_out.y++; + VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst1, + VXC_MODIFIER(0, 14, 0,VXC_RM_TowardZero, 0)); + coord_out.y++; + VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst2, + VXC_MODIFIER(0, 14, 0,VXC_RM_TowardZero, 0)); + coord_out.y++; + + VXC_DP2x8(dst0, in2, in2, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l00_2x8); + VXC_DP2x8(dst0, in2, in2, VXC_MODIFIER(8, 14, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l01_2x8); + VXC_DP4x4(dst1, in2, in3, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l10_4x4); + VXC_DP4x4(dst1, in2, in3, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l11_4x4); + VXC_DP4x4(dst1, in2, in3, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l12_4x4); + VXC_DP4x4(dst1, in2, in3, VXC_MODIFIER(12, 14, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l13_4x4); + VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst0, + VXC_MODIFIER(0, 14, 0,VXC_RM_TowardZero, 0)); + coord_out.y++; + VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst1, + VXC_MODIFIER(0, 14, 0,VXC_RM_TowardZero, 0)); +} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/sequence_mask.vx b/src/tim/vx/internal/src/libnnext/ops/vx/sequence_mask.vx new file mode 100644 index 0000000..3193485 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/sequence_mask.vx @@ -0,0 +1,150 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4; +_viv_uniform VXC_512Bits uniConvert1stUint8SubZpToFp32_4x4; +_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8; +_viv_uniform float input_scale; +_viv_uniform int inputZP; +_viv_uniform int output_ZP; +_viv_uniform float outputVal1; + +#define SEQUENCE_MASK_QINT_TO_QINT_2D(src0_type_name, src1_type_name, read_type, write_type) \ +__kernel void sequence_mask_##src0_type_name##to##src1_type_name##_2D( \ + image2d_t input, image2d_t output, int maxLen) \ +{ \ + int gidx = get_global_id(0); \ + int2 coord = (int2)(gidx, get_global_id(1)); \ + read_type src0; \ + VXC_ReadImage(src0, input, coord.yy, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \ + int4 outIdx = (int4)(gidx, gidx + 1, gidx + 2, gidx + 3); \ + float4 tmpData; \ + short zp = inputZP; \ + VXC_DP4x4(tmpData, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniConvert1stUint8SubZpToFp32_4x4); \ + int index = convert_int_rte(tmpData.s0 * input_scale); \ + int4 data; \ + data = outIdx < index? convert_int_rte(outputVal1) : output_ZP; \ + write_type dst; \ + VXC_DP2x8(dst, data, data, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8); \ + VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ +} +SEQUENCE_MASK_QINT_TO_QINT_2D(U8, U8, vxc_uchar16, vxc_uchar16) +SEQUENCE_MASK_QINT_TO_QINT_2D(I8, I8, vxc_char16, vxc_char16) +SEQUENCE_MASK_QINT_TO_QINT_2D(I16, I16, vxc_short8, vxc_short8) +SEQUENCE_MASK_QINT_TO_QINT_2D(I8, U8, vxc_char16, vxc_uchar16) +SEQUENCE_MASK_QINT_TO_QINT_2D(I16, U8, vxc_short8, vxc_uchar16) + +#define SEQUENCE_MASK_QINT_TO_QINT(src0_type_name, src1_type_name, read_type, write_type) \ +__kernel void sequence_mask_##src0_type_name##to##src1_type_name( \ + image2d_t input, image2d_array_t output, int maxLen) \ +{ \ + int gidx = get_global_id(0); \ + int4 coord = (int4)(gidx, get_global_id(1), get_global_id(2), 0); \ + read_type src0; \ + VXC_ReadImage(src0, input, coord.yz, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \ + int4 outIdx = (int4)(gidx, gidx + 1, gidx + 2, gidx + 3); \ + float4 tmpData; \ + short zp = inputZP; \ + VXC_DP4x4(tmpData, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniConvert1stUint8SubZpToFp32_4x4); \ + int index = convert_int_rte(tmpData.s0 * input_scale); \ + int4 data; \ + data = outIdx < index? convert_int_rte(outputVal1) : output_ZP; \ + write_type dst; \ + VXC_DP2x8(dst, data, data, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8); \ + VXC_WriteImage2DArray(output, coord, data, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ +} +SEQUENCE_MASK_QINT_TO_QINT(U8, U8, vxc_uchar16, vxc_uchar16) +SEQUENCE_MASK_QINT_TO_QINT(I8, I8, vxc_char16, vxc_char16) +SEQUENCE_MASK_QINT_TO_QINT(I16, I16, vxc_short8, vxc_short8) +SEQUENCE_MASK_QINT_TO_QINT(I16, U8, vxc_short8, vxc_uchar16) +SEQUENCE_MASK_QINT_TO_QINT(I8, U8, vxc_char16, vxc_uchar16) + +__kernel void sequence_mask_F16toF16_2D( + image2d_t input, image2d_t output, int maxLen) +{ + int gidx = get_global_id(0); + int2 coord = (int2)(gidx, get_global_id(1)); + vxc_short8 src0; + vxc_half8 in_h; + VXC_ReadImage(src0, input, coord.yy, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + int4 outIdx = (int4)(gidx, gidx + 1, gidx + 2, gidx + 3); + _viv_asm(COPY, in_h, src0, 16); + float4 tmpData; + VXC_DP4x4(tmpData, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + UniFP16toFP32Lo4_dp4x4); + int index = convert_int_rte(tmpData.x); + float4 data; + data = outIdx < index? outputVal1 : convert_float(output_ZP); + vxc_short8 dst; + half4 tmpVal; + _viv_asm(CONV, tmpVal, data); + _viv_asm(COPY, dst, tmpVal, 16); + VXC_WriteImage(output, coord, dst.s0246, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); +} + +__kernel void sequence_mask_F16toF16( + image2d_t input, image2d_t output, int maxLen) +{ + int gidx = get_global_id(0); + int4 coord = (int4)(gidx, get_global_id(1), get_global_id(2), 0); + vxc_short8 src0; + vxc_half8 in_h; + VXC_ReadImage(src0, input, coord.yz, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + int4 outIdx = (int4)(gidx, gidx + 1, gidx + 2, gidx + 3); + _viv_asm(COPY, in_h, src0, 16); + float4 tmpData; + VXC_DP4x4(tmpData, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + UniFP16toFP32Lo4_dp4x4); + int index = convert_int_rte(tmpData.x); + float4 data; + data = outIdx < index? outputVal1 : convert_float(output_ZP); + vxc_short8 dst; + half4 tmpVal; + _viv_asm(CONV, tmpVal, data); + _viv_asm(COPY, dst, tmpVal, 16); + VXC_WriteImage2DArray(output, coord, dst.s0246, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); +} + +__kernel void sequence_mask_F16toU8_2D( + image2d_t input, image2d_t output, int maxLen) +{ + int gidx = get_global_id(0); + int2 coord = (int2)(gidx, get_global_id(1)); + vxc_short8 src0; + vxc_half8 in_h; + VXC_ReadImage(src0, input, coord.yy, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + int4 outIdx = (int4)(gidx, gidx + 1, gidx + 2, gidx + 3); + _viv_asm(COPY, in_h, src0, 16); + float4 tmpData; + VXC_DP4x4(tmpData, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + UniFP16toFP32Lo4_dp4x4); + int index = convert_int_rte(tmpData.x); + int4 data; + data = outIdx < index? convert_int_rte(outputVal1) : output_ZP; + vxc_uchar16 dst; + VXC_DP2x8(dst, data, data, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8); + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); +} + +__kernel void sequence_mask_F16toU8( + image2d_t input, image2d_t output, int maxLen) +{ + int gidx = get_global_id(0); + int4 coord = (int4)(gidx, get_global_id(1), get_global_id(2), 0); + vxc_short8 src0; + vxc_half8 in_h; + VXC_ReadImage(src0, input, coord.yz, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + int4 outIdx = (int4)(gidx, gidx + 1, gidx + 2, gidx + 3); + _viv_asm(COPY, in_h, src0, 16); + float4 tmpData; + VXC_DP4x4(tmpData, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + UniFP16toFP32Lo4_dp4x4); + int index = convert_int_rte(tmpData.x); + int4 data; + data = outIdx < index? convert_int_rte(outputVal1) : output_ZP; + vxc_uchar16 dst; + VXC_DP2x8(dst, data, data, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8); + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); +} + diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/slice.vx b/src/tim/vx/internal/src/libnnext/ops/vx/slice.vx new file mode 100644 index 0000000..5717266 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/slice.vx @@ -0,0 +1,239 @@ +#include "cl_viv_vx_ext.h" + +#define SLICE_SAMLEFL_SH_IMPL(name, data_type, end_bin) \ +__kernel void slice_##name##_I32to##name##_SAMEFL \ + ( \ + __read_only image2d_array_t input0, \ + __read_only image2d_t input1, \ + __write_only image2d_array_t output, \ + int is_samefl \ + ) \ +{ \ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \ + int4 coord_in; \ + Image begin_img = create_image_from_image2d(input1, 4); \ + uchar* begin_ptr = begin_img.ptr; \ + int4 begin = ((int4 *)begin_ptr)[0]; \ + \ + coord_in = coord + begin; \ + data_type src; \ + VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(0, end_bin, 0, VXC_RM_TowardZero, 0)); \ + VXC_WriteImage2DArray(output, coord, src, VXC_MODIFIER(0, end_bin, 0, VXC_RM_TowardZero, 0)); \ +} +SLICE_SAMLEFL_SH_IMPL(U8, vxc_uchar16, 15) +SLICE_SAMLEFL_SH_IMPL(I16, vxc_short8, 7) + + +#define SLICE_SAMLEFL_2D_SH_IMPL(name, data_type, end_bin) \ +__kernel void slice_##name##_I32to##name##_SAMEFL_2D \ + ( \ + __read_only image2d_array_t input0, \ + __read_only image2d_t input1, \ + __write_only image2d_array_t output, \ + int is_samefl \ + ) \ +{ \ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); \ + int2 coord_in; \ + Image begin_img = create_image_from_image2d(input1, 4); \ + uchar* begin_ptr = begin_img.ptr; \ + int2 begin = ((int2 *)begin_ptr)[0]; \ + \ + coord_in = coord + begin; \ + data_type src; \ + VXC_ReadImage(src, input0, coord_in, 0, VXC_MODIFIER(0, end_bin, 0, VXC_RM_TowardZero, 0)); \ + VXC_WriteImage(output, coord, src, VXC_MODIFIER(0, end_bin, 0, VXC_RM_TowardZero, 0)); \ +} +SLICE_SAMLEFL_2D_SH_IMPL(U8, vxc_uchar16, 15) +SLICE_SAMLEFL_2D_SH_IMPL(I16, vxc_short8, 7) + +_viv_uniform VXC_512Bits uniU8MulAndPostShift_Lo_2x8; +_viv_uniform VXC_512Bits uniU8MulAndPostShift_Hi_2x8; +_viv_uniform int2 multAndoutZP;//[0:15] multiplier, [31:63] output zp +#define SLICE_8BITSTO16BITS(name0, name1, src_type, dst_type, save_type) \ +__kernel void slice_##name0##_I32to##name1 \ + ( \ + __read_only image2d_array_t input0, \ + __read_only image2d_t input1, \ + __write_only image2d_array_t output, \ + int is_samefl \ + ) \ +{ \ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \ + src_type src; \ + dst_type dst0; \ + int4 coord_in; \ + Image begin_img = create_image_from_image2d(input1, 4); \ + uchar* begin_ptr = begin_img.ptr; \ + int4 begin = ((int4 *)begin_ptr)[0]; \ + \ + coord_in = coord + begin; \ + VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + \ + vxc_ushort8 multiplier; \ + _viv_asm(COPY, multiplier, multAndoutZP, 16); \ + VXC_DP2x8(dst0, src, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \ + uniU8MulAndPostShift_Lo_2x8); \ + save_type dst; \ + _viv_asm(COPY, dst, dst0, 16); \ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \ +} +SLICE_8BITSTO16BITS(I8, F16, vxc_char16, vxc_half8, vxc_short8) +SLICE_8BITSTO16BITS(U8, F16, vxc_uchar16, vxc_half8, vxc_short8) + +#define SLICE_8BITSTO16BITS_2D(name0, name1, src_type, dst_type, save_type) \ +__kernel void slice_##name0##_I32to##name1##_2D \ + ( \ + __read_only image2d_array_t input0, \ + __read_only image2d_t input1, \ + __write_only image2d_array_t output, \ + int is_samefl \ + ) \ +{ \ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); \ + src_type src; \ + dst_type dst0; \ + int2 coord_in; \ + Image begin_img = create_image_from_image2d(input1, 4); \ + uchar* begin_ptr = begin_img.ptr; \ + int2 begin = ((int2 *)begin_ptr)[0]; \ + \ + coord_in = coord + begin; \ + VXC_ReadImage(src, input0, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + \ + vxc_ushort8 multiplier; \ + _viv_asm(COPY, multiplier, multAndoutZP, 16); \ + VXC_DP2x8(dst0, src, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \ + uniU8MulAndPostShift_Lo_2x8); \ + save_type dst; \ + _viv_asm(COPY, dst, dst0, 16); \ + VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \ +} +SLICE_8BITSTO16BITS_2D(I8, F16, vxc_char16, vxc_half8, vxc_short8) +SLICE_8BITSTO16BITS_2D(U8, F16, vxc_uchar16, vxc_half8, vxc_short8) + +#define SLICE_8BITSTO8BITS(name0, name1, src_type, dst_type) \ +__kernel void slice_##name0##_I32to##name1 \ + ( \ + __read_only image2d_array_t input0, \ + __read_only image2d_t input1, \ + __write_only image2d_array_t output, \ + int is_samefl \ + ) \ +{ \ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \ + src_type src; \ + dst_type dst; \ + int4 coord_in; \ + Image begin_img = create_image_from_image2d(input1, 4); \ + uchar* begin_ptr = begin_img.ptr; \ + int4 begin = ((int4 *)begin_ptr)[0]; \ + \ + coord_in = coord + begin; \ + VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ + \ + vxc_ushort8 multiplier; \ + _viv_asm(COPY, multiplier, multAndoutZP, 16); \ + VXC_DP2x8(dst, src, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \ + uniU8MulAndPostShift_Lo_2x8); \ + VXC_DP2x8(dst, src, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), \ + uniU8MulAndPostShift_Hi_2x8); \ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); \ +} +SLICE_8BITSTO8BITS(I8, I8, vxc_char16, vxc_char16) +SLICE_8BITSTO8BITS(U8, U8, vxc_uchar16, vxc_uchar16) + +#define SLICE_8BITSTO8BITS_2D(name0, name1, src_type, dst_type) \ +__kernel void slice_##name0##_I32to##name1##_2D \ + ( \ + __read_only image2d_array_t input0, \ + __read_only image2d_t input1, \ + __write_only image2d_array_t output, \ + int is_samefl \ + ) \ +{ \ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); \ + src_type src; \ + dst_type dst; \ + int2 coord_in; \ + Image begin_img = create_image_from_image2d(input1, 4); \ + uchar* begin_ptr = begin_img.ptr; \ + int2 begin = ((int2 *)begin_ptr)[0]; \ + \ + coord_in = coord + begin; \ + VXC_ReadImage(src, input0, coord_in, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ + \ + vxc_ushort8 multiplier; \ + _viv_asm(COPY, multiplier, multAndoutZP, 16); \ + VXC_DP2x8(dst, src, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \ + uniU8MulAndPostShift_Lo_2x8); \ + VXC_DP2x8(dst, src, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), \ + uniU8MulAndPostShift_Hi_2x8); \ + VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); \ +} +SLICE_8BITSTO8BITS_2D(I8, I8, vxc_char16, vxc_char16) +SLICE_8BITSTO8BITS_2D(U8, U8, vxc_uchar16, vxc_uchar16) + +#define SLICE_16BITS_TO(name0, name1, src_type, copy_type, dst_type) \ +__kernel void slice_##name0##_I32to##name1 \ + ( \ + __read_only image2d_array_t input0, \ + __read_only image2d_t input1, \ + __write_only image2d_array_t output, \ + int is_samefl \ + ) \ +{ \ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \ + src_type src; \ + copy_type src0; \ + dst_type dst; \ + int4 coord_in; \ + Image begin_img = create_image_from_image2d(input1, 4); \ + uchar* begin_ptr = begin_img.ptr; \ + int4 begin = ((int4 *)begin_ptr)[0]; \ + \ + coord_in = coord + begin; \ + VXC_ReadImage2DArray(src0, input0, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src, src0, 16); \ + \ + vxc_ushort8 multiplier; \ + _viv_asm(COPY, multiplier, multAndoutZP, 16); \ + VXC_DP2x8(dst, src, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \ + uniU8MulAndPostShift_Lo_2x8); \ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \ +} +SLICE_16BITS_TO(F16, I8, vxc_half8, vxc_short8, vxc_char16) +SLICE_16BITS_TO(F16, U8, vxc_half8, vxc_short8, vxc_uchar16) +SLICE_16BITS_TO(F16, I16, vxc_half8, vxc_short8, vxc_short8) + +#define SLICE_16BITS_TO_2D(name0, name1, src_type, copy_type, dst_type) \ +__kernel void slice_##name0##_I32to##name1##_2D \ + ( \ + __read_only image2d_array_t input0, \ + __read_only image2d_t input1, \ + __write_only image2d_array_t output, \ + int is_samefl \ + ) \ +{ \ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); \ + src_type src; \ + copy_type src0; \ + dst_type dst; \ + int2 coord_in; \ + Image begin_img = create_image_from_image2d(input1, 4); \ + uchar* begin_ptr = begin_img.ptr; \ + int2 begin = ((int2 *)begin_ptr)[0]; \ + \ + coord_in = coord + begin; \ + VXC_ReadImage(src0, input0, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src, src0, 16); \ + \ + vxc_ushort8 multiplier; \ + _viv_asm(COPY, multiplier, multAndoutZP, 16); \ + VXC_DP2x8(dst, src, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \ + uniU8MulAndPostShift_Lo_2x8); \ + VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \ +} +SLICE_16BITS_TO_2D(F16, I8, vxc_half8, vxc_short8, vxc_char16) +SLICE_16BITS_TO_2D(F16, U8, vxc_half8, vxc_short8, vxc_uchar16) +SLICE_16BITS_TO_2D(F16, I16, vxc_half8, vxc_short8, vxc_short8) \ No newline at end of file diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/tile.vx b/src/tim/vx/internal/src/libnnext/ops/vx/tile.vx index 54fb828..7fd4c58 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/tile.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/tile.vx @@ -130,4 +130,41 @@ TILE_2D(I16, I16, 6, 5, vxc_short8) TILE_2D(I16, I16, 7, 6, vxc_short8) TILE_2D(I16, I16, 0, 7, vxc_short8) +#define TILE_2D_1TON(name0, name1, type) \ +__kernel void tile_1toN_##name0##to##name1##_2D( \ + __read_only image2d_array_t input, \ + __write_only image2d_array_t output, \ + int batchIn, \ + int depthIn, \ + int depthOut, \ + int multiples_0, \ + int multiples_1, \ + int multiples_2, \ + int multiples_3 \ +) \ +{ \ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); \ + int width = get_image_width(input); \ + int height = get_image_height(input); \ + int output_width = get_image_width(output); \ + int output_height = get_image_height(output); \ + type src; \ + VXC_ReadImage(src, input, coord, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + \ + do \ + { \ + do \ + { \ + VXC_WriteImage(output, coord, src, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + coord.x += 8; \ + } while (coord.x < output_width); \ + coord.x = 0; \ + coord.y += height; \ + } while (coord.y < output_height); \ +} +TILE_2D_1TON(U8, U8, vxc_uchar8) +TILE_2D_1TON(I16, I16, vxc_short8) + + diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_axis_aligned_bbox_transform.vx b/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_axis_aligned_bbox_transform.vx deleted file mode 100644 index b0def7f..0000000 --- a/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_axis_aligned_bbox_transform.vx +++ /dev/null @@ -1,8 +0,0 @@ -#include "cl_viv_vx_ext.h" - -__kernel void vxcAxis_aligned_bbox_transform( - __read_only image2d_array_t input, - __write_only image2d_array_t output) -{ - -} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_generate_proposals.vx b/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_generate_proposals.vx deleted file mode 100644 index 9b2e37d..0000000 --- a/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_generate_proposals.vx +++ /dev/null @@ -1,8 +0,0 @@ -#include "cl_viv_vx_ext.h" - -__kernel void vxcGenerate_proposals( - __read_only image2d_array_t input, - __write_only image2d_array_t output) -{ - -} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_header.vx b/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_header.vx index b0f9565..86d1c60 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_header.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_header.vx @@ -9,6 +9,62 @@ */ #include "cl_viv_vx_ext.h" +typedef struct Image +{ + __global uchar *ptr; + int stride_x; + int stride_y; +} Image; + +inline uchar* get_image_ptr_from_coord(Image img, int2 coord) +{ + return img.ptr + coord.x * img.stride_x + coord.y * img.stride_y; +} + +inline Image create_image_from_image2d(image2d_t input, int stride_x) +{ + int8 desc; + _viv_asm(COPY, desc, input, sizeof(desc)); + + Image img = + { + .ptr = (uchar*)desc.s0, + .stride_x = stride_x, + .stride_y = desc.s1 + }; + + return img; +} + +typedef struct Tensor +{ + __global uchar *ptr; + int stride_x; + int stride_y; + int stride_z; +} Tensor; + +inline uchar* create_tensor_ptr_from_coord(Tensor t, int4 coord) +{ + return t.ptr + coord.x * t.stride_x + coord.y * t.stride_y + coord.z * t.stride_z; +} + +inline Tensor create_tensor_from_image2d_array(image2d_array_t input, int stride_x) +{ + int8 desc; + _viv_asm(COPY, desc, input, sizeof(desc)); + + Tensor t = + { + .ptr = (uchar*)desc.s0, + .stride_x = stride_x, + .stride_y = desc.s1, + .stride_z = desc.s4 + }; + + return t; +} + #if (VX_VERSION==1) #define VXC_DP2x8_b_(dst, src0, src1, src2, info, uniform)\ do\ diff --git a/src/tim/vx/internal/src/libnnext/vsi_nn_libnnext_resource.c b/src/tim/vx/internal/src/libnnext/vsi_nn_libnnext_resource.c index fd2db22..962644c 100644 --- a/src/tim/vx/internal/src/libnnext/vsi_nn_libnnext_resource.c +++ b/src/tim/vx/internal/src/libnnext/vsi_nn_libnnext_resource.c @@ -1525,7 +1525,7 @@ _viv_uniform float output_scale;\n\ _viv_uniform float output_zp;\n\ \n\ #define BATCH_NORM_SH_IMPL(name0, name1, src_type, read_type, conv_type, dst_type, save_type) \\\n\ -__kernel void batch_norm_##name0##to##name1##_brdcst1( \\\n\ +__kernel void batch_norm_##name0##_F16_F16_F16_F32to##name1##_brdcst1( \\\n\ __read_only image2d_array_t input, \\\n\ __read_only image2d_array_t Mean, \\\n\ __read_only image2d_array_t Variance, \\\n\ @@ -1589,7 +1589,7 @@ BATCH_NORM_SH_IMPL(I8, I8, vxc_char16, vxc_char16, int4, vxc_char16, vxc_c BATCH_NORM_SH_IMPL(I8, F16, vxc_char16, vxc_char16, half4, vxc_half8, vxc_ushort8)\n\ \n\ #define BATCH_NORM_SH_IMPL_2D(name0, name1, src_type, read_type, conv_type, dst_type, save_type) \\\n\ -__kernel void batch_norm_##name0##to##name1##_brdcst1_2D( \\\n\ +__kernel void batch_norm_##name0##_F16_F16_F16_F32to##name1##_brdcst1_2D( \\\n\ __read_only image2d_array_t input, \\\n\ __read_only image2d_t Mean, \\\n\ __read_only image2d_t Variance, \\\n\ @@ -1654,7 +1654,7 @@ BATCH_NORM_SH_IMPL_2D(I8, F16, vxc_char16, vxc_char16, half4, vxc_half8, vx \n\ \n\ #define BATCH_NORM_SH_IMPL_AXIS1(name0, name1, src_type, read_type, conv_type, dst_type, save_type) \\\n\ -__kernel void batch_norm_##name0##to##name1##_brdcst0( \\\n\ +__kernel void batch_norm_##name0##_F16_F16_F16_F32to##name1##_brdcst0( \\\n\ __read_only image2d_array_t input, \\\n\ __read_only image2d_array_t Mean, \\\n\ __read_only image2d_array_t Variance, \\\n\ @@ -1721,7 +1721,7 @@ BATCH_NORM_SH_IMPL_AXIS1(I8, I8, vxc_char16, vxc_char16, int4, vxc_char16, BATCH_NORM_SH_IMPL_AXIS1(I8, F16, vxc_char16, vxc_char16, half4, vxc_half8, vxc_ushort8)\n\ \n\ #define BATCH_NORM_SH_IMPL_AXIS1_2D(name0, name1, src_type, read_type, conv_type, dst_type, save_type) \\\n\ -__kernel void batch_norm_##name0##to##name1##_brdcst0_2D( \\\n\ +__kernel void batch_norm_##name0##_F16_F16_F16_F32to##name1##_brdcst0_2D( \\\n\ __read_only image2d_array_t input, \\\n\ __read_only image2d_t Mean, \\\n\ __read_only image2d_t Variance, \\\n\ @@ -1788,6 +1788,275 @@ BATCH_NORM_SH_IMPL_AXIS1_2D(I8, F16, vxc_char16, vxc_char16, half4, vxc_half8 \n\ "; /* end of batchnorm_single_vx*/ +static const char batchnorm_single_f32_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform VXC_512Bits uniDatatoF32_0_4x4;\n\ +_viv_uniform VXC_512Bits uniDatatoF32_1_4x4;\n\ +_viv_uniform VXC_512Bits uniExtract8Data_2x8;\n\ +_viv_uniform float input_scale;\n\ +_viv_uniform float input_tail;\n\ +_viv_uniform float output_scale;\n\ +_viv_uniform float output_zp;\n\ +\n\ +#define BATCH_NORM_SH_IMPL(name0, name1, src_type, read_type, conv_type, dst_type, save_type) \\\n\ +__kernel void batch_norm_##name0##_F16_F16_F32_F32to##name1##_brdcst1( \\\n\ + __read_only image2d_array_t input, \\\n\ + __read_only image2d_array_t Mean, \\\n\ + __read_only image2d_array_t Variance, \\\n\ + __read_only image2d_array_t Gamma, \\\n\ + __read_only image2d_array_t Beta, \\\n\ + __write_only image2d_array_t output, \\\n\ + float eps \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\ + read_type vec; \\\n\ + src_type src; \\\n\ + VXC_ReadImage2DArray(vec, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src, vec, 16); \\\n\ + vxc_ushort8 _mean, _var; \\\n\ + vxc_half8 mean, var; \\\n\ + VXC_ReadImage2DArray(_mean, Mean, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, mean, _mean, 16); \\\n\ + VXC_ReadImage2DArray(_var, Variance, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, var, _var, 16); \\\n\ + float4 gamma0 = read_imagef(Gamma, coord); \\\n\ + coord.x += 4; \\\n\ + float4 gamma1 = read_imagef(Gamma, coord); \\\n\ + coord.x -= 4; \\\n\ + float4 beta = read_imagef(Beta, coord); \\\n\ + \\\n\ + float4 src0, src1, m, v; \\\n\ + VXC_DP4x4(src0, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_0_4x4); \\\n\ + VXC_DP4x4(m, mean, mean, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_0_4x4); \\\n\ + VXC_DP4x4(v, var, var, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_0_4x4); \\\n\ + gamma0 = gamma0 * rsqrt(v + eps); \\\n\ + src0 = src0 * input_scale + input_tail; \\\n\ + src0 = (src0 - m) * gamma0 + beta.xxxx; \\\n\ + src0 = src0 * output_scale + output_zp; \\\n\ + VXC_DP4x4(src1, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_1_4x4); \\\n\ + VXC_DP4x4(m, mean, mean, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_1_4x4); \\\n\ + VXC_DP4x4(v, var, var, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_1_4x4); \\\n\ + gamma1 = gamma1 * rsqrt(v + eps); \\\n\ + src1 = src1 * input_scale + input_tail; \\\n\ + src1 = (src1 - m) * gamma1 + beta.xxxx; \\\n\ + src1 = src1 * output_scale + output_zp; \\\n\ + \\\n\ + conv_type dst0, dst1; \\\n\ + _viv_asm(CONV_RTE, dst0, src0); \\\n\ + _viv_asm(CONV_RTE, dst1, src1); \\\n\ + dst_type tmp; \\\n\ + save_type dst; \\\n\ + VXC_DP2x8(tmp, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \\\n\ + _viv_asm(COPY, dst, tmp, 16); \\\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +BATCH_NORM_SH_IMPL(F16, F16, vxc_half8, vxc_ushort8, half4, vxc_half8, vxc_ushort8)\n\ +BATCH_NORM_SH_IMPL(F16, I16, vxc_half8, vxc_ushort8, int4, vxc_short8, vxc_short8)\n\ +BATCH_NORM_SH_IMPL(F16, U8, vxc_half8, vxc_ushort8, int4, vxc_uchar16, vxc_uchar16)\n\ +BATCH_NORM_SH_IMPL(F16, I8, vxc_half8, vxc_ushort8, int4, vxc_char16, vxc_char16)\n\ +BATCH_NORM_SH_IMPL(I16, I16, vxc_short8, vxc_short8, int4, vxc_short8, vxc_short8)\n\ +BATCH_NORM_SH_IMPL(I16, F16, vxc_short8, vxc_short8, half4, vxc_half8, vxc_ushort8)\n\ +BATCH_NORM_SH_IMPL(U8, U8, vxc_uchar16, vxc_uchar16, int4, vxc_uchar16, vxc_uchar16)\n\ +BATCH_NORM_SH_IMPL(U8, F16, vxc_uchar16, vxc_uchar16, half4, vxc_half8, vxc_ushort8)\n\ +BATCH_NORM_SH_IMPL(I8, I8, vxc_char16, vxc_char16, int4, vxc_char16, vxc_char16)\n\ +BATCH_NORM_SH_IMPL(I8, F16, vxc_char16, vxc_char16, half4, vxc_half8, vxc_ushort8)\n\ +\n\ +#define BATCH_NORM_SH_IMPL_2D(name0, name1, src_type, read_type, conv_type, dst_type, save_type) \\\n\ +__kernel void batch_norm_##name0##_F16_F16_F32_F32to##name1##_brdcst1_2D( \\\n\ + __read_only image2d_array_t input, \\\n\ + __read_only image2d_t Mean, \\\n\ + __read_only image2d_t Variance, \\\n\ + __read_only image2d_t Gamma, \\\n\ + __read_only image2d_t Beta, \\\n\ + __write_only image2d_array_t output, \\\n\ + float eps \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\ + read_type vec; \\\n\ + src_type src; \\\n\ + VXC_ReadImage(vec, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src, vec, 16); \\\n\ + coord.z = coord.x + 4; \\\n\ + vxc_ushort8 _mean, _var; \\\n\ + vxc_half8 mean, var; \\\n\ + VXC_ReadImage(_mean, Mean, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, mean, _mean, 16); \\\n\ + VXC_ReadImage(_var, Variance, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, var, _var, 16); \\\n\ + float4 gamma0 = read_imagef(Gamma, coord.xy); \\\n\ + float4 gamma1 = read_imagef(Gamma, coord.zy); \\\n\ + float4 beta = read_imagef(Beta, coord.xy); \\\n\ + \\\n\ + float4 src0, src1, m, v; \\\n\ + VXC_DP4x4(src0, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_0_4x4); \\\n\ + VXC_DP4x4(m, mean, mean, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_0_4x4); \\\n\ + VXC_DP4x4(v, var, var, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_0_4x4); \\\n\ + gamma0 = gamma0 * rsqrt(v + eps); \\\n\ + src0 = src0 * input_scale + input_tail; \\\n\ + src0 = (src0 - m) * gamma0 + beta.xxxx; \\\n\ + src0 = src0 * output_scale + output_zp; \\\n\ + VXC_DP4x4(src1, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_1_4x4); \\\n\ + VXC_DP4x4(m, mean, mean, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_1_4x4); \\\n\ + VXC_DP4x4(v, var, var, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_1_4x4); \\\n\ + gamma1 = gamma1 * rsqrt(v + eps); \\\n\ + src1 = src1 * input_scale + input_tail; \\\n\ + src1 = (src1 - m) * gamma1 + beta.xxxx; \\\n\ + src1 = src1 * output_scale + output_zp; \\\n\ + \\\n\ + conv_type dst0, dst1; \\\n\ + _viv_asm(CONV_RTE, dst0, src0); \\\n\ + _viv_asm(CONV_RTE, dst1, src1); \\\n\ + dst_type tmp; \\\n\ + save_type dst; \\\n\ + VXC_DP2x8(tmp, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \\\n\ + _viv_asm(COPY, dst, tmp, 16); \\\n\ + VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +BATCH_NORM_SH_IMPL_2D(F16, F16, vxc_half8, vxc_ushort8, half4, vxc_half8, vxc_ushort8)\n\ +BATCH_NORM_SH_IMPL_2D(F16, I16, vxc_half8, vxc_ushort8, int4, vxc_short8, vxc_short8)\n\ +BATCH_NORM_SH_IMPL_2D(F16, U8, vxc_half8, vxc_ushort8, int4, vxc_uchar16, vxc_uchar16)\n\ +BATCH_NORM_SH_IMPL_2D(F16, I8, vxc_half8, vxc_ushort8, int4, vxc_char16, vxc_char16)\n\ +BATCH_NORM_SH_IMPL_2D(I16, I16, vxc_short8, vxc_short8, int4, vxc_short8, vxc_short8)\n\ +BATCH_NORM_SH_IMPL_2D(I16, F16, vxc_short8, vxc_short8, half4, vxc_half8, vxc_ushort8)\n\ +BATCH_NORM_SH_IMPL_2D(U8, U8, vxc_uchar16, vxc_uchar16, int4, vxc_uchar16, vxc_uchar16)\n\ +BATCH_NORM_SH_IMPL_2D(U8, F16, vxc_uchar16, vxc_uchar16, half4, vxc_half8, vxc_ushort8)\n\ +BATCH_NORM_SH_IMPL_2D(I8, I8, vxc_char16, vxc_char16, int4, vxc_char16, vxc_char16)\n\ +BATCH_NORM_SH_IMPL_2D(I8, F16, vxc_char16, vxc_char16, half4, vxc_half8, vxc_ushort8)\n\ +\n\ +\n\ +#define BATCH_NORM_SH_IMPL_AXIS1(name0, name1, src_type, read_type, conv_type, dst_type, save_type) \\\n\ +__kernel void batch_norm_##name0##_F16_F16_F32_F32to##name1##_brdcst0( \\\n\ + __read_only image2d_array_t input, \\\n\ + __read_only image2d_array_t Mean, \\\n\ + __read_only image2d_array_t Variance, \\\n\ + __read_only image2d_array_t Gamma, \\\n\ + __read_only image2d_array_t Beta, \\\n\ + __write_only image2d_array_t output, \\\n\ + float eps \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\ + read_type vec; \\\n\ + src_type src; \\\n\ + VXC_ReadImage2DArray(vec, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src, vec, 16); \\\n\ + vxc_ushort8 _mean, _var; \\\n\ + vxc_half8 mean, var; \\\n\ + VXC_ReadImage2DArray(_mean, Mean, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, mean, _mean, 16); \\\n\ + VXC_ReadImage2DArray(_var, Variance, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, var, _var, 16); \\\n\ + float4 gamma0 = read_imagef(Gamma, coord); \\\n\ + float4 beta0 = read_imagef(Beta, coord); \\\n\ + coord.x += 4; \\\n\ + float4 gamma1 = read_imagef(Gamma, coord); \\\n\ + float4 beta1 = read_imagef(Beta, coord); \\\n\ + coord.x -= 4; \\\n\ + \\\n\ + float4 src0, src1, m, v; \\\n\ + VXC_DP4x4(src0, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_0_4x4); \\\n\ + VXC_DP4x4(m, mean, mean, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_0_4x4); \\\n\ + VXC_DP4x4(v, var, var, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_0_4x4); \\\n\ + gamma0 = gamma0 * rsqrt(v + eps); \\\n\ + src0 = src0 * input_scale + input_tail; \\\n\ + src0 = (src0 - m) * gamma0 + beta0; \\\n\ + src0 = src0 * output_scale + output_zp; \\\n\ + VXC_DP4x4(src1, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_1_4x4); \\\n\ + VXC_DP4x4(m, mean, mean, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_1_4x4); \\\n\ + VXC_DP4x4(v, var, var, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_1_4x4); \\\n\ + gamma1 = gamma1 * rsqrt(v + eps); \\\n\ + src1 = src1 * input_scale + input_tail; \\\n\ + src1 = (src1 - m) * gamma1 + beta1; \\\n\ + src1 = src1 * output_scale + output_zp; \\\n\ + \\\n\ + conv_type dst0, dst1; \\\n\ + _viv_asm(CONV_RTE, dst0, src0); \\\n\ + _viv_asm(CONV_RTE, dst1, src1); \\\n\ + dst_type tmp; \\\n\ + save_type dst; \\\n\ + VXC_DP2x8(tmp, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \\\n\ + _viv_asm(COPY, dst, tmp, 16); \\\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +BATCH_NORM_SH_IMPL_AXIS1(F16, F16, vxc_half8, vxc_ushort8, half4, vxc_half8, vxc_ushort8)\n\ +BATCH_NORM_SH_IMPL_AXIS1(F16, I16, vxc_half8, vxc_ushort8, int4, vxc_short8, vxc_short8)\n\ +BATCH_NORM_SH_IMPL_AXIS1(F16, U8, vxc_half8, vxc_ushort8, int4, vxc_uchar16, vxc_uchar16)\n\ +BATCH_NORM_SH_IMPL_AXIS1(F16, I8, vxc_half8, vxc_ushort8, int4, vxc_char16, vxc_char16)\n\ +BATCH_NORM_SH_IMPL_AXIS1(I16, I16, vxc_short8, vxc_short8, int4, vxc_short8, vxc_short8)\n\ +BATCH_NORM_SH_IMPL_AXIS1(I16, F16, vxc_short8, vxc_short8, half4, vxc_half8, vxc_ushort8)\n\ +BATCH_NORM_SH_IMPL_AXIS1(U8, U8, vxc_uchar16, vxc_uchar16, int4, vxc_uchar16, vxc_uchar16)\n\ +BATCH_NORM_SH_IMPL_AXIS1(U8, F16, vxc_uchar16, vxc_uchar16, half4, vxc_half8, vxc_ushort8)\n\ +BATCH_NORM_SH_IMPL_AXIS1(I8, I8, vxc_char16, vxc_char16, int4, vxc_char16, vxc_char16)\n\ +BATCH_NORM_SH_IMPL_AXIS1(I8, F16, vxc_char16, vxc_char16, half4, vxc_half8, vxc_ushort8)\n\ +\n\ +#define BATCH_NORM_SH_IMPL_AXIS1_2D(name0, name1, src_type, read_type, conv_type, dst_type, save_type) \\\n\ +__kernel void batch_norm_##name0##_F16_F16_F32_F32to##name1##_brdcst0_2D( \\\n\ + __read_only image2d_array_t input, \\\n\ + __read_only image2d_t Mean, \\\n\ + __read_only image2d_t Variance, \\\n\ + __read_only image2d_t Gamma, \\\n\ + __read_only image2d_t Beta, \\\n\ + __write_only image2d_array_t output, \\\n\ + float eps \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \\\n\ + read_type vec; \\\n\ + src_type src; \\\n\ + VXC_ReadImage(vec, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src, vec, 16); \\\n\ + coord.z += 4; \\\n\ + vxc_ushort8 _mean, _var; \\\n\ + vxc_half8 mean, var; \\\n\ + VXC_ReadImage(_mean, Mean, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, mean, _mean, 16); \\\n\ + VXC_ReadImage(_var, Variance, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, var, _var, 16); \\\n\ + float4 gamma0 = read_imagef(Gamma, coord.xy); \\\n\ + float4 gamma1 = read_imagef(Gamma, coord.zy); \\\n\ + float4 beta0 = read_imagef(Beta, coord.xy); \\\n\ + float4 beta1 = read_imagef(Beta, coord.zy); \\\n\ + \\\n\ + float4 src0, src1, m, v; \\\n\ + VXC_DP4x4(src0, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_0_4x4); \\\n\ + VXC_DP4x4(m, mean, mean, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_0_4x4); \\\n\ + VXC_DP4x4(v, var, var, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_0_4x4); \\\n\ + gamma0 = gamma0 * rsqrt(v + eps); \\\n\ + src0 = src0 * input_scale + input_tail; \\\n\ + src0 = (src0 - m) * gamma0 + beta0; \\\n\ + src0 = src0 * output_scale + output_zp; \\\n\ + VXC_DP4x4(src1, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_1_4x4); \\\n\ + VXC_DP4x4(m, mean, mean, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_1_4x4); \\\n\ + VXC_DP4x4(v, var, var, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_1_4x4); \\\n\ + gamma1 = gamma1 * rsqrt(v + eps); \\\n\ + src1 = src1 * input_scale + input_tail; \\\n\ + src1 = (src1 - m) * gamma1 + beta1; \\\n\ + src1 = src1 * output_scale + output_zp; \\\n\ + \\\n\ + conv_type dst0, dst1; \\\n\ + _viv_asm(CONV_RTE, dst0, src0); \\\n\ + _viv_asm(CONV_RTE, dst1, src1); \\\n\ + dst_type tmp; \\\n\ + save_type dst; \\\n\ + VXC_DP2x8(tmp, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \\\n\ + _viv_asm(COPY, dst, tmp, 16); \\\n\ + VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +BATCH_NORM_SH_IMPL_AXIS1_2D(F16, F16, vxc_half8, vxc_ushort8, half4, vxc_half8, vxc_ushort8)\n\ +BATCH_NORM_SH_IMPL_AXIS1_2D(F16, I16, vxc_half8, vxc_ushort8, int4, vxc_short8, vxc_short8)\n\ +BATCH_NORM_SH_IMPL_AXIS1_2D(F16, U8, vxc_half8, vxc_ushort8, int4, vxc_uchar16, vxc_uchar16)\n\ +BATCH_NORM_SH_IMPL_AXIS1_2D(F16, I8, vxc_half8, vxc_ushort8, int4, vxc_char16, vxc_char16)\n\ +BATCH_NORM_SH_IMPL_AXIS1_2D(I16, I16, vxc_short8, vxc_short8, int4, vxc_short8, vxc_short8)\n\ +BATCH_NORM_SH_IMPL_AXIS1_2D(I16, F16, vxc_short8, vxc_short8, half4, vxc_half8, vxc_ushort8)\n\ +BATCH_NORM_SH_IMPL_AXIS1_2D(U8, U8, vxc_uchar16, vxc_uchar16, int4, vxc_uchar16, vxc_uchar16)\n\ +BATCH_NORM_SH_IMPL_AXIS1_2D(U8, F16, vxc_uchar16, vxc_uchar16, half4, vxc_half8, vxc_ushort8)\n\ +BATCH_NORM_SH_IMPL_AXIS1_2D(I8, I8, vxc_char16, vxc_char16, int4, vxc_char16, vxc_char16)\n\ +BATCH_NORM_SH_IMPL_AXIS1_2D(I8, F16, vxc_char16, vxc_char16, half4, vxc_half8, vxc_ushort8)\n\ +\n\ +"; /* end of batchnorm_single_f32_vx*/ + static const char cast_vx[] = "\n\ #include \"cl_viv_vx_ext.h\"\n\ \n\ @@ -2300,25 +2569,350 @@ __kernel void clip_U8toF16_2D(\n\ }\n\ "; /* end of clip_U8_vx*/ +static const char conv1d_ovxlib_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform VXC_512Bits uniConv1DK3_Lo0_4x4;\n\ +_viv_uniform VXC_512Bits uniConv1DK3_Lo1_4x4;\n\ +_viv_uniform VXC_512Bits uniConv1DK3_Lo2_4x4;\n\ +_viv_uniform VXC_512Bits uniConv1DK3_Hi0_4x4;\n\ +_viv_uniform VXC_512Bits uniConv1DK3_Hi1_4x4;\n\ +_viv_uniform VXC_512Bits uniConv1DK3_Hi2_4x4;\n\ +_viv_uniform VXC_512Bits uniDataConvK3_2x8;\n\ +_viv_uniform VXC_512Bits uniSumOrderUchar_2x8;\n\ +\n\ +_viv_uniform int input_ZP;\n\ +_viv_uniform int weight_ZP;\n\ +_viv_uniform float output_ZP;\n\ +_viv_uniform float scaleOut;\n\ +_viv_uniform int input_height;\n\ +\n\ +__kernel void conv1d_U8U8I32toU8_K3_S1(\n\ + __read_only image2d_array_t input,\n\ + __read_only image2d_array_t weight,\n\ + __read_only image2d_t bias,\n\ + __write_only image2d_array_t output,\n\ + int stride,\n\ + int pad_front,\n\ + int pad_end,\n\ + int dilation,\n\ + int overflow_policy)\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\ + int4 coord_w = (int4)(0, 0, get_global_id(1), 0);\n\ + float4 sum0, sum1, dst;\n\ + vxc_short8 weight_val_s =(short)input_ZP;\n\ + vxc_uchar16 input_val = 0, weight_val = 0;\n\ + int temp = 0, i;\n\ +\n\ + temp = read_imagei(bias, coord.yz).x;\n\ + sum0 = convert_float(temp);\n\ + sum1 = sum0;\n\ + weight_val_s.s5 = (short)weight_ZP;\n\ +\n\ + for (i = 0; i < input_height; i++)\n\ + {\n\ + VXC_ReadImage2DArray(weight_val, weight, coord_w, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 2, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(input_val, input, coord.xz, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_DP2x8(weight_val_s, weight_val, weight_val_s, \\\n\ + VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0), uniDataConvK3_2x8);\n\ +\n\ + VXC_DP4x4(dst, input_val, weight_val_s, \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConv1DK3_Lo0_4x4);\n\ + sum0 += dst;\n\ + VXC_DP4x4(dst, input_val, weight_val_s, \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConv1DK3_Hi0_4x4);\n\ + sum1 += dst;\n\ + coord.x += dilation;\n\ + VXC_ReadImage(input_val, input, coord.xz, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_DP4x4(dst, input_val, weight_val_s, \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConv1DK3_Lo1_4x4);\n\ + sum0 += dst;\n\ + VXC_DP4x4(dst, input_val, weight_val_s, \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConv1DK3_Hi1_4x4);\n\ + sum1 += dst;\n\ + coord.x += dilation;\n\ + VXC_ReadImage(input_val, input, coord.xz, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_DP4x4(dst, input_val, weight_val_s, \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConv1DK3_Lo2_4x4);\n\ + sum0 += dst;\n\ + VXC_DP4x4(dst, input_val, weight_val_s, \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConv1DK3_Hi2_4x4);\n\ + sum1 += dst;\n\ + coord_w.y++;\n\ + coord.z++;\n\ + coord.x = get_global_id(0);\n\ + }\n\ +\n\ + sum0 = sum0 * scaleOut + output_ZP;\n\ + sum1 = sum1 * scaleOut + output_ZP;\n\ + uchar4 result0, result1;\n\ + _viv_asm(CONV_SAT_RTE, result0, sum0);\n\ + _viv_asm(CONV_SAT_RTE, result1, sum1);\n\ + vxc_uchar8 result;\n\ + VXC_DP2x8(result, result0, result1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSumOrderUchar_2x8);\n\ + VXC_WriteImage(output, coord.xy, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void conv1d_U8U8I32toU8_K3_S1_D2_D4(\n\ + __read_only image2d_array_t input,\n\ + __read_only image2d_array_t weight,\n\ + __read_only image2d_t bias,\n\ + __write_only image2d_array_t output,\n\ + int stride,\n\ + int pad_front,\n\ + int pad_end,\n\ + int dilation,\n\ + int overflow_policy)\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\ + int4 coord_w = (int4)(0, 0, get_global_id(1), 0);\n\ + float4 sum0, sum1, dst;\n\ + vxc_short8 weight_val_s =(short)input_ZP;\n\ + vxc_uchar16 input_val = 0, weight_val = 0;\n\ + int temp = 0, i;\n\ +\n\ + temp = read_imagei(bias, coord.yz).x;\n\ + sum0 = convert_float(temp);\n\ + sum1 = sum0;\n\ + weight_val_s.s5 = (short)weight_ZP;\n\ +\n\ + for (i = 0; i < input_height; i++)\n\ + {\n\ + VXC_ReadImage2DArray(weight_val, weight, coord_w, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 2, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(input_val, input, coord.xz, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_DP2x8(weight_val_s, weight_val, weight_val_s, \\\n\ + VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0), uniDataConvK3_2x8);\n\ +\n\ + VXC_DP4x4(dst, input_val, weight_val_s, \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConv1DK3_Lo0_4x4);\n\ + sum0 += dst;\n\ + VXC_DP4x4(dst, input_val, weight_val_s, \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConv1DK3_Hi0_4x4);\n\ + sum1 += dst;\n\ + VXC_DP4x4(dst, input_val, weight_val_s, \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConv1DK3_Lo1_4x4);\n\ + sum0 += dst;\n\ + VXC_DP4x4(dst, input_val, weight_val_s, \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConv1DK3_Hi1_4x4);\n\ + sum1 += dst;\n\ + VXC_DP4x4(dst, input_val, weight_val_s, \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConv1DK3_Lo2_4x4);\n\ + sum0 += dst;\n\ + VXC_DP4x4(dst, input_val, weight_val_s, \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConv1DK3_Hi2_4x4);\n\ + sum1 += dst;\n\ + coord_w.y++;\n\ + coord.z++;\n\ + }\n\ +\n\ + sum0 = sum0 * scaleOut + output_ZP;\n\ + sum1 = sum1 * scaleOut + output_ZP;\n\ + uchar4 result0, result1;\n\ + _viv_asm(CONV_SAT_RTE, result0, sum0);\n\ + _viv_asm(CONV_SAT_RTE, result1, sum1);\n\ + vxc_uchar8 result;\n\ + VXC_DP2x8(result, result0, result1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSumOrderUchar_2x8);\n\ + VXC_WriteImage(output, coord.xy, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +"; /* end of conv1d_ovxlib_vx*/ + +static const char conv1d_ovxlib_k1024_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform VXC_512Bits uniU8SubZp_lo_2x8;\n\ +_viv_uniform VXC_512Bits uniU8SubZp_hi_2x8;\n\ +_viv_uniform VXC_512Bits uniU8Conv1d_part0_8x2;\n\ +_viv_uniform VXC_512Bits uniU8Conv1d_part1_8x2;\n\ +_viv_uniform VXC_512Bits uniU8Conv1d_part2_8x2;\n\ +_viv_uniform VXC_512Bits uniU8Conv1d_part3_8x2;\n\ +_viv_uniform VXC_512Bits uniSumOrderUchar_2x8;\n\ +\n\ +_viv_uniform int kernel_cnt_x16;\n\ +_viv_uniform int weight_ZP;\n\ +_viv_uniform float output_ZP;\n\ +_viv_uniform float scaleOut;\n\ +_viv_uniform int input_height;\n\ +_viv_uniform int input_width;\n\ +_viv_uniform int output_width;\n\ +\n\ +__kernel void conv1d_U8U8I32toU8_K1024_SMALL(\n\ + __read_only image2d_array_t input,\n\ + __read_only image2d_array_t weight,\n\ + __read_only image2d_t bias,\n\ + __write_only image2d_array_t output,\n\ + int stride,\n\ + int pad_front,\n\ + int pad_end,\n\ + int dilation,\n\ + int overflow_policy)\n\ +{\n\ + int start_x = get_global_id(0) - pad_front;\n\ + int4 coord = (int4)(start_x, get_global_id(1), 0, get_global_id(0));\n\ + int4 coord_w = (int4)(0, 0, get_global_id(1), 0);\n\ + float4 sum0, sum1, dst;\n\ + vxc_short8 coef;\n\ + vxc_short8 w_zp = (short)weight_ZP;\n\ + vxc_uchar16 input_val = 0, weight_val = 0;\n\ + int temp = 0, i, j;\n\ +\n\ + temp = read_imagei(bias, coord.yz).x;\n\ + sum0 = convert_float(temp);\n\ + sum1 = sum0;\n\ +\n\ + for (i = 0; i < input_height; i++)\n\ + {\n\ + for (j = 0; j < kernel_cnt_x16; j++)\n\ + {\n\ + VXC_ReadImage2DArray(weight_val, weight, coord_w, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(input_val, input, coord.xz, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_DP2x8(coef, weight_val, w_zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniU8SubZp_lo_2x8);\n\ + VXC_DP8x2(dst, input_val, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8Conv1d_part0_8x2);\n\ + VXC_DP8x2(dst, input_val, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8Conv1d_part1_8x2);\n\ + sum0 += dst;\n\ + VXC_DP8x2(dst, input_val, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8Conv1d_part2_8x2);\n\ + VXC_DP8x2(dst, input_val, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8Conv1d_part3_8x2);\n\ + sum1 += dst;\n\ + VXC_ReadImage(input_val, input, coord.xz, VXC_5BITOFFSET_XY(8, 0), \\\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_DP2x8(coef, weight_val, w_zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniU8SubZp_hi_2x8);\n\ + VXC_DP8x2(dst, input_val, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8Conv1d_part0_8x2);\n\ + VXC_DP8x2(dst, input_val, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8Conv1d_part1_8x2);\n\ + sum0 += dst;\n\ + VXC_DP8x2(dst, input_val, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8Conv1d_part2_8x2);\n\ + VXC_DP8x2(dst, input_val, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8Conv1d_part3_8x2);\n\ + sum1 += dst;\n\ + coord_w.x += 16;\n\ + coord.x += 16;\n\ + }\n\ + coord_w.x = 0;\n\ + coord_w.y++;\n\ + coord.z++;\n\ + coord.x = start_x;\n\ + }\n\ +\n\ + sum0 = sum0 * scaleOut + output_ZP;\n\ + sum1 = sum1 * scaleOut + output_ZP;\n\ + uchar4 result0, result1;\n\ + _viv_asm(CONV_SAT_RTE, result0, sum0);\n\ + _viv_asm(CONV_SAT_RTE, result1, sum1);\n\ + vxc_uchar8 result;\n\ + VXC_DP2x8(result, result0, result1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSumOrderUchar_2x8);\n\ + VXC_WriteImage(output, coord.wy, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +inline uchar* get_image2D_array_ptr(image2d_array_t input)\n\ +{\n\ + int8 desc;\n\ + _viv_asm(COPY, desc, input, sizeof(desc));\n\ + uchar *src_ptr = (uchar*)desc.s0;\n\ + return src_ptr;\n\ +}\n\ +\n\ +__kernel void conv1d_U8U8I32toU8_K1024_LARGE(\n\ + __read_only image2d_array_t input,\n\ + __read_only image2d_array_t weight,\n\ + __read_only image2d_t bias,\n\ + __write_only image2d_array_t output,\n\ + int stride,\n\ + int pad_front,\n\ + int pad_end,\n\ + int dilation,\n\ + int overflow_policy)\n\ +{\n\ + int start_x = get_global_id(0);\n\ + int w_left = output_width - start_x;\n\ + int out_x = w_left < 8 ? get_global_id(0) - w_left : get_global_id(0);\n\ + int4 coord = (int4)(start_x, get_global_id(1), 0, out_x);\n\ + int4 coord_w = (int4)(0, 0, get_global_id(1), 0);\n\ + float4 sum0, sum1, dst;\n\ + vxc_short8 coef;\n\ + vxc_short8 w_zp = (short)weight_ZP;\n\ + vxc_uchar16 input_val = 0, weight_val = 0;\n\ + int temp = 0, i, j;\n\ + uchar *src_ptr_base = (uchar *)get_image2D_array_ptr(input);\n\ + uchar *src_ptr;\n\ + uchar *dst_ptr = (uchar *)get_image2D_array_ptr(output);\n\ +\n\ + temp = read_imagei(bias, coord.yz).x;\n\ + sum0 = convert_float(temp);\n\ + sum1 = sum0;\n\ +\n\ + for (i = 0; i < input_height; i++)\n\ + {\n\ + src_ptr = src_ptr_base + (coord.x + coord.z * input_width);\n\ + for (j = 0; j < kernel_cnt_x16; j++)\n\ + {\n\ + VXC_ReadImage2DArray(weight_val, weight, coord_w, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_Vload16(input_val, src_ptr, 0);\n\ + VXC_DP2x8(coef, weight_val, w_zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniU8SubZp_lo_2x8);\n\ + VXC_DP8x2(dst, input_val, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8Conv1d_part0_8x2);\n\ + VXC_DP8x2(dst, input_val, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8Conv1d_part1_8x2);\n\ + sum0 += dst;\n\ + VXC_DP8x2(dst, input_val, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8Conv1d_part2_8x2);\n\ + VXC_DP8x2(dst, input_val, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8Conv1d_part3_8x2);\n\ + sum1 += dst;\n\ + src_ptr += 8;\n\ + VXC_Vload16(input_val, src_ptr, 0);\n\ + VXC_DP2x8(coef, weight_val, w_zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniU8SubZp_hi_2x8);\n\ + VXC_DP8x2(dst, input_val, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8Conv1d_part0_8x2);\n\ + VXC_DP8x2(dst, input_val, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8Conv1d_part1_8x2);\n\ + sum0 += dst;\n\ + VXC_DP8x2(dst, input_val, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8Conv1d_part2_8x2);\n\ + VXC_DP8x2(dst, input_val, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8Conv1d_part3_8x2);\n\ + sum1 += dst;\n\ + coord_w.x += 16;\n\ + coord.x += 16;\n\ + src_ptr += 8;\n\ + }\n\ + coord_w.x = 0;\n\ + coord_w.y++;\n\ + coord.z++;\n\ + coord.x = start_x;\n\ + }\n\ +\n\ + sum0 = sum0 * scaleOut + output_ZP;\n\ + sum1 = sum1 * scaleOut + output_ZP;\n\ + uchar4 result0, result1;\n\ + _viv_asm(CONV_SAT_RTE, result0, sum0);\n\ + _viv_asm(CONV_SAT_RTE, result1, sum1);\n\ + vxc_uchar8 result;\n\ + VXC_DP2x8(result, result0, result1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSumOrderUchar_2x8);\n\ + dst_ptr = dst_ptr + (coord.w + coord.y * output_width);\n\ + VXC_Vstore8(dst_ptr, 0, result);\n\ +}\n\ +\n\ +"; /* end of conv1d_ovxlib_k1024_vx*/ + static const char depth2space_crd_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ \n\ _viv_uniform int2 multAndoutZP0;//[0:15] multiplier, [31:63] output zp\n\ _viv_uniform VXC_512Bits uniU8MulAndPostShift_0_Lo_2x8;\n\ +_viv_uniform VXC_512Bits uniU8MulAndPostShift_ExLo_2x8;\n\ +_viv_uniform VXC_512Bits uniU8MulAndPostShift_ExHi_2x8;\n\ +_viv_uniform VXC_512Bits uniDepth2SpaceF16Blk2_lo_2x8;\n\ +_viv_uniform VXC_512Bits uniDepth2SpaceF16Blk2_hi_2x8;\n\ +\n\ \n\ #define DEPTH2SPACE_CRD_QINT_TO_QINT(src0_type_name, src1_type_name, read_type, write_type) \\\n\ __kernel void depth2space_crd_##src0_type_name##to##src1_type_name( \\\n\ - image2d_array_t input, \\\n\ - image2d_array_t output, \\\n\ - int block_size \\\n\ - ) \\\n\ + image2d_array_t input, image2d_array_t output, int block_size) \\\n\ { \\\n\ int gidx = get_global_id(0); \\\n\ int gidy = get_global_id(1); \\\n\ int gidz = get_global_id(2); \\\n\ int4 coord_out = (int4)(gidx, gidy, gidz, 0); \\\n\ int block_e2 = block_size * block_size; \\\n\ - int inx = gidx / block_size; \\\n\ - int iny = gidy / block_size; \\\n\ + ushort blk = (ushort)block_size; \\\n\ + int inx = (int)((ushort)gidx / blk); \\\n\ + int iny = (int)((ushort)gidy / blk); \\\n\ int inz = (gidx % block_size) + (gidy % block_size) * block_size + gidz * block_e2; \\\n\ int4 coord_in = (int4)(inx, iny, inz, 0); \\\n\ read_type src; \\\n\ @@ -2335,18 +2929,16 @@ DEPTH2SPACE_CRD_QINT_TO_QINT(I8, I8, vxc_char16, vxc_char16)\n\ DEPTH2SPACE_CRD_QINT_TO_QINT(I16, I16, vxc_short8, vxc_short8)\n\ \n\ __kernel void depth2space_crd_F16toF16(\n\ - image2d_array_t input,\n\ - image2d_array_t output,\n\ - int block_size\n\ - )\n\ + image2d_array_t input, image2d_array_t output, int block_size)\n\ {\n\ int gidx = get_global_id(0);\n\ int gidy = get_global_id(1);\n\ int gidz = get_global_id(2);\n\ int4 coord_out = (int4)(gidx, gidy, gidz, 0);\n\ int block_e2 = block_size * block_size;\n\ - int inx = gidx / block_size;\n\ - int iny = gidy / block_size;\n\ + ushort blk = (ushort)block_size;\n\ + int inx = (int)((ushort)gidx / blk);\n\ + int iny = (int)((ushort)gidy / blk);\n\ int inz = (gidx % block_size) + (gidy % block_size) * block_size + gidz * block_e2;\n\ int4 coord_in = (int4)(inx, iny, inz, 0);\n\ vxc_short8 data;\n\ @@ -2356,18 +2948,16 @@ __kernel void depth2space_crd_F16toF16(\n\ \n\ #define DEPTH2SPACE_CRD_QINT_TO_F16(src0_type_name, read_type) \\\n\ __kernel void depth2space_crd_##src0_type_name##toF16( \\\n\ - image2d_array_t input, \\\n\ - image2d_array_t output, \\\n\ - int block_size \\\n\ - ) \\\n\ + image2d_array_t input, image2d_array_t output, int block_size) \\\n\ { \\\n\ int gidx = get_global_id(0); \\\n\ int gidy = get_global_id(1); \\\n\ int gidz = get_global_id(2); \\\n\ int4 coord_out = (int4)(gidx, gidy, gidz, 0); \\\n\ int block_e2 = block_size * block_size; \\\n\ - int inx = gidx / block_size; \\\n\ - int iny = gidy / block_size; \\\n\ + ushort blk = (ushort)block_size; \\\n\ + int inx = (int)((ushort)gidx / blk); \\\n\ + int iny = (int)((ushort)gidy / blk); \\\n\ int inz = (gidx % block_size) + (gidy % block_size) * block_size + gidz * block_e2; \\\n\ int4 coord_in = (int4)(inx, iny, inz, 0); \\\n\ read_type src; \\\n\ @@ -2387,18 +2977,16 @@ DEPTH2SPACE_CRD_QINT_TO_F16(I16, vxc_short8)\n\ \n\ #define DEPTH2SPACE_CRD_F16_TO_QINT(src1_type_name, write_type) \\\n\ __kernel void depth2space_crd_F16to##src1_type_name( \\\n\ - image2d_array_t input, \\\n\ - image2d_array_t output, \\\n\ - int block_size \\\n\ - ) \\\n\ + image2d_array_t input, image2d_array_t output, int block_size) \\\n\ { \\\n\ int gidx = get_global_id(0); \\\n\ int gidy = get_global_id(1); \\\n\ int gidz = get_global_id(2); \\\n\ int4 coord_out = (int4)(gidx, gidy, gidz, 0); \\\n\ int block_e2 = block_size * block_size; \\\n\ - int inx = gidx / block_size; \\\n\ - int iny = gidy / block_size; \\\n\ + ushort blk = (ushort)block_size; \\\n\ + int inx = (int)((ushort)gidx / blk); \\\n\ + int iny = (int)((ushort)gidy / blk); \\\n\ int inz = (gidx % block_size) + (gidy % block_size) * block_size + gidz * block_e2; \\\n\ int4 coord_in = (int4)(inx, iny, inz, 0); \\\n\ vxc_short8 src; \\\n\ @@ -2414,7 +3002,202 @@ __kernel void depth2space_crd_F16to##src1_type_name( \\\n\ }\n\ DEPTH2SPACE_CRD_F16_TO_QINT(U8, vxc_uchar16)\n\ DEPTH2SPACE_CRD_F16_TO_QINT(I8, vxc_char16)\n\ -DEPTH2SPACE_CRD_F16_TO_QINT(I16, vxc_short8)"; /* end of depth2space_crd_vx*/ +DEPTH2SPACE_CRD_F16_TO_QINT(I16, vxc_short8)\n\ +\n\ +#define DEPTH2SPACE_CRD_QINT_TO_QINT_BLK2(src0_type_name, src1_type_name, read_type, write_type) \\\n\ +__kernel void depth2space_crd_##src0_type_name##to##src1_type_name##_blk2( \\\n\ + image2d_array_t input, image2d_array_t output, int block_size) \\\n\ +{ \\\n\ + int gidy = get_global_id(1); \\\n\ + int gidz = get_global_id(2); \\\n\ + int4 coord_out = (int4)(get_global_id(0), gidy, gidz, gidz); \\\n\ + int4 coord_in = coord_out >> 1; \\\n\ + coord_in.z = (gidy & 1) * 2 + gidz * 4; \\\n\ + coord_in.w = coord_in.z + 1; \\\n\ + read_type src; \\\n\ + VXC_ReadImage2DArray(src, input, coord_in.xyzz, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage2DArray(src, input, coord_in.xyww, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + write_type dst; \\\n\ + vxc_ushort8 ms0; \\\n\ + _viv_asm(COPY, ms0, multAndoutZP0, 16); \\\n\ + VXC_DP2x8(dst, src, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniU8MulAndPostShift_ExLo_2x8); \\\n\ + VXC_DP2x8(dst, src, ms0, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniU8MulAndPostShift_ExHi_2x8); \\\n\ + VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +DEPTH2SPACE_CRD_QINT_TO_QINT_BLK2(U8, U8, vxc_uchar16, vxc_uchar16)\n\ +DEPTH2SPACE_CRD_QINT_TO_QINT_BLK2(I8, I8, vxc_char16, vxc_char16)\n\ +\n\ +__kernel void depth2space_crd_F16toF16_blk2(\n\ + image2d_array_t input, image2d_array_t output, int block_size)\n\ +{\n\ + int gidy = get_global_id(1);\n\ + int gidz = get_global_id(2);\n\ + int4 coord_out = (int4)(get_global_id(0), gidy, gidz, gidz);\n\ + int4 coord_in = coord_out >> 1;\n\ + coord_in.z = (gidy & 1) * 2 + gidz * 4;\n\ + coord_in.w = coord_in.z + 1;\n\ + vxc_short8 data0, data1, dst0, dst1;\n\ + VXC_ReadImage2DArray(data0, input, coord_in.xyzz,\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage2DArray(data1, input, coord_in.xyww,\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_DP2x8(dst0, data0, data1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniDepth2SpaceF16Blk2_lo_2x8);\n\ + VXC_DP2x8(dst1, data0, data1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniDepth2SpaceF16Blk2_hi_2x8);\n\ +\n\ + VXC_WriteImage2DArray(output, coord_out, dst0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + coord_out.x += 8;\n\ + VXC_WriteImage2DArray(output, coord_out, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void depth2space_crd_I16toI16_blk2(\n\ + image2d_array_t input, image2d_array_t output, int block_size)\n\ +{\n\ + int gidy = get_global_id(1);\n\ + int gidz = get_global_id(2);\n\ + int4 coord_out = (int4)(get_global_id(0), gidy, gidz, gidz);\n\ + int4 coord_in = coord_out >> 1;\n\ + coord_in.z = (gidy & 1) * 2 + gidz * 4;\n\ + coord_in.w = coord_in.z + 1;\n\ + vxc_short8 src0, src1, data0, data1, dst0, dst1;\n\ + VXC_ReadImage2DArray(src0, input, coord_in.xyzz,\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage2DArray(src1, input, coord_in.xyww,\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_DP2x8(data0, src0, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniDepth2SpaceF16Blk2_lo_2x8);\n\ + VXC_DP2x8(data1, src0, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniDepth2SpaceF16Blk2_hi_2x8);\n\ +\n\ + vxc_ushort8 ms0;\n\ + _viv_asm(COPY, ms0, multAndoutZP0, 16);\n\ + VXC_DP2x8(dst0, data0, ms0, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 1), uniU8MulAndPostShift_0_Lo_2x8);\n\ + VXC_DP2x8(dst1, data1, ms0, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 1), uniU8MulAndPostShift_0_Lo_2x8);\n\ +\n\ + VXC_WriteImage2DArray(output, coord_out, dst0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + coord_out.x += 8;\n\ + VXC_WriteImage2DArray(output, coord_out, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +#define DEPTH2SPACE_CRD_QINT_TO_F16_BLK2(src0_type_name, read_type) \\\n\ +__kernel void depth2space_crd_##src0_type_name##toF16_blk2( \\\n\ + image2d_array_t input, image2d_array_t output, int block_size) \\\n\ +{ \\\n\ + int gidy = get_global_id(1); \\\n\ + int gidz = get_global_id(2); \\\n\ + int4 coord_out = (int4)(get_global_id(0), gidy, gidz, gidz); \\\n\ + int4 coord_in = coord_out >> 1; \\\n\ + coord_in.z = (gidy & 1) * 2 + gidz * 4; \\\n\ + coord_in.w = coord_in.z + 1; \\\n\ + read_type src; \\\n\ + VXC_ReadImage2DArray(src, input, coord_in.xyzz, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage2DArray(src, input, coord_in.xyww, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + vxc_half8 tmpDst0, tmpDst1; \\\n\ + vxc_short8 dst0, dst1; \\\n\ + vxc_ushort8 ms0; \\\n\ + _viv_asm(COPY, ms0, multAndoutZP0, 16); \\\n\ + VXC_DP2x8(tmpDst0, src, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniU8MulAndPostShift_ExLo_2x8); \\\n\ + VXC_DP2x8(tmpDst1, src, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniU8MulAndPostShift_ExHi_2x8); \\\n\ + _viv_asm(COPY, dst0, tmpDst0, 16); \\\n\ + _viv_asm(COPY, dst1, tmpDst1, 16); \\\n\ + VXC_WriteImage2DArray(output, coord_out, dst0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_out.x+=8; \\\n\ + VXC_WriteImage2DArray(output, coord_out, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +DEPTH2SPACE_CRD_QINT_TO_F16_BLK2(U8, vxc_uchar16)\n\ +DEPTH2SPACE_CRD_QINT_TO_F16_BLK2(I8, vxc_char16)\n\ +\n\ +__kernel void depth2space_crd_I16toF16_blk2(\n\ + image2d_array_t input, image2d_array_t output, int block_size)\n\ +{\n\ + int gidy = get_global_id(1);\n\ + int gidz = get_global_id(2);\n\ + int4 coord_out = (int4)(get_global_id(0), gidy, gidz, gidz);\n\ + int4 coord_in = coord_out >> 1;\n\ + coord_in.z = (gidy & 1) * 2 + gidz * 4;\n\ + coord_in.w = coord_in.z + 1;\n\ + vxc_short8 src0, src1, data0, data1, dst0, dst1;\n\ + vxc_half8 tmpDst0, tmpDst1;\n\ + VXC_ReadImage2DArray(src0, input, coord_in.xyzz,\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage2DArray(src1, input, coord_in.xyww,\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_DP2x8(data0, src0, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniDepth2SpaceF16Blk2_lo_2x8);\n\ + VXC_DP2x8(data1, src0, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniDepth2SpaceF16Blk2_hi_2x8);\n\ +\n\ + vxc_ushort8 ms0;\n\ + _viv_asm(COPY, ms0, multAndoutZP0, 16);\n\ + VXC_DP2x8(tmpDst0, data0, ms0, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 1), uniU8MulAndPostShift_0_Lo_2x8);\n\ + VXC_DP2x8(tmpDst1, data1, ms0, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 1), uniU8MulAndPostShift_0_Lo_2x8);\n\ + _viv_asm(COPY, dst0, tmpDst0, 16);\n\ + _viv_asm(COPY, dst1, tmpDst1, 16);\n\ + VXC_WriteImage2DArray(output, coord_out, dst0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + coord_out.x += 8;\n\ + VXC_WriteImage2DArray(output, coord_out, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +#define DEPTH2SPACE_CRD_F16_TO_QINT_BLK2(src1_type_name, write_type) \\\n\ +__kernel void depth2space_crd_F16to##src1_type_name##_blk2( \\\n\ + image2d_array_t input, image2d_array_t output, int block_size) \\\n\ +{ \\\n\ + int gidy = get_global_id(1); \\\n\ + int gidz = get_global_id(2); \\\n\ + int4 coord_out = (int4)(get_global_id(0), gidy, gidz, gidz); \\\n\ + int4 coord_in = coord_out >> 1; \\\n\ + coord_in.z = (gidy & 1) * 2 + gidz * 4; \\\n\ + coord_in.w = coord_in.z + 1; \\\n\ + vxc_short8 src0, src1, data0, data1; \\\n\ + vxc_half8 tmpDst0, tmpDst1; \\\n\ + VXC_ReadImage2DArray(src0, input, coord_in.xyzz, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage2DArray(src1, input, coord_in.xyww, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_DP2x8(data0, src0, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniDepth2SpaceF16Blk2_lo_2x8); \\\n\ + VXC_DP2x8(data1, src0, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniDepth2SpaceF16Blk2_hi_2x8); \\\n\ + \\\n\ + write_type dst; \\\n\ + vxc_ushort8 ms0; \\\n\ + _viv_asm(COPY, ms0, multAndoutZP0, 16); \\\n\ + _viv_asm(COPY, tmpDst0, data0, 16); \\\n\ + _viv_asm(COPY, tmpDst1, data1, 16); \\\n\ + VXC_DP2x8(dst, tmpDst0, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniU8MulAndPostShift_0_Lo_2x8); \\\n\ + VXC_DP2x8(dst, tmpDst1, ms0, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniU8MulAndPostShift_0_Lo_2x8); \\\n\ + VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +DEPTH2SPACE_CRD_F16_TO_QINT_BLK2(U8, vxc_uchar16)\n\ +DEPTH2SPACE_CRD_F16_TO_QINT_BLK2(I8, vxc_char16)\n\ +\n\ +__kernel void depth2space_crd_F16toI16_blk2(\n\ + image2d_array_t input, image2d_array_t output, int block_size)\n\ +{\n\ + int gidy = get_global_id(1);\n\ + int gidz = get_global_id(2);\n\ + int4 coord_out = (int4)(get_global_id(0), gidy, gidz, gidz);\n\ + int4 coord_in = coord_out >> 1;\n\ + coord_in.z = (gidy & 1) * 2 + gidz * 4;\n\ + coord_in.w = coord_in.z + 1;\n\ + vxc_short8 src0, src1, data0, data1, dst0, dst1;\n\ + vxc_half8 tmpDst0, tmpDst1;\n\ + VXC_ReadImage2DArray(src0, input, coord_in.xyzz,\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage2DArray(src1, input, coord_in.xyww,\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + vxc_ushort8 ms0;\n\ + _viv_asm(COPY, ms0, multAndoutZP0, 16);\n\ + VXC_DP2x8(data0, src0, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniDepth2SpaceF16Blk2_lo_2x8);\n\ + VXC_DP2x8(data1, src0, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniDepth2SpaceF16Blk2_hi_2x8);\n\ + _viv_asm(COPY, tmpDst0, data0, 16);\n\ + _viv_asm(COPY, tmpDst1, data1, 16);\n\ + VXC_DP2x8(dst0, tmpDst0, ms0, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 1), uniU8MulAndPostShift_0_Lo_2x8);\n\ + VXC_DP2x8(dst1, tmpDst1, ms0, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 1), uniU8MulAndPostShift_0_Lo_2x8);\n\ +\n\ + VXC_WriteImage2DArray(output, coord_out, dst0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + coord_out.x += 8;\n\ + VXC_WriteImage2DArray(output, coord_out, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +}"; /* end of depth2space_crd_vx*/ static const char depthwise_conv1d_src0_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ \n\ @@ -3322,6 +4105,11 @@ float4 eltwise_unary_mish(float4 x)\n\ return x;\n\ }\n\ \n\ +float4 eltwise_unary_round(float4 x)\n\ +{\n\ + return convert_float4(convert_int4_rte(x));\n\ +}\n\ +\n\ _viv_uniform float inputScale;\n\ _viv_uniform float inputTail;\n\ _viv_uniform float outputScale;\n\ @@ -3442,7 +4230,17 @@ ELTSISE_UNARY_2D(hard_sigmoid, U8, U8, vxc_uchar8, vxc_uchar8, int4, vxc_ucha ELTSISE_UNARY_2D(hard_sigmoid, U8, F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8, vxc_short8)\n\ ELTSISE_UNARY_2D(hard_sigmoid, I16, I16, vxc_short8, vxc_short8, int4, vxc_short8, vxc_short8)\n\ ELTSISE_UNARY_2D(hard_sigmoid, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8, vxc_short8)\n\ -\n\ +//ROUND\n\ +ELTSISE_UNARY_2D(round, F16, F16, vxc_short8, vxc_half8, half4, vxc_half8, vxc_short8)\n\ +ELTSISE_UNARY_2D(round, F16, I8, vxc_short8, vxc_half8, int4, vxc_char8, vxc_char8)\n\ +ELTSISE_UNARY_2D(round, F16, U8, vxc_short8, vxc_half8, int4, vxc_uchar8, vxc_uchar8)\n\ +ELTSISE_UNARY_2D(round, F16, I16, vxc_short8, vxc_half8, int4, vxc_short8, vxc_short8)\n\ +ELTSISE_UNARY_2D(round, I8, I8, vxc_char8, vxc_char8, int4, vxc_char8, vxc_char8)\n\ +ELTSISE_UNARY_2D(round, I8, F16, vxc_char8, vxc_char8, half4, vxc_half8, vxc_short8)\n\ +ELTSISE_UNARY_2D(round, U8, U8, vxc_uchar8, vxc_uchar8, int4, vxc_uchar8, vxc_uchar8)\n\ +ELTSISE_UNARY_2D(round, U8, F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8, vxc_short8)\n\ +ELTSISE_UNARY_2D(round, I16, I16, vxc_short8, vxc_short8, int4, vxc_short8, vxc_short8)\n\ +ELTSISE_UNARY_2D(round, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8, vxc_short8)\n\ \n\ _viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;\n\ _viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8;\n\ @@ -3490,6 +4288,8 @@ ELTSISE_UNARY_BF16_2D(neg)\n\ ELTSISE_UNARY_BF16_2D(mish)\n\ //HARD_SIGMOID\n\ ELTSISE_UNARY_BF16_2D(hard_sigmoid)\n\ +//ROUND\n\ +ELTSISE_UNARY_BF16_2D(round)\n\ "; /* end of eltwise_unary_2d_vx*/ static const char eltwise_unary_3d_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ @@ -3561,6 +4361,11 @@ float4 eltwise_unary_mish(float4 x)\n\ return x;\n\ }\n\ \n\ +float4 eltwise_unary_round(float4 x)\n\ +{\n\ + return convert_float4(convert_int4_rte(x));\n\ +}\n\ +\n\ _viv_uniform float inputScale;\n\ _viv_uniform float inputTail;\n\ _viv_uniform float outputScale;\n\ @@ -3681,6 +4486,17 @@ ELTSISE_UNARY_3D(hard_sigmoid, U8, U8, vxc_uchar8, vxc_uchar8, int4, vxc_ucha ELTSISE_UNARY_3D(hard_sigmoid, U8, F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8, vxc_short8)\n\ ELTSISE_UNARY_3D(hard_sigmoid, I16, I16, vxc_short8, vxc_short8, int4, vxc_short8, vxc_short8)\n\ ELTSISE_UNARY_3D(hard_sigmoid, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8, vxc_short8)\n\ +//ROUND\n\ +ELTSISE_UNARY_3D(round, F16, F16, vxc_short8, vxc_half8, half4, vxc_half8, vxc_short8)\n\ +ELTSISE_UNARY_3D(round, F16, I8, vxc_short8, vxc_half8, int4, vxc_char8, vxc_char8)\n\ +ELTSISE_UNARY_3D(round, F16, U8, vxc_short8, vxc_half8, int4, vxc_uchar8, vxc_uchar8)\n\ +ELTSISE_UNARY_3D(round, F16, I16, vxc_short8, vxc_half8, int4, vxc_short8, vxc_short8)\n\ +ELTSISE_UNARY_3D(round, I8, I8, vxc_char8, vxc_char8, int4, vxc_char8, vxc_char8)\n\ +ELTSISE_UNARY_3D(round, I8, F16, vxc_char8, vxc_char8, half4, vxc_half8, vxc_short8)\n\ +ELTSISE_UNARY_3D(round, U8, U8, vxc_uchar8, vxc_uchar8, int4, vxc_uchar8, vxc_uchar8)\n\ +ELTSISE_UNARY_3D(round, U8, F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8, vxc_short8)\n\ +ELTSISE_UNARY_3D(round, I16, I16, vxc_short8, vxc_short8, int4, vxc_short8, vxc_short8)\n\ +ELTSISE_UNARY_3D(round, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8, vxc_short8)\n\ \n\ _viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;\n\ _viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8;\n\ @@ -3726,7 +4542,184 @@ ELTSISE_UNARY_BF16(neg)\n\ //MISH\n\ ELTSISE_UNARY_BF16(mish)\n\ //HARD_SIGMOID\n\ -ELTSISE_UNARY_BF16(hard_sigmoid)"; /* end of eltwise_unary_3d_vx*/ +ELTSISE_UNARY_BF16(hard_sigmoid)\n\ +//ROUND\n\ +ELTSISE_UNARY_BF16(round)"; /* end of eltwise_unary_3d_vx*/ + +static const char erf_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +#define MUL2_RSQRTPI (1.1283791670955126f)\n\ +float eltwise_unary_erf(float x)\n\ +{\n\ + float res = 0;\n\ + float tmp = x;\n\ + float factorial = 1;\n\ + float x_pow = x;\n\ + float one = 1.0f;\n\ + float n = 1;\n\ +\n\ + while (fabs(tmp) > 1e-5)\n\ + {\n\ + res += tmp;\n\ +\n\ + factorial *= n;\n\ + one *= -1;\n\ + x_pow *= x * x;\n\ + tmp = one / factorial * x_pow / ( 2 * n + 1);\n\ +\n\ + n += 1.0f;\n\ + }\n\ + return res * MUL2_RSQRTPI;\n\ +}\n\ +\n\ +_viv_uniform float inputScale;\n\ +_viv_uniform float inputTail;\n\ +_viv_uniform float outputScale;\n\ +_viv_uniform float outputZP;\n\ +_viv_uniform VXC_512Bits uniExtract8Data_2x8;\n\ +_viv_uniform VXC_512Bits uniDatatoFp32Part0_4x4;\n\ +\n\ +#define ELTSISE_UNARY_2D(func_name, src_type_name, dst_type_name, src_type, \\\n\ + src_copy_type, convert_type, dst_type, dst_copy_type) \\\n\ + __kernel void func_name##_##src_type_name##to##dst_type_name##_2D( \\\n\ + __read_only image2d_array_t input, \\\n\ + __write_only image2d_array_t output \\\n\ + ) \\\n\ +{ \\\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); \\\n\ + src_type src0; \\\n\ + src_copy_type src1; \\\n\ + VXC_ReadImage(src0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src1, src0, 16); \\\n\ + \\\n\ + float4 vecA; \\\n\ + VXC_DP4x4(vecA, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDatatoFp32Part0_4x4); \\\n\ + vecA = vecA * inputScale + inputTail; \\\n\ + vecA.x = eltwise_unary_##func_name(vecA.x); \\\n\ + vecA.y = eltwise_unary_##func_name(vecA.y); \\\n\ + vecA.z = eltwise_unary_##func_name(vecA.z); \\\n\ + vecA.w = eltwise_unary_##func_name(vecA.w); \\\n\ + vecA = vecA * outputScale + outputZP; \\\n\ + \\\n\ + convert_type dst0; \\\n\ + _viv_asm(CONV_RTE, dst0, vecA); \\\n\ + dst_type dst2; \\\n\ + VXC_DP2x8(dst2, dst0, dst0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\ + dst_copy_type dst; \\\n\ + _viv_asm(COPY, dst, dst2, 16); \\\n\ + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +//ERF\n\ +ELTSISE_UNARY_2D(erf, F16, F16, vxc_short8, vxc_half8, half4, vxc_half8, vxc_short8)\n\ +ELTSISE_UNARY_2D(erf, F16, I8, vxc_short8, vxc_half8, int4, vxc_char8, vxc_char8)\n\ +ELTSISE_UNARY_2D(erf, F16, U8, vxc_short8, vxc_half8, int4, vxc_uchar8, vxc_uchar8)\n\ +ELTSISE_UNARY_2D(erf, F16, I16, vxc_short8, vxc_half8, int4, vxc_short8, vxc_short8)\n\ +ELTSISE_UNARY_2D(erf, I8, I8, vxc_char8, vxc_char8, int4, vxc_char8, vxc_char8)\n\ +ELTSISE_UNARY_2D(erf, I8, F16, vxc_char8, vxc_char8, half4, vxc_half8, vxc_short8)\n\ +ELTSISE_UNARY_2D(erf, U8, U8, vxc_uchar8, vxc_uchar8, int4, vxc_uchar8, vxc_uchar8)\n\ +ELTSISE_UNARY_2D(erf, U8, F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8, vxc_short8)\n\ +ELTSISE_UNARY_2D(erf, I16, I16, vxc_short8, vxc_short8, int4, vxc_short8, vxc_short8)\n\ +ELTSISE_UNARY_2D(erf, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8, vxc_short8)\n\ +\n\ +\n\ +_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;\n\ +_viv_uniform VXC_512Bits uniExtractOddData_2x8;\n\ +\n\ +#define ELTSISE_UNARY_BF16_2D(func_name) \\\n\ + __kernel void func_name##_BF16toBF16_2D( \\\n\ + __read_only image2d_array_t input, \\\n\ + __write_only image2d_array_t output \\\n\ + ) \\\n\ +{ \\\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); \\\n\ + vxc_ushort8 src0, src1, dst; \\\n\ + VXC_ReadImage(src0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + float4 vecA; \\\n\ + vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0); \\\n\ + VXC_DP2x8(src1, src0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); \\\n\ + _viv_asm(COPY, vecA, src1, 16); \\\n\ + vecA.x = eltwise_unary_##func_name(vecA.x); \\\n\ + vecA.y = eltwise_unary_##func_name(vecA.y); \\\n\ + vecA.z = eltwise_unary_##func_name(vecA.z); \\\n\ + vecA.w = eltwise_unary_##func_name(vecA.w); \\\n\ + \\\n\ + _viv_asm(COPY, src0, vecA, 16); \\\n\ + \\\n\ + VXC_DP2x8(dst, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); \\\n\ + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +//EXP\n\ +ELTSISE_UNARY_BF16_2D(erf)\n\ +\n\ +#define ELTSISE_UNARY_3D(func_name, src_type_name, dst_type_name, src_type, \\\n\ + src_copy_type, convert_type, dst_type, dst_copy_type) \\\n\ +__kernel void func_name##_##src_type_name##to##dst_type_name( \\\n\ +__read_only image2d_array_t input, \\\n\ +__write_only image2d_array_t output \\\n\ +) \\\n\ +{ \\\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\ + src_type src0; \\\n\ + src_copy_type src1; \\\n\ + VXC_ReadImage2DArray(src0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src1, src0, 16); \\\n\ + \\\n\ + float4 vecA; \\\n\ + VXC_DP4x4(vecA, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDatatoFp32Part0_4x4); \\\n\ + vecA = vecA * inputScale + inputTail; \\\n\ + vecA.x = eltwise_unary_##func_name(vecA.x); \\\n\ + vecA.y = eltwise_unary_##func_name(vecA.y); \\\n\ + vecA.z = eltwise_unary_##func_name(vecA.z); \\\n\ + vecA.w = eltwise_unary_##func_name(vecA.w); \\\n\ + vecA = vecA * outputScale + outputZP; \\\n\ + \\\n\ + convert_type dst0; \\\n\ + _viv_asm(CONV_RTE, dst0, vecA); \\\n\ + dst_type dst2; \\\n\ + VXC_DP2x8(dst2, dst0, dst0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\ + dst_copy_type dst; \\\n\ + _viv_asm(COPY, dst, dst2, 16); \\\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +//ERF\n\ +ELTSISE_UNARY_3D(erf, F16, F16, vxc_short8, vxc_half8, half4, vxc_half8, vxc_short8)\n\ +ELTSISE_UNARY_3D(erf, F16, I8, vxc_short8, vxc_half8, int4, vxc_char8, vxc_char8)\n\ +ELTSISE_UNARY_3D(erf, F16, U8, vxc_short8, vxc_half8, int4, vxc_uchar8, vxc_uchar8)\n\ +ELTSISE_UNARY_3D(erf, F16, I16, vxc_short8, vxc_half8, int4, vxc_short8, vxc_short8)\n\ +ELTSISE_UNARY_3D(erf, I8, I8, vxc_char8, vxc_char8, int4, vxc_char8, vxc_char8)\n\ +ELTSISE_UNARY_3D(erf, I8, F16, vxc_char8, vxc_char8, half4, vxc_half8, vxc_short8)\n\ +ELTSISE_UNARY_3D(erf, U8, U8, vxc_uchar8, vxc_uchar8, int4, vxc_uchar8, vxc_uchar8)\n\ +ELTSISE_UNARY_3D(erf, U8, F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8, vxc_short8)\n\ +ELTSISE_UNARY_3D(erf, I16, I16, vxc_short8, vxc_short8, int4, vxc_short8, vxc_short8)\n\ +ELTSISE_UNARY_3D(erf, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8, vxc_short8)\n\ +\n\ +#define ELTSISE_UNARY_BF16_3D(func_name) \\\n\ + __kernel void func_name##_BF16toBF16( \\\n\ + __read_only image2d_array_t input, \\\n\ + __write_only image2d_array_t output \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\ + vxc_ushort8 src0, src1, dst; \\\n\ + VXC_ReadImage2DArray(src0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + float4 vecA; \\\n\ + vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0); \\\n\ + VXC_DP2x8(src1, src0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); \\\n\ + _viv_asm(COPY, vecA, src1, 16); \\\n\ + vecA.x = eltwise_unary_##func_name(vecA.x); \\\n\ + vecA.y = eltwise_unary_##func_name(vecA.y); \\\n\ + vecA.z = eltwise_unary_##func_name(vecA.z); \\\n\ + vecA.w = eltwise_unary_##func_name(vecA.w); \\\n\ + \\\n\ + _viv_asm(COPY, src0, vecA, 16); \\\n\ + \\\n\ + VXC_DP2x8(dst, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); \\\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +//ERF\n\ +ELTSISE_UNARY_BF16_3D(erf)"; /* end of erf_vx*/ static const char floordiv_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ \n\ @@ -4113,6 +5106,164 @@ __kernel void gather_F16toF16_axis0(\n\ }\n\ "; /* end of gather_vx*/ +static const char gather_array_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform int indices_num;\n\ +_viv_uniform VXC_512Bits uniExtraCopyDpKeepinEvis_2x8;\n\ +\n\ +__kernel void gather_I8toI8_array(\n\ + __read_only image2d_t input0,\n\ + __read_only image2d_t input1,\n\ + __write_only image2d_t output,\n\ + int block_size,\n\ + int block_num,\n\ + int axis_num\n\ + )\n\ +{\n\ + int gidx = get_global_id(0); // block_size\n\ + int gidy = get_global_id(1); // indices_num\n\ + int gidz = get_global_id(2); // block_num\n\ +\n\ + int4 coord_in = (int4)(gidy, 0, gidx, 0);\n\ + int4 indice = read_imagei(input1, coord_in.xy);\n\ + coord_in.w = gidz * axis_num + indice.x;\n\ +\n\ + Image img1 = create_image_from_image2d(input0, 1);\n\ + Image img2 = create_image_from_image2d(output, 1);\n\ + uchar* input_ptr = get_image_ptr_from_coord(img1, coord_in.zw);\n\ + __global vxc_char16* data_ptr = (__global vxc_char16*)input_ptr;\n\ + vxc_char16 src = data_ptr[0];\n\ + int2 coord = (int2)(gidx, gidz * indices_num + gidy);\n\ + uchar* output_ptr = get_image_ptr_from_coord(img2, coord);\n\ + __global vxc_char16* dst_ptr = (__global vxc_char16*)output_ptr;\n\ + dst_ptr[0] = src;\n\ +}\n\ +\n\ +__kernel void gather_U8toU8_array(\n\ + __read_only image2d_t input0,\n\ + __read_only image2d_t input1,\n\ + __write_only image2d_t output,\n\ + int block_size,\n\ + int block_num,\n\ + int axis_num\n\ + )\n\ +{\n\ + int gidx = get_global_id(0); // block_size\n\ + int gidy = get_global_id(1); // indices_num\n\ + int gidz = get_global_id(2); // block_num\n\ +\n\ + int4 coord_in = (int4)(gidy, 0, gidx, 0);\n\ + int4 indice = read_imagei(input1, coord_in.xy);\n\ + coord_in.w = gidz * axis_num + indice.x;\n\ +\n\ + Image img1 = create_image_from_image2d(input0, 1);\n\ + Image img2 = create_image_from_image2d(output, 1);\n\ + uchar* input_ptr = get_image_ptr_from_coord(img1, coord_in.zw);\n\ + __global vxc_uchar16* data_ptr = (__global vxc_uchar16*)input_ptr;\n\ + vxc_uchar16 src = data_ptr[0];\n\ + int2 coord = (int2)(gidx, gidz * indices_num + gidy);\n\ + uchar* output_ptr = get_image_ptr_from_coord(img2, coord);\n\ + __global vxc_uchar16* dst_ptr = (__global vxc_uchar16*)output_ptr;\n\ + dst_ptr[0] = src;\n\ +}\n\ +\n\ +__kernel void gather_I16toI16_array(\n\ + __read_only image2d_t input0,\n\ + __read_only image2d_t input1,\n\ + __write_only image2d_t output,\n\ + int block_size,\n\ + int block_num,\n\ + int axis_num\n\ + )\n\ +{\n\ + int gidx = get_global_id(0); // block_size\n\ + int gidy = get_global_id(1); // indices_num\n\ + int gidz = get_global_id(2); // block_num\n\ +\n\ + int4 coord_in = (int4)(gidy, 0, gidx, 0);\n\ +\n\ +\n\ + int4 indice = read_imagei(input1, coord_in.xy);\n\ + coord_in.w = gidz * axis_num + indice.x;\n\ +\n\ + Image img1 = create_image_from_image2d(input0, 2);\n\ + Image img2 = create_image_from_image2d(output, 2);\n\ + uchar* input_ptr = get_image_ptr_from_coord(img1, coord_in.zw);\n\ + __global vxc_short8* data_ptr = (__global vxc_short8*)input_ptr;\n\ + vxc_short8 src = data_ptr[0];\n\ + int2 coord = (int2)(gidx, gidz * indices_num + gidy);\n\ + uchar* output_ptr = get_image_ptr_from_coord(img2, coord);\n\ + __global vxc_short8* dst_ptr = (__global vxc_short8*)output_ptr;\n\ + dst_ptr[0] = src;\n\ +}\n\ +\n\ +__kernel void gather_F16toF16_array(\n\ + __read_only image2d_t input0,\n\ + __read_only image2d_t input1,\n\ + __write_only image2d_t output,\n\ + int block_size,\n\ + int block_num,\n\ + int axis_num\n\ + )\n\ +{\n\ + int gidx = get_global_id(0); // block_size\n\ + int gidy = get_global_id(1); // indices_num\n\ + int gidz = get_global_id(2); // block_num\n\ +\n\ + int4 coord_in = (int4)(gidy, 0, gidx, 0);\n\ +\n\ + int4 indice = read_imagei(input1, coord_in.xy);\n\ + coord_in.w = gidz * axis_num + indice.x;\n\ +\n\ + Image img1 = create_image_from_image2d(input0, 2);\n\ + Image img2 = create_image_from_image2d(output, 2);\n\ + uchar* input_ptr = get_image_ptr_from_coord(img1, coord_in.zw);\n\ + __global vxc_short8* data_ptr = (__global vxc_short8*)input_ptr;\n\ + vxc_short8 src = data_ptr[0];\n\ + int2 coord = (int2)(gidx, gidz * indices_num + gidy);\n\ + uchar* output_ptr = get_image_ptr_from_coord(img2, coord);\n\ + __global vxc_short8* dst_ptr = (__global vxc_short8*)output_ptr;\n\ + dst_ptr[0] = src;\n\ +}\n\ +\n\ +#define GATHER_AXIS0_ARRAY(src0_type_name, read_type, data_type, write_type) \\\n\ +__kernel void gather_##src0_type_name##to##src0_type_name##_axis0_array( \\\n\ + __read_only image2d_t input0, \\\n\ + __read_only image2d_t input1, \\\n\ + __write_only image2d_t output, \\\n\ + int block_size, \\\n\ + int block_num, \\\n\ + int axis_num \\\n\ + ) \\\n\ +{ \\\n\ + Image img0 = create_image_from_image2d(input0, 1); \\\n\ + Image img1 = create_image_from_image2d(input1, 4); \\\n\ + Image img2 = create_image_from_image2d(output, 1); \\\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0); \\\n\ + uchar* index_ptr = get_image_ptr_from_coord(img1, coord.xz); \\\n\ + __global int* index = (__global int*)index_ptr; \\\n\ + int4 indices = vload4(0, index); \\\n\ + \\\n\ + read_type src, dst; \\\n\ + \\\n\ + uchar* input_ptr = get_image_ptr_from_coord(img0, coord.zy); \\\n\ + uchar* output_ptr = get_image_ptr_from_coord(img2, coord.xy); \\\n\ + __global data_type* data_ptr = (__global data_type*)input_ptr; \\\n\ + __global write_type* out_ptr = (__global write_type*)output_ptr; \\\n\ + src.s0 = data_ptr[indices.x]; \\\n\ + src.s1 = data_ptr[indices.y]; \\\n\ + src.s2 = data_ptr[indices.z]; \\\n\ + src.s3 = data_ptr[indices.w]; \\\n\ + \\\n\ + VXC_DP2x8(dst, src, src, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\ + uniExtraCopyDpKeepinEvis_2x8); \\\n\ + out_ptr[0] = dst.s0123; \\\n\ +}\n\ +GATHER_AXIS0_ARRAY(U8, vxc_uchar16, uchar, vxc_uchar4)\n\ +GATHER_AXIS0_ARRAY(I8, vxc_char16, char, vxc_char4)\n\ +GATHER_AXIS0_ARRAY(I16, vxc_short8, short, vxc_short4)\n\ +GATHER_AXIS0_ARRAY(F16, vxc_short8, short, vxc_short4)"; /* end of gather_array_vx*/ + static const char gather_mix_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ \n\ _viv_uniform int indices_num;\n\ @@ -4334,7 +5485,10 @@ __kernel void gather_nd_I8toI8_1D(\n\ int gidy = get_global_id(1); // indices_num\n\ \n\ int4 coord = (int4)(0, gidy, gidx, 0);\n\ - int4 indice = read_imagei(input1, coord.xy);\n\ + Image img = create_image_from_image2d(input1, 4);\n\ + uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy);\n\ + int4 indice = ((int4 *)indice_ptr)[0];\n\ +\n\ coord.w = indice.x;\n\ \n\ vxc_char16 src;\n\ @@ -4355,7 +5509,10 @@ __kernel void gather_nd_U8toU8_1D(\n\ int gidy = get_global_id(1); // indices_num\n\ \n\ int4 coord = (int4)(0, gidy, gidx, 0);\n\ - int4 indice = read_imagei(input1, coord.xy);\n\ + Image img = create_image_from_image2d(input1, 4);\n\ + uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy);\n\ + int4 indice = ((int4 *)indice_ptr)[0];\n\ +\n\ coord.w = indice.x;\n\ \n\ vxc_uchar16 src;\n\ @@ -4375,7 +5532,10 @@ __kernel void gather_nd_I16toI16_1D(\n\ int gidy = get_global_id(1); // indices_num\n\ \n\ int4 coord = (int4)(0, gidy, gidx, 0);\n\ - int4 indice = read_imagei(input1, coord.xy);\n\ + Image img = create_image_from_image2d(input1, 4);\n\ + uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy);\n\ + int4 indice = ((int4 *)indice_ptr)[0];\n\ +\n\ coord.w = indice.x;\n\ \n\ vxc_short8 src;\n\ @@ -4395,7 +5555,10 @@ __kernel void gather_nd_F16toF16_1D(\n\ int gidy = get_global_id(1); // indices_num\n\ \n\ int4 coord = (int4)(0, gidy, gidx, 0);\n\ - int4 indice = read_imagei(input1, coord.xy);\n\ + Image img = create_image_from_image2d(input1, 4);\n\ + uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy);\n\ + int4 indice = ((int4 *)indice_ptr)[0];\n\ +\n\ coord.w = indice.x;\n\ \n\ vxc_short8 src;\n\ @@ -4418,7 +5581,10 @@ __kernel void gather_nd_I8toI8_2D(\n\ int gidy = get_global_id(1); // indices_num\n\ \n\ int4 coord = (int4)(0, gidy, gidx, 0);\n\ - int4 indice = read_imagei(input1, coord.xy);\n\ + Image img = create_image_from_image2d(input1, 4);\n\ + uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy);\n\ + int4 indice = ((int4 *)indice_ptr)[0];\n\ +\n\ indice.x = indice.x * block_size + gidx;\n\ \n\ vxc_char16 src;\n\ @@ -4439,7 +5605,10 @@ __kernel void gather_nd_U8toU8_2D(\n\ int gidy = get_global_id(1); // indices_num\n\ \n\ int4 coord = (int4)(0, gidy, gidx, 0);\n\ - int4 indice = read_imagei(input1, coord.xy);\n\ + Image img = create_image_from_image2d(input1, 4);\n\ + uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy);\n\ + int4 indice = ((int4 *)indice_ptr)[0];\n\ +\n\ indice.x = indice.x * block_size + gidx;\n\ \n\ vxc_uchar16 src;\n\ @@ -4459,7 +5628,10 @@ __kernel void gather_nd_I16toI16_2D(\n\ int gidy = get_global_id(1); // indices_num\n\ \n\ int4 coord = (int4)(0, gidy, gidx, 0);\n\ - int4 indice = read_imagei(input1, coord.xy);\n\ + Image img = create_image_from_image2d(input1, 4);\n\ + uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy);\n\ + int4 indice = ((int4 *)indice_ptr)[0];\n\ +\n\ indice.x = indice.x * block_size + gidx;\n\ \n\ vxc_short8 src;\n\ @@ -4479,7 +5651,10 @@ __kernel void gather_nd_F16toF16_2D(\n\ int gidy = get_global_id(1); // indices_num\n\ \n\ int4 coord = (int4)(0, gidy, gidx, 0);\n\ - int4 indice = read_imagei(input1, coord.xy);\n\ + Image img = create_image_from_image2d(input1, 4);\n\ + uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy);\n\ + int4 indice = ((int4 *)indice_ptr)[0];\n\ +\n\ indice.x = indice.x * block_size + gidx;\n\ \n\ vxc_short8 src;\n\ @@ -4516,7 +5691,10 @@ __kernel void gather_nd_##src0_type_name##toF16_2D( \\\n\ int gidy = get_global_id(1); \\\n\ \\\n\ int4 coord = (int4)(0, gidy, gidx, 0); \\\n\ - int4 indice = read_imagei(input1, coord.xy); \\\n\ + Image img = create_image_from_image2d(input1, 4); \\\n\ + uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy); \\\n\ + int4 indice = ((int4 *)indice_ptr)[0]; \\\n\ + \\\n\ indice.x = indice.x * block_size + gidx; \\\n\ \\\n\ read_type src; \\\n\ @@ -4547,7 +5725,10 @@ __kernel void gather_nd_F16to##src1_type_name##_2D( \\\n\ int gidy = get_global_id(1); \\\n\ \\\n\ int4 coord = (int4)(0, gidy, gidx, 0); \\\n\ - int4 indice = read_imagei(input1, coord.xy); \\\n\ + Image img = create_image_from_image2d(input1, 4); \\\n\ + uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy); \\\n\ + int4 indice = ((int4 *)indice_ptr)[0]; \\\n\ + \\\n\ indice.x = indice.x * block_size + gidx; \\\n\ \\\n\ vxc_short8 src; \\\n\ @@ -4580,7 +5761,10 @@ __kernel void gather_nd_I8toI8_3D(\n\ int gidy = get_global_id(1); // indices_num\n\ \n\ int4 coord = (int4)(0, gidy, gidx, 0);\n\ - int4 indice = read_imagei(input1, coord.xy);\n\ + Image img = create_image_from_image2d(input1, 4);\n\ + uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy);\n\ + int4 indice = ((int4 *)indice_ptr)[0];\n\ +\n\ indice.x = indice.x * block_size + gidx;\n\ indice.w = 0;\n\ \n\ @@ -4602,7 +5786,11 @@ __kernel void gather_nd_U8toU8_3D(\n\ int gidy = get_global_id(1); // indices_num\n\ \n\ int4 coord = (int4)(0, gidy, gidx, 0);\n\ - int4 indice = read_imagei(input1, coord.xy);\n\ +\n\ + Image img = create_image_from_image2d(input1, 4);\n\ + uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy);\n\ + int4 indice = ((int4 *)indice_ptr)[0];\n\ +\n\ indice.x = indice.x * block_size + gidx;\n\ indice.w = 0;\n\ \n\ @@ -4623,7 +5811,10 @@ __kernel void gather_nd_I16toI16_3D(\n\ int gidy = get_global_id(1); // indices_num\n\ \n\ int4 coord = (int4)(0, gidy, gidx, 0);\n\ - int4 indice = read_imagei(input1, coord.xy);\n\ + Image img = create_image_from_image2d(input1, 4);\n\ + uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy);\n\ + int4 indice = ((int4 *)indice_ptr)[0];\n\ +\n\ indice.x = indice.x * block_size + gidx;\n\ indice.w = 0;\n\ \n\ @@ -4644,7 +5835,10 @@ __kernel void gather_nd_F16toF16_3D(\n\ int gidy = get_global_id(1); // indices_num\n\ \n\ int4 coord = (int4)(0, gidy, gidx, 0);\n\ - int4 indice = read_imagei(input1, coord.xy);\n\ + Image img = create_image_from_image2d(input1, 4);\n\ + uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy);\n\ + int4 indice = ((int4 *)indice_ptr)[0];\n\ +\n\ indice.x = indice.x * block_size + gidx;\n\ indice.w = 0;\n\ \n\ @@ -4652,6 +5846,7 @@ __kernel void gather_nd_F16toF16_3D(\n\ VXC_ReadImage2DArray(src, input0, indice, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ VXC_WriteImage(output, coord.zy, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ }\n\ +\n\ "; /* end of gather_nd_3d_vx*/ static const char gather_nd_3d_mix_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ @@ -4679,7 +5874,10 @@ __kernel void gather_nd_##src0_type_name##toF16_3D( \\\n\ int gidy = get_global_id(1); \\\n\ \\\n\ int4 coord = (int4)(0, gidy, gidx, 0); \\\n\ - int4 indice = read_imagei(input1, coord.xy); \\\n\ + Image img = create_image_from_image2d(input1, 4); \\\n\ + uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy); \\\n\ + int4 indice = ((int4 *)indice_ptr)[0]; \\\n\ + \\\n\ indice.x = indice.x * block_size + gidx; \\\n\ indice.w = 0; \\\n\ \\\n\ @@ -4711,7 +5909,10 @@ __kernel void gather_nd_F16to##src1_type_name##_3D( \\\n\ int gidy = get_global_id(1); \\\n\ \\\n\ int4 coord = (int4)(0, gidy, gidx, 0); \\\n\ - int4 indice = read_imagei(input1, coord.xy); \\\n\ + Image img = create_image_from_image2d(input1, 4); \\\n\ + uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy); \\\n\ + int4 indice = ((int4 *)indice_ptr)[0]; \\\n\ + \\\n\ indice.x = indice.x * block_size + gidx; \\\n\ indice.w = 0; \\\n\ \\\n\ @@ -4760,7 +5961,10 @@ __kernel void gather_nd_##src0_type_name##toF16_1D( \\\n\ int gidy = get_global_id(1); \\\n\ \\\n\ int4 coord = (int4)(0, gidy, gidx, 0); \\\n\ - int4 indice = read_imagei(input1, coord.xy); \\\n\ + Image img = create_image_from_image2d(input1, 4); \\\n\ + uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy); \\\n\ + int4 indice = ((int4 *)indice_ptr)[0]; \\\n\ + \\\n\ coord.w = indice.x; \\\n\ \\\n\ read_type src; \\\n\ @@ -4791,7 +5995,10 @@ __kernel void gather_nd_F16to##src1_type_name##_1D( \\\n\ int gidy = get_global_id(1); \\\n\ \\\n\ int4 coord = (int4)(0, gidy, gidx, 0); \\\n\ - int4 indice = read_imagei(input1, coord.xy); \\\n\ + Image img = create_image_from_image2d(input1, 4); \\\n\ + uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy); \\\n\ + int4 indice = ((int4 *)indice_ptr)[0]; \\\n\ + \\\n\ coord.w = indice.x; \\\n\ \\\n\ vxc_short8 src; \\\n\ @@ -4811,6 +6018,1350 @@ GATHER_ND_F16_TO_QINT_1D(I16, vxc_short8)\n\ \n\ "; /* end of gather_nd_mix_vx*/ +static const char group_normalization_f16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform int width;\n\ +_viv_uniform int height;\n\ +_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4;\n\ +_viv_uniform VXC_512Bits uniConvertHalfToFp16_2x8;\n\ +_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\ +\n\ +_viv_uniform VXC_512Bits uniFp16SumSqr_dp8x2;\n\ +_viv_uniform VXC_512Bits uniConvertEndInt16Fp32_4x4;\n\ +\n\ +_viv_uniform float outputScale;\n\ +_viv_uniform int output_ZP;\n\ +\n\ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_sumsqr_F16(\n\ + image2d_array_t input, image2d_array_t output, float eps, int is2D)\n\ +{\n\ + int gidx = get_global_id(0) << 3;\n\ + int lidx = get_local_id(0);\n\ + int gidz = get_global_id(1);\n\ + int4 coord = (int4)(gidx, 0, gidz, 0);\n\ + vxc_short8 src0;\n\ + vxc_half8 in_h;\n\ + vxc_float4 sumsqr;\n\ + vxc_float4 tmpSumSqr = (vxc_float4)(0);\n\ +\n\ + __local float lcl_sum[16];\n\ + __local float lcl_sqr[16];\n\ +\n\ + int8 input_desc;\n\ + _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ + int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0;\n\ + _viv_asm(MOV, coord.z, baseAddr_a);\n\ +\n\ + if(gidx < width)\n\ + {\n\ + for(coord.y = 0; coord.y < height;)\n\ + {\n\ + VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + coord.y++;\n\ + _viv_asm(COPY, in_h, src0, 16);\n\ + VXC_DP8x2(sumsqr, in_h, in_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\\\n\ + uniFp16SumSqr_dp8x2);\n\ + tmpSumSqr += sumsqr;\n\ + }\n\ + }\n\ +\n\ + lcl_sum[lidx] = tmpSumSqr.x;\n\ + lcl_sqr[lidx] = tmpSumSqr.y;\n\ + barrier(CLK_LOCAL_MEM_FENCE);\n\ +\n\ + int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0);\n\ + if(lidx == 0)\n\ + {\n\ + float4 one = (float4)(1, 1, 1, 1);\n\ + __local float4* tmp_sum = (__local float4*)lcl_sum;\n\ + __local float4* tmp_sqr = (__local float4*)lcl_sqr;\n\ +\n\ + float sum = 0;\n\ + float sqr = 0;\n\ + for(int i = 0; i < 4; i++)\n\ + {\n\ + //sum += lcl_sum[i];\n\ + //sqr += lcl_sqr[i];\n\ + sum += dot(tmp_sum[i], one);\n\ + sqr += dot(tmp_sqr[i], one);\n\ + }\n\ +\n\ + float4 data = (float4)(sum, sqr, 0, 0);\n\ + write_imagef(output, coord_out, data);\n\ + }\n\ +}\n\ +\n\ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_sumsqr_F16_2D(\n\ + image2d_array_t input, image2d_array_t output, float eps, int is2D)\n\ +{\n\ + int gidx = get_global_id(0) << 3;\n\ + int lidx = get_local_id(0);\n\ +\n\ + int2 coord = (int2)(gidx, get_global_id(1));\n\ + vxc_short8 src0;\n\ + vxc_half8 in_h;\n\ + vxc_float4 sumsqr = (vxc_float4)(0);\n\ +\n\ + __local float lcl_sum[16];\n\ + __local float lcl_sqr[16];\n\ +\n\ + if(gidx < width)\n\ + {\n\ + VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, in_h, src0, 16);\n\ + VXC_DP8x2(sumsqr, in_h, in_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\\\n\ + uniFp16SumSqr_dp8x2);\n\ + }\n\ +\n\ + lcl_sum[lidx] = sumsqr.x;\n\ + lcl_sqr[lidx] = sumsqr.y;\n\ + barrier(CLK_LOCAL_MEM_FENCE);\n\ +\n\ + int4 coord_out = (int4)(get_group_id(0) << 2, get_global_id(1), 0, 0);\n\ + if(lidx == 0)\n\ + {\n\ + float4 one = (float4)(1, 1, 1, 1);\n\ + __local float4* tmp_sum = (__local float4*)lcl_sum;\n\ + __local float4* tmp_sqr = (__local float4*)lcl_sqr;\n\ +\n\ + float sum = 0;\n\ + float sqr = 0;\n\ + for(int i = 0; i < 4; i++)\n\ + {\n\ + //sum += lcl_sum[i];\n\ + //sqr += lcl_sqr[i];\n\ + sum += dot(tmp_sum[i], one);\n\ + sqr += dot(tmp_sqr[i], one);\n\ + }\n\ +\n\ + float4 data = (float4)(sum, sqr, 0, 0);\n\ + write_imagef(output, coord_out, data);\n\ + }\n\ +}\n\ +\n\ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_F16toF16(\n\ + image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari, image2d_array_t output,\n\ + float eps, int is2D, float rSpaceOrg, int pStride)\n\ +{\n\ + int gidy = get_global_id(1);\n\ + int gidz = get_global_id(2);\n\ + int4 coord = (int4)(get_global_id(0), gidy, gidz, 0);\n\ + int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0);\n\ + vxc_short8 src0;\n\ + vxc_short8 src1;\n\ + vxc_half8 scale_h, in_h;\n\ + float scale_vari, bias_val;\n\ + vxc_float4 bias_f, scale_f;\n\ +\n\ + vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);\n\ + VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + bias_f = read_imagef(bias, coord_para.xy);\n\ + VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + _viv_asm(COPY, scale_h, src1, 16);\n\ + VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);\n\ +\n\ + scale_vari = scale_f.s0 * mean_vari.s1;\n\ + vxc_float4 tmpData0, tmpData1;\n\ + vxc_short8 outval;\n\ + half4 tmpVal0, tmpVal1;\n\ + bias_val = (bias_f.s0 - scale_vari * mean_vari.s0);\n\ + vxc_half8 dst;\n\ +\n\ + _viv_asm(COPY, in_h, src0, 16);\n\ + VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + UniFP16toFP32Lo4_dp4x4);\n\ + VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvertEndInt16Fp32_4x4);\n\ +\n\ + vxc_float4 norm;\n\ + norm = scale_vari * tmpData0 + bias_val;\n\ + _viv_asm(CONV, tmpVal0, norm);\n\ + norm = scale_vari * tmpData1 + bias_val;\n\ + _viv_asm(CONV, tmpVal1, norm);\n\ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvertHalfToFp16_2x8);\n\ + _viv_asm(COPY, outval, dst, 16);\n\ + VXC_WriteImage2DArray(output, coord, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_F16toF16_2D(\n\ + image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari, image2d_array_t output,\n\ + float eps, int is2D, float rSpaceOrg, int pStride)\n\ +{\n\ + int gidz = get_global_id(1);\n\ + int2 coord = (int2)(get_global_id(0), gidz);\n\ + int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 0);\n\ + vxc_short8 src0;\n\ + vxc_short8 src1;\n\ + vxc_half8 scale_h, in_h;\n\ + float scale_vari, bias_val;\n\ + vxc_float4 bias_f, scale_f;\n\ +\n\ + vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);\n\ + VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + bias_f = read_imagef(bias, coord_para.xy);\n\ + VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + _viv_asm(COPY, scale_h, src1, 16);\n\ + VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);\n\ +\n\ + scale_vari = scale_f.s0 * mean_vari.s1;\n\ + vxc_float4 tmpData0, tmpData1;\n\ + vxc_short8 outval;\n\ + half4 tmpVal0, tmpVal1;\n\ + bias_val = (bias_f.s0 - scale_vari * mean_vari.s0);\n\ + vxc_half8 dst;\n\ +\n\ + _viv_asm(COPY, in_h, src0, 16);\n\ + VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + UniFP16toFP32Lo4_dp4x4);\n\ + VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvertEndInt16Fp32_4x4);\n\ + vxc_float4 norm;\n\ + norm = scale_vari * tmpData0 + bias_val;\n\ + _viv_asm(CONV, tmpVal0, norm);\n\ + norm = scale_vari * tmpData1 + bias_val;\n\ + _viv_asm(CONV, tmpVal1, norm);\n\ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvertHalfToFp16_2x8);\n\ + _viv_asm(COPY, outval, dst, 16);\n\ + VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_F16toU8(\n\ + image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari, image2d_array_t output,\n\ + float eps, int is2D, float rSpaceOrg, int pStride)\n\ +{\n\ + int gidy = get_global_id(1);\n\ + int gidz = get_global_id(2);\n\ + int4 coord = (int4)(get_global_id(0), gidy, gidz, 0);\n\ + int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0);\n\ + vxc_short8 src0;\n\ + vxc_short8 src1;\n\ + vxc_half8 scale_h, in_h;\n\ + float scale_vari, bias_val;\n\ + vxc_float4 bias_f, scale_f;\n\ +\n\ + vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);\n\ + VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + bias_f = read_imagef(bias, coord_para.xy);\n\ + VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + _viv_asm(COPY, scale_h, src1, 16);\n\ + VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);\n\ +\n\ + scale_vari = scale_f.s0 * mean_vari.s1;\n\ + vxc_float4 tmpData0, tmpData1;\n\ + vxc_uchar16 outval;\n\ + vxc_int4 tmpVal0, tmpVal1;\n\ + float alpha = outputScale * scale_vari;\n\ + bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * outputScale + output_ZP;\n\ +\n\ + _viv_asm(COPY, in_h, src0, 16);\n\ + VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + UniFP16toFP32Lo4_dp4x4);\n\ + VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvertEndInt16Fp32_4x4);\n\ +\n\ + vxc_float4 norm;\n\ + norm = alpha * tmpData0 + bias_val;\n\ + tmpVal0 = convert_int4_rte(norm);\n\ + norm = alpha * tmpData1 + bias_val;\n\ + tmpVal1 = convert_int4_rte(norm);\n\ + VXC_DP2x8(outval, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ + uniConvertInt32toUint8_2x8);\n\ + VXC_WriteImage2DArray(output, coord, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_F16toU8_2D(\n\ + image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari, image2d_array_t output,\n\ + float eps, int is2D, float rSpaceOrg, int pStride)\n\ +{\n\ + int gidz = get_global_id(1);\n\ + int2 coord = (int2)(get_global_id(0), gidz);\n\ + int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 0);\n\ + vxc_short8 src0;\n\ + vxc_short8 src1;\n\ + vxc_half8 scale_h, in_h;\n\ + float scale_vari, bias_val;\n\ + vxc_float4 bias_f, scale_f;\n\ +\n\ + vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);\n\ + VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + bias_f = read_imagef(bias, coord_para.xy);\n\ + VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + _viv_asm(COPY, scale_h, src1, 16);\n\ + VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);\n\ +\n\ + scale_vari = scale_f.s0 * mean_vari.s1;\n\ + vxc_float4 tmpData0, tmpData1;\n\ + vxc_uchar16 outval;\n\ + vxc_int4 tmpVal0, tmpVal1;\n\ + float alpha = outputScale * scale_vari;\n\ + bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * outputScale + output_ZP;\n\ +\n\ + _viv_asm(COPY, in_h, src0, 16);\n\ + VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + UniFP16toFP32Lo4_dp4x4);\n\ + VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvertEndInt16Fp32_4x4);\n\ + vxc_float4 norm;\n\ + norm = alpha * tmpData0 + bias_val;\n\ + tmpVal0 = convert_int4_rte(norm);\n\ + norm = alpha * tmpData1 + bias_val;\n\ + tmpVal1 = convert_int4_rte(norm);\n\ + VXC_DP2x8(outval, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ + uniConvertInt32toUint8_2x8);\n\ + VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +}"; /* end of group_normalization_f16_vx*/ + +static const char group_normalization_i16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform int width;\n\ +_viv_uniform int height;\n\ +_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4;\n\ +_viv_uniform VXC_512Bits uniConvertHalfToFp16_2x8;\n\ +\n\ +_viv_uniform float inFlScale_s2;\n\ +_viv_uniform float input_fl_scale;\n\ +_viv_uniform float inOut_fl_scale;\n\ +_viv_uniform float output_fl_scale;\n\ +\n\ +_viv_uniform VXC_512Bits uniInt16SumSqr_dp8x2;\n\ +_viv_uniform VXC_512Bits uniConvertInt16Fp32Fst_4x4;\n\ +_viv_uniform VXC_512Bits uniConvertInt16Fp32Secd_4x4;\n\ +_viv_uniform VXC_512Bits uniConvertInt32toInt16_2x8;\n\ +\n\ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_sumsqr_I16(\n\ + image2d_array_t input,\n\ + image2d_array_t output,\n\ + float eps,\n\ + int is2D)\n\ +{\n\ + int gidx = get_global_id(0) << 3;\n\ + int lidx = get_local_id(0);\n\ + int gidz = get_global_id(1);\n\ + int4 coord = (int4)(gidx, 0, gidz, 0);\n\ + vxc_short8 src0;\n\ + float sum = 0, sqr = 0;\n\ + vxc_float4 sumsqr = (vxc_float4)(0);\n\ + vxc_float4 tmpSumSqr = (vxc_float4)(0);\n\ +\n\ + __local float lcl_sum[16];\n\ + __local float lcl_sqr[16];\n\ +\n\ + int8 input_desc;\n\ + _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ + int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0;\n\ + _viv_asm(MOV, coord.z, baseAddr_a);\n\ + if(gidx < width)\n\ + {\n\ + for(coord.y = 0; coord.y < height;)\n\ + {\n\ + VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + coord.y++;\n\ + VXC_DP8x2(sumsqr, src0, src0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\\\n\ + uniInt16SumSqr_dp8x2);\n\ + //tmpSumSqr += sumsqr;\n\ + tmpSumSqr.x += sumsqr.x;\n\ + sqr += (sumsqr.y * inFlScale_s2);\n\ + }\n\ + sum = tmpSumSqr.x * input_fl_scale;\n\ + //sqr = tmpSumSqr.y * inFlScale_s2;\n\ + }\n\ +\n\ + lcl_sum[lidx] = sum;\n\ + lcl_sqr[lidx] = sqr;\n\ + barrier(CLK_LOCAL_MEM_FENCE);\n\ +\n\ + int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0);\n\ + if(lidx == 0)\n\ + {\n\ + float4 one = (float4)(1, 1, 1, 1);\n\ + __local float4* tmp_sum = (__local float4*)lcl_sum;\n\ + __local float4* tmp_sqr = (__local float4*)lcl_sqr;\n\ +\n\ + sum = 0; sqr = 0;\n\ + for(int i = 0; i < 4; i++)\n\ + {\n\ + //sum += lcl_sum[i];\n\ + //sqr += lcl_sqr[i];\n\ + sum += dot(tmp_sum[i], one);\n\ + sqr += dot(tmp_sqr[i], one);\n\ + }\n\ +\n\ + float4 data = (float4)(sum, sqr, 0, 0);\n\ + write_imagef(output, coord_out, data);\n\ + }\n\ +}\n\ +\n\ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_sumsqr_I16_2D(\n\ + image2d_array_t input,\n\ + image2d_array_t output,\n\ + float eps,\n\ + int is2D)\n\ +{\n\ + int gidx = get_global_id(0) << 3;\n\ + int lidx = get_local_id(0);\n\ + int gidz = get_global_id(1);\n\ +\n\ + int2 coord = (int2)(gidx, gidz);\n\ + vxc_short8 src0;\n\ + float sum = 0, sqr = 0;\n\ + vxc_float4 sumsqr = (vxc_float4)(0);\n\ +\n\ + __local float lcl_sum[16];\n\ + __local float lcl_sqr[16];\n\ +\n\ + if(gidx < width)\n\ + {\n\ + VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_DP8x2(sumsqr, src0, src0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\\\n\ + uniInt16SumSqr_dp8x2);\n\ + sqr = sumsqr.y * inFlScale_s2;\n\ + sum = sumsqr.x * input_fl_scale;\n\ + //sqr = tmpSumSqr.y * inFlScale_s2;\n\ + }\n\ +\n\ + lcl_sum[lidx] = sum;\n\ + lcl_sqr[lidx] = sqr;\n\ + barrier(CLK_LOCAL_MEM_FENCE);\n\ +\n\ + int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0);\n\ + if(lidx == 0)\n\ + {\n\ + float4 one = (float4)(1, 1, 1, 1);\n\ + __local float4* tmp_sum = (__local float4*)lcl_sum;\n\ + __local float4* tmp_sqr = (__local float4*)lcl_sqr;\n\ +\n\ + sum = 0; sqr = 0;\n\ + for(int i = 0; i < 4; i++)\n\ + {\n\ + //sum += lcl_sum[i];\n\ + //sqr += lcl_sqr[i];\n\ + sum += dot(tmp_sum[i], one);\n\ + sqr += dot(tmp_sqr[i], one);\n\ + }\n\ +\n\ + float4 data = (float4)(sum, sqr, 0, 0);\n\ + write_imagef(output, coord_out, data);\n\ + }\n\ +}\n\ +\n\ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_I16toF16(\n\ + image2d_array_t input,\n\ + image2d_t bias,\n\ + image2d_t scale,\n\ + image2d_t meanVari,\n\ + image2d_array_t output,\n\ + float eps,\n\ + int is2D,\n\ + float rSpaceOrg, int pStride)\n\ +{\n\ + int gidy = get_global_id(1);\n\ + int gidz = get_global_id(2);\n\ + int4 coord = (int4)(get_global_id(0), gidy, gidz, 0);\n\ + int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0);\n\ + vxc_short8 src0;\n\ + vxc_short8 src1;\n\ + vxc_half8 scale_h;\n\ + float scale_vari, bias_val;\n\ + vxc_float4 bias_f, scale_f;\n\ +\n\ + vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);\n\ + VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + bias_f = read_imagef(bias, coord_para.xy);\n\ + VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + _viv_asm(COPY, scale_h, src1, 16);\n\ + VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + UniFP16toFP32Lo4_dp4x4);\n\ +\n\ + scale_vari = scale_f.s0 * mean_vari.s1;\n\ + vxc_float4 tmpData0, tmpData1;\n\ + vxc_short8 outval;\n\ + half4 tmpVal0, tmpVal1;\n\ + float alpha = input_fl_scale * scale_vari;\n\ + bias_val = (bias_f.s0 - scale_vari * mean_vari.s0);\n\ + vxc_half8 dst;\n\ +\n\ + VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvertInt16Fp32Fst_4x4);\n\ + VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvertInt16Fp32Secd_4x4);\n\ +\n\ + vxc_float4 norm;\n\ + norm = alpha * tmpData0 + bias_val;\n\ + _viv_asm(CONV, tmpVal0, norm);\n\ + norm = alpha * tmpData1 + bias_val;\n\ + _viv_asm(CONV, tmpVal1, norm);\n\ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvertHalfToFp16_2x8);\n\ + _viv_asm(COPY, outval, dst, 16);\n\ + VXC_WriteImage2DArray(output, coord, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_I16toF16_2D(\n\ + image2d_array_t input,\n\ + image2d_t bias,\n\ + image2d_t scale,\n\ + image2d_t meanVari,\n\ + image2d_array_t output,\n\ + float eps,\n\ + int is2D,\n\ + float rSpaceOrg, int pStride)\n\ +{\n\ + int gidz = get_global_id(1);\n\ + int2 coord = (int2)(get_global_id(0), gidz);\n\ + int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 0);\n\ + vxc_short8 src0;\n\ + vxc_short8 src1;\n\ + vxc_half8 scale_h;\n\ + float scale_vari, bias_val;\n\ + vxc_float4 bias_f, scale_f;\n\ +\n\ + vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);\n\ + VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + bias_f = read_imagef(bias, coord_para.xy);\n\ + VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + _viv_asm(COPY, scale_h, src1, 16);\n\ + VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + UniFP16toFP32Lo4_dp4x4);\n\ +\n\ + scale_vari = scale_f.s0 * mean_vari.s1;\n\ + vxc_float4 tmpData0, tmpData1;\n\ + vxc_short8 outval;\n\ + half4 tmpVal0, tmpVal1;\n\ + float alpha = input_fl_scale * scale_vari;\n\ + bias_val = (bias_f.s0 - scale_vari * mean_vari.s0);\n\ + vxc_half8 dst;\n\ +\n\ + VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvertInt16Fp32Fst_4x4);\n\ + VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvertInt16Fp32Secd_4x4);\n\ + vxc_float4 norm;\n\ + norm = alpha * tmpData0 + bias_val;\n\ + _viv_asm(CONV, tmpVal0, norm);\n\ + norm = alpha * tmpData1 + bias_val;\n\ + _viv_asm(CONV, tmpVal1, norm);\n\ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvertHalfToFp16_2x8);\n\ + _viv_asm(COPY, outval, dst, 16);\n\ + VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_I16toI16(\n\ + image2d_array_t input,\n\ + image2d_t bias,\n\ + image2d_t scale,\n\ + image2d_t meanVari,\n\ + image2d_array_t output,\n\ + float eps,\n\ + int is2D,\n\ + float rSpaceOrg, int pStride)\n\ +{\n\ + int gidy = get_global_id(1);\n\ + int gidz = get_global_id(2);\n\ + int4 coord = (int4)(get_global_id(0), gidy, gidz, 0);\n\ + int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0);\n\ + vxc_short8 src0, src2;\n\ + vxc_short8 src1;\n\ + vxc_half8 scale_h;\n\ + float scale_vari, bias_val;\n\ + vxc_float4 bias_f, scale_f;\n\ +\n\ + vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);\n\ + VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + bias_f = read_imagef(bias, coord_para.xy);\n\ + VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + _viv_asm(COPY, scale_h, src1, 16);\n\ + VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);\n\ +\n\ + scale_vari = scale_f.s0 * mean_vari.s1;\n\ + vxc_int4 tmpVal0, tmpVal1;\n\ + vxc_float4 tmpData0, tmpData1;\n\ + float alpha = inOut_fl_scale * scale_vari;\n\ + bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * output_fl_scale;\n\ +\n\ + VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvertInt16Fp32Fst_4x4);\n\ + VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvertInt16Fp32Secd_4x4);\n\ + vxc_float4 norm;\n\ + norm = tmpData0 * alpha + bias_val;\n\ + tmpVal0 = convert_int4_rte(norm);\n\ + norm = tmpData1 * alpha + bias_val;\n\ + tmpVal1 = convert_int4_rte(norm);\n\ + VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ + uniConvertInt32toInt16_2x8);\n\ + VXC_WriteImage2DArray(output, coord, src2, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_I16toI16_2D(\n\ + image2d_array_t input,\n\ + image2d_t bias,\n\ + image2d_t scale,\n\ + image2d_t meanVari,\n\ + image2d_array_t output,\n\ + float eps,\n\ + int is2D,\n\ + float rSpaceOrg, int pStride)\n\ +{\n\ + int gidz = get_global_id(1);\n\ + int2 coord = (int2)(get_global_id(0), gidz);\n\ + int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 0);\n\ + vxc_short8 src0, src2;\n\ + vxc_short8 src1;\n\ + vxc_half8 scale_h;\n\ + float scale_vari, bias_val;\n\ + vxc_float4 bias_f, scale_f;\n\ +\n\ + vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);\n\ + VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + bias_f = read_imagef(bias, coord_para.xy);\n\ + VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + _viv_asm(COPY, scale_h, src1, 16);\n\ + VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);\n\ +\n\ + scale_vari = scale_f.s0 * mean_vari.s1;\n\ + vxc_int4 tmpVal0, tmpVal1;\n\ + vxc_float4 tmpData0, tmpData1;\n\ + float alpha = inOut_fl_scale * scale_vari;\n\ + bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * output_fl_scale;\n\ +\n\ + VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvertInt16Fp32Fst_4x4);\n\ + VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvertInt16Fp32Secd_4x4);\n\ + vxc_float4 norm;\n\ + norm = tmpData0 * alpha + bias_val;\n\ + tmpVal0 = convert_int4_rte(norm);\n\ + norm = tmpData1 * alpha + bias_val;\n\ + tmpVal1 = convert_int4_rte(norm);\n\ + VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ + uniConvertInt32toInt16_2x8);\n\ + VXC_WriteImage(output, coord.xy, src2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +}"; /* end of group_normalization_i16_vx*/ + +static const char group_normalization_i8_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform int width;\n\ +_viv_uniform int height;\n\ +_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4;\n\ +_viv_uniform VXC_512Bits uniConvertHalfToFp16_2x8;\n\ +_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\ +\n\ +_viv_uniform VXC_512Bits uniSumInt8_16x1;\n\ +_viv_uniform VXC_512Bits uniSqrSumInt8_16x1;\n\ +_viv_uniform float inFlScale_s2;\n\ +_viv_uniform float input_fl_scale;\n\ +\n\ +_viv_uniform VXC_512Bits uniConvertDirInt8Fp32_4x4;\n\ +_viv_uniform VXC_512Bits uniConvertEndInt8Fp32_4x4;\n\ +_viv_uniform VXC_512Bits uniConvertTrdInt8Fp32_4x4;\n\ +_viv_uniform VXC_512Bits uniConvertFthInt8Fp32_4x4;\n\ +\n\ +_viv_uniform float inOut_fl_scale;\n\ +_viv_uniform float output_fl_scale;\n\ +\n\ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_sumsqr_I8(\n\ + image2d_array_t input, image2d_array_t output, float eps, int is2D)\n\ +{\n\ + int gidx = get_global_id(0) << 4;\n\ + int lidx = get_local_id(0);\n\ + int gidz = get_global_id(1);\n\ + int4 coord = (int4)(gidx, 0, gidz, 0);\n\ + vxc_char16 src0;\n\ + float sum = 0, sqr = 0;\n\ + int tmpSum = 0, tmpSqr = 0, tmpSum1, tmpSqr1;\n\ +\n\ + __local float lcl_sum[16];\n\ + __local float lcl_sqr[16];\n\ +\n\ + int8 input_desc;\n\ + _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ + int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0;\n\ + _viv_asm(MOV, coord.z, baseAddr_a);\n\ +\n\ + if(gidx < width)\n\ + {\n\ + for(coord.y = 0; coord.y < height;)\n\ + {\n\ + VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + coord.y++;\n\ + VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumInt8_16x1);\n\ + tmpSum += (tmpSum1);\n\ + VXC_DP16x1(tmpSqr1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSqrSumInt8_16x1);\n\ + tmpSqr += (tmpSqr1);\n\ + }\n\ + sqr = tmpSqr * inFlScale_s2;\n\ + sum = tmpSum * input_fl_scale;\n\ + }\n\ + lcl_sum[lidx] = sum;\n\ + lcl_sqr[lidx] = sqr;\n\ + barrier(CLK_LOCAL_MEM_FENCE);\n\ +\n\ + int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0);\n\ + if(lidx == 0)\n\ + {\n\ + float4 one = (float4)(1, 1, 1, 1);\n\ + __local float4* tmp_sum = (__local float4*)lcl_sum;\n\ + __local float4* tmp_sqr = (__local float4*)lcl_sqr;\n\ +\n\ + sum = 0; sqr = 0;\n\ + for(int i = 0; i < 4; i++)\n\ + {\n\ + sum += dot(tmp_sum[i], one);\n\ + sqr += dot(tmp_sqr[i], one);\n\ + }\n\ +\n\ + float4 data = (float4)(sum, sqr, 0, 0);\n\ + write_imagef(output, coord_out, data);\n\ + }\n\ +}\n\ +\n\ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_sumsqr_I8_2D(\n\ + image2d_array_t input, image2d_array_t output, float eps, int is2D)\n\ +{\n\ + int gidx = get_global_id(0) << 4;\n\ + int lidx = get_local_id(0);\n\ + int gidz = get_global_id(1);\n\ +\n\ + int2 coord = (int2)(gidx, gidz);\n\ + vxc_char16 src0;\n\ + float sum = 0, sqr = 0;\n\ + int tmpSum1, tmpSqr1;\n\ +\n\ + __local float lcl_sum[16];\n\ + __local float lcl_sqr[16];\n\ +\n\ + if(gidx < width)\n\ + {\n\ + VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumInt8_16x1);\n\ + VXC_DP16x1(tmpSqr1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSqrSumInt8_16x1);\n\ + sqr = tmpSqr1 * inFlScale_s2;\n\ + sum = tmpSum1 * input_fl_scale;\n\ + }\n\ +\n\ + lcl_sum[lidx] = sum;\n\ + lcl_sqr[lidx] = sqr;\n\ + barrier(CLK_LOCAL_MEM_FENCE);\n\ +\n\ + int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0);\n\ + if(lidx == 0)\n\ + {\n\ + float4 one = (float4)(1, 1, 1, 1);\n\ + __local float4* tmp_sum = (__local float4*)lcl_sum;\n\ + __local float4* tmp_sqr = (__local float4*)lcl_sqr;\n\ +\n\ + sum = 0; sqr = 0;\n\ + for(int i = 0; i < 4; i++)\n\ + {\n\ + sum += dot(tmp_sum[i], one);\n\ + sqr += dot(tmp_sqr[i], one);\n\ + }\n\ +\n\ + float4 data = (float4)(sum, sqr, 0, 0);\n\ + write_imagef(output, coord_out, data);\n\ + }\n\ +}\n\ +\n\ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_I8toF16(\n\ + image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,\n\ + image2d_array_t output, float eps, int is2D, float rSpaceOrg, int pStride)\n\ +{\n\ + int gidy = get_global_id(1);\n\ + int gidz = get_global_id(2);\n\ + int4 coord = (int4)(get_global_id(0), gidy, gidz, 0);\n\ + int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0);\n\ + vxc_char16 src0;\n\ + vxc_short8 src1, outval;\n\ + vxc_half8 scale_h, dst;\n\ + float scale_vari, bias_val;\n\ + vxc_float4 bias_f, scale_f;\n\ +\n\ + vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);\n\ + VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + bias_f = read_imagef(bias, coord_para.xy);\n\ + VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + _viv_asm(COPY, scale_h, src1, 16);\n\ + VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);\n\ +\n\ + scale_vari = scale_f.s0 * mean_vari.s1;\n\ + vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3, norm;\n\ + half4 tmpVal0, tmpVal1;\n\ + float alpha = input_fl_scale * scale_vari;\n\ + bias_val = (bias_f.s0 - scale_vari * mean_vari.s0);\n\ +\n\ + VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDirInt8Fp32_4x4);\n\ + VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertEndInt8Fp32_4x4);\n\ + VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertTrdInt8Fp32_4x4);\n\ + VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFthInt8Fp32_4x4);\n\ +\n\ + norm = alpha * tmpData0 + bias_val;\n\ + _viv_asm(CONV, tmpVal0, norm);\n\ + norm = alpha * tmpData1 + bias_val;\n\ + _viv_asm(CONV, tmpVal1, norm);\n\ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8);\n\ + _viv_asm(COPY, outval, dst, 16);\n\ + VXC_WriteImage2DArray(output, coord, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ + coord.x += 8;\n\ + norm = alpha * tmpData2 + bias_val;\n\ + _viv_asm(CONV, tmpVal0, norm);\n\ + norm = alpha * tmpData3 + bias_val;\n\ + _viv_asm(CONV, tmpVal1, norm);\n\ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8);\n\ + _viv_asm(COPY, outval, dst, 16);\n\ + VXC_WriteImage2DArray(output, coord, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_I8toF16_2D(\n\ + image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,\n\ + image2d_array_t output, float eps, int is2D, float rSpaceOrg, int pStride)\n\ +{\n\ + int gidz = get_global_id(1);\n\ + int2 coord = (int2)(get_global_id(0), gidz);\n\ + int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 0);\n\ + vxc_char16 src0;\n\ + vxc_short8 src1, outval;\n\ + vxc_half8 scale_h, dst;\n\ + float scale_vari, bias_val;\n\ + vxc_float4 bias_f, scale_f;\n\ +\n\ + vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);\n\ + VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + bias_f = read_imagef(bias, coord_para.xy);\n\ + VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + _viv_asm(COPY, scale_h, src1, 16);\n\ + VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);\n\ +\n\ + scale_vari = scale_f.s0 * mean_vari.s1;\n\ + vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3, norm;\n\ + half4 tmpVal0, tmpVal1;\n\ + float alpha = input_fl_scale * scale_vari;\n\ + bias_val = (bias_f.s0 - scale_vari * mean_vari.s0);\n\ +\n\ + VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDirInt8Fp32_4x4);\n\ + VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertEndInt8Fp32_4x4);\n\ + VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertTrdInt8Fp32_4x4);\n\ + VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFthInt8Fp32_4x4);\n\ + norm = alpha * tmpData0 + bias_val;\n\ + _viv_asm(CONV, tmpVal0, norm);\n\ + norm = alpha * tmpData1 + bias_val;\n\ + _viv_asm(CONV, tmpVal1, norm);\n\ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8);\n\ + _viv_asm(COPY, outval, dst, 16);\n\ + VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + coord.x += 8;\n\ + norm = alpha * tmpData2 + bias_val;\n\ + _viv_asm(CONV, tmpVal0, norm);\n\ + norm = alpha * tmpData3 + bias_val;\n\ + _viv_asm(CONV, tmpVal1, norm);\n\ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8);\n\ + _viv_asm(COPY, outval, dst, 16);\n\ + VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_I8toI8(\n\ + image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,\n\ + image2d_array_t output, float eps, int is2D, float rSpaceOrg, int pStride)\n\ +{\n\ + int gidy = get_global_id(1);\n\ + int gidz = get_global_id(2);\n\ + int4 coord = (int4)(get_global_id(0), gidy, gidz, 0);\n\ + int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0);\n\ + vxc_char16 src0, src2;\n\ + vxc_short8 src1;\n\ + vxc_half8 scale_h;\n\ + float scale_vari, bias_val;\n\ + vxc_float4 bias_f, scale_f;\n\ +\n\ + vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);\n\ + VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + bias_f = read_imagef(bias, coord_para.xy);\n\ + VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + _viv_asm(COPY, scale_h, src1, 16);\n\ + VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);\n\ +\n\ + scale_vari = scale_f.s0 * mean_vari.s1;\n\ + vxc_int4 tmpVal0, tmpVal1;\n\ + vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3, norm;\n\ + float alpha = inOut_fl_scale * scale_vari;\n\ + bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * output_fl_scale;\n\ +\n\ + VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDirInt8Fp32_4x4);\n\ + VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertEndInt8Fp32_4x4);\n\ + VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertTrdInt8Fp32_4x4);\n\ + VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFthInt8Fp32_4x4);\n\ + norm = tmpData0 * alpha + bias_val;\n\ + tmpVal0 = convert_int4_rte(norm);\n\ + norm = tmpData1 * alpha + bias_val;\n\ + tmpVal1 = convert_int4_rte(norm);\n\ + VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\n\ + norm = tmpData2 * alpha + bias_val;\n\ + tmpVal0 = convert_int4_rte(norm);\n\ + norm = tmpData3 * alpha + bias_val;\n\ + tmpVal1 = convert_int4_rte(norm);\n\ + VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\n\ + VXC_WriteImage2DArray(output, coord, src2, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_I8toI8_2D(\n\ + image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,\n\ + image2d_array_t output, float eps, int is2D, float rSpaceOrg, int pStride)\n\ +{\n\ + int gidz = get_global_id(1);\n\ + int2 coord = (int2)(get_global_id(0), gidz);\n\ + int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 0);\n\ + vxc_char16 src0, src2;\n\ + vxc_short8 src1;\n\ + vxc_half8 scale_h;\n\ + float scale_vari, bias_val;\n\ + vxc_float4 bias_f, scale_f;\n\ +\n\ + vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);\n\ + VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + bias_f = read_imagef(bias, coord_para.xy);\n\ + VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + _viv_asm(COPY, scale_h, src1, 16);\n\ + VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);\n\ +\n\ + scale_vari = scale_f.s0 * mean_vari.s1;\n\ + vxc_int4 tmpVal0, tmpVal1;\n\ + vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3, norm;\n\ + float alpha = inOut_fl_scale * scale_vari;\n\ + bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * output_fl_scale;\n\ +\n\ + VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDirInt8Fp32_4x4);\n\ + VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertEndInt8Fp32_4x4);\n\ + VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertTrdInt8Fp32_4x4);\n\ + VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFthInt8Fp32_4x4);\n\ + norm = tmpData0 * alpha + bias_val;\n\ + tmpVal0 = convert_int4_rte(norm);\n\ + norm = tmpData1 * alpha + bias_val;\n\ + tmpVal1 = convert_int4_rte(norm);\n\ + VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\n\ + norm = tmpData2 * alpha + bias_val;\n\ + tmpVal0 = convert_int4_rte(norm);\n\ + norm = tmpData3 * alpha + bias_val;\n\ + tmpVal1 = convert_int4_rte(norm);\n\ + VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\n\ + VXC_WriteImage(output, coord.xy, src2, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +"; /* end of group_normalization_i8_vx*/ + +static const char group_normalization_u8_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform int width;\n\ +_viv_uniform int height;\n\ +_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4;\n\ +_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\ +\n\ +_viv_uniform VXC_512Bits uniConvert1stUint8SubZpToFp32_4x4;\n\ +_viv_uniform VXC_512Bits uniConvert2ndUint8SubZpToFp32_4x4;\n\ +_viv_uniform VXC_512Bits uniConvert3rdUint8SubZpToFp32_4x4;\n\ +_viv_uniform VXC_512Bits uniConvert4thUint8SubZpToFp32_4x4;\n\ +_viv_uniform VXC_512Bits uniSumU8_16x1;\n\ +_viv_uniform VXC_512Bits uniSqrSum_16x1;\n\ +_viv_uniform float input_scale;\n\ +_viv_uniform int inputZP;\n\ +_viv_uniform int sumInZp;\n\ +_viv_uniform int tmpZp1;\n\ +_viv_uniform float e2InScale;\n\ +_viv_uniform float rowSumScale;\n\ +_viv_uniform float scale_inOut;\n\ +_viv_uniform float outputScale;\n\ +_viv_uniform int output_ZP;\n\ +\n\ +_viv_uniform VXC_512Bits uniResetFp32_4x4;\n\ +_viv_uniform int group_stride;\n\ +\n\ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_sumsqr_U8(\n\ + image2d_array_t input, image2d_array_t output, float eps, int is2D)\n\ +{\n\ + int gidx = get_global_id(0) << 4;\n\ + int lidx = get_local_id(0);\n\ + int gidz = get_global_id(1);\n\ + int4 coord = (int4)(gidx, 0, gidz, 0);\n\ + vxc_uchar16 src0;\n\ + float sum = 0, sqr = 0;\n\ + int tmpSum = 0, tmpSqr = 0, tmpSum1 = 0, tmpSqr1 = 0;\n\ +\n\ + __local float lcl_sum[16];\n\ + __local float lcl_sqr[16];\n\ + int8 input_desc;\n\ + _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ + int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0;\n\ + _viv_asm(MOV, coord.z, baseAddr_a);\n\ + if(gidx < width)\n\ + {\n\ + for(coord.y = 0; coord.y < height;)\n\ + {\n\ + VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + coord.y++;\n\ + VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumU8_16x1);\n\ + tmpSum += (tmpSum1);\n\ + VXC_DP16x1(tmpSqr1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSqrSum_16x1);\n\ + tmpSqr += (tmpSqr1 + tmpZp1 * tmpSum1);\n\ + }\n\ + sqr += (tmpSqr * e2InScale + rowSumScale);\n\ + sum = (tmpSum + sumInZp) * input_scale;\n\ + }\n\ + lcl_sum[lidx] = sum;\n\ + lcl_sqr[lidx] = sqr;\n\ + barrier(CLK_LOCAL_MEM_FENCE);\n\ +\n\ + int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0);\n\ + if(lidx == 0)\n\ + {\n\ + float4 one = (float4)(1, 1, 1, 1);\n\ + __local float4* tmp_sum = (__local float4*)lcl_sum;\n\ + __local float4* tmp_sqr = (__local float4*)lcl_sqr;\n\ + sum = 0; sqr = 0;\n\ + for(int i = 0; i < 4; i++)\n\ + {\n\ + sum += dot(tmp_sum[i], one);\n\ + sqr += dot(tmp_sqr[i], one);\n\ + }\n\ + float4 data = (float4)(sum, sqr, 0, 0);\n\ + write_imagef(output, coord_out, data);\n\ + }\n\ +}\n\ +\n\ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_sumsqr_U8_2D(\n\ + image2d_array_t input, image2d_array_t output, float eps, int is2D)\n\ +{\n\ + int gidx = get_global_id(0) << 4;\n\ + int lidx = get_local_id(0);\n\ +\n\ + int2 coord = (int2)(gidx, get_global_id(1));\n\ + vxc_uchar16 src0;\n\ + float sum = 0, sqr = 0;\n\ + int tmpSqr, tmpSum1, tmpSqr1;\n\ +\n\ + __local float lcl_sum[16];\n\ + __local float lcl_sqr[16];\n\ + if(gidx < width)\n\ + {\n\ + VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumU8_16x1);\n\ + VXC_DP16x1(tmpSqr1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSqrSum_16x1);\n\ + tmpSqr = tmpSqr1 + tmpZp1 * tmpSum1;\n\ + sqr = (tmpSqr * e2InScale + rowSumScale);\n\ + sum = (tmpSum1 + sumInZp) * input_scale;\n\ + }\n\ + lcl_sum[lidx] = sum;\n\ + lcl_sqr[lidx] = sqr;\n\ + barrier(CLK_LOCAL_MEM_FENCE);\n\ +\n\ + int4 coord_out = (int4)(get_group_id(0) << 2, get_global_id(1), 0, 0);\n\ + if(lidx == 0)\n\ + {\n\ + float4 one = (float4)(1, 1, 1, 1);\n\ + __local float4* tmp_sum = (__local float4*)lcl_sum;\n\ + __local float4* tmp_sqr = (__local float4*)lcl_sqr;\n\ + sum = 0; sqr = 0;\n\ + for(int i = 0; i < 4; i++)\n\ + {\n\ + sum += dot(tmp_sum[i], one);\n\ + sqr += dot(tmp_sqr[i], one);\n\ + }\n\ + float4 data = (float4)(sum, sqr, 0, 0);\n\ + write_imagef(output, coord_out, data);\n\ + }\n\ +}\n\ +\n\ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_meanvari(\n\ + image2d_t input, image2d_t output, float eps, float group_ratio)\n\ +{\n\ + int gidx = get_global_id(0);\n\ + int lidx = get_local_id(0);\n\ +\n\ + int2 coord = (int2)(gidx, get_global_id(1));\n\ + vxc_uchar16 src0;\n\ + float2 sum_sqr = (float2)(0);\n\ + vxc_float4 mean_vari;\n\ + VXC_DP4x4(mean_vari, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniResetFp32_4x4);\n\ +\n\ + __local float2 lcl_data[16];\n\ + __local float2 lcl_sum[4];\n\ +\n\ + for(; coord.x < group_stride; coord.x += 64)\n\ + {\n\ + mean_vari += read_imagef(input, coord);\n\ + }\n\ + lcl_data[lidx] = mean_vari.xy;\n\ + barrier(CLK_LOCAL_MEM_FENCE);\n\ + if(lidx < 4)\n\ + {\n\ + float2 tmpSum = (float2)(0);\n\ + for(int i = lidx; i < 16; i+=4)\n\ + {\n\ + tmpSum += lcl_data[i];\n\ + }\n\ + lcl_sum[lidx] = tmpSum;\n\ + }\n\ + barrier(CLK_LOCAL_MEM_FENCE);\n\ + if(lidx == 0)\n\ + {\n\ + for(int i = 0; i < 4; i++)\n\ + {\n\ + sum_sqr += lcl_sum[i];\n\ + }\n\ + mean_vari.xy = sum_sqr * group_ratio;\n\ + mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\ + mean_vari.s1 = rsqrt(mean_vari.s1);\n\ +\n\ + coord.x = 0;\n\ + write_imagef(output, coord, mean_vari);\n\ + }\n\ +}\n\ +\n\ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_U8toU8(\n\ + image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,\n\ + image2d_array_t output, float eps, int is2D, float rSpaceOrg, int pStride)\n\ +{\n\ + int gidy = get_global_id(1);\n\ + int gidz = get_global_id(2);\n\ + int4 coord = (int4)(get_global_id(0), gidy, gidz, 0);\n\ + int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0);\n\ + vxc_uchar16 src0, src2;\n\ + vxc_short8 src1;\n\ + vxc_half8 scale_h;\n\ + float scale_vari, bias_val;\n\ + vxc_float4 bias_f, scale_f;\n\ +\n\ + vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);\n\ + VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + bias_f = read_imagef(bias, coord_para.xy);\n\ + VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + _viv_asm(COPY, scale_h, src1, 16);\n\ + VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);\n\ +\n\ + scale_vari = scale_f.s0 * mean_vari.s1;\n\ + short zp = inputZP;\n\ + vxc_int4 tmpVal0, tmpVal1;\n\ + vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3, norm;\n\ + float alpha = scale_inOut * scale_vari;\n\ + bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * outputScale + output_ZP;\n\ +\n\ + VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert1stUint8SubZpToFp32_4x4);\n\ + VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert2ndUint8SubZpToFp32_4x4);\n\ + VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert3rdUint8SubZpToFp32_4x4);\n\ + VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert4thUint8SubZpToFp32_4x4);\n\ + norm = tmpData0 * alpha + bias_val;\n\ + tmpVal0 = convert_int4_rte(norm);\n\ + norm = tmpData1 * alpha + bias_val;\n\ + tmpVal1 = convert_int4_rte(norm);\n\ + VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\n\ + norm = tmpData2 * alpha + bias_val;\n\ + tmpVal0 = convert_int4_rte(norm);\n\ + norm = tmpData3 * alpha + bias_val;\n\ + tmpVal1 = convert_int4_rte(norm);\n\ + VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\n\ + VXC_WriteImage2DArray(output, coord, src2, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_U8toU8_2D(\n\ + image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,\n\ + image2d_array_t output, float eps, int is2D, float rSpaceOrg, int pStride)\n\ +{\n\ + int gidz = get_global_id(1);\n\ + int2 coord = (int2)(get_global_id(0), gidz);\n\ + int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 0);\n\ + vxc_uchar16 src0, src2;\n\ + vxc_short8 src1;\n\ + vxc_half8 scale_h;\n\ + float scale_vari, bias_val;\n\ + vxc_float4 bias_f, scale_f;\n\ +\n\ + vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);\n\ + VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + bias_f = read_imagef(bias, coord_para.xy);\n\ + VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + _viv_asm(COPY, scale_h, src1, 16);\n\ + VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);\n\ +\n\ + scale_vari = scale_f.s0 * mean_vari.s1;\n\ + short zp = inputZP;\n\ + vxc_int4 tmpVal0, tmpVal1;\n\ + vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3, norm;\n\ + float alpha = scale_inOut * scale_vari;\n\ + bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * outputScale + output_ZP;\n\ +\n\ + VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert1stUint8SubZpToFp32_4x4);\n\ + VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert2ndUint8SubZpToFp32_4x4);\n\ + VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert3rdUint8SubZpToFp32_4x4);\n\ + VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert4thUint8SubZpToFp32_4x4);\n\ + norm = tmpData0 * alpha + bias_val;\n\ + tmpVal0 = convert_int4_rte(norm);\n\ + norm = tmpData1 * alpha + bias_val;\n\ + tmpVal1 = convert_int4_rte(norm);\n\ + VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\n\ + norm = tmpData2 * alpha + bias_val;\n\ + tmpVal0 = convert_int4_rte(norm);\n\ + norm = tmpData3 * alpha + bias_val;\n\ + tmpVal1 = convert_int4_rte(norm);\n\ + VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\n\ + VXC_WriteImage(output, coord.xy, src2, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +}"; /* end of group_normalization_u8_vx*/ + +static const char group_normalization_u8_f16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform int width;\n\ +_viv_uniform int height;\n\ +_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4;\n\ +_viv_uniform VXC_512Bits uniConvertHalfToFp16_2x8;\n\ +\n\ +_viv_uniform VXC_512Bits uniConvert1stUint8SubZpToFp32_4x4;\n\ +_viv_uniform VXC_512Bits uniConvert2ndUint8SubZpToFp32_4x4;\n\ +_viv_uniform VXC_512Bits uniConvert3rdUint8SubZpToFp32_4x4;\n\ +_viv_uniform VXC_512Bits uniConvert4thUint8SubZpToFp32_4x4;\n\ +_viv_uniform float input_scale;\n\ +_viv_uniform int inputZP;\n\ +\n\ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_U8toF16(\n\ + image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,\n\ + image2d_array_t output, float eps, int is2D, float rSpaceOrg, int pStride)\n\ +{\n\ + int gidy = get_global_id(1);\n\ + int gidz = get_global_id(2);\n\ + int4 coord = (int4)(get_global_id(0), gidy, gidz, 0);\n\ + int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0);\n\ + vxc_uchar16 src0;\n\ + vxc_short8 src1, outval;\n\ + vxc_half8 scale_h, dst;\n\ + float scale_vari, bias_val;\n\ + vxc_float4 bias_f, scale_f;\n\ +\n\ + vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);\n\ + VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + bias_f = read_imagef(bias, coord_para.xy);\n\ + VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + _viv_asm(COPY, scale_h, src1, 16);\n\ + VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);\n\ +\n\ + scale_vari = scale_f.s0 * mean_vari.s1;\n\ + short zp = inputZP;\n\ + vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3, norm;\n\ + half4 tmpVal0, tmpVal1;\n\ + float alpha = input_scale * scale_vari;\n\ + bias_val = (bias_f.s0 - scale_vari * mean_vari.s0);\n\ +\n\ + VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert1stUint8SubZpToFp32_4x4);\n\ + VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert2ndUint8SubZpToFp32_4x4);\n\ + VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert3rdUint8SubZpToFp32_4x4);\n\ + VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert4thUint8SubZpToFp32_4x4);\n\ + norm = alpha * tmpData0 + bias_val;\n\ + _viv_asm(CONV, tmpVal0, norm);\n\ + norm = alpha * tmpData1 + bias_val;\n\ + _viv_asm(CONV, tmpVal1, norm);\n\ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8);\n\ + _viv_asm(COPY, outval, dst, 16);\n\ + VXC_WriteImage2DArray(output, coord, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ + coord.x += 8;\n\ + norm = alpha * tmpData2 + bias_val;\n\ + _viv_asm(CONV, tmpVal0, norm);\n\ + norm = alpha * tmpData3 + bias_val;\n\ + _viv_asm(CONV, tmpVal1, norm);\n\ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8);\n\ + _viv_asm(COPY, outval, dst, 16);\n\ + VXC_WriteImage2DArray(output, coord, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_U8toF16_2D(\n\ + image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,\n\ + image2d_array_t output, float eps, int is2D, float rSpaceOrg, int pStride)\n\ +{\n\ + int gidz = get_global_id(1);\n\ + int2 coord = (int2)(get_global_id(0), gidz);\n\ + int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 0);\n\ + vxc_uchar16 src0;\n\ + vxc_short8 src1, outval;\n\ + vxc_half8 scale_h, dst;\n\ + float scale_vari, bias_val;\n\ + vxc_float4 bias_f, scale_f;\n\ +\n\ + vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);\n\ + VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + bias_f = read_imagef(bias, coord_para.xy);\n\ + VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + _viv_asm(COPY, scale_h, src1, 16);\n\ + VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);\n\ +\n\ + scale_vari = scale_f.s0 * mean_vari.s1;\n\ + short zp = inputZP;\n\ + vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3, norm;\n\ + half4 tmpVal0, tmpVal1;\n\ + float alpha = input_scale * scale_vari;\n\ + bias_val = (bias_f.s0 - scale_vari * mean_vari.s0);\n\ +\n\ + VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert1stUint8SubZpToFp32_4x4);\n\ + VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert2ndUint8SubZpToFp32_4x4);\n\ + VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert3rdUint8SubZpToFp32_4x4);\n\ + VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert4thUint8SubZpToFp32_4x4);\n\ + norm = alpha * tmpData0 + bias_val;\n\ + _viv_asm(CONV, tmpVal0, norm);\n\ + norm = alpha * tmpData1 + bias_val;\n\ + _viv_asm(CONV, tmpVal1, norm);\n\ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8);\n\ + _viv_asm(COPY, outval, dst, 16);\n\ + VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + coord.x += 8;\n\ + norm = alpha * tmpData2 + bias_val;\n\ + _viv_asm(CONV, tmpVal0, norm);\n\ + norm = alpha * tmpData3 + bias_val;\n\ + _viv_asm(CONV, tmpVal1, norm);\n\ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8);\n\ + _viv_asm(COPY, outval, dst, 16);\n\ + VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +"; /* end of group_normalization_u8_f16_vx*/ + static const char grucell_activation_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ \n\ #define logE (1.44269502f)\n\ @@ -5930,10 +8481,7 @@ _viv_uniform VXC_512Bits uniFp16SumSqr_dp8x2;\n\ _viv_uniform VXC_512Bits uniConvertEndInt16Fp32_4x4;\n\ \n\ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_meanvari_F16(\n\ - image2d_array_t input,\n\ - image2d_array_t output,\n\ - float eps,\n\ - int rsFlg)\n\ + image2d_array_t input, image2d_array_t output, float eps, int rsFlg)\n\ {\n\ int gidx = get_global_id(0) << 3;\n\ int lidx = get_local_id(0);\n\ @@ -5946,13 +8494,18 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_mean \n\ __local float lcl_sum[16];\n\ __local float lcl_sqr[16];\n\ +\n\ + int8 input_desc;\n\ + _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ + int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0;\n\ + _viv_asm(MOV, coord.z, baseAddr_a);\n\ \n\ if(gidx < width)\n\ {\n\ for(coord.y = 0; coord.y < height;)\n\ {\n\ - VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ coord.y++;\n\ _viv_asm(COPY, in_h, src0, 16);\n\ VXC_DP8x2(sumsqr, in_h, in_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\\\n\ @@ -5988,10 +8541,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_mean }\n\ \n\ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_meanvari_F16_2D(\n\ - image2d_array_t input,\n\ - image2d_array_t output,\n\ - float eps,\n\ - int rsFlg)\n\ + image2d_array_t input, image2d_array_t output, float eps, int rsFlg)\n\ {\n\ int gidx = get_global_id(0) << 3;\n\ int lidx = get_local_id(0);\n\ @@ -6049,13 +8599,8 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_mean }\n\ \n\ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_F16toF16(\n\ - image2d_array_t input,\n\ - image2d_array_t bias,\n\ - image2d_array_t scale,\n\ - image2d_array_t meanVari,\n\ - image2d_array_t output,\n\ - float eps,\n\ - int rsFlg)\n\ + image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari,\n\ + image2d_array_t output, float eps, int rsFlg)\n\ {\n\ int gidz = get_global_id(1);\n\ int4 coord = (int4)(get_global_id(0), 0, gidz, 0);\n\ @@ -6072,12 +8617,10 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_F16t VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);\n\ bias_f = read_imagef(bias, coord_para);\n\ \n\ - coord_para.x = 0;\n\ - coord_para.y = gidz;\n\ for(int i = 0; i < group_num; i++)\n\ {\n\ - mean_vari += read_imagef(meanVari, coord_para);\n\ - coord_para.x += 4;\n\ + mean_vari += read_imagef(meanVari, coord_para.yx);\n\ + coord_para.y += 4;\n\ }\n\ mean_vari *= dimRatio;\n\ mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\ @@ -6089,11 +8632,20 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_F16t half4 tmpVal0, tmpVal1;\n\ bias_val = (bias_f.s0 - scale_vari * mean_vari.s0);\n\ vxc_half8 dst;\n\ +\n\ + int8 input_desc, output_desc;\n\ + _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ + int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0;\n\ + _viv_asm(MOV, coord.z, baseAddr_a);\n\ +\n\ + _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\ + int baseAddr = (int)get_global_id(1) * output_desc.s4 + output_desc.s0;\n\ + _viv_asm(MOV, coord.w, baseAddr);\n\ \n\ for(coord.y = 0; coord.y < height; coord.y++)\n\ {\n\ - VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ _viv_asm(COPY, in_h, src0, 16);\n\ \n\ VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ @@ -6109,18 +8661,14 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_F16t VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\ uniConvertHalfToFp16_2x8);\n\ _viv_asm(COPY, outval, dst, 16);\n\ - VXC_WriteImage2DArray(output, coord, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4_NoDest(img_store_3d, output, coord.xyww, outval, \\\n\ + VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ }\n\ }\n\ \n\ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_F16toF16_2D(\n\ - image2d_array_t input,\n\ - image2d_array_t bias,\n\ - image2d_array_t scale,\n\ - image2d_array_t meanVari,\n\ - image2d_array_t output,\n\ - float eps,\n\ - int rsFlg)\n\ + image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari,\n\ + image2d_array_t output, float eps, int rsFlg)\n\ {\n\ int gidz = get_global_id(1);\n\ int gidy = gidz * height;\n\ @@ -6139,12 +8687,10 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_F16t VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);\n\ bias_f = read_imagef(bias, coord_para);\n\ \n\ - coord_para.x = 0;\n\ - coord_para.y = gidz;\n\ for(int i = 0; i < group_num; i++)\n\ {\n\ - mean_vari += read_imagef(meanVari, coord_para);\n\ - coord_para.x += 4;\n\ + mean_vari += read_imagef(meanVari, coord_para.yx);\n\ + coord_para.y += 4;\n\ }\n\ mean_vari *= dimRatio;\n\ mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\ @@ -6216,12 +8762,16 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_mean __local float lcl_sum[16];\n\ __local float lcl_sqr[16];\n\ \n\ + int8 input_desc;\n\ + _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ + int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0;\n\ + _viv_asm(MOV, coord.z, baseAddr_a);\n\ if(gidx < width)\n\ {\n\ for(coord.y = 0; coord.y < height;)\n\ {\n\ - VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ coord.y++;\n\ VXC_DP8x2(sumsqr, src0, src0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\\\n\ uniInt16SumSqr_dp8x2);\n\ @@ -6325,7 +8875,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I16t image2d_array_t input,\n\ image2d_array_t bias,\n\ image2d_array_t scale,\n\ - image2d_array_t meanVari,\n\ + image2d_t meanVari,\n\ image2d_array_t output,\n\ float eps,\n\ int rsFlg)\n\ @@ -6347,12 +8897,10 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I16t \n\ bias_f = read_imagef(bias, coord_para);\n\ \n\ - coord_para.x = 0;\n\ - coord_para.y = gidz;\n\ for(int i = 0; i < group_num; i++)\n\ {\n\ - mean_vari += read_imagef(meanVari, coord_para);\n\ - coord_para.x += 4;\n\ + mean_vari += read_imagef(meanVari, coord_para.yx);\n\ + coord_para.y += 4;\n\ }\n\ mean_vari *= dimRatio;\n\ mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\ @@ -6365,12 +8913,20 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I16t float alpha = input_fl_scale * scale_vari;\n\ bias_val = (bias_f.s0 - scale_vari * mean_vari.s0);\n\ vxc_half8 dst;\n\ +\n\ + int8 input_desc, output_desc;\n\ + _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ + int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0;\n\ + _viv_asm(MOV, coord.z, baseAddr_a);\n\ +\n\ + _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\ + int baseAddr = (int)get_global_id(1) * output_desc.s4 + output_desc.s0;\n\ + _viv_asm(MOV, coord.w, baseAddr);\n\ \n\ for(coord.y = 0; coord.y < height; coord.y++)\n\ {\n\ - VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ -\n\ + VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ uniConvertInt16Fp32Fst_4x4);\n\ VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ @@ -6384,7 +8940,8 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I16t VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\ uniConvertHalfToFp16_2x8);\n\ _viv_asm(COPY, outval, dst, 16);\n\ - VXC_WriteImage2DArray(output, coord, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4_NoDest(img_store_3d, output, coord.xyww, outval, \\\n\ + VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ }\n\ }\n\ \n\ @@ -6392,7 +8949,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I16t image2d_array_t input,\n\ image2d_array_t bias,\n\ image2d_array_t scale,\n\ - image2d_array_t meanVari,\n\ + image2d_t meanVari,\n\ image2d_array_t output,\n\ float eps,\n\ int rsFlg)\n\ @@ -6416,12 +8973,10 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I16t \n\ bias_f = read_imagef(bias, coord_para);\n\ \n\ - coord_para.x = 0;\n\ - coord_para.y = gidz;\n\ for(int i = 0; i < group_num; i++)\n\ {\n\ - mean_vari += read_imagef(meanVari, coord_para);\n\ - coord_para.x += 4;\n\ + mean_vari += read_imagef(meanVari, coord_para.yx);\n\ + coord_para.y += 4;\n\ }\n\ mean_vari *= dimRatio;\n\ mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\ @@ -6460,7 +9015,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I16t image2d_array_t input,\n\ image2d_array_t bias,\n\ image2d_array_t scale,\n\ - image2d_array_t meanVari,\n\ + image2d_t meanVari,\n\ image2d_array_t output,\n\ float eps,\n\ int rsFlg)\n\ @@ -6480,12 +9035,10 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I16t VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);\n\ \n\ bias_f = read_imagef(bias, coord_para);\n\ - coord_para.x = 0;\n\ - coord_para.y = gidz;\n\ for(int i = 0; i < group_num; i++)\n\ {\n\ - mean_vari += read_imagef(meanVari, coord_para);\n\ - coord_para.x += 4;\n\ + mean_vari += read_imagef(meanVari, coord_para.yx);\n\ + coord_para.y += 4;\n\ }\n\ mean_vari *= dimRatio;\n\ mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\ @@ -6497,10 +9050,18 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I16t float alpha = inOut_fl_scale * scale_vari;\n\ bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * output_fl_scale;\n\ \n\ + int8 input_desc, output_desc;\n\ + _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ + int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0;\n\ + _viv_asm(MOV, coord.z, baseAddr_a);\n\ +\n\ + _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\ + int baseAddr = (int)get_global_id(1) * output_desc.s4 + output_desc.s0;\n\ + _viv_asm(MOV, coord.w, baseAddr);\n\ for(coord.y = 0; coord.y < height; coord.y++)\n\ {\n\ - VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ uniConvertInt16Fp32Fst_4x4);\n\ VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ @@ -6512,7 +9073,8 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I16t tmpVal1 = convert_int4_rte(norm);\n\ VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ uniConvertInt32toInt16_2x8);\n\ - VXC_WriteImage2DArray(output, coord, src2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4_NoDest(img_store_3d, output, coord.xyww, src2, \\\n\ + VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ }\n\ }\n\ \n\ @@ -6520,7 +9082,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I16t image2d_array_t input,\n\ image2d_array_t bias,\n\ image2d_array_t scale,\n\ - image2d_array_t meanVari,\n\ + image2d_t meanVari,\n\ image2d_array_t output,\n\ float eps,\n\ int rsFlg)\n\ @@ -6542,12 +9104,10 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I16t VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);\n\ \n\ bias_f = read_imagef(bias, coord_para);\n\ - coord_para.x = 0;\n\ - coord_para.y = gidz;\n\ for(int i = 0; i < group_num; i++)\n\ {\n\ - mean_vari += read_imagef(meanVari, coord_para);\n\ - coord_para.x += 4;\n\ + mean_vari += read_imagef(meanVari, coord_para.yx);\n\ + coord_para.y += 4;\n\ }\n\ mean_vari *= dimRatio;\n\ mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\ @@ -6602,10 +9162,7 @@ _viv_uniform float inOut_fl_scale;\n\ _viv_uniform float output_fl_scale;\n\ \n\ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_meanvari_I8(\n\ - image2d_array_t input,\n\ - image2d_array_t output,\n\ - float eps,\n\ - int rsFlg)\n\ + image2d_array_t input, image2d_array_t output, float eps, int rsFlg)\n\ {\n\ int gidx = get_global_id(0) << 4;\n\ int lidx = get_local_id(0);\n\ @@ -6613,18 +9170,22 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_mean int4 coord = (int4)(gidx, 0, gidz, 0);\n\ vxc_char16 src0;\n\ float sum = 0, sqr = 0;\n\ - int tmpSum = 0, tmpSqr = 0;\n\ - int tmpSum1, tmpSqr1;\n\ + int tmpSum = 0, tmpSqr = 0, tmpSum1, tmpSqr1;\n\ \n\ __local float lcl_sum[16];\n\ __local float lcl_sqr[16];\n\ +\n\ + int8 input_desc;\n\ + _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ + int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0;\n\ + _viv_asm(MOV, coord.z, baseAddr_a);\n\ \n\ if(gidx < width)\n\ {\n\ for(coord.y = 0; coord.y < height;)\n\ {\n\ - VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\n\ - VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ coord.y++;\n\ VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumInt8_16x1);\n\ tmpSum += (tmpSum1);\n\ @@ -6634,7 +9195,6 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_mean sqr = tmpSqr * inFlScale_s2;\n\ sum = tmpSum * input_fl_scale;\n\ }\n\ -\n\ lcl_sum[lidx] = sum;\n\ lcl_sqr[lidx] = sqr;\n\ barrier(CLK_LOCAL_MEM_FENCE);\n\ @@ -6649,8 +9209,6 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_mean sum = 0; sqr = 0;\n\ for(int i = 0; i < 4; i++)\n\ {\n\ - //sum += lcl_sum[i];\n\ - //sqr += lcl_sqr[i];\n\ sum += dot(tmp_sum[i], one);\n\ sqr += dot(tmp_sqr[i], one);\n\ }\n\ @@ -6661,10 +9219,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_mean }\n\ \n\ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_meanvari_I8_2D(\n\ - image2d_array_t input,\n\ - image2d_array_t output,\n\ - float eps,\n\ - int rsFlg)\n\ + image2d_array_t input, image2d_array_t output, float eps, int rsFlg)\n\ {\n\ int gidx = get_global_id(0) << 4;\n\ int lidx = get_local_id(0);\n\ @@ -6674,8 +9229,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_mean int2 coord = (int2)(gidx, gidy);\n\ vxc_char16 src0;\n\ float sum = 0, sqr = 0;\n\ - int tmpSum = 0, tmpSqr = 0;\n\ - int tmpSum1, tmpSqr1;\n\ + int tmpSum = 0, tmpSqr = 0, tmpSum1, tmpSqr1;\n\ \n\ __local float lcl_sum[16];\n\ __local float lcl_sqr[16];\n\ @@ -6683,7 +9237,6 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_mean int endH = gidy + height;\n\ if(gidx < width)\n\ {\n\ - tmpSqr = 0;\n\ for(; coord.y < endH;)\n\ {\n\ VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\n\ @@ -6712,8 +9265,6 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_mean sum = 0; sqr = 0;\n\ for(int i = 0; i < 4; i++)\n\ {\n\ - //sum += lcl_sum[i];\n\ - //sqr += lcl_sqr[i];\n\ sum += dot(tmp_sum[i], one);\n\ sqr += dot(tmp_sqr[i], one);\n\ }\n\ @@ -6724,94 +9275,81 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_mean }\n\ \n\ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I8toF16(\n\ - image2d_array_t input,\n\ - image2d_array_t bias,\n\ - image2d_array_t scale,\n\ - image2d_array_t meanVari,\n\ - image2d_array_t output,\n\ - float eps,\n\ - int rsFlg)\n\ + image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari,\n\ + image2d_array_t output, float eps, int rsFlg)\n\ {\n\ int gidz = get_global_id(1);\n\ int4 coord = (int4)(get_global_id(0), 0, gidz, 0);\n\ int4 coord_para = (int4)(gidz, 0, 0, 0);\n\ vxc_char16 src0;\n\ - vxc_short8 src1;\n\ - vxc_half8 scale_h;\n\ + vxc_short8 src1, outval;\n\ + vxc_half8 scale_h, dst;\n\ float scale_vari, bias_val;\n\ vxc_float4 bias_f, scale_f, mean_vari = (vxc_float4)(0);\n\ \n\ VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ _viv_asm(COPY, scale_h, src1, 16);\n\ - VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - UniFP16toFP32Lo4_dp4x4);\n\ -\n\ + VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);\n\ bias_f = read_imagef(bias, coord_para);\n\ \n\ - coord_para.x = 0;\n\ - coord_para.y = gidz;\n\ for(int i = 0; i < group_num; i++)\n\ {\n\ - mean_vari += read_imagef(meanVari, coord_para);\n\ - coord_para.x += 4;\n\ + mean_vari += read_imagef(meanVari, coord_para.yx);\n\ + coord_para.y += 4;\n\ }\n\ mean_vari *= dimRatio;\n\ mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\ mean_vari.s1 = rsqrt(mean_vari.s1);\n\ \n\ scale_vari = scale_f.s0 * mean_vari.s1;\n\ - vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3;\n\ - vxc_short8 outval;\n\ + vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3, norm;\n\ half4 tmpVal0, tmpVal1;\n\ float alpha = input_fl_scale * scale_vari;\n\ bias_val = (bias_f.s0 - scale_vari * mean_vari.s0);\n\ - vxc_half8 dst;\n\ +\n\ + coord_para = coord;\n\ + int8 input_desc, output_desc;\n\ + _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ + int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0;\n\ + _viv_asm(MOV, coord.z, baseAddr_a);\n\ +\n\ + _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\ + int baseAddr = (int)get_global_id(1) * output_desc.s4 + output_desc.s0;\n\ + _viv_asm(MOV, coord_para.z, baseAddr);\n\ \n\ for(coord.y = 0; coord.y < height;)\n\ {\n\ - coord_para = coord;\n\ - VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ - VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + coord_para.xy = coord.xy;\n\ coord.y++;\n\ - VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvertDirInt8Fp32_4x4);\n\ - VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvertEndInt8Fp32_4x4);\n\ - VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvertTrdInt8Fp32_4x4);\n\ - VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvertFthInt8Fp32_4x4);\n\ + VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDirInt8Fp32_4x4);\n\ + VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertEndInt8Fp32_4x4);\n\ + VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertTrdInt8Fp32_4x4);\n\ + VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFthInt8Fp32_4x4);\n\ \n\ - vxc_float4 norm;\n\ norm = alpha * tmpData0 + bias_val;\n\ _viv_asm(CONV, tmpVal0, norm);\n\ norm = alpha * tmpData1 + bias_val;\n\ _viv_asm(CONV, tmpVal1, norm);\n\ - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvertHalfToFp16_2x8);\n\ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8);\n\ _viv_asm(COPY, outval, dst, 16);\n\ - VXC_WriteImage2DArray(output, coord_para, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_para, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ coord_para.x += 8;\n\ norm = alpha * tmpData2 + bias_val;\n\ _viv_asm(CONV, tmpVal0, norm);\n\ norm = alpha * tmpData3 + bias_val;\n\ _viv_asm(CONV, tmpVal1, norm);\n\ - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvertHalfToFp16_2x8);\n\ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8);\n\ _viv_asm(COPY, outval, dst, 16);\n\ - VXC_WriteImage2DArray(output, coord_para, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_para, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ }\n\ }\n\ \n\ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I8toF16_2D(\n\ - image2d_array_t input,\n\ - image2d_array_t bias,\n\ - image2d_array_t scale,\n\ - image2d_array_t meanVari,\n\ - image2d_array_t output,\n\ - float eps,\n\ - int rsFlg)\n\ + image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari,\n\ + image2d_array_t output, float eps, int rsFlg)\n\ {\n\ int gidz = get_global_id(1);\n\ int gidy = gidz * height;\n\ @@ -6819,59 +9357,48 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I8to int4 coord_para = (int4)(gidz, 0, 0, 0);\n\ int endH = gidy + height;\n\ vxc_char16 src0;\n\ - vxc_short8 src1;\n\ - vxc_half8 scale_h;\n\ + vxc_short8 src1, outval;\n\ + vxc_half8 scale_h, dst;\n\ float scale_vari, bias_val;\n\ vxc_float4 bias_f, scale_f, mean_vari = (vxc_float4)(0);\n\ \n\ VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ _viv_asm(COPY, scale_h, src1, 16);\n\ - VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - UniFP16toFP32Lo4_dp4x4);\n\ + VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);\n\ \n\ bias_f = read_imagef(bias, coord_para);\n\ \n\ - coord_para.x = 0;\n\ - coord_para.y = gidz;\n\ for(int i = 0; i < group_num; i++)\n\ {\n\ - mean_vari += read_imagef(meanVari, coord_para);\n\ - coord_para.x += 4;\n\ + mean_vari += read_imagef(meanVari, coord_para.yx);\n\ + coord_para.y += 4;\n\ }\n\ mean_vari *= dimRatio;\n\ mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\ mean_vari.s1 = rsqrt(mean_vari.s1);\n\ \n\ scale_vari = scale_f.s0 * mean_vari.s1;\n\ - vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3;\n\ - vxc_short8 outval;\n\ + vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3, norm;\n\ half4 tmpVal0, tmpVal1;\n\ float alpha = input_fl_scale * scale_vari;\n\ bias_val = (bias_f.s0 - scale_vari * mean_vari.s0);\n\ - vxc_half8 dst;\n\ \n\ for(; coord.y < endH;)\n\ {\n\ - coord_para = coord;\n\ VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + coord_para = coord;\n\ coord.y++;\n\ - VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvertDirInt8Fp32_4x4);\n\ - VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvertEndInt8Fp32_4x4);\n\ - VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvertTrdInt8Fp32_4x4);\n\ - VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvertFthInt8Fp32_4x4);\n\ - vxc_float4 norm;\n\ + VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDirInt8Fp32_4x4);\n\ + VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertEndInt8Fp32_4x4);\n\ + VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertTrdInt8Fp32_4x4);\n\ + VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFthInt8Fp32_4x4);\n\ norm = alpha * tmpData0 + bias_val;\n\ _viv_asm(CONV, tmpVal0, norm);\n\ norm = alpha * tmpData1 + bias_val;\n\ _viv_asm(CONV, tmpVal1, norm);\n\ - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvertHalfToFp16_2x8);\n\ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8);\n\ _viv_asm(COPY, outval, dst, 16);\n\ VXC_WriteImage(output, coord_para.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ coord_para.x += 8;\n\ @@ -6879,21 +9406,15 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I8to _viv_asm(CONV, tmpVal0, norm);\n\ norm = alpha * tmpData3 + bias_val;\n\ _viv_asm(CONV, tmpVal1, norm);\n\ - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvertHalfToFp16_2x8);\n\ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8);\n\ _viv_asm(COPY, outval, dst, 16);\n\ VXC_WriteImage(output, coord_para.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ }\n\ }\n\ \n\ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I8toI8(\n\ - image2d_array_t input,\n\ - image2d_array_t bias,\n\ - image2d_array_t scale,\n\ - image2d_array_t meanVari,\n\ - image2d_array_t output,\n\ - float eps,\n\ - int rsFlg)\n\ + image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari,\n\ + image2d_array_t output, float eps, int rsFlg)\n\ {\n\ int gidz = get_global_id(1);\n\ int4 coord = (int4)(get_global_id(0), 0, gidz, 0);\n\ @@ -6910,12 +9431,10 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I8to VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);\n\ \n\ bias_f = read_imagef(bias, coord_para);\n\ - coord_para.x = 0;\n\ - coord_para.y = gidz;\n\ for(int i = 0; i < group_num; i++)\n\ {\n\ - mean_vari += read_imagef(meanVari, coord_para);\n\ - coord_para.x += 4;\n\ + mean_vari += read_imagef(meanVari, coord_para.yx);\n\ + coord_para.y += 4;\n\ }\n\ mean_vari *= dimRatio;\n\ mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\ @@ -6923,47 +9442,44 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I8to \n\ scale_vari = scale_f.s0 * mean_vari.s1;\n\ vxc_int4 tmpVal0, tmpVal1;\n\ - vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3;\n\ + vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3, norm;\n\ float alpha = inOut_fl_scale * scale_vari;\n\ bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * output_fl_scale;\n\ +\n\ + int8 input_desc, output_desc;\n\ + _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ + int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0;\n\ + _viv_asm(MOV, coord.z, baseAddr_a);\n\ +\n\ + _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\ + int baseAddr = (int)get_global_id(1) * output_desc.s4 + output_desc.s0;\n\ + _viv_asm(MOV, coord.w, baseAddr);\n\ \n\ for(coord.y = 0; coord.y < height; coord.y++)\n\ {\n\ - VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ - VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ - VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvertDirInt8Fp32_4x4);\n\ - VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvertEndInt8Fp32_4x4);\n\ - VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvertTrdInt8Fp32_4x4);\n\ - VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvertFthInt8Fp32_4x4);\n\ - vxc_float4 norm;\n\ + VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDirInt8Fp32_4x4);\n\ + VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertEndInt8Fp32_4x4);\n\ + VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertTrdInt8Fp32_4x4);\n\ + VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFthInt8Fp32_4x4);\n\ norm = tmpData0 * alpha + bias_val;\n\ tmpVal0 = convert_int4_rte(norm);\n\ norm = tmpData1 * alpha + bias_val;\n\ tmpVal1 = convert_int4_rte(norm);\n\ - VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ - uniConvertInt32toUint8_2x8);\n\ + VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\n\ norm = tmpData2 * alpha + bias_val;\n\ tmpVal0 = convert_int4_rte(norm);\n\ norm = tmpData3 * alpha + bias_val;\n\ tmpVal1 = convert_int4_rte(norm);\n\ - VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1),\\\n\ - uniConvertInt32toUint8_2x8);\n\ - VXC_WriteImage2DArray(output, coord, src2, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\n\ + VXC_OP4_NoDest(img_store_3d, output, coord.xyww, src2, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\ }\n\ }\n\ \n\ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I8toI8_2D(\n\ - image2d_array_t input,\n\ - image2d_array_t bias,\n\ - image2d_array_t scale,\n\ - image2d_array_t meanVari,\n\ - image2d_array_t output,\n\ - float eps,\n\ - int rsFlg)\n\ + image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari,\n\ + image2d_array_t output, float eps, int rsFlg)\n\ {\n\ int gidz = get_global_id(1);\n\ int gidy = gidz * height;\n\ @@ -6982,12 +9498,10 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I8to VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);\n\ \n\ bias_f = read_imagef(bias, coord_para);\n\ - coord_para.x = 0;\n\ - coord_para.y = gidz;\n\ for(int i = 0; i < group_num; i++)\n\ {\n\ - mean_vari += read_imagef(meanVari, coord_para);\n\ - coord_para.x += 4;\n\ + mean_vari += read_imagef(meanVari, coord_para.yx);\n\ + coord_para.y += 4;\n\ }\n\ mean_vari *= dimRatio;\n\ mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\ @@ -6995,39 +9509,715 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I8to \n\ scale_vari = scale_f.s0 * mean_vari.s1;\n\ vxc_int4 tmpVal0, tmpVal1;\n\ - vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3;\n\ + vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3, norm;\n\ float alpha = inOut_fl_scale * scale_vari;\n\ bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * output_fl_scale;\n\ \n\ for(; coord.y < endH; coord.y++)\n\ {\n\ - VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ - VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDirInt8Fp32_4x4);\n\ + VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertEndInt8Fp32_4x4);\n\ + VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertTrdInt8Fp32_4x4);\n\ + VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFthInt8Fp32_4x4);\n\ + norm = tmpData0 * alpha + bias_val;\n\ + tmpVal0 = convert_int4_rte(norm);\n\ + norm = tmpData1 * alpha + bias_val;\n\ + tmpVal1 = convert_int4_rte(norm);\n\ + VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\n\ + norm = tmpData2 * alpha + bias_val;\n\ + tmpVal0 = convert_int4_rte(norm);\n\ + norm = tmpData3 * alpha + bias_val;\n\ + tmpVal1 = convert_int4_rte(norm);\n\ + VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\n\ + VXC_WriteImage(output, coord, src2, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + }\n\ +}\n\ +"; /* end of instance_normalization_i8_vx*/ + +static const char instance_normalization_scale_f32_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform int height;\n\ +_viv_uniform float dimRatio;\n\ +_viv_uniform int group_num;\n\ +_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\ +\n\ +_viv_uniform VXC_512Bits uniConvert1stUint8SubZpToFp32_4x4;\n\ +_viv_uniform VXC_512Bits uniConvert2ndUint8SubZpToFp32_4x4;\n\ +_viv_uniform VXC_512Bits uniConvert3rdUint8SubZpToFp32_4x4;\n\ +_viv_uniform VXC_512Bits uniConvert4thUint8SubZpToFp32_4x4;\n\ +_viv_uniform int inputZP;\n\ +_viv_uniform float scale_inOut;\n\ +_viv_uniform float outputScale;\n\ +_viv_uniform int output_ZP;\n\ +_viv_uniform float inOut_fl_scale;\n\ +_viv_uniform float output_fl_scale;\n\ +_viv_uniform VXC_512Bits uniConvertInt16Fp32Fst_4x4;\n\ +_viv_uniform VXC_512Bits uniConvertInt16Fp32Secd_4x4;\n\ +_viv_uniform VXC_512Bits uniConvertInt32toInt16_2x8;\n\ +\n\ +#define INSTANCENORM_8BITS_F32(src1_type_name, read_type) \\\n\ +__kernel void instance_norm_##src1_type_name##F32to##src1_type_name( \\\n\ + image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari, \\\n\ + image2d_array_t output, float eps, int rsFlg) \\\n\ +{ \\\n\ + int gidz = get_global_id(1); \\\n\ + int4 coord = (int4)(get_global_id(0), 0, gidz, 0); \\\n\ + int2 coord_para = (int2)(gidz, 0); \\\n\ + read_type src0, src2; \\\n\ + float scale_vari, bias_val; \\\n\ + vxc_float4 mean_vari = (vxc_float4)(0); \\\n\ + \\\n\ + Image img1 = create_image_from_image2d(bias, 4); \\\n\ + Image img2 = create_image_from_image2d(scale, 4); \\\n\ + Image img3 = create_image_from_image2d(meanVari, 4); \\\n\ + __global float* bias_ptr = (__global float*)img1.ptr; \\\n\ + __global float* scal_ptr = (__global float*)img2.ptr; \\\n\ + __global uchar* sumVari_ptr = (__global uchar*)get_image_ptr_from_coord(img3, coord_para.yx); \\\n\ + __global float4* vari_ptr = (__global float4*)sumVari_ptr; \\\n\ + \\\n\ + float bval = bias_ptr[gidz]; \\\n\ + float sval = scal_ptr[gidz]; \\\n\ + \\\n\ + for(int i = 0; i < group_num; i++) \\\n\ + { \\\n\ + mean_vari += vari_ptr[i]; \\\n\ + } \\\n\ + mean_vari *= dimRatio; \\\n\ + mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; \\\n\ + mean_vari.s1 = rsqrt(mean_vari.s1); \\\n\ + \\\n\ + scale_vari = sval * mean_vari.s1; \\\n\ + short zp = inputZP; \\\n\ + vxc_int4 tmpVal0, tmpVal1; \\\n\ + vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3, norm; \\\n\ + float alpha = scale_inOut * scale_vari; \\\n\ + bias_val = (bval - scale_vari * mean_vari.s0) * outputScale + output_ZP; \\\n\ + \\\n\ + int8 input_desc, output_desc; \\\n\ + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); \\\n\ + int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0; \\\n\ + _viv_asm(MOV, coord.z, baseAddr_a); \\\n\ + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \\\n\ + int baseAddr = (int)get_global_id(1) * output_desc.s4 + output_desc.s0; \\\n\ + _viv_asm(MOV, coord.w, baseAddr); \\\n\ + \\\n\ + for(coord.y = 0; coord.y < height; coord.y++) \\\n\ + { \\\n\ + VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvert1stUint8SubZpToFp32_4x4); \\\n\ + VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvert2ndUint8SubZpToFp32_4x4); \\\n\ + VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvert3rdUint8SubZpToFp32_4x4); \\\n\ + VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvert4thUint8SubZpToFp32_4x4); \\\n\ + norm = tmpData0 * alpha + bias_val; \\\n\ + tmpVal0 = convert_int4_rte(norm); \\\n\ + norm = tmpData1 * alpha + bias_val; \\\n\ + tmpVal1 = convert_int4_rte(norm); \\\n\ + VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8); \\\n\ + norm = tmpData2 * alpha + bias_val; \\\n\ + tmpVal0 = convert_int4_rte(norm); \\\n\ + norm = tmpData3 * alpha + bias_val; \\\n\ + tmpVal1 = convert_int4_rte(norm); \\\n\ + VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8); \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord.xyww, src2, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); \\\n\ + } \\\n\ +}\n\ +INSTANCENORM_8BITS_F32(U8, vxc_uchar16)\n\ +INSTANCENORM_8BITS_F32(I8, vxc_char16)\n\ +\n\ +#define INSTANCENORM_8BITS_F32_2D(src1_type_name, read_type) \\\n\ +__kernel void instance_norm_##src1_type_name##F32to##src1_type_name##_2D( \\\n\ + image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari, \\\n\ + image2d_array_t output, float eps, int rsFlg) \\\n\ +{ \\\n\ + int gidz = get_global_id(1); \\\n\ + int gidy = gidz * height; \\\n\ + int2 coord = (int2)(get_global_id(0), gidy); \\\n\ + int2 coord_para = (int2)(gidz, 0); \\\n\ + int endH = gidy + height; \\\n\ + read_type src0, src2; \\\n\ + float scale_vari, bias_val; \\\n\ + vxc_float4 mean_vari = (vxc_float4)(0); \\\n\ + \\\n\ + Image img1 = create_image_from_image2d(bias, 4); \\\n\ + Image img2 = create_image_from_image2d(scale, 4); \\\n\ + Image img3 = create_image_from_image2d(meanVari, 4); \\\n\ + __global float* bias_ptr = (__global float*)img1.ptr; \\\n\ + __global float* scal_ptr = (__global float*)img2.ptr; \\\n\ + __global uchar* sumVari_ptr = (__global uchar*)get_image_ptr_from_coord(img3, coord_para.yx); \\\n\ + __global float4* vari_ptr = (__global float4*)sumVari_ptr; \\\n\ + \\\n\ + float bval = bias_ptr[gidz]; \\\n\ + float sval = scal_ptr[gidz]; \\\n\ + \\\n\ + for(int i = 0; i < group_num; i++) \\\n\ + { \\\n\ + mean_vari += vari_ptr[i]; \\\n\ + } \\\n\ + \\\n\ + mean_vari *= dimRatio; \\\n\ + mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; \\\n\ + mean_vari.s1 = rsqrt(mean_vari.s1); \\\n\ + \\\n\ + scale_vari = sval * mean_vari.s1; \\\n\ + short zp = inputZP; \\\n\ + vxc_int4 tmpVal0, tmpVal1; \\\n\ + vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3, norm; \\\n\ + float alpha = scale_inOut * scale_vari; \\\n\ + bias_val = (bval - scale_vari * mean_vari.s0) * outputScale + output_ZP; \\\n\ + \\\n\ + for(; coord.y < endH; coord.y++) \\\n\ + { \\\n\ + VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvert1stUint8SubZpToFp32_4x4); \\\n\ + VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvert2ndUint8SubZpToFp32_4x4); \\\n\ + VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvert3rdUint8SubZpToFp32_4x4); \\\n\ + VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvert4thUint8SubZpToFp32_4x4); \\\n\ + norm = tmpData0 * alpha + bias_val; \\\n\ + tmpVal0 = convert_int4_rte(norm); \\\n\ + norm = tmpData1 * alpha + bias_val; \\\n\ + tmpVal1 = convert_int4_rte(norm); \\\n\ + VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8); \\\n\ + norm = tmpData2 * alpha + bias_val; \\\n\ + tmpVal0 = convert_int4_rte(norm); \\\n\ + norm = tmpData3 * alpha + bias_val; \\\n\ + tmpVal1 = convert_int4_rte(norm); \\\n\ + VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8); \\\n\ + VXC_WriteImage(output, coord, src2, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + } \\\n\ +}\n\ +INSTANCENORM_8BITS_F32_2D(U8, vxc_uchar16)\n\ +INSTANCENORM_8BITS_F32_2D(I8, vxc_char16)\n\ +\n\ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I16F32toI16(\n\ + image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,\n\ + image2d_array_t output, float eps, int rsFlg)\n\ +{\n\ + int gidz = get_global_id(1);\n\ + int4 coord = (int4)(get_global_id(0), 0, gidz, 0);\n\ + int2 coord_para = (int2)(gidz, 0);\n\ + vxc_short8 src0, src2;\n\ + float scale_vari, bias_val;\n\ + vxc_float4 mean_vari = (vxc_float4)(0);\n\ +\n\ + Image img1 = create_image_from_image2d(bias, 4);\n\ + Image img2 = create_image_from_image2d(scale, 4);\n\ + Image img3 = create_image_from_image2d(meanVari, 4);\n\ + __global float* bias_ptr = (__global float*)img1.ptr;\n\ + __global float* scal_ptr = (__global float*)img2.ptr;\n\ + __global uchar* sumVari_ptr = (__global uchar*)get_image_ptr_from_coord(img3, coord_para.yx);\n\ + __global float4* vari_ptr = (__global float4*)sumVari_ptr;\n\ +\n\ + float bval = bias_ptr[gidz];\n\ + float sval = scal_ptr[gidz];\n\ +\n\ + for(int i = 0; i < group_num; i++)\n\ + {\n\ + mean_vari += vari_ptr[i];\n\ + }\n\ +\n\ + mean_vari *= dimRatio;\n\ + mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\ + mean_vari.s1 = rsqrt(mean_vari.s1);\n\ +\n\ + scale_vari = sval * mean_vari.s1;\n\ + vxc_int4 tmpVal0, tmpVal1;\n\ + vxc_float4 tmpData0, tmpData1;\n\ + float alpha = inOut_fl_scale * scale_vari;\n\ + bias_val = (bval - scale_vari * mean_vari.s0) * output_fl_scale;\n\ +\n\ + int8 input_desc, output_desc;\n\ + _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ + int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0;\n\ + _viv_asm(MOV, coord.z, baseAddr_a);\n\ +\n\ + _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\ + int baseAddr = (int)get_global_id(1) * output_desc.s4 + output_desc.s0;\n\ + _viv_asm(MOV, coord.w, baseAddr);\n\ + for(coord.y = 0; coord.y < height; coord.y++)\n\ + {\n\ + VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvertDirInt8Fp32_4x4);\n\ + uniConvertInt16Fp32Fst_4x4);\n\ VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvertEndInt8Fp32_4x4);\n\ - VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvertTrdInt8Fp32_4x4);\n\ - VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvertFthInt8Fp32_4x4);\n\ + uniConvertInt16Fp32Secd_4x4);\n\ vxc_float4 norm;\n\ norm = tmpData0 * alpha + bias_val;\n\ tmpVal0 = convert_int4_rte(norm);\n\ norm = tmpData1 * alpha + bias_val;\n\ tmpVal1 = convert_int4_rte(norm);\n\ VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ - uniConvertInt32toUint8_2x8);\n\ - norm = tmpData2 * alpha + bias_val;\n\ - tmpVal0 = convert_int4_rte(norm);\n\ - norm = tmpData3 * alpha + bias_val;\n\ - tmpVal1 = convert_int4_rte(norm);\n\ - VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1),\\\n\ - uniConvertInt32toUint8_2x8);\n\ - VXC_WriteImage(output, coord, src2, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + uniConvertInt32toInt16_2x8);\n\ + VXC_OP4_NoDest(img_store_3d, output, coord.xyww, src2, \\\n\ + VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ }\n\ }\n\ -"; /* end of instance_normalization_i8_vx*/ +\n\ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I16F32toI16_2D(\n\ + image2d_t input, image2d_t bias, image2d_t scale,\n\ + image2d_t meanVari, image2d_array_t output, float eps, int rsFlg)\n\ +{\n\ + int gidz = get_global_id(1);\n\ + int gidy = gidz * height;\n\ + int2 coord = (int2)(get_global_id(0), gidy);\n\ + int2 coord_para = (int2)(gidz, 0);\n\ + int endH = gidy + height;\n\ + vxc_short8 src0, src2;\n\ + float scale_vari, bias_val;\n\ + vxc_float4 mean_vari = (vxc_float4)(0);\n\ +\n\ + Image img1 = create_image_from_image2d(bias, 4);\n\ + Image img2 = create_image_from_image2d(scale, 4);\n\ + Image img3 = create_image_from_image2d(meanVari, 4);\n\ + __global float* bias_ptr = (__global float*)img1.ptr;\n\ + __global float* scal_ptr = (__global float*)img2.ptr;\n\ + __global uchar* sumVari_ptr = (__global uchar*)get_image_ptr_from_coord(img3, coord_para.yx);\n\ + __global float4* vari_ptr = (__global float4*)sumVari_ptr;\n\ +\n\ + float bval = bias_ptr[gidz];\n\ + float sval = scal_ptr[gidz];\n\ +\n\ + for(int i = 0; i < group_num; i++)\n\ + {\n\ + mean_vari += vari_ptr[i];\n\ + }\n\ +\n\ + mean_vari *= dimRatio;\n\ + mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\ + mean_vari.s1 = rsqrt(mean_vari.s1);\n\ +\n\ + scale_vari = sval * mean_vari.s1;\n\ + vxc_int4 tmpVal0, tmpVal1;\n\ + vxc_float4 tmpData0, tmpData1;\n\ + float alpha = inOut_fl_scale * scale_vari;\n\ + bias_val = (bval - scale_vari * mean_vari.s0) * output_fl_scale;\n\ +\n\ + for(; coord.y < endH; coord.y++)\n\ + {\n\ + VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvertInt16Fp32Fst_4x4);\n\ + VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvertInt16Fp32Secd_4x4);\n\ + vxc_float4 norm;\n\ + norm = tmpData0 * alpha + bias_val;\n\ + tmpVal0 = convert_int4_rte(norm);\n\ + norm = tmpData1 * alpha + bias_val;\n\ + tmpVal1 = convert_int4_rte(norm);\n\ + VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ + uniConvertInt32toInt16_2x8);\n\ + VXC_WriteImage(output, coord, src2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + }\n\ +}"; /* end of instance_normalization_scale_f32_vx*/ + +static const char instance_normalization_scale_f32_bf16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform int width;\n\ +_viv_uniform int height;\n\ +_viv_uniform float dimRatio;\n\ +_viv_uniform int group_num;\n\ +_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;\n\ +_viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8;\n\ +_viv_uniform VXC_512Bits uniExtractOddData_2x8;\n\ +\n\ +constant vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0);\n\ +constant float4 one = (float4)(1.0, 1.0, 1.0, 1.0);\n\ +\n\ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_meanvari_BF16(\n\ + image2d_array_t input, image2d_array_t output, float eps, int rsFlg)\n\ +{\n\ + int gidx = get_global_id(0) << 3;\n\ + int lidx = get_local_id(0);\n\ + int gidz = get_global_id(1);\n\ + int4 coord = (int4)(gidx, 0, gidz, 0);\n\ + vxc_short8 src0, src1, src2;\n\ + float4 srcA, srcB;\n\ + vxc_float sum = 0, sqr = 0;\n\ +\n\ + __local float lcl_sum[16];\n\ + __local float lcl_sqr[16];\n\ +\n\ + int8 input_desc;\n\ + _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ + int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0;\n\ + _viv_asm(MOV, coord.z, baseAddr_a);\n\ +\n\ + if(gidx < width)\n\ + {\n\ + for(coord.y = 0; coord.y < height;)\n\ + {\n\ + VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + coord.y++;\n\ + VXC_DP2x8(src1, src0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\ + uniConvBF16toF32_Part0_2x8);\n\ + VXC_DP2x8(src2, src0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\ + uniConvBF16toF32_Part1_2x8);\n\ + _viv_asm(COPY, srcA, src1, 16);\n\ + _viv_asm(COPY, srcB, src2, 16);\n\ + sum += dot(srcA, one) + dot(srcB, one);\n\ + sqr += dot(srcA * srcA, one) + dot(srcB * srcB, one);\n\ + }\n\ + }\n\ +\n\ + lcl_sum[lidx] = sum;\n\ + lcl_sqr[lidx] = sqr;\n\ + barrier(CLK_LOCAL_MEM_FENCE);\n\ +\n\ + int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0);\n\ + if(lidx == 0)\n\ + {\n\ + __local float4* tmp_sum = (__local float4*)lcl_sum;\n\ + __local float4* tmp_sqr = (__local float4*)lcl_sqr;\n\ +\n\ + sum = 0;\n\ + sqr = 0;\n\ + for(int i = 0; i < 4; i++)\n\ + {\n\ + sum += dot(tmp_sum[i], one);\n\ + sqr += dot(tmp_sqr[i], one);\n\ + }\n\ +\n\ + float4 data = (float4)(sum, sqr, 0, 0);\n\ + write_imagef(output, coord_out, data);\n\ + }\n\ +}\n\ +\n\ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_meanvari_BF16_2D(\n\ + image2d_array_t input, image2d_array_t output, float eps, int rsFlg)\n\ +{\n\ + int gidx = get_global_id(0) << 3;\n\ + int lidx = get_local_id(0);\n\ + int gidz = get_global_id(1);\n\ + int gidy = gidz * height;\n\ +\n\ + int2 coord = (int2)(gidx, gidy);\n\ + vxc_short8 src0, src1, src2;\n\ + float4 srcA, srcB;\n\ + vxc_float sum = 0, sqr = 0;\n\ +\n\ + __local float lcl_sum[16];\n\ + __local float lcl_sqr[16];\n\ +\n\ + int endH = gidy + height;\n\ + if(gidx < width)\n\ + {\n\ + for(; coord.y < endH;)\n\ + {\n\ + VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + coord.y++;\n\ + VXC_DP2x8(src1, src0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\ + uniConvBF16toF32_Part0_2x8);\n\ + VXC_DP2x8(src2, src0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\ + uniConvBF16toF32_Part1_2x8);\n\ + _viv_asm(COPY, srcA, src1, 16);\n\ + _viv_asm(COPY, srcB, src2, 16);\n\ + sum += dot(srcA, one) + dot(srcB, one);\n\ + sqr += dot(srcA * srcA, one) + dot(srcB * srcB, one);\n\ + }\n\ + }\n\ +\n\ + lcl_sum[lidx] = sum;\n\ + lcl_sqr[lidx] = sqr;\n\ + barrier(CLK_LOCAL_MEM_FENCE);\n\ +\n\ + int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0);\n\ + if(lidx == 0)\n\ + {\n\ + __local float4* tmp_sum = (__local float4*)lcl_sum;\n\ + __local float4* tmp_sqr = (__local float4*)lcl_sqr;\n\ +\n\ + sum = 0;\n\ + sqr = 0;\n\ + for(int i = 0; i < 4; i++)\n\ + {\n\ + sum += dot(tmp_sum[i], one);\n\ + sqr += dot(tmp_sqr[i], one);\n\ + }\n\ +\n\ + float4 data = (float4)(sum, sqr, 0, 0);\n\ + write_imagef(output, coord_out, data);\n\ + }\n\ +}\n\ +\n\ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_BF16F32toBF16(\n\ + image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,\n\ + image2d_array_t output, float eps, int rsFlg)\n\ +{\n\ + int gidz = get_global_id(1);\n\ + int4 coord = (int4)(get_global_id(0), 0, gidz, 0);\n\ + vxc_short8 src0, src1, src2;\n\ + float scale_vari, bias_val;\n\ + vxc_float4 mean_vari = (vxc_float4)(0);\n\ +\n\ + Image img1 = create_image_from_image2d(bias, 4);\n\ + Image img2 = create_image_from_image2d(scale, 4);\n\ + Image img3 = create_image_from_image2d(meanVari, 4);\n\ + __global float* bias_ptr = (__global float*)img1.ptr;\n\ + __global float* scal_ptr = (__global float*)img2.ptr;\n\ + __global uchar* sumVari_ptr = (__global uchar*)get_image_ptr_from_coord(img3, coord.wz);\n\ + __global float4* vari_ptr = (__global float4*)sumVari_ptr;\n\ +\n\ + float bval = bias_ptr[gidz];\n\ + float sval = scal_ptr[gidz];\n\ +\n\ + for(int i = 0; i < group_num; i++)\n\ + {\n\ + mean_vari += vari_ptr[i];\n\ + }\n\ +\n\ + mean_vari *= dimRatio;\n\ + mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\ + mean_vari.s1 = rsqrt(mean_vari.s1);\n\ +\n\ + scale_vari = sval * mean_vari.s1;\n\ + vxc_float4 tmpData0, tmpData1;\n\ + bias_val = (bval - scale_vari * mean_vari.s0);\n\ +\n\ + int8 input_desc, output_desc;\n\ + _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ + int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0;\n\ + _viv_asm(MOV, coord.z, baseAddr_a);\n\ +\n\ + _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\ + int baseAddr = (int)get_global_id(1) * output_desc.s4 + output_desc.s0;\n\ + _viv_asm(MOV, coord.w, baseAddr);\n\ +\n\ + for(coord.y = 0; coord.y < height; coord.y++)\n\ + {\n\ + VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_DP2x8(src1, src0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\ + uniConvBF16toF32_Part0_2x8);\n\ + VXC_DP2x8(src2, src0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\ + uniConvBF16toF32_Part1_2x8);\n\ + _viv_asm(COPY, tmpData0, src1, 16);\n\ + _viv_asm(COPY, tmpData1, src2, 16);\n\ +\n\ + vxc_float4 norm;\n\ + norm = scale_vari * tmpData0 + bias_val;\n\ + _viv_asm(COPY, src0, norm, 16);\n\ + norm = scale_vari * tmpData1 + bias_val;\n\ + _viv_asm(COPY, src1, norm, 16);\n\ + VXC_DP2x8(src2, src0, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);\n\ + VXC_OP4_NoDest(img_store_3d, output, coord.xyww, src2, \\\n\ + VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ + }\n\ +}\n\ +\n\ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_BF16F32toBF16_2D(\n\ + image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,\n\ + image2d_array_t output, float eps, int rsFlg)\n\ +{\n\ + int gidz = get_global_id(1);\n\ + int gidy = gidz * height;\n\ + int2 coord = (int2)(get_global_id(0), gidy);\n\ + int2 coord_para = (int2)(gidz, 0);\n\ + int endH = gidy + height;\n\ + vxc_short8 src0, src1, src2;\n\ + float scale_vari, bias_val;\n\ + vxc_float4 mean_vari = (vxc_float4)(0);\n\ +\n\ + Image img1 = create_image_from_image2d(bias, 4);\n\ + Image img2 = create_image_from_image2d(scale, 4);\n\ + Image img3 = create_image_from_image2d(meanVari, 4);\n\ + __global float* bias_ptr = (__global float*)img1.ptr;\n\ + __global float* scal_ptr = (__global float*)img2.ptr;\n\ + __global uchar* sumVari_ptr = (__global uchar*)get_image_ptr_from_coord(img3, coord_para.yx);\n\ + __global float4* vari_ptr = (__global float4*)sumVari_ptr;\n\ +\n\ + float bval = bias_ptr[gidz];\n\ + float sval = scal_ptr[gidz];\n\ +\n\ + for(int i = 0; i < group_num; i++)\n\ + {\n\ + mean_vari += vari_ptr[i];\n\ + }\n\ +\n\ + mean_vari *= dimRatio;\n\ + mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\ + mean_vari.s1 = rsqrt(mean_vari.s1);\n\ +\n\ + scale_vari = sval * mean_vari.s1;\n\ + vxc_float4 tmpData0, tmpData1;\n\ + bias_val = (bval - scale_vari * mean_vari.s0);\n\ +\n\ + for(; coord.y < endH; coord.y++)\n\ + {\n\ + VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_DP2x8(src1, src0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\ + uniConvBF16toF32_Part0_2x8);\n\ + VXC_DP2x8(src2, src0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\ + uniConvBF16toF32_Part1_2x8);\n\ + _viv_asm(COPY, tmpData0, src1, 16);\n\ + _viv_asm(COPY, tmpData1, src2, 16);\n\ +\n\ + vxc_float4 norm;\n\ + norm = scale_vari * tmpData0 + bias_val;\n\ + _viv_asm(COPY, src0, norm, 16);\n\ + norm = scale_vari * tmpData1 + bias_val;\n\ + _viv_asm(COPY, src1, norm, 16);\n\ + VXC_DP2x8(src2, src0, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);\n\ + VXC_WriteImage(output, coord.xy, src2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + }\n\ +}"; /* end of instance_normalization_scale_f32_bf16_vx*/ + +static const char instance_normalization_scale_f32_f16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform int height;\n\ +_viv_uniform float dimRatio;\n\ +_viv_uniform int group_num;\n\ +_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4;\n\ +_viv_uniform VXC_512Bits uniConvertHalfToFp16_2x8;\n\ +\n\ +_viv_uniform VXC_512Bits uniConvertEndInt16Fp32_4x4;\n\ +\n\ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_F16F32toF16(\n\ + image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,\n\ + image2d_array_t output, float eps, int rsFlg)\n\ +{\n\ + int gidz = get_global_id(1);\n\ + int4 coord = (int4)(get_global_id(0), 0, gidz, 0);\n\ + vxc_short8 src0;\n\ + vxc_half8 in_h;\n\ + float scale_vari, bias_val;\n\ + vxc_float4 mean_vari = (vxc_float4)(0);\n\ +\n\ + Image img1 = create_image_from_image2d(bias, 4);\n\ + Image img2 = create_image_from_image2d(scale, 4);\n\ + Image img3 = create_image_from_image2d(meanVari, 4);\n\ + __global float* bias_ptr = (__global float*)img1.ptr;\n\ + __global float* scal_ptr = (__global float*)img2.ptr;\n\ + __global uchar* sumVari_ptr = (__global uchar*)get_image_ptr_from_coord(img3, coord.wz);\n\ + __global float4* vari_ptr = (__global float4*)sumVari_ptr;\n\ +\n\ + float bval = bias_ptr[gidz];\n\ + float sval = scal_ptr[gidz];\n\ +\n\ + for(int i = 0; i < group_num; i++)\n\ + {\n\ + mean_vari += vari_ptr[i];\n\ + }\n\ +\n\ + mean_vari *= dimRatio;\n\ + mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\ + mean_vari.s1 = rsqrt(mean_vari.s1);\n\ +\n\ + scale_vari = sval * mean_vari.s1;\n\ + vxc_float4 tmpData0, tmpData1;\n\ + vxc_short8 outval;\n\ + half4 tmpVal0, tmpVal1;\n\ + bias_val = (bval - scale_vari * mean_vari.s0);\n\ + vxc_half8 dst;\n\ +\n\ + int8 input_desc, output_desc;\n\ + _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ + int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0;\n\ + _viv_asm(MOV, coord.z, baseAddr_a);\n\ +\n\ + _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\ + int baseAddr = (int)get_global_id(1) * output_desc.s4 + output_desc.s0;\n\ + _viv_asm(MOV, coord.w, baseAddr);\n\ +\n\ + for(coord.y = 0; coord.y < height; coord.y++)\n\ + {\n\ + VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, in_h, src0, 16);\n\ +\n\ + VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + UniFP16toFP32Lo4_dp4x4);\n\ + VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvertEndInt16Fp32_4x4);\n\ +\n\ + vxc_float4 norm;\n\ + norm = scale_vari * tmpData0 + bias_val;\n\ + _viv_asm(CONV, tmpVal0, norm);\n\ + norm = scale_vari * tmpData1 + bias_val;\n\ + _viv_asm(CONV, tmpVal1, norm);\n\ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvertHalfToFp16_2x8);\n\ + _viv_asm(COPY, outval, dst, 16);\n\ + VXC_OP4_NoDest(img_store_3d, output, coord.xyww, outval, \\\n\ + VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ + }\n\ +}\n\ +\n\ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_F16F32toF16_2D(\n\ + image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,\n\ + image2d_array_t output, float eps, int rsFlg)\n\ +{\n\ + int gidz = get_global_id(1);\n\ + int gidy = gidz * height;\n\ + int2 coord = (int2)(get_global_id(0), gidy);\n\ + int2 coord_para = (int2)(gidz, 0);\n\ + int endH = gidy + height;\n\ + vxc_short8 src0;\n\ + vxc_half8 in_h;\n\ + float scale_vari, bias_val;\n\ + vxc_float4 mean_vari = (vxc_float4)(0);\n\ +\n\ + Image img1 = create_image_from_image2d(bias, 4);\n\ + Image img2 = create_image_from_image2d(scale, 4);\n\ + Image img3 = create_image_from_image2d(meanVari, 4);\n\ + __global float* bias_ptr = (__global float*)img1.ptr;\n\ + __global float* scal_ptr = (__global float*)img2.ptr;\n\ + __global uchar* sumVari_ptr = (__global uchar*)get_image_ptr_from_coord(img3, coord_para.yx);\n\ + __global float4* vari_ptr = (__global float4*)sumVari_ptr;\n\ +\n\ + float bval = bias_ptr[gidz];\n\ + float sval = scal_ptr[gidz];\n\ +\n\ + for(int i = 0; i < group_num; i++)\n\ + {\n\ + mean_vari += vari_ptr[i];\n\ + }\n\ +\n\ + mean_vari *= dimRatio;\n\ + mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\ + mean_vari.s1 = rsqrt(mean_vari.s1);\n\ +\n\ + scale_vari = sval * mean_vari.s1;\n\ + vxc_float4 tmpData0, tmpData1;\n\ + vxc_short8 outval;\n\ + half4 tmpVal0, tmpVal1;\n\ + bias_val = (bval - scale_vari * mean_vari.s0);\n\ + vxc_half8 dst;\n\ +\n\ + for(; coord.y < endH; coord.y++)\n\ + {\n\ + VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, in_h, src0, 16);\n\ +\n\ + VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + UniFP16toFP32Lo4_dp4x4);\n\ + VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvertEndInt16Fp32_4x4);\n\ + vxc_float4 norm;\n\ + norm = scale_vari * tmpData0 + bias_val;\n\ + _viv_asm(CONV, tmpVal0, norm);\n\ + norm = scale_vari * tmpData1 + bias_val;\n\ + _viv_asm(CONV, tmpVal1, norm);\n\ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvertHalfToFp16_2x8);\n\ + _viv_asm(COPY, outval, dst, 16);\n\ + VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + }\n\ +}"; /* end of instance_normalization_scale_f32_f16_vx*/ static const char instance_normalization_u8_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ \n\ @@ -7036,7 +10226,6 @@ _viv_uniform int height;\n\ _viv_uniform float dimRatio;\n\ _viv_uniform int group_num;\n\ _viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4;\n\ -_viv_uniform VXC_512Bits uniConvertHalfToFp16_2x8;\n\ _viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\ \n\ _viv_uniform VXC_512Bits uniConvert1stUint8SubZpToFp32_4x4;\n\ @@ -7056,9 +10245,7 @@ _viv_uniform float outputScale;\n\ _viv_uniform int output_ZP;\n\ \n\ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_meanvari_U8(\n\ - image2d_array_t input,\n\ - image2d_array_t output,\n\ - float eps, int rsFlg)\n\ + image2d_array_t input, image2d_array_t output, float eps, int rsFlg)\n\ {\n\ int gidx = get_global_id(0) << 4;\n\ int lidx = get_local_id(0);\n\ @@ -7066,17 +10253,20 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_mean int4 coord = (int4)(gidx, 0, gidz, 0);\n\ vxc_uchar16 src0;\n\ float sum = 0, sqr = 0;\n\ - int tmpSum = 0, tmpSqr = 0, tmpSum1, tmpSqr1;\n\ + int tmpSum = 0, tmpSqr = 0, tmpSum1 = 0, tmpSqr1 = 0;\n\ \n\ __local float lcl_sum[16];\n\ __local float lcl_sqr[16];\n\ -\n\ + int8 input_desc;\n\ + _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ + int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0;\n\ + _viv_asm(MOV, coord.z, baseAddr_a);\n\ if(gidx < width)\n\ {\n\ for(coord.y = 0; coord.y < height;)\n\ {\n\ - VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\n\ - VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ coord.y++;\n\ VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumU8_16x1);\n\ tmpSum += (tmpSum1);\n\ @@ -7086,7 +10276,6 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_mean sqr += (tmpSqr * e2InScale + rowSumScale);\n\ sum = (tmpSum + sumInZp) * input_scale;\n\ }\n\ -\n\ lcl_sum[lidx] = sum;\n\ lcl_sqr[lidx] = sqr;\n\ barrier(CLK_LOCAL_MEM_FENCE);\n\ @@ -7097,23 +10286,19 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_mean float4 one = (float4)(1, 1, 1, 1);\n\ __local float4* tmp_sum = (__local float4*)lcl_sum;\n\ __local float4* tmp_sqr = (__local float4*)lcl_sqr;\n\ -\n\ sum = 0; sqr = 0;\n\ for(int i = 0; i < 4; i++)\n\ {\n\ sum += dot(tmp_sum[i], one);\n\ sqr += dot(tmp_sqr[i], one);\n\ }\n\ -\n\ float4 data = (float4)(sum, sqr, 0, 0);\n\ write_imagef(output, coord_out, data);\n\ }\n\ }\n\ \n\ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_meanvari_U8_2D(\n\ - image2d_array_t input,\n\ - image2d_array_t output,\n\ - float eps, int rsFlg)\n\ + image2d_array_t input, image2d_array_t output, float eps, int rsFlg)\n\ {\n\ int gidx = get_global_id(0) << 4;\n\ int lidx = get_local_id(0);\n\ @@ -7124,17 +10309,16 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_mean vxc_uchar16 src0;\n\ float sum = 0, sqr = 0;\n\ int tmpSum = 0, tmpSqr = 0, tmpSum1, tmpSqr1;\n\ + int endH = gidy + height;\n\ \n\ __local float lcl_sum[16];\n\ __local float lcl_sqr[16];\n\ -\n\ - int endH = gidy + height;\n\ if(gidx < width)\n\ {\n\ for(; coord.y < endH;)\n\ {\n\ VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\n\ - VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ coord.y++;\n\ VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumU8_16x1);\n\ tmpSum += (tmpSum1);\n\ @@ -7144,7 +10328,6 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_mean sqr += (tmpSqr * e2InScale + rowSumScale);\n\ sum = (tmpSum + sumInZp) * input_scale;\n\ }\n\ -\n\ lcl_sum[lidx] = sum;\n\ lcl_sqr[lidx] = sqr;\n\ barrier(CLK_LOCAL_MEM_FENCE);\n\ @@ -7155,192 +10338,20 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_mean float4 one = (float4)(1, 1, 1, 1);\n\ __local float4* tmp_sum = (__local float4*)lcl_sum;\n\ __local float4* tmp_sqr = (__local float4*)lcl_sqr;\n\ -\n\ sum = 0; sqr = 0;\n\ for(int i = 0; i < 4; i++)\n\ {\n\ sum += dot(tmp_sum[i], one);\n\ sqr += dot(tmp_sqr[i], one);\n\ }\n\ -\n\ float4 data = (float4)(sum, sqr, 0, 0);\n\ write_imagef(output, coord_out, data);\n\ }\n\ }\n\ \n\ -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_U8toF16(\n\ - image2d_array_t input,\n\ - image2d_array_t bias,\n\ - image2d_array_t scale,\n\ - image2d_array_t meanVari,\n\ - image2d_array_t output,\n\ - float eps,\n\ - int rsFlg)\n\ -{\n\ - int gidz = get_global_id(1);\n\ - int4 coord = (int4)(get_global_id(0), 0, gidz, 0);\n\ - int4 coord_para = (int4)(gidz, 0, 0, 0);\n\ - vxc_uchar16 src0;\n\ - vxc_short8 src1;\n\ - vxc_half8 scale_h;\n\ - float scale_vari, bias_val;\n\ - vxc_float4 bias_f, scale_f, mean_vari = (vxc_float4)(0);\n\ -\n\ - VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ - VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ - _viv_asm(COPY, scale_h, src1, 16);\n\ - VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - UniFP16toFP32Lo4_dp4x4);\n\ -\n\ - bias_f = read_imagef(bias, coord_para);\n\ -\n\ - coord_para.x = 0;\n\ - coord_para.y = gidz;\n\ - for(int i = 0; i < group_num; i++)\n\ - {\n\ - mean_vari += read_imagef(meanVari, coord_para);\n\ - coord_para.x += 4;\n\ - }\n\ - mean_vari *= dimRatio;\n\ - mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\ - mean_vari.s1 = rsqrt(mean_vari.s1);\n\ -\n\ - scale_vari = scale_f.s0 * mean_vari.s1;\n\ - short zp = inputZP;\n\ - vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3;\n\ - vxc_short8 outval;\n\ - half4 tmpVal0, tmpVal1;\n\ - float alpha = input_scale * scale_vari;\n\ - bias_val = (bias_f.s0 - scale_vari * mean_vari.s0);\n\ - vxc_half8 dst;\n\ -\n\ - for(coord.y = 0; coord.y < height;)\n\ - {\n\ - coord_para = coord;\n\ - VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ - VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ - coord.y++;\n\ -\n\ - VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvert1stUint8SubZpToFp32_4x4);\n\ - VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvert2ndUint8SubZpToFp32_4x4);\n\ - VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvert3rdUint8SubZpToFp32_4x4);\n\ - VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvert4thUint8SubZpToFp32_4x4);\n\ - vxc_float4 norm;\n\ - norm = alpha * tmpData0 + bias_val;\n\ - _viv_asm(CONV, tmpVal0, norm);\n\ - norm = alpha * tmpData1 + bias_val;\n\ - _viv_asm(CONV, tmpVal1, norm);\n\ - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvertHalfToFp16_2x8);\n\ - _viv_asm(COPY, outval, dst, 16);\n\ - VXC_WriteImage2DArray(output, coord_para, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - coord_para.x += 8;\n\ - norm = alpha * tmpData2 + bias_val;\n\ - _viv_asm(CONV, tmpVal0, norm);\n\ - norm = alpha * tmpData3 + bias_val;\n\ - _viv_asm(CONV, tmpVal1, norm);\n\ - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvertHalfToFp16_2x8);\n\ - _viv_asm(COPY, outval, dst, 16);\n\ - VXC_WriteImage2DArray(output, coord_para, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - }\n\ -}\n\ -\n\ -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_U8toF16_2D(\n\ - image2d_array_t input,\n\ - image2d_array_t bias,\n\ - image2d_array_t scale,\n\ - image2d_array_t meanVari,\n\ - image2d_array_t output,\n\ - float eps,\n\ - int rsFlg)\n\ -{\n\ - int gidz = get_global_id(1);\n\ - int gidy = gidz * height;\n\ - int4 coord = (int4)(get_global_id(0), gidy, 0, 0);\n\ - int4 coord_para = (int4)(gidz, 0, 0, 0);\n\ - int endH = gidy + height;\n\ - vxc_uchar16 src0;\n\ - vxc_short8 src1;\n\ - vxc_half8 scale_h;\n\ - float scale_vari, bias_val;\n\ - vxc_float4 bias_f, scale_f, mean_vari = (vxc_float4)(0);\n\ -\n\ - VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ - VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ - _viv_asm(COPY, scale_h, src1, 16);\n\ - VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - UniFP16toFP32Lo4_dp4x4);\n\ -\n\ - bias_f = read_imagef(bias, coord_para);\n\ -\n\ - coord_para.x = 0;\n\ - coord_para.y = gidz;\n\ - for(int i = 0; i < group_num; i++)\n\ - {\n\ - mean_vari += read_imagef(meanVari, coord_para);\n\ - coord_para.x += 4;\n\ - }\n\ - mean_vari *= dimRatio;\n\ - mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\ - mean_vari.s1 = rsqrt(mean_vari.s1);\n\ -\n\ - scale_vari = scale_f.s0 * mean_vari.s1;\n\ - short zp = inputZP;\n\ - vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3;\n\ - vxc_short8 outval;\n\ - half4 tmpVal0, tmpVal1;\n\ - float alpha = input_scale * scale_vari;\n\ - bias_val = (bias_f.s0 - scale_vari * mean_vari.s0);\n\ - vxc_half8 dst;\n\ -\n\ - for(; coord.y < endH;)\n\ - {\n\ - coord_para = coord;\n\ - VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ - VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ - coord.y++;\n\ - VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvert1stUint8SubZpToFp32_4x4);\n\ - VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvert2ndUint8SubZpToFp32_4x4);\n\ - VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvert3rdUint8SubZpToFp32_4x4);\n\ - VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvert4thUint8SubZpToFp32_4x4);\n\ - vxc_float4 norm;\n\ - norm = alpha * tmpData0 + bias_val;\n\ - _viv_asm(CONV, tmpVal0, norm);\n\ - norm = alpha * tmpData1 + bias_val;\n\ - _viv_asm(CONV, tmpVal1, norm);\n\ - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvertHalfToFp16_2x8);\n\ - _viv_asm(COPY, outval, dst, 16);\n\ - VXC_WriteImage(output, coord_para.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - coord_para.x += 8;\n\ - norm = alpha * tmpData2 + bias_val;\n\ - _viv_asm(CONV, tmpVal0, norm);\n\ - norm = alpha * tmpData3 + bias_val;\n\ - _viv_asm(CONV, tmpVal1, norm);\n\ - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvertHalfToFp16_2x8);\n\ - _viv_asm(COPY, outval, dst, 16);\n\ - VXC_WriteImage(output, coord_para.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - }\n\ -}\n\ -\n\ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_U8toU8(\n\ - image2d_array_t input,\n\ - image2d_array_t bias,\n\ - image2d_array_t scale,\n\ - image2d_array_t meanVari,\n\ - image2d_array_t output,\n\ - float eps,\n\ - int rsFlg)\n\ + image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari,\n\ + image2d_array_t output, float eps, int rsFlg)\n\ {\n\ int gidz = get_global_id(1);\n\ int4 coord = (int4)(get_global_id(0), 0, gidz, 0);\n\ @@ -7357,12 +10368,10 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_U8to VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);\n\ \n\ bias_f = read_imagef(bias, coord_para);\n\ - coord_para.x = 0;\n\ - coord_para.y = gidz;\n\ for(int i = 0; i < group_num; i++)\n\ {\n\ - mean_vari += read_imagef(meanVari, coord_para);\n\ - coord_para.x += 4;\n\ + mean_vari += read_imagef(meanVari, coord_para.yx);\n\ + coord_para.y += 4;\n\ }\n\ mean_vari *= dimRatio;\n\ mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\ @@ -7371,47 +10380,43 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_U8to scale_vari = scale_f.s0 * mean_vari.s1;\n\ short zp = inputZP;\n\ vxc_int4 tmpVal0, tmpVal1;\n\ - vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3;\n\ + vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3, norm;\n\ float alpha = scale_inOut * scale_vari;\n\ bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * outputScale + output_ZP;\n\ \n\ - for(coord.y = 0; coord.y < height;coord.y++)\n\ + int8 input_desc, output_desc;\n\ + _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ + int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0;\n\ + _viv_asm(MOV, coord.z, baseAddr_a);\n\ + _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\ + int baseAddr = (int)get_global_id(1) * output_desc.s4 + output_desc.s0;\n\ + _viv_asm(MOV, coord.w, baseAddr);\n\ +\n\ + for(coord.y = 0; coord.y < height; coord.y++)\n\ {\n\ - VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ - VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ - VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvert1stUint8SubZpToFp32_4x4);\n\ - VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvert2ndUint8SubZpToFp32_4x4);\n\ - VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvert3rdUint8SubZpToFp32_4x4);\n\ - VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvert4thUint8SubZpToFp32_4x4);\n\ - vxc_float4 norm;\n\ + VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert1stUint8SubZpToFp32_4x4);\n\ + VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert2ndUint8SubZpToFp32_4x4);\n\ + VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert3rdUint8SubZpToFp32_4x4);\n\ + VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert4thUint8SubZpToFp32_4x4);\n\ norm = tmpData0 * alpha + bias_val;\n\ tmpVal0 = convert_int4_rte(norm);\n\ norm = tmpData1 * alpha + bias_val;\n\ tmpVal1 = convert_int4_rte(norm);\n\ - VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ - uniConvertInt32toUint8_2x8);\n\ + VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\n\ norm = tmpData2 * alpha + bias_val;\n\ tmpVal0 = convert_int4_rte(norm);\n\ norm = tmpData3 * alpha + bias_val;\n\ tmpVal1 = convert_int4_rte(norm);\n\ - VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1),\\\n\ - uniConvertInt32toUint8_2x8);\n\ - VXC_WriteImage2DArray(output, coord, src2, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\n\ + VXC_OP4_NoDest(img_store_3d, output, coord.xyww, src2, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\ }\n\ }\n\ \n\ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_U8toU8_2D(\n\ - image2d_array_t input,\n\ - image2d_array_t bias,\n\ - image2d_array_t scale,\n\ - image2d_array_t meanVari,\n\ - image2d_array_t output,\n\ - float eps,\n\ - int rsFlg)\n\ + image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari,\n\ + image2d_array_t output, float eps, int rsFlg)\n\ {\n\ int gidz = get_global_id(1);\n\ int gidy = gidz * height;\n\ @@ -7430,12 +10435,10 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_U8to VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);\n\ \n\ bias_f = read_imagef(bias, coord_para);\n\ - coord_para.x = 0;\n\ - coord_para.y = gidz;\n\ for(int i = 0; i < group_num; i++)\n\ {\n\ - mean_vari += read_imagef(meanVari, coord_para);\n\ - coord_para.x += 4;\n\ + mean_vari += read_imagef(meanVari, coord_para.yx);\n\ + coord_para.y += 4;\n\ }\n\ mean_vari *= dimRatio;\n\ mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\ @@ -7444,39 +10447,180 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_U8to scale_vari = scale_f.s0 * mean_vari.s1;\n\ short zp = inputZP;\n\ vxc_int4 tmpVal0, tmpVal1;\n\ - vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3;\n\ + vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3, norm;\n\ float alpha = scale_inOut * scale_vari;\n\ bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * outputScale + output_ZP;\n\ \n\ for(; coord.y < endH; coord.y++)\n\ {\n\ - VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ - VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ - VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvert1stUint8SubZpToFp32_4x4);\n\ - VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvert2ndUint8SubZpToFp32_4x4);\n\ - VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvert3rdUint8SubZpToFp32_4x4);\n\ - VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvert4thUint8SubZpToFp32_4x4);\n\ - vxc_float4 norm;\n\ + VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert1stUint8SubZpToFp32_4x4);\n\ + VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert2ndUint8SubZpToFp32_4x4);\n\ + VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert3rdUint8SubZpToFp32_4x4);\n\ + VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert4thUint8SubZpToFp32_4x4);\n\ norm = tmpData0 * alpha + bias_val;\n\ tmpVal0 = convert_int4_rte(norm);\n\ norm = tmpData1 * alpha + bias_val;\n\ tmpVal1 = convert_int4_rte(norm);\n\ - VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ - uniConvertInt32toUint8_2x8);\n\ + VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\n\ norm = tmpData2 * alpha + bias_val;\n\ tmpVal0 = convert_int4_rte(norm);\n\ norm = tmpData3 * alpha + bias_val;\n\ tmpVal1 = convert_int4_rte(norm);\n\ - VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1),\\\n\ - uniConvertInt32toUint8_2x8);\n\ + VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\n\ VXC_WriteImage(output, coord, src2, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ }\n\ }"; /* end of instance_normalization_u8_vx*/ +static const char instance_normalization_u8_f16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform int width;\n\ +_viv_uniform int height;\n\ +_viv_uniform float dimRatio;\n\ +_viv_uniform int group_num;\n\ +_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4;\n\ +_viv_uniform VXC_512Bits uniConvertHalfToFp16_2x8;\n\ +\n\ +_viv_uniform VXC_512Bits uniConvert1stUint8SubZpToFp32_4x4;\n\ +_viv_uniform VXC_512Bits uniConvert2ndUint8SubZpToFp32_4x4;\n\ +_viv_uniform VXC_512Bits uniConvert3rdUint8SubZpToFp32_4x4;\n\ +_viv_uniform VXC_512Bits uniConvert4thUint8SubZpToFp32_4x4;\n\ +_viv_uniform float input_scale;\n\ +_viv_uniform int inputZP;\n\ +\n\ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_U8toF16(\n\ + image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari,\n\ + image2d_array_t output, float eps, int rsFlg)\n\ +{\n\ + int gidz = get_global_id(1);\n\ + int4 coord = (int4)(get_global_id(0), 0, gidz, 0);\n\ + int4 coord_para = (int4)(gidz, 0, 0, 0);\n\ + vxc_uchar16 src0;\n\ + vxc_short8 src1, outval;\n\ + vxc_half8 scale_h, dst;\n\ + float scale_vari, bias_val;\n\ + vxc_float4 bias_f, scale_f, mean_vari = (vxc_float4)(0);\n\ +\n\ + VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, scale_h, src1, 16);\n\ + VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);\n\ + bias_f = read_imagef(bias, coord_para);\n\ + for(int i = 0; i < group_num; i++)\n\ + {\n\ + mean_vari += read_imagef(meanVari, coord_para.yx);\n\ + coord_para.y += 4;\n\ + }\n\ + mean_vari *= dimRatio;\n\ + mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\ + mean_vari.s1 = rsqrt(mean_vari.s1);\n\ + scale_vari = scale_f.s0 * mean_vari.s1;\n\ + short zp = inputZP;\n\ + vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3, norm;\n\ + half4 tmpVal0, tmpVal1;\n\ + float alpha = input_scale * scale_vari;\n\ + bias_val = (bias_f.s0 - scale_vari * mean_vari.s0);\n\ +\n\ + coord_para = coord;\n\ + int8 input_desc, output_desc;\n\ + _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ + int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0;\n\ + _viv_asm(MOV, coord.z, baseAddr_a);\n\ +\n\ + _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\ + int baseAddr = (int)get_global_id(1) * output_desc.s4 + output_desc.s0;\n\ + _viv_asm(MOV, coord_para.z, baseAddr);\n\ + for(coord.y = 0; coord.y < height;)\n\ + {\n\ + VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + coord_para.xy = coord.xy;\n\ + coord.y++;\n\ + VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert1stUint8SubZpToFp32_4x4);\n\ + VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert2ndUint8SubZpToFp32_4x4);\n\ + VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert3rdUint8SubZpToFp32_4x4);\n\ + VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert4thUint8SubZpToFp32_4x4);\n\ + norm = alpha * tmpData0 + bias_val;\n\ + _viv_asm(CONV, tmpVal0, norm);\n\ + norm = alpha * tmpData1 + bias_val;\n\ + _viv_asm(CONV, tmpVal1, norm);\n\ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8);\n\ + _viv_asm(COPY, outval, dst, 16);\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_para, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ + coord_para.x += 8;\n\ + norm = alpha * tmpData2 + bias_val;\n\ + _viv_asm(CONV, tmpVal0, norm);\n\ + norm = alpha * tmpData3 + bias_val;\n\ + _viv_asm(CONV, tmpVal1, norm);\n\ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8);\n\ + _viv_asm(COPY, outval, dst, 16);\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_para, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ + }\n\ +}\n\ +\n\ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_U8toF16_2D(\n\ + image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari,\n\ + image2d_array_t output, float eps, int rsFlg)\n\ +{\n\ + int gidz = get_global_id(1);\n\ + int gidy = gidz * height;\n\ + int4 coord = (int4)(get_global_id(0), gidy, 0, 0);\n\ + int4 coord_para = (int4)(gidz, 0, 0, 0);\n\ + int endH = gidy + height;\n\ + vxc_uchar16 src0;\n\ + vxc_short8 src1, outval;\n\ + vxc_half8 scale_h, dst;\n\ + float scale_vari, bias_val;\n\ + vxc_float4 bias_f, scale_f, mean_vari = (vxc_float4)(0);\n\ +\n\ + VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, scale_h, src1, 16);\n\ + VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);\n\ + bias_f = read_imagef(bias, coord_para);\n\ + for(int i = 0; i < group_num; i++)\n\ + {\n\ + mean_vari += read_imagef(meanVari, coord_para.yx);\n\ + coord_para.y += 4;\n\ + }\n\ + mean_vari *= dimRatio;\n\ + mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\ + mean_vari.s1 = rsqrt(mean_vari.s1);\n\ +\n\ + scale_vari = scale_f.s0 * mean_vari.s1;\n\ + short zp = inputZP;\n\ + vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3, norm;\n\ + half4 tmpVal0, tmpVal1;\n\ + float alpha = input_scale * scale_vari;\n\ + bias_val = (bias_f.s0 - scale_vari * mean_vari.s0);\n\ + for(; coord.y < endH;)\n\ + {\n\ + VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + coord_para = coord;\n\ + coord.y++;\n\ + VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert1stUint8SubZpToFp32_4x4);\n\ + VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert2ndUint8SubZpToFp32_4x4);\n\ + VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert3rdUint8SubZpToFp32_4x4);\n\ + VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert4thUint8SubZpToFp32_4x4);\n\ + norm = alpha * tmpData0 + bias_val;\n\ + _viv_asm(CONV, tmpVal0, norm);\n\ + norm = alpha * tmpData1 + bias_val;\n\ + _viv_asm(CONV, tmpVal1, norm);\n\ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8);\n\ + _viv_asm(COPY, outval, dst, 16);\n\ + VXC_WriteImage(output, coord_para.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + coord_para.x += 8;\n\ + norm = alpha * tmpData2 + bias_val;\n\ + _viv_asm(CONV, tmpVal0, norm);\n\ + norm = alpha * tmpData3 + bias_val;\n\ + _viv_asm(CONV, tmpVal1, norm);\n\ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8);\n\ + _viv_asm(COPY, outval, dst, 16);\n\ + VXC_WriteImage(output, coord_para.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + }\n\ +}\n\ +"; /* end of instance_normalization_u8_f16_vx*/ + static const char l2normalizescale_axis0_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ \n\ #define VXC_Vstore3(Pointer, Offset, Data) \\\n\ @@ -8738,6 +11882,680 @@ __kernel void layer_norm_I16toI16_2D(\n\ }\n\ "; /* end of layer_normalization_i16_vx*/ +static const char layer_normalization_scale_f32_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +/**************************layernorm float16***********************************/\n\ +_viv_uniform int width;\n\ +_viv_uniform float dimRatio;\n\ +_viv_uniform VXC_512Bits uniFp16SumSqr_dp8x2;\n\ +_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4;\n\ +_viv_uniform VXC_512Bits uniExtractHalf4_dp4x4;\n\ +\n\ +__kernel void layer_norm_F16F32toF16(\n\ + image2d_array_t input, image2d_t bias, image2d_t scale,\n\ + image2d_array_t output, float eps)\n\ +{\n\ + int4 coord = (int4)(0, get_global_id(1), get_global_id(2), 0);\n\ + int4 coord_out = coord;\n\ +\n\ + int8 input_desc, output_desc;\n\ + _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ + int baseAddr_a = (int)get_global_id(2) * input_desc.s4 + input_desc.s0;\n\ + _viv_asm(MOV, coord.z, baseAddr_a);\n\ +\n\ + vxc_short8 src0;\n\ + vxc_float sum = 0, sqr = 0;\n\ + VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + Image img1 = create_image_from_image2d(bias, 4);\n\ + Image img2 = create_image_from_image2d(scale, 4);\n\ +\n\ + _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\ + int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0;\n\ + _viv_asm(MOV, coord_out.w, baseAddr);\n\ +\n\ + for(coord.x = 8; coord.x < (width+8); coord.x += 8)\n\ + {\n\ + vxc_half8 val0_h;\n\ + _viv_asm(COPY, val0_h, src0, 16);\n\ + VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + vxc_float4 sumsqr;\n\ + VXC_DP8x2(sumsqr, val0_h, val0_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\\\n\ + uniFp16SumSqr_dp8x2);\n\ + sum += sumsqr.x;\n\ + sqr += sumsqr.y;\n\ + }\n\ + vxc_float mean;\n\ + mean = sum * dimRatio;\n\ + vxc_float vari;\n\ + vari = sqr*dimRatio - mean*mean;\n\ + vari += eps;\n\ + vari = rsqrt(vari);\n\ + vxc_float4 bias_f, scale_f, in_f;\n\ + __global float* bias_ptr = (__global float*)get_image_ptr_from_coord(img1, coord.ww);\n\ + __global float* scale_ptr = (__global float*)get_image_ptr_from_coord(img2, coord.ww);\n\ + for(coord.x = 0; coord.x < width; coord.x += 4)\n\ + {\n\ + VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + bias_f = vload4(0, bias_ptr + coord.x);\n\ + scale_f = vload4(0, scale_ptr + coord.x);\n\ + vxc_half8 in_h;\n\ + _viv_asm(COPY, in_h, src0, 16);\n\ + VXC_DP4x4(in_f, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + UniFP16toFP32Lo4_dp4x4);\n\ + vxc_float4 sub, norm;\n\ + sub = in_f - mean;\n\ + norm = scale_f * vari * sub + bias_f;\n\ + half4 norm_h;\n\ + _viv_asm(CONV, norm_h, norm);\n\ + vxc_half8 dst;\n\ + VXC_DP4x4(dst, norm_h, norm_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniExtractHalf4_dp4x4);\n\ + vxc_short8 dstval;\n\ + _viv_asm(COPY, dstval, dst, 16);\n\ + coord_out.x = coord.x;\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dstval, \\\n\ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ + }\n\ +}\n\ +/*****************************layernorm uint8 to uint8****************************/\n\ +_viv_uniform VXC_512Bits uniConvert1stUint8SubZpToFp32_4x4;\n\ +_viv_uniform VXC_512Bits uniConvert2ndUint8SubZpToFp32_4x4;\n\ +_viv_uniform VXC_512Bits uniConvert3rdUint8SubZpToFp32_4x4;\n\ +_viv_uniform VXC_512Bits uniConvert4thUint8SubZpToFp32_4x4;\n\ +_viv_uniform VXC_512Bits uniSumU8_16x1;\n\ +_viv_uniform VXC_512Bits uniSqrSum_16x1;\n\ +_viv_uniform float input_scale;\n\ +_viv_uniform int inputZP;\n\ +_viv_uniform float outputScale;\n\ +_viv_uniform float output_zp;\n\ +_viv_uniform int sumInZp;\n\ +_viv_uniform int tmpZp1;\n\ +_viv_uniform int tmpZp2;\n\ +_viv_uniform float e2InScale;\n\ +_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\ +\n\ +_viv_uniform VXC_512Bits uniInt16SumSqr_dp8x2;\n\ +_viv_uniform float dimRatio_scale;\n\ +\n\ +__kernel void layer_norm_U8F32toU8(\n\ + image2d_array_t input, image2d_t bias, image2d_t scale,\n\ + image2d_array_t output, float eps)\n\ +{\n\ + int4 coord = (int4)(0, get_global_id(1), get_global_id(2), 0);\n\ + int4 coord_out = coord;\n\ +\n\ + vxc_uchar16 src0, src2;\n\ + float sum = 0, sqr = 0;\n\ + vxc_float4 bias_f0, bias_f1, bias_f2, bias_f3, scale_f0, scale_f1, scale_f2, scale_f3;\n\ + int tmpSum = 0, tmpSqr = 0;\n\ + vxc_int4 tmpSum1;\n\ + vxc_int4 tmpSqr1;\n\ + short zp = inputZP;\n\ +\n\ + int8 input_desc, output_desc;\n\ + _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ + int baseAddr_a = (int)get_global_id(2) * input_desc.s4 + input_desc.s0;\n\ + _viv_asm(MOV, coord.z, baseAddr_a);\n\ +\n\ + _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\ + int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0;\n\ + _viv_asm(MOV, coord_out.w, baseAddr);\n\ +\n\ + for(coord.x = 0; coord.x < width; coord.x += 16)\n\ + {\n\ + VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumU8_16x1);\n\ + tmpSum += (tmpSum1.x);\n\ + VXC_DP16x1(tmpSqr1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSqrSum_16x1);\n\ + tmpSqr += (tmpSqr1.x + tmpZp1 * tmpSum1.x);\n\ + }\n\ + sum = (tmpSum + sumInZp) * input_scale;\n\ + sqr = (tmpSqr + tmpZp2) * e2InScale;\n\ +\n\ + float mean, vari;\n\ + mean = sum * dimRatio;\n\ + vari = sqr*dimRatio - mean*mean;\n\ + vari += eps;\n\ + vari = rsqrt(vari);\n\ + vxc_int4 tmpVal0, tmpVal1;\n\ + vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3;\n\ +\n\ + Image img1 = create_image_from_image2d(bias, 4);\n\ + Image img2 = create_image_from_image2d(scale, 4);\n\ + __global float* bias_ptr = (__global float*)get_image_ptr_from_coord(img1, coord.ww);\n\ + __global float* scale_ptr = (__global float*)get_image_ptr_from_coord(img2, coord.ww);\n\ + for(coord.x = 0; coord.x < width; coord.x += 16)\n\ + {\n\ + VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + bias_f0 = vload4(0, bias_ptr);\n\ + bias_f1 = vload4(1, bias_ptr);\n\ + bias_f2 = vload4(2, bias_ptr);\n\ + bias_f3 = vload4(3, bias_ptr);\n\ + scale_f0 = vload4(0, scale_ptr);\n\ + scale_f1 = vload4(1, scale_ptr);\n\ + scale_f2 = vload4(2, scale_ptr);\n\ + scale_f3 = vload4(3, scale_ptr);\n\ + bias_ptr += 16;\n\ + scale_ptr += 16;\n\ +\n\ + VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvert1stUint8SubZpToFp32_4x4);\n\ + VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvert2ndUint8SubZpToFp32_4x4);\n\ + VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvert3rdUint8SubZpToFp32_4x4);\n\ + VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvert4thUint8SubZpToFp32_4x4);\n\ + tmpData0 *= input_scale;\n\ + tmpData1 *= input_scale;\n\ + tmpData2 *= input_scale;\n\ + tmpData3 *= input_scale;\n\ +\n\ + vxc_float4 norm;\n\ + tmpData0 -= mean;\n\ + norm = scale_f0 * vari * tmpData0 + bias_f0;\n\ + tmpVal0 = convert_int4_rte(norm * outputScale + output_zp);\n\ +\n\ + tmpData1 -= mean;\n\ + norm = scale_f1 * vari * tmpData1 + bias_f1;\n\ + tmpVal1 = convert_int4_rte(norm * outputScale + output_zp);\n\ + VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ + uniConvertInt32toUint8_2x8);\n\ +\n\ + tmpData2 -= mean;\n\ + norm = scale_f2 * vari * tmpData2 + bias_f2;\n\ + tmpVal0 = convert_int4_rte(norm * outputScale + output_zp);\n\ +\n\ + tmpData3 -= mean;\n\ + norm = scale_f3 * vari * tmpData3 + bias_f3;\n\ + tmpVal1 = convert_int4_rte(norm * outputScale + output_zp);\n\ + VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1),\\\n\ + uniConvertInt32toUint8_2x8);\n\ + coord_out.x = coord.x;\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, src2, \\\n\ + VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\ + }\n\ +}\n\ +\n\ +__kernel void layer_norm_I16F32toI16(\n\ + image2d_array_t input, image2d_t bias, image2d_t scale,\n\ + image2d_array_t output, float eps)\n\ +{\n\ + int4 coord = (int4)(0, get_global_id(1), get_global_id(2), 0);\n\ +\n\ + int8 input_desc, output_desc;\n\ + _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ + int baseAddr_a = (int)get_global_id(2) * input_desc.s4 + input_desc.s0;\n\ + _viv_asm(MOV, coord.z, baseAddr_a);\n\ +\n\ + _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\ + int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0;\n\ + _viv_asm(MOV, coord.w, baseAddr);\n\ +\n\ + vxc_short8 src0, dst;\n\ + vxc_float sum = 0, sqr = 0;\n\ + for(; coord.x < width;)\n\ + {\n\ + VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + coord.x += 8;\n\ + vxc_float4 sumsqr;\n\ + VXC_DP8x2(sumsqr, src0, src0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\\\n\ + uniInt16SumSqr_dp8x2);\n\ + sum += sumsqr.x;\n\ + sqr = sqr + sumsqr.y * e2InScale;\n\ + }\n\ + vxc_float mean;\n\ + mean = sum * dimRatio_scale;\n\ + vxc_float vari;\n\ + vari = sqr*dimRatio - mean*mean;\n\ + vari += eps;\n\ + vari = rsqrt(vari);\n\ +\n\ + short zp = inputZP;\n\ + vxc_float4 tmpData0, tmpData1;\n\ + vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;\n\ + vxc_int4 tmpVal0, tmpVal1;\n\ +\n\ + int2 coord_bias = (int2)(0, 0);\n\ + Image img1 = create_image_from_image2d(bias, 4);\n\ + Image img2 = create_image_from_image2d(scale, 4);\n\ + __global float* bias_ptr = (__global float*)get_image_ptr_from_coord(img1, coord_bias);\n\ + __global float* scale_ptr = (__global float*)get_image_ptr_from_coord(img2, coord_bias);\n\ + for(coord.x = 0; coord.x < width; coord.x += 8)\n\ + {\n\ + VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + bias_f0 = vload4(0, bias_ptr);\n\ + bias_f1 = vload4(1, bias_ptr);\n\ + scale_f0 = vload4(0, scale_ptr);\n\ + scale_f1 = vload4(1, scale_ptr);\n\ + bias_ptr += 8;\n\ + scale_ptr += 8;\n\ +\n\ + VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvert1stUint8SubZpToFp32_4x4);\n\ + VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvert2ndUint8SubZpToFp32_4x4);\n\ +\n\ + vxc_float4 sub, norm;\n\ + sub = tmpData0 * input_scale - mean;\n\ + norm = scale_f0 * vari * sub + bias_f0;\n\ + tmpVal0 = convert_int4_rte(norm * outputScale + output_zp);\n\ + sub = tmpData1 * input_scale - mean;\n\ + norm = scale_f1 * vari * sub + bias_f1;\n\ + tmpVal1 = convert_int4_rte(norm * outputScale + output_zp);\n\ +\n\ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ + uniConvertInt32toUint8_2x8);\n\ + VXC_OP4_NoDest(img_store_3d, output, coord.xyww, dst, \\\n\ + VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ + }\n\ +}"; /* end of layer_normalization_scale_f32_vx*/ + +static const char layer_normalization_scale_f32_2d_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +/**************************layernorm float16***********************************/\n\ +_viv_uniform int width;\n\ +_viv_uniform float dimRatio;\n\ +_viv_uniform VXC_512Bits uniFp16SumSqr_dp8x2;\n\ +_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4;\n\ +_viv_uniform VXC_512Bits uniExtractHalf4_dp4x4;\n\ +\n\ +__kernel void layer_norm_F16F32toF16_2D(\n\ + image2d_t input, image2d_t bias, image2d_t scale,\n\ + image2d_t output, float eps)\n\ +{\n\ + int4 coord = (int4)(0, get_global_id(1), 0, 0);\n\ + vxc_short8 src0, src1;\n\ + vxc_float sum = 0, sqr = 0;\n\ + VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + Image img1 = create_image_from_image2d(bias, 4);\n\ + Image img2 = create_image_from_image2d(scale, 4);\n\ +\n\ + for(coord.x = 8; coord.x < (width+8); coord.x += 8)\n\ + {\n\ + vxc_half8 val0_h;\n\ + _viv_asm(COPY, val0_h, src0, 16);\n\ + VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + vxc_float4 sumsqr;\n\ + VXC_DP8x2(sumsqr, val0_h, val0_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\\\n\ + uniFp16SumSqr_dp8x2);\n\ + sum += sumsqr.x;\n\ + sqr += sumsqr.y;\n\ + }\n\ + vxc_float mean;\n\ + mean = sum * dimRatio;\n\ + vxc_float vari;\n\ + vari = sqr*dimRatio - mean*mean;\n\ + vari += eps;\n\ + vari = rsqrt(vari);\n\ + vxc_float4 bias_f, scale_f, in_f;\n\ + __global float* bias_ptr = (__global float*)get_image_ptr_from_coord(img1, coord.zw);\n\ + __global float* scale_ptr = (__global float*)get_image_ptr_from_coord(img2, coord.zw);\n\ + for(coord.x = 0; coord.x < width; coord.x += 4)\n\ + {\n\ + VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + bias_f = vload4(0, bias_ptr + coord.x);\n\ + scale_f = vload4(0, scale_ptr + coord.x);\n\ +\n\ + vxc_half8 in_h;\n\ + _viv_asm(COPY, in_h, src0, 16);\n\ + VXC_DP4x4(in_f, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + UniFP16toFP32Lo4_dp4x4);\n\ + vxc_float4 sub, norm;\n\ + sub = in_f - mean;\n\ + norm = scale_f * vari * sub + bias_f;\n\ + half4 norm_h;\n\ + _viv_asm(CONV, norm_h, norm);\n\ + vxc_half8 dst;\n\ + VXC_DP4x4(dst, norm_h, norm_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniExtractHalf4_dp4x4);\n\ + vxc_short8 dstval;\n\ + _viv_asm(COPY, dstval, dst, 16);\n\ + VXC_WriteImage(output, coord.xy, dstval, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + }\n\ +}\n\ +/*****************************layernorm uint8 to uint8****************************/\n\ +_viv_uniform VXC_512Bits uniConvert1stUint8SubZpToFp32_4x4;\n\ +_viv_uniform VXC_512Bits uniConvert2ndUint8SubZpToFp32_4x4;\n\ +_viv_uniform VXC_512Bits uniConvert3rdUint8SubZpToFp32_4x4;\n\ +_viv_uniform VXC_512Bits uniConvert4thUint8SubZpToFp32_4x4;\n\ +_viv_uniform VXC_512Bits uniSumU8_16x1;\n\ +_viv_uniform VXC_512Bits uniSqrSum_16x1;\n\ +_viv_uniform float input_scale;\n\ +_viv_uniform int inputZP;\n\ +_viv_uniform float outputScale;\n\ +_viv_uniform float output_zp;\n\ +_viv_uniform int sumInZp;\n\ +_viv_uniform int tmpZp1;\n\ +_viv_uniform int tmpZp2;\n\ +_viv_uniform float e2InScale;\n\ +_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\ +\n\ +_viv_uniform VXC_512Bits uniInt16SumSqr_dp8x2;\n\ +_viv_uniform float dimRatio_scale;\n\ +\n\ +__kernel void layer_norm_U8F32toU8_2D(\n\ + image2d_t input, image2d_t bias, image2d_t scale,\n\ + image2d_t output, float eps)\n\ +{\n\ + int4 coord = (int4)(0, get_global_id(1), 0, 0);\n\ + vxc_uchar16 src0, src2;\n\ + float sum = 0, sqr = 0;\n\ + vxc_float4 bias_f0, bias_f1, bias_f2, bias_f3, scale_f0, scale_f1, scale_f2, scale_f3;\n\ + int tmpSum = 0, tmpSqr = 0;\n\ + vxc_int4 tmpSum1;\n\ + vxc_int4 tmpSqr1;\n\ + short zp = inputZP;\n\ +\n\ + for(coord.x = 0; coord.x < width; coord.x += 16)\n\ + {\n\ + VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumU8_16x1);\n\ + tmpSum += (tmpSum1.x);\n\ + VXC_DP16x1(tmpSqr1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSqrSum_16x1);\n\ + tmpSqr += (tmpSqr1.x + tmpZp1 * tmpSum1.x);\n\ + }\n\ + sum = (tmpSum + sumInZp) * input_scale;\n\ + sqr = (tmpSqr + tmpZp2) * e2InScale;\n\ +\n\ + float mean, vari;\n\ + mean = sum * dimRatio;\n\ + vari = sqr*dimRatio - mean*mean;\n\ + vari += eps;\n\ + vari = rsqrt(vari);\n\ + vxc_int4 tmpVal0, tmpVal1;\n\ + vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3;\n\ +\n\ + Image img1 = create_image_from_image2d(bias, 4);\n\ + Image img2 = create_image_from_image2d(scale, 4);\n\ + __global float* bias_ptr = (__global float*)get_image_ptr_from_coord(img1, coord.zw);\n\ + __global float* scale_ptr = (__global float*)get_image_ptr_from_coord(img2, coord.zw);\n\ + for(coord.x = 0; coord.x < width; coord.x += 16)\n\ + {\n\ + VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + bias_f0 = vload4(0, bias_ptr);\n\ + bias_f1 = vload4(1, bias_ptr);\n\ + bias_f2 = vload4(2, bias_ptr);\n\ + bias_f3 = vload4(3, bias_ptr);\n\ + scale_f0 = vload4(0, scale_ptr);\n\ + scale_f1 = vload4(1, scale_ptr);\n\ + scale_f2 = vload4(2, scale_ptr);\n\ + scale_f3 = vload4(3, scale_ptr);\n\ + bias_ptr += 16;\n\ + scale_ptr += 16;\n\ +\n\ + VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvert1stUint8SubZpToFp32_4x4);\n\ + VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvert2ndUint8SubZpToFp32_4x4);\n\ + VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvert3rdUint8SubZpToFp32_4x4);\n\ + VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvert4thUint8SubZpToFp32_4x4);\n\ + tmpData0 = tmpData0 * input_scale - mean;\n\ + tmpData1 = tmpData1 * input_scale - mean;\n\ + tmpData2 = tmpData2 * input_scale - mean;\n\ + tmpData3 = tmpData3 * input_scale - mean;\n\ +\n\ + vxc_float4 norm;\n\ + norm = scale_f0 * vari * tmpData0 + bias_f0;\n\ + tmpVal0 = convert_int4_rte(norm * outputScale + output_zp);\n\ +\n\ + norm = scale_f1 * vari * tmpData1 + bias_f1;\n\ + tmpVal1 = convert_int4_rte(norm * outputScale + output_zp);\n\ + VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ + uniConvertInt32toUint8_2x8);\n\ +\n\ + norm = scale_f2 * vari * tmpData2 + bias_f2;\n\ + tmpVal0 = convert_int4_rte(norm * outputScale + output_zp);\n\ +\n\ + norm = scale_f3 * vari * tmpData3 + bias_f3;\n\ + tmpVal1 = convert_int4_rte(norm * outputScale + output_zp);\n\ + VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1),\\\n\ + uniConvertInt32toUint8_2x8);\n\ + VXC_WriteImage(output, coord.xy, src2, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + }\n\ +}\n\ +\n\ +__kernel void layer_norm_I16F32toI16_2D(\n\ + image2d_t input, image2d_t bias, image2d_t scale,\n\ + image2d_t output, float eps)\n\ +{\n\ + int4 coord = (int4)(0, get_global_id(1), 0, 0);\n\ +\n\ + vxc_short8 src0, src1, dst;\n\ + vxc_float sum = 0, sqr = 0;\n\ + for(; coord.x < width;)\n\ + {\n\ + VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + coord.x += 8;\n\ + vxc_float4 sumsqr;\n\ + VXC_DP8x2(sumsqr, src0, src0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\\\n\ + uniInt16SumSqr_dp8x2);\n\ + sum += sumsqr.x;\n\ + sqr = sqr + sumsqr.y * e2InScale;\n\ + }\n\ + vxc_float mean, vari;\n\ + mean = sum * dimRatio_scale;\n\ + vari = sqr * dimRatio - mean * mean;\n\ + vari += eps;\n\ + vari = rsqrt(vari);\n\ +\n\ + short zp = inputZP;\n\ + vxc_float4 tmpData0, tmpData1;\n\ + vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;\n\ + vxc_half8 scale_h;\n\ + vxc_int4 tmpVal0, tmpVal1;\n\ +\n\ + Image img1 = create_image_from_image2d(bias, 4);\n\ + Image img2 = create_image_from_image2d(scale, 4);\n\ +\n\ + __global float* bias_ptr = (__global float*)get_image_ptr_from_coord(img1, coord.zw);\n\ + __global float* scale_ptr = (__global float*)get_image_ptr_from_coord(img2, coord.zw);\n\ + for(coord.x = 0; coord.x < width; coord.x += 8)\n\ + {\n\ + VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + bias_f0 = vload4(0, bias_ptr);\n\ + bias_f1 = vload4(1, bias_ptr);\n\ + scale_f0 = vload4(0, scale_ptr);\n\ + scale_f1 = vload4(1, scale_ptr);\n\ + bias_ptr += 8;\n\ + scale_ptr += 8;\n\ +\n\ + VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvert1stUint8SubZpToFp32_4x4);\n\ + VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvert2ndUint8SubZpToFp32_4x4);\n\ +\n\ + vxc_float4 sub, norm;\n\ + sub = tmpData0 * input_scale - mean;\n\ + norm = scale_f0 * vari * sub + bias_f0;\n\ + tmpVal0 = convert_int4_rte(norm * outputScale + output_zp);\n\ + sub = tmpData1 * input_scale - mean;\n\ + norm = scale_f1 * vari * sub + bias_f1;\n\ + tmpVal1 = convert_int4_rte(norm * outputScale + output_zp);\n\ +\n\ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ + uniConvertInt32toUint8_2x8);\n\ + VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + }\n\ +}"; /* end of layer_normalization_scale_f32_2d_vx*/ + +static const char layer_normalization_scale_f32_bf16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +/**************************layernorm float16***********************************/\n\ +_viv_uniform int width;\n\ +_viv_uniform float dimRatio;\n\ +_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;\n\ +_viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8;\n\ +_viv_uniform VXC_512Bits uniExtractOddData_2x8;\n\ +\n\ +__kernel void layer_norm_BF16F32toBF16(\n\ + image2d_array_t input, image2d_t bias, image2d_t scale,\n\ + image2d_array_t output, float eps)\n\ +{\n\ + int4 coord = (int4)(0, get_global_id(1), get_global_id(2), 0);\n\ + int4 coord_out = coord;\n\ +\n\ + int8 input_desc, output_desc;\n\ + _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ + int baseAddr_a = (int)get_global_id(2) * input_desc.s4 + input_desc.s0;\n\ + _viv_asm(MOV, coord.z, baseAddr_a);\n\ +\n\ + vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0);\n\ + float4 ones = (float4)(1.0, 1.0, 1.0, 1.0);\n\ + vxc_ushort8 src0, src1, src2;\n\ + vxc_float sum = 0, sqr = 0;\n\ + VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + Image img1 = create_image_from_image2d(bias, 4);\n\ + Image img2 = create_image_from_image2d(scale, 4);\n\ +\n\ + _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\ + int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0;\n\ + _viv_asm(MOV, coord_out.w, baseAddr);\n\ + float4 srcA, srcB;\n\ + for(coord.x = 8; coord.x < (width+8); coord.x += 8)\n\ + {\n\ + VXC_DP2x8(src1, src0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\ + uniConvBF16toF32_Part0_2x8);\n\ + VXC_DP2x8(src2, src0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\ + uniConvBF16toF32_Part1_2x8);\n\ + _viv_asm(COPY, srcA, src1, 16);\n\ + _viv_asm(COPY, srcB, src2, 16);\n\ + VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + sum += dot(srcA, ones) + dot(srcB, ones);\n\ + sqr += dot(srcA * srcA, ones) + dot(srcB * srcB, ones);\n\ + }\n\ + vxc_float mean;\n\ + mean = sum * dimRatio;\n\ + vxc_float vari;\n\ + vari = sqr*dimRatio - mean*mean;\n\ + vari += eps;\n\ + vari = rsqrt(vari);\n\ + vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;\n\ + __global float* bias_ptr = (__global float*)get_image_ptr_from_coord(img1, coord.ww);\n\ + __global float* scale_ptr = (__global float*)get_image_ptr_from_coord(img2, coord.ww);\n\ +\n\ + for(coord.x = 0; coord.x < width; coord.x += 8)\n\ + {\n\ + VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + bias_f0 = vload4(0, bias_ptr);\n\ + bias_f1 = vload4(1, bias_ptr);\n\ + scale_f0 = vload4(0, scale_ptr);\n\ + scale_f1 = vload4(1, scale_ptr);\n\ + bias_ptr += 8;\n\ + scale_ptr += 8;\n\ +\n\ + VXC_DP2x8(src1, src0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\ + uniConvBF16toF32_Part0_2x8);\n\ + VXC_DP2x8(src2, src0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\ + uniConvBF16toF32_Part1_2x8);\n\ + _viv_asm(COPY, srcA, src1, 16);\n\ + _viv_asm(COPY, srcB, src2, 16);\n\ +\n\ +\n\ + vxc_float4 sub0, sub1, norm0, norm1;\n\ + sub0 = srcA - mean;\n\ + sub1 = srcB - mean;\n\ + norm0 = scale_f0 * vari * sub0 + bias_f0;\n\ + norm1 = scale_f1 * vari * sub1 + bias_f1;\n\ +\n\ + _viv_asm(COPY, src0, norm0, 16);\n\ + _viv_asm(COPY, src1, norm1, 16);\n\ + VXC_DP2x8(src2, src0, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);\n\ +\n\ + coord_out.x = coord.x;\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, src2, \\\n\ + VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ + }\n\ +}\n\ +\n\ +__kernel void layer_norm_BF16F32toBF16_2D(\n\ + image2d_t input, image2d_t bias, image2d_t scale,\n\ + image2d_t output, float eps)\n\ +{\n\ + int4 coord = (int4)(0, get_global_id(1), 0, 0);\n\ + vxc_ushort8 src0, src1, src2;\n\ + vxc_float sum = 0, sqr = 0;\n\ + VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0);\n\ + float4 ones = (float4)(1.0, 1.0, 1.0, 1.0);\n\ + Image img1 = create_image_from_image2d(bias, 4);\n\ + Image img2 = create_image_from_image2d(scale, 4);\n\ + float4 srcA, srcB;\n\ + for(coord.x = 8; coord.x < (width+8); coord.x += 8)\n\ + {\n\ + VXC_DP2x8(src1, src0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\ + uniConvBF16toF32_Part0_2x8);\n\ + VXC_DP2x8(src2, src0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\ + uniConvBF16toF32_Part1_2x8);\n\ + _viv_asm(COPY, srcA, src1, 16);\n\ + _viv_asm(COPY, srcB, src2, 16);\n\ + VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + sum += dot(srcA, ones) + dot(srcB, ones);\n\ + sqr += dot(srcA * srcA, ones) + dot(srcB * srcB, ones);\n\ + }\n\ + vxc_float mean;\n\ + mean = sum * dimRatio;\n\ + vxc_float vari;\n\ + vari = sqr*dimRatio - mean*mean;\n\ + vari += eps;\n\ + vari = rsqrt(vari);\n\ + vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;\n\ + __global float* bias_ptr = (__global float*)get_image_ptr_from_coord(img1, coord.zw);\n\ + __global float* scale_ptr = (__global float*)get_image_ptr_from_coord(img2, coord.zw);\n\ + for(coord.x = 0; coord.x < width; coord.x += 8)\n\ + {\n\ + VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + bias_f0 = vload4(0, bias_ptr);\n\ + bias_f1 = vload4(1, bias_ptr);\n\ + scale_f0 = vload4(0, scale_ptr);\n\ + scale_f1 = vload4(1, scale_ptr);\n\ + bias_ptr += 8;\n\ + scale_ptr += 8;\n\ +\n\ + VXC_DP2x8(src1, src0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\ + uniConvBF16toF32_Part0_2x8);\n\ + VXC_DP2x8(src2, src0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\ + uniConvBF16toF32_Part1_2x8);\n\ + _viv_asm(COPY, srcA, src1, 16);\n\ + _viv_asm(COPY, srcB, src2, 16);\n\ +\n\ + vxc_float4 sub0, sub1, norm0, norm1;\n\ + sub0 = srcA - mean;\n\ + sub1 = srcB - mean;\n\ + norm0 = scale_f0 * vari * sub0 + bias_f0;\n\ + norm1 = scale_f1 * vari * sub1 + bias_f1;\n\ +\n\ + _viv_asm(COPY, src0, norm0, 16);\n\ + _viv_asm(COPY, src1, norm1, 16);\n\ + VXC_DP2x8(src2, src0, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);\n\ + VXC_WriteImage(output, coord.xy, src2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + }\n\ +}"; /* end of layer_normalization_scale_f32_bf16_vx*/ + static const char layer_normalization_u8_f16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ \n\ /*****************************layernorm uint8 to fp16****************************/\n\ @@ -20838,6 +24656,213 @@ __kernel void moments_axis2_F16toF16(\n\ VXC_WriteImage(output_vari, coord_out, dst.s4567, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ }"; /* end of moments_axis2_vx*/ +static const char one_hot_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform VXC_512Bits uniDataConvert_0_4x4;\n\ +_viv_uniform VXC_512Bits uniDataConvert_1_4x4;\n\ +_viv_uniform VXC_512Bits uniExtract8Data_2x8;\n\ +_viv_uniform int depth;\n\ +#define ONE_HOT_SH_IMPL(name0, name1, src_type, copy_type, dst_type) \\\n\ +__kernel void one_hot_##name0##to##name1 \\\n\ + ( \\\n\ + __read_only image2d_array_t input, \\\n\ + __write_only image2d_array_t output, \\\n\ + int suffix_sz, \\\n\ + int on_val, \\\n\ + int off_val \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0); \\\n\ + \\\n\ + copy_type src; \\\n\ + src_type val; \\\n\ + \\\n\ + VXC_ReadImage(val, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src, val, 16); \\\n\ + \\\n\ + int4 data0, data1; \\\n\ + VXC_DP4x4(data0, src, src, \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataConvert_0_4x4); \\\n\ + VXC_DP4x4(data1, src, src, \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataConvert_1_4x4); \\\n\ + \\\n\ + do \\\n\ + { \\\n\ + int4 d0 = data0 == coord.zzzz ? on_val : off_val; \\\n\ + int4 d1 = data1 == coord.zzzz ? on_val : off_val; \\\n\ + \\\n\ + dst_type dst; \\\n\ + VXC_DP2x8(dst, d0, d1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8); \\\n\ + \\\n\ + VXC_WriteImage2DArray(output, coord.xzyw, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + coord.z ++; \\\n\ + } while (coord.z < depth); \\\n\ +}\n\ +ONE_HOT_SH_IMPL(F16, F16, vxc_ushort8, vxc_half8, vxc_ushort8)\n\ +ONE_HOT_SH_IMPL(F16, I16, vxc_ushort8, vxc_half8, vxc_ushort8)\n\ +ONE_HOT_SH_IMPL(F16, I8, vxc_ushort8, vxc_half8, vxc_uchar8)\n\ +ONE_HOT_SH_IMPL(F16, U8, vxc_ushort8, vxc_half8, vxc_uchar8)\n\ +ONE_HOT_SH_IMPL(I16, F16, vxc_short8, vxc_short8, vxc_ushort8)\n\ +ONE_HOT_SH_IMPL(I16, I16, vxc_short8, vxc_short8, vxc_ushort8)\n\ +ONE_HOT_SH_IMPL(I8, F16, vxc_char8, vxc_char8, vxc_ushort8)\n\ +ONE_HOT_SH_IMPL(I8, I8, vxc_char8, vxc_char8, vxc_uchar8)\n\ +\n\ +#define ONE_HOT_SH_IMPL_2D(name0, name1, src_type, copy_type, dst_type) \\\n\ +__kernel void one_hot_##name0##to##name1##_2D \\\n\ + ( \\\n\ + __read_only image2d_array_t input, \\\n\ + __write_only image2d_array_t output, \\\n\ + int suffix_sz, \\\n\ + int on_val, \\\n\ + int off_val \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord = (int4)(get_global_id(0), 0, get_global_id(0), get_global_id(0)); \\\n\ + \\\n\ + copy_type src; \\\n\ + src_type val; \\\n\ + \\\n\ + VXC_ReadImage(val, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src, val, 16); \\\n\ + \\\n\ + int4 data, data0, data1; \\\n\ + VXC_DP4x4(data, src, src, \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataConvert_0_4x4); \\\n\ + int4 d4 = (int4)(0, 1, 2, 3); \\\n\ + \\\n\ + do \\\n\ + { \\\n\ + coord.zw = coord.xx + (int2)(0, 1); \\\n\ + dst_type dst; \\\n\ + data0 = data.xxxx == d4 ? on_val : off_val; \\\n\ + data1 = data.yyyy == d4 ? on_val : off_val; \\\n\ + \\\n\ + VXC_DP2x8(dst, data0, data1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8); \\\n\ + \\\n\ + VXC_WriteImage(output, coord.yz, dst, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\ + VXC_WriteImage(output, coord.yw, dst, VXC_MODIFIER(4, 7, 0,VXC_RM_TowardZero, 0)); \\\n\ + coord.zw = coord.zw + (int2)(2, 2); \\\n\ + \\\n\ + data0 = data.zzzz == d4 ? on_val : off_val; \\\n\ + data1 = data.wwww == d4 ? on_val : off_val; \\\n\ + \\\n\ + VXC_DP2x8(dst, data0, data1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8); \\\n\ + \\\n\ + VXC_WriteImage(output, coord.yz, dst, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\ + VXC_WriteImage(output, coord.yw, dst, VXC_MODIFIER(4, 7, 0,VXC_RM_TowardZero, 0)); \\\n\ + d4 += 4; \\\n\ + coord.y += 4; \\\n\ + } while (coord.y < depth); \\\n\ +}\n\ +ONE_HOT_SH_IMPL_2D(F16, F16, vxc_ushort8, vxc_half8, vxc_ushort8)\n\ +ONE_HOT_SH_IMPL_2D(F16, I16, vxc_ushort8, vxc_half8, vxc_ushort8)\n\ +ONE_HOT_SH_IMPL_2D(F16, I8, vxc_ushort8, vxc_half8, vxc_uchar8)\n\ +ONE_HOT_SH_IMPL_2D(F16, U8, vxc_ushort8, vxc_half8, vxc_uchar8)\n\ +ONE_HOT_SH_IMPL_2D(I16, F16, vxc_short8, vxc_short8, vxc_ushort8)\n\ +ONE_HOT_SH_IMPL_2D(I16, I16, vxc_short8, vxc_short8, vxc_ushort8)\n\ +ONE_HOT_SH_IMPL_2D(I8, F16, vxc_char8, vxc_char8, vxc_ushort8)\n\ +ONE_HOT_SH_IMPL_2D(I8, I8, vxc_char8, vxc_char8, vxc_uchar8)\n\ +\n\ +_viv_uniform float input_scale;\n\ +_viv_uniform float input_tail;\n\ +#define ONE_HOT_ASYM_SH_IMPL(name0, name1, src_type, copy_type, dst_type) \\\n\ +__kernel void one_hot_##name0##to##name1 \\\n\ + ( \\\n\ + __read_only image2d_array_t input, \\\n\ + __write_only image2d_array_t output, \\\n\ + int suffix_sz, \\\n\ + int on_val, \\\n\ + int off_val \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0); \\\n\ + \\\n\ + copy_type src; \\\n\ + src_type val; \\\n\ + \\\n\ + VXC_ReadImage(val, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src, val, 16); \\\n\ + \\\n\ + int4 data0, data1; \\\n\ + float4 v0, v1; \\\n\ + VXC_DP4x4(v0, src, src, \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataConvert_0_4x4); \\\n\ + VXC_DP4x4(v1, src, src, \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataConvert_1_4x4); \\\n\ + \\\n\ + data0 = convert_int4(v0 * input_scale + input_tail); \\\n\ + data1 = convert_int4(v1 * input_scale + input_tail); \\\n\ + do \\\n\ + { \\\n\ + int4 d0 = data0 == coord.zzzz ? on_val : off_val; \\\n\ + int4 d1 = data1 == coord.zzzz ? on_val : off_val; \\\n\ + \\\n\ + dst_type dst; \\\n\ + VXC_DP2x8(dst, d0, d1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8); \\\n\ + \\\n\ + VXC_WriteImage2DArray(output, coord.xzyw, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + coord.z ++; \\\n\ + } while (coord.z < depth); \\\n\ +}\n\ +ONE_HOT_ASYM_SH_IMPL(U8, F16, vxc_uchar8, vxc_uchar8, vxc_ushort8)\n\ +ONE_HOT_ASYM_SH_IMPL(U8, U8, vxc_uchar8, vxc_uchar8, vxc_uchar8)\n\ +\n\ +#define ONE_HOT_ASYM_SH_IMPL_2D(name0, name1, src_type, copy_type, dst_type) \\\n\ +__kernel void one_hot_##name0##to##name1##_2D \\\n\ + ( \\\n\ + __read_only image2d_array_t input, \\\n\ + __write_only image2d_array_t output, \\\n\ + int suffix_sz, \\\n\ + int on_val, \\\n\ + int off_val \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord = (int4)(get_global_id(0), 0, get_global_id(0), get_global_id(0)); \\\n\ + \\\n\ + copy_type src; \\\n\ + src_type val; \\\n\ + \\\n\ + VXC_ReadImage(val, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src, val, 16); \\\n\ + \\\n\ + int4 data, data0, data1; \\\n\ + float4 v0; \\\n\ + VXC_DP4x4(v0, src, src, \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataConvert_0_4x4); \\\n\ + int4 d4 = (int4)(0, 1, 2, 3); \\\n\ + data = convert_int4(v0 * input_scale + input_tail); \\\n\ + \\\n\ + do \\\n\ + { \\\n\ + coord.zw = coord.xx + (int2)(0, 1); \\\n\ + dst_type dst; \\\n\ + data0 = data.xxxx == d4 ? on_val : off_val; \\\n\ + data1 = data.yyyy == d4 ? on_val : off_val; \\\n\ + \\\n\ + VXC_DP2x8(dst, data0, data1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8); \\\n\ + \\\n\ + VXC_WriteImage(output, coord.yz, dst, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\ + VXC_WriteImage(output, coord.yw, dst, VXC_MODIFIER(4, 7, 0,VXC_RM_TowardZero, 0)); \\\n\ + coord.zw = coord.zw + (int2)(2, 2); \\\n\ + \\\n\ + data0 = data.zzzz == d4 ? on_val : off_val; \\\n\ + data1 = data.wwww == d4 ? on_val : off_val; \\\n\ + \\\n\ + VXC_DP2x8(dst, data0, data1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8); \\\n\ + \\\n\ + VXC_WriteImage(output, coord.yz, dst, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\ + VXC_WriteImage(output, coord.yw, dst, VXC_MODIFIER(4, 7, 0,VXC_RM_TowardZero, 0)); \\\n\ + d4 += 4; \\\n\ + coord.y += 4; \\\n\ + } while (coord.y < depth); \\\n\ +}\n\ +ONE_HOT_ASYM_SH_IMPL_2D(U8, F16, vxc_uchar8, vxc_uchar8, vxc_ushort8)\n\ +ONE_HOT_ASYM_SH_IMPL_2D(U8, U8, vxc_uchar8, vxc_uchar8, vxc_uchar8)\n\ +\n\ +"; /* end of one_hot_vx*/ + static const char poolwithargmax_F16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ \n\ //-------------------max pooling with argmax---------------\n\ @@ -24667,10 +28692,16 @@ _viv_uniform int r_order;\n\ _viv_uniform int b_order;\n\ _viv_uniform VXC_512Bits uniExtractRtoF32_part0_4x4;\n\ _viv_uniform VXC_512Bits uniExtractRtoF32_part1_4x4;\n\ +_viv_uniform VXC_512Bits uniExtractRtoF32_part2_4x4;\n\ +_viv_uniform VXC_512Bits uniExtractRtoF32_part3_4x4;\n\ _viv_uniform VXC_512Bits uniExtractGtoF32_part0_4x4;\n\ _viv_uniform VXC_512Bits uniExtractGtoF32_part1_4x4;\n\ +_viv_uniform VXC_512Bits uniExtractGtoF32_part2_4x4;\n\ +_viv_uniform VXC_512Bits uniExtractGtoF32_part3_4x4;\n\ _viv_uniform VXC_512Bits uniExtractBtoF32_part0_4x4;\n\ _viv_uniform VXC_512Bits uniExtractBtoF32_part1_4x4;\n\ +_viv_uniform VXC_512Bits uniExtractBtoF32_part2_4x4;\n\ +_viv_uniform VXC_512Bits uniExtractBtoF32_part3_4x4;\n\ _viv_uniform VXC_512Bits uniExtract8Data_2x8;\n\ \n\ #define IMAGE_PRE_PROCESS_COPY_16BITS(dst_name, dst_type, copy_type, convert_type) \\\n\ @@ -24692,13 +28723,14 @@ __kernel void pre_process_rgb_copy_U8to##dst_name \\\n\ { \\\n\ int2 coord = (int2)(get_global_id(0) * 3, get_global_id(1)); \\\n\ \\\n\ - coord.xy += (int2) (*xOffset, *yOffset); \\\n\ + coord.xy = coord.xy + (int2) (*xOffset * 3 + 16, *yOffset); \\\n\ vxc_uchar16 src0, src1; \\\n\ dst_type dst0; \\\n\ copy_type dst; \\\n\ \\\n\ - VXC_ReadImage(src0, input, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ - VXC_ReadImage(src1, input, coord.xy, VXC_5BITOFFSET_XY(15, 0), \\\n\ + VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(-16, 0), \\\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(src1, input, coord.xy, VXC_5BITOFFSET_XY(0, 0), \\\n\ VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ \\\n\ f32Var *= outputScale; \\\n\ @@ -24709,7 +28741,7 @@ __kernel void pre_process_rgb_copy_U8to##dst_name \\\n\ float4 tmp0, tmp1; \\\n\ convert_type result0, result1; \\\n\ \\\n\ - VXC_DP4x4(tmp0, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractRtoF32_part0_4x4); \\\n\ + VXC_DP4x4(tmp0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractRtoF32_part0_4x4); \\\n\ VXC_DP4x4(tmp1, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractRtoF32_part1_4x4); \\\n\ tmp0 = tmp0 * paramData.w - paramData.x; \\\n\ tmp1 = tmp1 * paramData.w - paramData.x; \\\n\ @@ -24720,7 +28752,7 @@ __kernel void pre_process_rgb_copy_U8to##dst_name \\\n\ VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ \\\n\ coord_out.z = 1; \\\n\ - VXC_DP4x4(tmp0, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractGtoF32_part0_4x4); \\\n\ + VXC_DP4x4(tmp0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractGtoF32_part0_4x4); \\\n\ VXC_DP4x4(tmp1, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractGtoF32_part1_4x4); \\\n\ tmp0 = tmp0 * paramData.w - paramData.y; \\\n\ tmp1 = tmp1 * paramData.w - paramData.y; \\\n\ @@ -24731,7 +28763,7 @@ __kernel void pre_process_rgb_copy_U8to##dst_name \\\n\ VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ \\\n\ coord_out.z = b_order; \\\n\ - VXC_DP4x4(tmp0, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractBtoF32_part0_4x4); \\\n\ + VXC_DP4x4(tmp0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractBtoF32_part0_4x4); \\\n\ VXC_DP4x4(tmp1, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractBtoF32_part1_4x4); \\\n\ tmp0 = tmp0 * paramData.w - paramData.z; \\\n\ tmp1 = tmp1 * paramData.w - paramData.z; \\\n\ @@ -24762,12 +28794,16 @@ __kernel void pre_process_rgb_copy_U8to##dst_name \\\n\ ) \\\n\ { \\\n\ int2 coord = (int2)(get_global_id(0) * 3, get_global_id(1)); \\\n\ - coord.xy += (int2) (*xOffset, *yOffset); \\\n\ - vxc_uchar16 src0, src1; \\\n\ + coord.xy = coord.xy + (int2) (*xOffset * 3 + 16, *yOffset); \\\n\ + vxc_uchar16 src0, src1, src2; \\\n\ dst_type dst; \\\n\ \\\n\ - VXC_ReadImage(src0, input, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ - VXC_ReadImage(src1, input, coord.xy, VXC_5BITOFFSET_XY(15, 0), \\\n\ + VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(-16, 0), \\\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(src1, input, coord.xy, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord.x += 16; \\\n\ + VXC_ReadImage(src2, input, coord.xy, VXC_5BITOFFSET_XY(0, 0), \\\n\ VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ \\\n\ f32Var *= outputScale; \\\n\ @@ -24778,35 +28814,55 @@ __kernel void pre_process_rgb_copy_U8to##dst_name \\\n\ float4 tmp0, tmp1; \\\n\ int4 result0, result1; \\\n\ \\\n\ - VXC_DP4x4(tmp0, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractRtoF32_part0_4x4); \\\n\ + VXC_DP4x4(tmp0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractRtoF32_part0_4x4); \\\n\ VXC_DP4x4(tmp1, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractRtoF32_part1_4x4); \\\n\ tmp0 = tmp0 * paramData.w - paramData.x; \\\n\ tmp1 = tmp1 * paramData.w - paramData.x; \\\n\ result0 = convert_int4_rte(tmp0); \\\n\ result1 = convert_int4_rte(tmp1); \\\n\ VXC_DP2x8(dst, result0, result1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \\\n\ - VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ - \\\n\ + VXC_DP4x4(tmp0, src1, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractRtoF32_part2_4x4); \\\n\ + VXC_DP4x4(tmp1, src1, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractRtoF32_part3_4x4); \\\n\ + tmp0 = tmp0 * paramData.w - paramData.x; \\\n\ + tmp1 = tmp1 * paramData.w - paramData.x; \\\n\ + result0 = convert_int4_rte(tmp0); \\\n\ + result1 = convert_int4_rte(tmp1); \\\n\ + VXC_DP2x8(dst, result0, result1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \\\n\ + VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ \\\n\ coord_out.z = 1; \\\n\ - VXC_DP4x4(tmp0, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractGtoF32_part0_4x4); \\\n\ + VXC_DP4x4(tmp0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractGtoF32_part0_4x4); \\\n\ VXC_DP4x4(tmp1, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractGtoF32_part1_4x4); \\\n\ tmp0 = tmp0 * paramData.w - paramData.y; \\\n\ tmp1 = tmp1 * paramData.w - paramData.y; \\\n\ result0 = convert_int4_rte(tmp0); \\\n\ result1 = convert_int4_rte(tmp1); \\\n\ VXC_DP2x8(dst, result0, result1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \\\n\ - VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_DP4x4(tmp0, src1, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractGtoF32_part2_4x4); \\\n\ + VXC_DP4x4(tmp1, src1, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractGtoF32_part3_4x4); \\\n\ + tmp0 = tmp0 * paramData.w - paramData.y; \\\n\ + tmp1 = tmp1 * paramData.w - paramData.y; \\\n\ + result0 = convert_int4_rte(tmp0); \\\n\ + result1 = convert_int4_rte(tmp1); \\\n\ + VXC_DP2x8(dst, result0, result1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \\\n\ + VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ \\\n\ coord_out.z = b_order; \\\n\ - VXC_DP4x4(tmp0, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractBtoF32_part0_4x4); \\\n\ + VXC_DP4x4(tmp0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractBtoF32_part0_4x4); \\\n\ VXC_DP4x4(tmp1, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractBtoF32_part1_4x4); \\\n\ tmp0 = tmp0 * paramData.w - paramData.z; \\\n\ tmp1 = tmp1 * paramData.w - paramData.z; \\\n\ result0 = convert_int4_rte(tmp0); \\\n\ result1 = convert_int4_rte(tmp1); \\\n\ VXC_DP2x8(dst, result0, result1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \\\n\ - VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_DP4x4(tmp0, src1, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractBtoF32_part2_4x4); \\\n\ + VXC_DP4x4(tmp1, src1, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractBtoF32_part3_4x4); \\\n\ + tmp0 = tmp0 * paramData.w - paramData.z; \\\n\ + tmp1 = tmp1 * paramData.w - paramData.z; \\\n\ + result0 = convert_int4_rte(tmp0); \\\n\ + result1 = convert_int4_rte(tmp1); \\\n\ + VXC_DP2x8(dst, result0, result1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \\\n\ + VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ }\n\ IMAGE_PRE_PROCESS_COPY_8BITS(U8, vxc_uchar16)\n\ IMAGE_PRE_PROCESS_COPY_8BITS(I8, vxc_char16)\n\ @@ -24867,7 +28923,7 @@ __kernel void pre_process_yuv420_copy_U8toU8(\n\ )\n\ {\n\ int4 pos = (int4)(get_global_id(0) + (*xOffset), get_global_id(1) + (*yOffset), 0, 0);\n\ - int4 pos1 = (int4)((get_global_id(0) + (*xOffset)) >> 1, (get_global_id(1) + (*yOffset)) >> 1, 0, 0);\n\ + int2 pos1 = (int2)((get_global_id(0) + (*xOffset)) >> 1, (get_global_id(1) + (*yOffset)) >> 1);\n\ vxc_uchar16 Y;\n\ vxc_uchar8 U, V;\n\ vxc_int4 C0, C1, C2, C3;\n\ @@ -24946,6 +29002,112 @@ __kernel void pre_process_yuv420_copy_U8toU8(\n\ VXC_WriteImage2DArray(output, pos, dst2, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ }\n\ \n\ +__kernel void pre_process_yuv420_copy_U8toF16(\n\ + __read_only image2d_t y_img,\n\ + __read_only image2d_t u_img,\n\ + __read_only image2d_t v_img,\n\ + __write_only image2d_array_t output,\n\ + global int * xRatio,\n\ + global int * yRatio,\n\ + global int * xOffset,\n\ + global int * yOffset,\n\ + float rMean,\n\ + float gMean,\n\ + float bMean,\n\ + float var,\n\ + int reverse_channel,\n\ + int trans\n\ + )\n\ +{\n\ + int4 pos = (int4)(get_global_id(0) + (*xOffset), get_global_id(1) + (*yOffset), 0, 0);\n\ + int2 pos1 = (int2)((get_global_id(0) + (*xOffset)) >> 1, (get_global_id(1) + (*yOffset)) >> 1);\n\ + vxc_uchar16 Y;\n\ + vxc_uchar8 U, V;\n\ + vxc_int4 C0, C1, C2, C3;\n\ + vxc_uchar16 R, G, B;\n\ + vxc_half8 dst0, dst1, dst2, dst3, dst4, dst5;\n\ + vxc_short8 out0, out1, out2, out3, out4, out5;\n\ +\n\ + VXC_ReadImage(Y, y_img, pos.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(U, u_img, pos1.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(V, v_img, pos1.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + //C = Y - 16;\n\ + //D = U - 128;\n\ + //E = V - 128;\n\ + // calculate R\n\ + // ((298 * C + 409 * E + 128) >> 8) --> [(298Y + 409V - 56992) >> 8]\n\ + int tmpV = -56992;\n\ + VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR1st_4x4);\n\ + VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR2nd_4x4);\n\ + VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR3rd_4x4);\n\ + VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR4th_4x4);\n\ +\n\ + VXC_DP4x4(R, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);\n\ + VXC_DP4x4(R, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);\n\ + VXC_DP4x4(R, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);\n\ + VXC_DP4x4(R, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);\n\ +\n\ + // calculate G\n\ + // ((298 * C - 100* D - 208 * E + 128) >> 8) --> [(298Y - 100U - 208V + 34784) >> 8]\n\ + // 298Y - 208V\n\ + VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG1st_4x4);\n\ + VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG2nd_4x4);\n\ + VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG3rd_4x4);\n\ + VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG4th_4x4);\n\ + // 34784 - 100U\n\ + ushort tmpG = 34784;\n\ + vxc_ushort8 tmpDstG;\n\ + VXC_DP2x8(tmpDstG, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU_2x8);\n\ + VXC_DP4x4(G, C0, tmpDstG, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateG1st_4x4);\n\ + VXC_DP4x4(G, C1, tmpDstG, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateG2nd_4x4);\n\ + VXC_DP4x4(G, C2, tmpDstG, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateG3rd_4x4);\n\ + VXC_DP4x4(G, C3, tmpDstG, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateG4th_4x4);\n\ +\n\ + // calculate B\n\ + // ((298 * C + 516 * D + 128) >> 8) ==> [(298Y + 516U - 70688) >> 8]\n\ + VXC_DP4x4(C0, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB1st_4x4);\n\ + VXC_DP4x4(C1, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB2nd_4x4);\n\ + VXC_DP4x4(C2, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB3rd_4x4);\n\ + VXC_DP4x4(C3, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB4th_4x4);\n\ + tmpV = -70688;\n\ + VXC_DP4x4(B, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);\n\ + VXC_DP4x4(B, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);\n\ + VXC_DP4x4(B, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);\n\ + VXC_DP4x4(B, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);\n\ +\n\ + float4 paramData = (float4)(bMean * var, gMean * var,\\\n\ + rMean * var, var);\n\ + half4 paramData_f16;\n\ + _viv_asm(CONV, paramData_f16, paramData);\n\ +\n\ + VXC_DP2x8(dst0, B, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoB_2x8);\n\ + VXC_DP2x8(dst1, B, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiB_2x8);\n\ +\n\ + VXC_DP2x8(dst2, G, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoG_2x8);\n\ + VXC_DP2x8(dst3, G, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiG_2x8);\n\ +\n\ + VXC_DP2x8(dst4, R, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoR_2x8);\n\ + VXC_DP2x8(dst5, R, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiR_2x8);\n\ +\n\ + _viv_asm(COPY, out0, dst0, 16);\n\ + _viv_asm(COPY, out1, dst1, 16);\n\ + _viv_asm(COPY, out2, dst2, 16);\n\ + _viv_asm(COPY, out3, dst3, 16);\n\ + _viv_asm(COPY, out4, dst4, 16);\n\ + _viv_asm(COPY, out5, dst5, 16);\n\ +\n\ + pos = (int4)(get_global_id(0), get_global_id(1), bOrder, get_global_id(0) + 8);\n\ + VXC_WriteImage2DArray(output, pos.xyzz, out0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_WriteImage2DArray(output, pos.wyzz, out1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + pos.z = 1;\n\ + VXC_WriteImage2DArray(output, pos.xyzz, out2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_WriteImage2DArray(output, pos.wyzz, out3, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + pos.z = rOrder;\n\ + VXC_WriteImage2DArray(output, pos.xyzz, out4, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_WriteImage2DArray(output, pos.wyzz, out5, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ "; /* end of pre_process_yuv420_copy_u8_vx*/ static const char pre_process_yuv420_scale_fp16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ @@ -25919,7 +30081,7 @@ __kernel void pre_process_yuv444_copy_U8toU8(\n\ int trans\n\ )\n\ {\n\ - int4 pos = (int4)(get_global_id(0) + (*xOffset), get_global_id(1) + (*yOffset), 0, 0);\n\ + int2 pos = (int2)(get_global_id(0) + (*xOffset), get_global_id(1) + (*yOffset));\n\ vxc_uchar16 Y, U, V;\n\ vxc_int4 C0, C1, C2, C3;\n\ vxc_uchar16 R, G, B;\n\ @@ -25990,13 +30152,118 @@ __kernel void pre_process_yuv444_copy_U8toU8(\n\ VXC_DP2x8(dst2, R, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoR_2x8);\n\ VXC_DP2x8(dst2, R, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiR_2x8);\n\ \n\ - pos = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\ - pos.z = bOrder;\n\ - VXC_WriteImage2DArray(output, pos, dst0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ - pos.z = 1;\n\ - VXC_WriteImage2DArray(output, pos, dst1, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ - pos.z = rOrder;\n\ - VXC_WriteImage2DArray(output, pos, dst2, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + int4 pos1 = (int4)(get_global_id(0), get_global_id(1), bOrder, 0);\n\ + VXC_WriteImage2DArray(output, pos1, dst0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + pos1.z = 1;\n\ + VXC_WriteImage2DArray(output, pos1, dst1, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + pos1.z = rOrder;\n\ + VXC_WriteImage2DArray(output, pos1, dst2, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void pre_process_yuv444_copy_U8toF16(\n\ + __read_only image2d_t y_img,\n\ + __read_only image2d_t u_img,\n\ + __read_only image2d_t v_img,\n\ + __write_only image2d_array_t output,\n\ + global int * xRatio,\n\ + global int * yRatio,\n\ + global int * xOffset,\n\ + global int * yOffset,\n\ + float rMean,\n\ + float gMean,\n\ + float bMean,\n\ + float var,\n\ + int reverse_channel,\n\ + int trans\n\ + )\n\ +{\n\ + int2 pos = (int2)(get_global_id(0) + (*xOffset), get_global_id(1) + (*yOffset));\n\ + vxc_uchar16 Y, U, V;\n\ + vxc_int4 C0, C1, C2, C3;\n\ + vxc_uchar16 R, G, B;\n\ + vxc_half8 dst0, dst1, dst2, dst3, dst4, dst5;\n\ + vxc_short8 out0, out1, out2, out3, out4, out5;\n\ +\n\ + VXC_ReadImage(Y, y_img, pos.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(U, u_img, pos.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(V, v_img, pos.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + //C = Y - 16;\n\ + //D = U - 128;\n\ + //E = V - 128;\n\ + // calculate R\n\ + // ((298 * C + 409 * E + 128) >> 8) --> [(298Y + 409V - 56992) >> 8]\n\ + int tmpV = -56992;\n\ + VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR1st_4x4);\n\ + VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR2nd_4x4);\n\ + VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR3rd_4x4);\n\ + VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR4th_4x4);\n\ +\n\ + VXC_DP4x4(R, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);\n\ + VXC_DP4x4(R, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);\n\ + VXC_DP4x4(R, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);\n\ + VXC_DP4x4(R, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);\n\ +\n\ + // calculate G\n\ + // ((298 * C - 100* D - 208 * E + 128) >> 8) --> [(298Y - 100U - 208V + 34784) >> 8]\n\ + // 298Y - 208V\n\ + VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG1st_4x4);\n\ + VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG2nd_4x4);\n\ + VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG3rd_4x4);\n\ + VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG4th_4x4);\n\ + // 34784 - 100U\n\ + ushort tmpG = 34784;\n\ + vxc_ushort8 tmpDstG0, tmpDstG1;\n\ + VXC_DP2x8(tmpDstG0, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU_2x8);\n\ + VXC_DP2x8(tmpDstG1, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU2_2x8);\n\ +\n\ + VXC_DP4x4(G, C0, tmpDstG0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateG1st_4x4);\n\ + VXC_DP4x4(G, C1, tmpDstG0, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateG2nd_4x4);\n\ + VXC_DP4x4(G, C2, tmpDstG1, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateG1st_4x4);\n\ + VXC_DP4x4(G, C3, tmpDstG1, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateG2nd_4x4);\n\ +\n\ + // calculate B\n\ + // ((298 * C + 516 * D + 128) >> 8) ==> [(298Y + 516U - 70688) >> 8]\n\ + VXC_DP4x4(C0, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB1st_4x4);\n\ + VXC_DP4x4(C1, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB2nd_4x4);\n\ + VXC_DP4x4(C2, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB3rd_4x4);\n\ + VXC_DP4x4(C3, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB4th_4x4);\n\ + tmpV = -70688;\n\ + VXC_DP4x4(B, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);\n\ + VXC_DP4x4(B, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);\n\ + VXC_DP4x4(B, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);\n\ + VXC_DP4x4(B, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);\n\ +\n\ + float4 paramData = (float4)(bMean * var, gMean * var,\\\n\ + rMean * var, var);\n\ + half4 paramData_f16;\n\ + _viv_asm(CONV, paramData_f16, paramData);\n\ +\n\ + VXC_DP2x8(dst0, B, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoB_2x8);\n\ + VXC_DP2x8(dst1, B, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiB_2x8);\n\ +\n\ + VXC_DP2x8(dst2, G, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoG_2x8);\n\ + VXC_DP2x8(dst3, G, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiG_2x8);\n\ +\n\ + VXC_DP2x8(dst4, R, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoR_2x8);\n\ + VXC_DP2x8(dst5, R, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiR_2x8);\n\ +\n\ + _viv_asm(COPY, out0, dst0, 16);\n\ + _viv_asm(COPY, out1, dst1, 16);\n\ + _viv_asm(COPY, out2, dst2, 16);\n\ + _viv_asm(COPY, out3, dst3, 16);\n\ + _viv_asm(COPY, out4, dst4, 16);\n\ + _viv_asm(COPY, out5, dst5, 16);\n\ +\n\ + int4 pos1 = (int4)(get_global_id(0), get_global_id(1), bOrder, get_global_id(0) + 8);\n\ + VXC_WriteImage2DArray(output, pos1.xyzz, out0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_WriteImage2DArray(output, pos1.wyzz, out1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + pos1.z = 1;\n\ + VXC_WriteImage2DArray(output, pos1.xyzz, out2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_WriteImage2DArray(output, pos1.wyzz, out3, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + pos1.z = rOrder;\n\ + VXC_WriteImage2DArray(output, pos1.xyzz, out4, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_WriteImage2DArray(output, pos1.wyzz, out5, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ }\n\ "; /* end of pre_process_yuv444_copy_u8_vx*/ @@ -29189,6 +33456,465 @@ TENSOR_KERAS_RELU(U8, U8, _2D, Image, U8toF32, F32toU8, vxc_uchar TENSOR_KERAS_RELU(U8, F16, _2D, Image, U8toF32, F32toF16, vxc_uchar8, vxc_ushort8)\n\ "; /* end of relu_keras_vx*/ +static const char repeat_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform VXC_512Bits uniIntegralHorAcc_4x4;\n\ +_viv_uniform VXC_512Bits uniExtract1to8Short_2x8;\n\ +_viv_uniform int width;\n\ +\n\ +// workgroup size is 32\n\ +__kernel void preprocess_start_idx(image2d_t input, image2d_t output)\n\ +{\n\ + int lidx = get_local_id(0);\n\ + __local int lcl_sum[32];\n\ + __local int last_round[1];\n\ + Image img = create_image_from_image2d(input, 4);\n\ + Image dst = create_image_from_image2d(output, 4);\n\ + __global int* index_ptr = (__global int*)img.ptr + get_global_id(0);\n\ + __global int* output_org = (__global int*)dst.ptr;\n\ + __global int* output_ptr = output_org + get_global_id(0) + 1;\n\ +\n\ + if (lidx == 0)\n\ + {\n\ + last_round[0] = 0;\n\ + output_org[0] = 0;\n\ + }\n\ + int4 accSum0, accSum1, accSum2, accSum3;\n\ +\n\ + for(int i = 0; i < width; i += 512)\n\ + {\n\ + int4 data0 = vload4(0, index_ptr + i);\n\ + int4 data1 = vload4(1, index_ptr + i);\n\ + int4 data2 = vload4(2, index_ptr + i);\n\ + int4 data3 = vload4(3, index_ptr + i);\n\ + barrier(CLK_LOCAL_MEM_FENCE);\n\ + int prevSum = last_round[0];\n\ +\n\ + VXC_DP4x4(accSum0, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniIntegralHorAcc_4x4);\n\ + VXC_DP4x4(accSum1, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniIntegralHorAcc_4x4);\n\ + VXC_DP4x4(accSum2, data2, data2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniIntegralHorAcc_4x4);\n\ + VXC_DP4x4(accSum3, data3, data3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniIntegralHorAcc_4x4);\n\ + accSum1 += accSum0.w;\n\ + accSum2 += accSum1.w;\n\ + accSum3 += accSum2.w;\n\ +\n\ + lcl_sum[lidx] = accSum3.w;\n\ + barrier(CLK_LOCAL_MEM_FENCE);\n\ +\n\ + for(int j = 0; j < lidx; j++)\n\ + {\n\ + prevSum += lcl_sum[j];\n\ + }\n\ + accSum0 += prevSum;\n\ + accSum1 += prevSum;\n\ + accSum2 += prevSum;\n\ + accSum3 += prevSum;\n\ + if(lidx == 31)\n\ + {\n\ + last_round[0] = accSum3.w;\n\ + }\n\ + vstore4(accSum0, 0, output_ptr + i);\n\ + vstore4(accSum1, 1, output_ptr + i);\n\ + vstore4(accSum2, 2, output_ptr + i);\n\ + vstore4(accSum3, 3, output_ptr + i);\n\ + }\n\ +}\n\ +\n\ +__kernel void repeat_I16_axis0(\n\ + image2d_array_t input0, image2d_t input1, image2d_t input2,\n\ + image2d_array_t output, int axis)\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ + vxc_short8 src0;\n\ + VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + Image img1 = create_image_from_image2d(input1, 4);\n\ + Image img2 = create_image_from_image2d(input2, 4);\n\ + __global int* len_ptr = (__global int*)img1.ptr;\n\ + __global int* index_ptr = (__global int*)img2.ptr;\n\ +\n\ + int len = len_ptr[get_global_id(1)];\n\ + int start = index_ptr[get_global_id(1)];\n\ +\n\ + int8 output_desc;\n\ + _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\ + int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0;\n\ + _viv_asm(MOV, coord.z, baseAddr);\n\ + int end = len + start;\n\ +\n\ + for(coord.y = start; coord.y < end; coord.y++)\n\ + {\n\ + VXC_OP4_NoDest(img_store_3d, output, coord, src0, \\\n\ + VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ + }\n\ +}\n\ +\n\ +__kernel void repeat_I16_axis2(\n\ + image2d_array_t input0, image2d_t input1, image2d_t input2,\n\ + image2d_array_t output, int axis)\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ + vxc_short8 src0;\n\ + VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + Image img1 = create_image_from_image2d(input1, 4);\n\ + Image img2 = create_image_from_image2d(input2, 4);\n\ + __global int* len_ptr = (__global int*)img1.ptr;\n\ + __global int* index_ptr = (__global int*)img2.ptr;\n\ +\n\ + int len = len_ptr[get_global_id(2)];\n\ + int start = index_ptr[get_global_id(2)];\n\ + int end = len + start;\n\ +\n\ + for(coord.z = start; coord.z < end; coord.z++)\n\ + {\n\ + VXC_WriteImage2DArray(output, coord, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + }\n\ +}\n\ +\n\ +#define REPEAT_1D(src0_type_name, data_type) \\\n\ +__kernel void repeat_##src0_type_name##_1D( \\\n\ + image2d_t input0, image2d_t input1, image2d_t input2, \\\n\ + image2d_t output, int axis) \\\n\ +{ \\\n\ + int2 coord = (int2)(get_global_id(0), 0); \\\n\ + data_type src0; \\\n\ + VXC_ReadImage(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \\\n\ + Image img1 = create_image_from_image2d(input1, 4); \\\n\ + Image img2 = create_image_from_image2d(input2, 4); \\\n\ + __global int* len_ptr = (__global int*)img1.ptr; \\\n\ + __global int* index_ptr = (__global int*)img2.ptr; \\\n\ + int len = len_ptr[get_global_id(0)]; \\\n\ + int start = index_ptr[get_global_id(0)]; \\\n\ + \\\n\ + int iter = len >> 3; \\\n\ + int res = len & 7; \\\n\ + int end = start + iter * 8; \\\n\ + VXC_DP2x8(src0, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract1to8Short_2x8); \\\n\ + for(coord.x = start; coord.x < end; coord.x+=8) \\\n\ + { \\\n\ + VXC_WriteImage(output, coord, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + } \\\n\ + \\\n\ + if(res == 7) \\\n\ + { \\\n\ + VXC_WriteImage(output, coord, src0, VXC_MODIFIER(0, 6, 0, VXC_RM_TowardZero, 0)); \\\n\ + } \\\n\ + else if(res == 6) \\\n\ + { \\\n\ + VXC_WriteImage(output, coord, src0, VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0)); \\\n\ + } \\\n\ + else if(res == 5) \\\n\ + { \\\n\ + VXC_WriteImage(output, coord, src0, VXC_MODIFIER(0, 4, 0, VXC_RM_TowardZero, 0)); \\\n\ + } \\\n\ + else if(res == 4) \\\n\ + { \\\n\ + VXC_WriteImage(output, coord, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + } \\\n\ + else if(res == 3) \\\n\ + { \\\n\ + VXC_WriteImage(output, coord, src0, VXC_MODIFIER(0, 2, 0, VXC_RM_TowardZero, 0)); \\\n\ + } \\\n\ + else if(res == 2) \\\n\ + { \\\n\ + VXC_WriteImage(output, coord, src0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\ + } \\\n\ + else if(res == 1) \\\n\ + { \\\n\ + VXC_WriteImage(output, coord, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \\\n\ + } \\\n\ +}\n\ +REPEAT_1D(U8, vxc_uchar16)\n\ +REPEAT_1D(I16, vxc_short8)\n\ +\n\ +__kernel void repeat_U8_axis0(\n\ + image2d_array_t input0, image2d_t input1, image2d_t input2,\n\ + image2d_array_t output, int axis)\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ + vxc_uchar16 src0;\n\ + VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + Image img1 = create_image_from_image2d(input1, 4);\n\ + Image img2 = create_image_from_image2d(input2, 4);\n\ + __global int* len_ptr = (__global int*)img1.ptr;\n\ + __global int* index_ptr = (__global int*)img2.ptr;\n\ +\n\ + int len = len_ptr[get_global_id(1)];\n\ + int start = index_ptr[get_global_id(1)];\n\ +\n\ + int8 output_desc;\n\ + _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\ + int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0;\n\ + _viv_asm(MOV, coord.z, baseAddr);\n\ + int end = len + start;\n\ +\n\ + for(coord.y = start; coord.y < end; coord.y++)\n\ + {\n\ + VXC_OP4_NoDest(img_store_3d, output, coord, src0, \\\n\ + VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\ + }\n\ +}\n\ +\n\ +__kernel void repeat_U8_axis2(\n\ + image2d_array_t input0, image2d_t input1, image2d_t input2,\n\ + image2d_array_t output, int axis)\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ + vxc_uchar16 src0;\n\ + VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + Image img1 = create_image_from_image2d(input1, 4);\n\ + Image img2 = create_image_from_image2d(input2, 4);\n\ + __global int* len_ptr = (__global int*)img1.ptr;\n\ + __global int* index_ptr = (__global int*)img2.ptr;\n\ +\n\ + int len = len_ptr[get_global_id(2)];\n\ + int start = index_ptr[get_global_id(2)];\n\ + int end = len + start;\n\ +\n\ + for(coord.z = start; coord.z < end; coord.z++)\n\ + {\n\ + VXC_WriteImage2DArray(output, coord, src0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + }\n\ +}"; /* end of repeat_vx*/ + +static const char repeat_axis1_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform VXC_512Bits uniExtract1to8Short_2x8;\n\ +\n\ +#define REPEAT_RES(end_pos) \\\n\ +coord.y = gidy; \\\n\ +VXC_OP4_NoDest(img_store_3d, output, coord, src0, VXC_MODIFIER(0, end_pos, 0,VXC_RM_TowardZero, 0)); \\\n\ +coord.y++; \\\n\ +VXC_OP4_NoDest(img_store_3d, output, coord, src1, VXC_MODIFIER(0, end_pos, 0,VXC_RM_TowardZero, 0)); \\\n\ +coord.y++; \\\n\ +VXC_OP4_NoDest(img_store_3d, output, coord, src2, VXC_MODIFIER(0, end_pos, 0,VXC_RM_TowardZero, 0)); \\\n\ +coord.y++; \\\n\ +VXC_OP4_NoDest(img_store_3d, output, coord, src3, VXC_MODIFIER(0, end_pos, 0,VXC_RM_TowardZero, 0)); \\\n\ +coord.y++; \\\n\ +VXC_OP4_NoDest(img_store_3d, output, coord, src4, VXC_MODIFIER(0, end_pos, 0,VXC_RM_TowardZero, 0)); \\\n\ +coord.y++; \\\n\ +VXC_OP4_NoDest(img_store_3d, output, coord, src5, VXC_MODIFIER(0, end_pos, 0,VXC_RM_TowardZero, 0)); \\\n\ +coord.y++; \\\n\ +VXC_OP4_NoDest(img_store_3d, output, coord, src6, VXC_MODIFIER(0, end_pos, 0,VXC_RM_TowardZero, 0)); \\\n\ +coord.y++; \\\n\ +VXC_OP4_NoDest(img_store_3d, output, coord, src7, VXC_MODIFIER(0, end_pos, 0,VXC_RM_TowardZero, 0));\n\ +\n\ +__kernel void repeat_I16_axis1(\n\ + image2d_array_t input0, image2d_t input1, image2d_t input2,\n\ + image2d_array_t output, int axis)\n\ +{\n\ + int gidy = get_global_id(1);\n\ + int4 coord = (int4)(get_global_id(0), gidy, get_global_id(2), 0);\n\ + vxc_short8 src0, src1, src2, src3, src4, src5, src6, src7;\n\ +\n\ + int8 input_desc, output_desc;\n\ + _viv_asm(COPY, input_desc, input0, sizeof(input_desc));\n\ + int baseAddr_a = (int)get_global_id(2) * input_desc.s4 + input_desc.s0;\n\ + _viv_asm(MOV, coord.z, baseAddr_a);\n\ + _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\ + int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0;\n\ +\n\ + VXC_OP4(img_load_3d, src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, src1, input0, coord, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, src2, input0, coord, VXC_5BITOFFSET_XY(0, 2),\n\ + VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, src3, input0, coord, VXC_5BITOFFSET_XY(0, 3),\n\ + VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, src4, input0, coord, VXC_5BITOFFSET_XY(0, 4),\n\ + VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, src5, input0, coord, VXC_5BITOFFSET_XY(0, 5),\n\ + VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, src6, input0, coord, VXC_5BITOFFSET_XY(0, 6),\n\ + VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, src7, input0, coord, VXC_5BITOFFSET_XY(0, 7),\n\ + VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + Image img1 = create_image_from_image2d(input1, 4);\n\ + Image img2 = create_image_from_image2d(input2, 4);\n\ + __global int* len_ptr = (__global int*)img1.ptr;\n\ + __global int* index_ptr = (__global int*)img2.ptr;\n\ +\n\ + int len = len_ptr[get_global_id(0)];\n\ + int start = index_ptr[get_global_id(0)];\n\ +\n\ + _viv_asm(MOV, coord.z, baseAddr);\n\ + int iter = len >> 3;\n\ + int res = len & 7;\n\ + coord.x = start;\n\ +\n\ + VXC_DP2x8(src0, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract1to8Short_2x8);\n\ + VXC_DP2x8(src1, src1, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract1to8Short_2x8);\n\ + VXC_DP2x8(src2, src2, src2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract1to8Short_2x8);\n\ + VXC_DP2x8(src3, src3, src3, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract1to8Short_2x8);\n\ + VXC_DP2x8(src4, src4, src4, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract1to8Short_2x8);\n\ + VXC_DP2x8(src5, src5, src5, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract1to8Short_2x8);\n\ + VXC_DP2x8(src6, src6, src6, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract1to8Short_2x8);\n\ + VXC_DP2x8(src7, src7, src7, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract1to8Short_2x8);\n\ +\n\ + for(int i = 0; i < iter; i++)\n\ + {\n\ + coord.y = gidy;\n\ + VXC_OP4_NoDest(img_store_3d, output, coord, src0, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ + coord.y++;\n\ + VXC_OP4_NoDest(img_store_3d, output, coord, src1, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ + coord.y++;\n\ + VXC_OP4_NoDest(img_store_3d, output, coord, src2, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ + coord.y++;\n\ + VXC_OP4_NoDest(img_store_3d, output, coord, src3, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ + coord.y++;\n\ + VXC_OP4_NoDest(img_store_3d, output, coord, src4, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ + coord.y++;\n\ + VXC_OP4_NoDest(img_store_3d, output, coord, src5, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ + coord.y++;\n\ + VXC_OP4_NoDest(img_store_3d, output, coord, src6, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ + coord.y++;\n\ + VXC_OP4_NoDest(img_store_3d, output, coord, src7, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ + coord.x += 8;\n\ + }\n\ +\n\ + if(res == 7)\n\ + {\n\ + REPEAT_RES(6)\n\ + }\n\ + else if(res == 6)\n\ + {\n\ + REPEAT_RES(5)\n\ + }\n\ + else if(res == 5)\n\ + {\n\ + REPEAT_RES(4)\n\ + }\n\ + else if(res == 4)\n\ + {\n\ + REPEAT_RES(3)\n\ + }\n\ + else if(res == 3)\n\ + {\n\ + REPEAT_RES(2)\n\ + }\n\ + else if(res == 2)\n\ + {\n\ + REPEAT_RES(1)\n\ + }\n\ + else if(res == 1)\n\ + {\n\ + REPEAT_RES(0)\n\ + }\n\ +}\n\ +\n\ +__kernel void repeat_U8_axis1(\n\ + image2d_array_t input0, image2d_t input1, image2d_t input2,\n\ + image2d_array_t output, int axis)\n\ +{\n\ + int gidy = get_global_id(1);\n\ + int4 coord = (int4)(get_global_id(0), gidy, get_global_id(2), 0);\n\ + vxc_uchar16 src0, src1, src2, src3, src4, src5, src6, src7;\n\ +\n\ + int8 input_desc, output_desc;\n\ + _viv_asm(COPY, input_desc, input0, sizeof(input_desc));\n\ + int baseAddr_a = (int)get_global_id(2) * input_desc.s4 + input_desc.s0;\n\ + _viv_asm(MOV, coord.z, baseAddr_a);\n\ + _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\ + int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0;\n\ +\n\ + VXC_OP4(img_load_3d, src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, src1, input0, coord, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, src2, input0, coord, VXC_5BITOFFSET_XY(0, 2),\n\ + VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, src3, input0, coord, VXC_5BITOFFSET_XY(0, 3),\n\ + VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, src4, input0, coord, VXC_5BITOFFSET_XY(0, 4),\n\ + VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, src5, input0, coord, VXC_5BITOFFSET_XY(0, 5),\n\ + VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, src6, input0, coord, VXC_5BITOFFSET_XY(0, 6),\n\ + VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, src7, input0, coord, VXC_5BITOFFSET_XY(0, 7),\n\ + VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + Image img1 = create_image_from_image2d(input1, 4);\n\ + Image img2 = create_image_from_image2d(input2, 4);\n\ + __global int* len_ptr = (__global int*)img1.ptr;\n\ + __global int* index_ptr = (__global int*)img2.ptr;\n\ +\n\ + int len = len_ptr[get_global_id(0)];\n\ + int start = index_ptr[get_global_id(0)];\n\ +\n\ + _viv_asm(MOV, coord.z, baseAddr);\n\ + int iter = len >> 3;\n\ + int res = len & 7;\n\ + coord.x = start;\n\ +\n\ + VXC_DP2x8(src0, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract1to8Short_2x8);\n\ + VXC_DP2x8(src1, src1, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract1to8Short_2x8);\n\ + VXC_DP2x8(src2, src2, src2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract1to8Short_2x8);\n\ + VXC_DP2x8(src3, src3, src3, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract1to8Short_2x8);\n\ + VXC_DP2x8(src4, src4, src4, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract1to8Short_2x8);\n\ + VXC_DP2x8(src5, src5, src5, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract1to8Short_2x8);\n\ + VXC_DP2x8(src6, src6, src6, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract1to8Short_2x8);\n\ + VXC_DP2x8(src7, src7, src7, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract1to8Short_2x8);\n\ +\n\ + for(int i = 0; i < iter; i++)\n\ + {\n\ + coord.y = gidy;\n\ + VXC_OP4_NoDest(img_store_3d, output, coord, src0, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ + coord.y++;\n\ + VXC_OP4_NoDest(img_store_3d, output, coord, src1, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ + coord.y++;\n\ + VXC_OP4_NoDest(img_store_3d, output, coord, src2, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ + coord.y++;\n\ + VXC_OP4_NoDest(img_store_3d, output, coord, src3, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ + coord.y++;\n\ + VXC_OP4_NoDest(img_store_3d, output, coord, src4, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ + coord.y++;\n\ + VXC_OP4_NoDest(img_store_3d, output, coord, src5, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ + coord.y++;\n\ + VXC_OP4_NoDest(img_store_3d, output, coord, src6, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ + coord.y++;\n\ + VXC_OP4_NoDest(img_store_3d, output, coord, src7, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ + coord.x += 8;\n\ + }\n\ +\n\ + if(res == 7)\n\ + {\n\ + REPEAT_RES(6)\n\ + }\n\ + else if(res == 6)\n\ + {\n\ + REPEAT_RES(5)\n\ + }\n\ + else if(res == 5)\n\ + {\n\ + REPEAT_RES(4)\n\ + }\n\ + else if(res == 4)\n\ + {\n\ + REPEAT_RES(3)\n\ + }\n\ + else if(res == 3)\n\ + {\n\ + REPEAT_RES(2)\n\ + }\n\ + else if(res == 2)\n\ + {\n\ + REPEAT_RES(1)\n\ + }\n\ + else if(res == 1)\n\ + {\n\ + REPEAT_RES(0)\n\ + }\n\ +}\n\ +\n\ +"; /* end of repeat_axis1_vx*/ + static const char resize_1d_bilinear_BF16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ \n\ _viv_uniform float scale_x;\n\ @@ -31997,13 +36723,13 @@ __kernel void resize_bilinear_U8toU8_DOWN\n\ }\n\ "; /* end of resize_bilinear_U8_vx*/ -static const char resize_bilinear_U8_UP_2X_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +static const char resize_bilinear_U8_half_pixel_centers_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ \n\ -_viv_uniform VXC_512Bits uniResize2xUp_4x8;\n\ -_viv_uniform VXC_512Bits uniResize2xUpRound_2x8;\n\ +_viv_uniform VXC_512Bits uniResize2xUp_0_4x8;\n\ +_viv_uniform VXC_512Bits uniResize2xUp_1_4x8;\n\ _viv_uniform int out_height;\n\ \n\ -__kernel void resize_bilinear_U8toU8_UP_2X_half\n\ +__kernel void resize_bilinear_U8toU8_SAME_2x_upsample_half_pixel_centers\n\ (\n\ __read_only image2d_array_t input,\n\ __write_only image2d_array_t output,\n\ @@ -32017,7 +36743,6 @@ __kernel void resize_bilinear_U8toU8_UP_2X_half\n\ coord_in.x = coord_out.x == 0 ? -1 : coord_in.x;\n\ \n\ vxc_uchar16 in0, in1, tmp, result;\n\ - vxc_ushort8 result_s, round_s = 8;\n\ \n\ int8 input_desc;\n\ _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ @@ -32035,34 +36760,199 @@ __kernel void resize_bilinear_U8toU8_UP_2X_half\n\ \n\ while (coord_out.y < out_height)\n\ {\n\ - VXC_DP4x8(result_s, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniResize2xUp_4x8);\n\ - VXC_DP2x8(result, result_s, round_s, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniResize2xUpRound_2x8);\n\ + VXC_DP4x8(result, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize2xUp_0_4x8);\n\ + VXC_DP4x8(result, in0, in1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize2xUp_1_4x8);\n\ VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, result,\n\ - VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ + VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\ coord_out.y++;\n\ VXC_OP4(img_load_3d, in0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 2),\n\ VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ - VXC_DP4x8(result_s, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniResize2xUp_4x8);\n\ - VXC_DP2x8(result, result_s, round_s, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniResize2xUpRound_2x8);\n\ + VXC_DP4x8(result, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize2xUp_0_4x8);\n\ + VXC_DP4x8(result, in0, in1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize2xUp_1_4x8);\n\ VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, result,\n\ - VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ + VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\ coord_out.y++;\n\ - VXC_DP4x8(result_s, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniResize2xUp_4x8);\n\ - VXC_DP2x8(result, result_s, round_s, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniResize2xUpRound_2x8);\n\ + VXC_DP4x8(result, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize2xUp_0_4x8);\n\ + VXC_DP4x8(result, in1, in0, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize2xUp_1_4x8);\n\ VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, result,\n\ - VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ + VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\ VXC_OP4(img_load_3d, in1, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 3),\n\ VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ coord_out.y++;\n\ - VXC_DP4x8(result_s, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniResize2xUp_4x8);\n\ - VXC_DP2x8(result, result_s, round_s, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniResize2xUpRound_2x8);\n\ + VXC_DP4x8(result, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize2xUp_0_4x8);\n\ + VXC_DP4x8(result, in1, in0, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize2xUp_1_4x8);\n\ VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, result,\n\ - VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ + VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\ coord_in.y += 2;\n\ coord_out.y++;\n\ }\n\ }\n\ -"; /* end of resize_bilinear_U8_UP_2X_vx*/ +\n\ +_viv_uniform VXC_512Bits uniResize4xUp_l00_4x8;\n\ +_viv_uniform VXC_512Bits uniResize4xUp_l01_4x8;\n\ +_viv_uniform VXC_512Bits uniResize4xUp_l10_4x8;\n\ +_viv_uniform VXC_512Bits uniResize4xUp_l11_4x8;\n\ +__kernel void resize_bilinear_U8toU8_SAME_4x_upsample_half_pixel_centers\n\ + (\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + int align_corners,\n\ + int half_pixel_centers\n\ + )\n\ +{\n\ + int4 coord_out = (int4)(get_global_id(0), 0, get_global_id(1), 0);\n\ + int4 coord_in = (int4)(get_global_id(0), -1, get_global_id(1), 0);\n\ + coord_in.x = (coord_out.x * 2 - 3) >> 3;\n\ + coord_in.x = coord_out.x == 0 ? -1 : coord_in.x;\n\ +\n\ + vxc_uchar16 in0, in1, tmp, dst0, dst1;\n\ +\n\ + int8 input_desc;\n\ + _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ + int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;\n\ + _viv_asm(MOV, coord_in.w, baseAddr);\n\ + VXC_OP4(img_load_3d, in0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, in1, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + int8 output_desc;\n\ + _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\ + baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;\n\ + _viv_asm(MOV, coord_out.w, baseAddr);\n\ +\n\ + while (coord_out.y < out_height)\n\ + {\n\ + VXC_DP4x8(dst0, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l00_4x8);\n\ + VXC_DP4x8(dst0, in0, in1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l01_4x8);\n\ + VXC_DP4x8(dst1, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l10_4x8);\n\ + VXC_DP4x8(dst1, in0, in1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l11_4x8);\n\ + VXC_OP4(img_load_3d, in0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 2),\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst0,\n\ + VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\ + coord_out.y++;\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst1,\n\ + VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\ + coord_out.y++;\n\ + VXC_DP4x8(dst0, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l10_4x8);\n\ + VXC_DP4x8(dst0, in0, in1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l11_4x8);\n\ + VXC_DP4x8(dst1, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l00_4x8);\n\ + VXC_DP4x8(dst1, in0, in1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l01_4x8);\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst0,\n\ + VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\ + coord_out.y++;\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst1,\n\ + VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\ + coord_out.y++;\n\ + VXC_DP4x8(dst0, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l00_4x8);\n\ + VXC_DP4x8(dst0, in1, in0, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l01_4x8);\n\ + VXC_DP4x8(dst1, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l10_4x8);\n\ + VXC_DP4x8(dst1, in1, in0, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l11_4x8);\n\ + VXC_OP4(img_load_3d, in1, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 3),\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst0,\n\ + VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\ + coord_out.y++;\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst1,\n\ + VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\ + coord_out.y++;\n\ + VXC_DP4x8(dst0, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l10_4x8);\n\ + VXC_DP4x8(dst0, in1, in0, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l11_4x8);\n\ + VXC_DP4x8(dst1, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l00_4x8);\n\ + VXC_DP4x8(dst1, in1, in0, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l01_4x8);\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst0,\n\ + VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\ + coord_out.y++;\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst1,\n\ + VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\ + coord_in.y += 2;\n\ + coord_out.y++;\n\ + }\n\ +}\n\ +\n\ +_viv_uniform VXC_512Bits uniResize3xUp_l00_2x8;\n\ +_viv_uniform VXC_512Bits uniResize3xUp_l01_2x8;\n\ +_viv_uniform VXC_512Bits uniResize3xUp_l10_4x4;\n\ +_viv_uniform VXC_512Bits uniResize3xUp_l11_4x4;\n\ +_viv_uniform VXC_512Bits uniResize3xUp_l12_4x4;\n\ +_viv_uniform VXC_512Bits uniResize3xUp_l13_4x4;\n\ +__kernel void resize_bilinear_U8toU8_SAME_3x_upsample_half_pixel_centers\n\ + (\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + int align_corners,\n\ + int half_pixel_centers\n\ + )\n\ +{\n\ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ + coord_in.x = (short)(coord_out.x * 2 - 1) / (short)6;\n\ + coord_in.x = coord_out.x == 0 ? -1 : coord_in.x;\n\ + coord_in.y = (short)(coord_out.y * 2 - 1) / (short)6;\n\ + coord_in.y = coord_out.y == 0 ? -1 : coord_in.y;\n\ +\n\ + vxc_uchar16 in0, in1, in2, in3, tmp, dst0, dst1, dst2;\n\ +\n\ + int8 input_desc;\n\ + _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ + int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;\n\ + _viv_asm(MOV, coord_in.w, baseAddr);\n\ + VXC_OP4(img_load_3d, in0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, in1, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, in2, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 2),\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, in3, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 3),\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + int8 output_desc;\n\ + _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\ + baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;\n\ + _viv_asm(MOV, coord_out.w, baseAddr);\n\ +\n\ + VXC_DP4x4(dst2, in1, in0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l10_4x4);\n\ + VXC_DP4x4(dst2, in1, in0, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l11_4x4);\n\ + VXC_DP4x4(dst2, in1, in0, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l12_4x4);\n\ + VXC_DP4x4(dst2, in1, in0, VXC_MODIFIER(12, 14, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l13_4x4);\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst2,\n\ + VXC_MODIFIER(0, 14, 0,VXC_RM_TowardZero, 0));\n\ + coord_out.y++;\n\ +\n\ + VXC_DP2x8(dst0, in1, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l00_2x8);\n\ + VXC_DP2x8(dst0, in1, in1, VXC_MODIFIER(8, 14, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l01_2x8);\n\ + VXC_DP4x4(dst1, in1, in2, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l10_4x4);\n\ + VXC_DP4x4(dst1, in1, in2, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l11_4x4);\n\ + VXC_DP4x4(dst1, in1, in2, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l12_4x4);\n\ + VXC_DP4x4(dst1, in1, in2, VXC_MODIFIER(12, 14, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l13_4x4);\n\ + VXC_DP4x4(dst2, in2, in1, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l10_4x4);\n\ + VXC_DP4x4(dst2, in2, in1, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l11_4x4);\n\ + VXC_DP4x4(dst2, in2, in1, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l12_4x4);\n\ + VXC_DP4x4(dst2, in2, in1, VXC_MODIFIER(12, 14, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l13_4x4);\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst0,\n\ + VXC_MODIFIER(0, 14, 0,VXC_RM_TowardZero, 0));\n\ + coord_out.y++;\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst1,\n\ + VXC_MODIFIER(0, 14, 0,VXC_RM_TowardZero, 0));\n\ + coord_out.y++;\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst2,\n\ + VXC_MODIFIER(0, 14, 0,VXC_RM_TowardZero, 0));\n\ + coord_out.y++;\n\ +\n\ + VXC_DP2x8(dst0, in2, in2, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l00_2x8);\n\ + VXC_DP2x8(dst0, in2, in2, VXC_MODIFIER(8, 14, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l01_2x8);\n\ + VXC_DP4x4(dst1, in2, in3, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l10_4x4);\n\ + VXC_DP4x4(dst1, in2, in3, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l11_4x4);\n\ + VXC_DP4x4(dst1, in2, in3, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l12_4x4);\n\ + VXC_DP4x4(dst1, in2, in3, VXC_MODIFIER(12, 14, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l13_4x4);\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst0,\n\ + VXC_MODIFIER(0, 14, 0,VXC_RM_TowardZero, 0));\n\ + coord_out.y++;\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst1,\n\ + VXC_MODIFIER(0, 14, 0,VXC_RM_TowardZero, 0));\n\ +}\n\ +"; /* end of resize_bilinear_U8_half_pixel_centers_vx*/ static const char resize_bilinear_U8_opt_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ \n\ @@ -32832,6 +37722,398 @@ __kernel void select_I8_U8_U8toU8_2D(\n\ }\n\ "; /* end of select_vx*/ +static const char sequence_mask_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4;\n\ +_viv_uniform VXC_512Bits uniConvert1stUint8SubZpToFp32_4x4;\n\ +_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\ +_viv_uniform float input_scale;\n\ +_viv_uniform int inputZP;\n\ +_viv_uniform int output_ZP;\n\ +_viv_uniform float outputVal1;\n\ +\n\ +#define SEQUENCE_MASK_QINT_TO_QINT_2D(src0_type_name, src1_type_name, read_type, write_type) \\\n\ +__kernel void sequence_mask_##src0_type_name##to##src1_type_name##_2D( \\\n\ + image2d_t input, image2d_t output, int maxLen) \\\n\ +{ \\\n\ + int gidx = get_global_id(0); \\\n\ + int2 coord = (int2)(gidx, get_global_id(1)); \\\n\ + read_type src0; \\\n\ + VXC_ReadImage(src0, input, coord.yy, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \\\n\ + int4 outIdx = (int4)(gidx, gidx + 1, gidx + 2, gidx + 3); \\\n\ + float4 tmpData; \\\n\ + short zp = inputZP; \\\n\ + VXC_DP4x4(tmpData, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvert1stUint8SubZpToFp32_4x4); \\\n\ + int index = convert_int_rte(tmpData.s0 * input_scale); \\\n\ + int4 data; \\\n\ + data = outIdx < index? convert_int_rte(outputVal1) : output_ZP; \\\n\ + write_type dst; \\\n\ + VXC_DP2x8(dst, data, data, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8); \\\n\ + VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +SEQUENCE_MASK_QINT_TO_QINT_2D(U8, U8, vxc_uchar16, vxc_uchar16)\n\ +SEQUENCE_MASK_QINT_TO_QINT_2D(I8, I8, vxc_char16, vxc_char16)\n\ +SEQUENCE_MASK_QINT_TO_QINT_2D(I16, I16, vxc_short8, vxc_short8)\n\ +SEQUENCE_MASK_QINT_TO_QINT_2D(I8, U8, vxc_char16, vxc_uchar16)\n\ +SEQUENCE_MASK_QINT_TO_QINT_2D(I16, U8, vxc_short8, vxc_uchar16)\n\ +\n\ +#define SEQUENCE_MASK_QINT_TO_QINT(src0_type_name, src1_type_name, read_type, write_type) \\\n\ +__kernel void sequence_mask_##src0_type_name##to##src1_type_name( \\\n\ + image2d_t input, image2d_array_t output, int maxLen) \\\n\ +{ \\\n\ + int gidx = get_global_id(0); \\\n\ + int4 coord = (int4)(gidx, get_global_id(1), get_global_id(2), 0); \\\n\ + read_type src0; \\\n\ + VXC_ReadImage(src0, input, coord.yz, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \\\n\ + int4 outIdx = (int4)(gidx, gidx + 1, gidx + 2, gidx + 3); \\\n\ + float4 tmpData; \\\n\ + short zp = inputZP; \\\n\ + VXC_DP4x4(tmpData, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvert1stUint8SubZpToFp32_4x4); \\\n\ + int index = convert_int_rte(tmpData.s0 * input_scale); \\\n\ + int4 data; \\\n\ + data = outIdx < index? convert_int_rte(outputVal1) : output_ZP; \\\n\ + write_type dst; \\\n\ + VXC_DP2x8(dst, data, data, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8); \\\n\ + VXC_WriteImage2DArray(output, coord, data, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +SEQUENCE_MASK_QINT_TO_QINT(U8, U8, vxc_uchar16, vxc_uchar16)\n\ +SEQUENCE_MASK_QINT_TO_QINT(I8, I8, vxc_char16, vxc_char16)\n\ +SEQUENCE_MASK_QINT_TO_QINT(I16, I16, vxc_short8, vxc_short8)\n\ +SEQUENCE_MASK_QINT_TO_QINT(I16, U8, vxc_short8, vxc_uchar16)\n\ +SEQUENCE_MASK_QINT_TO_QINT(I8, U8, vxc_char16, vxc_uchar16)\n\ +\n\ +__kernel void sequence_mask_F16toF16_2D(\n\ + image2d_t input, image2d_t output, int maxLen)\n\ +{\n\ + int gidx = get_global_id(0);\n\ + int2 coord = (int2)(gidx, get_global_id(1));\n\ + vxc_short8 src0;\n\ + vxc_half8 in_h;\n\ + VXC_ReadImage(src0, input, coord.yy, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + int4 outIdx = (int4)(gidx, gidx + 1, gidx + 2, gidx + 3);\n\ + _viv_asm(COPY, in_h, src0, 16);\n\ + float4 tmpData;\n\ + VXC_DP4x4(tmpData, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + UniFP16toFP32Lo4_dp4x4);\n\ + int index = convert_int_rte(tmpData.x);\n\ + float4 data;\n\ + data = outIdx < index? outputVal1 : convert_float(output_ZP);\n\ + vxc_short8 dst;\n\ + half4 tmpVal;\n\ + _viv_asm(CONV, tmpVal, data);\n\ + _viv_asm(COPY, dst, tmpVal, 16);\n\ + VXC_WriteImage(output, coord, dst.s0246, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void sequence_mask_F16toF16(\n\ + image2d_t input, image2d_t output, int maxLen)\n\ +{\n\ + int gidx = get_global_id(0);\n\ + int4 coord = (int4)(gidx, get_global_id(1), get_global_id(2), 0);\n\ + vxc_short8 src0;\n\ + vxc_half8 in_h;\n\ + VXC_ReadImage(src0, input, coord.yz, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + int4 outIdx = (int4)(gidx, gidx + 1, gidx + 2, gidx + 3);\n\ + _viv_asm(COPY, in_h, src0, 16);\n\ + float4 tmpData;\n\ + VXC_DP4x4(tmpData, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + UniFP16toFP32Lo4_dp4x4);\n\ + int index = convert_int_rte(tmpData.x);\n\ + float4 data;\n\ + data = outIdx < index? outputVal1 : convert_float(output_ZP);\n\ + vxc_short8 dst;\n\ + half4 tmpVal;\n\ + _viv_asm(CONV, tmpVal, data);\n\ + _viv_asm(COPY, dst, tmpVal, 16);\n\ + VXC_WriteImage2DArray(output, coord, dst.s0246, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void sequence_mask_F16toU8_2D(\n\ + image2d_t input, image2d_t output, int maxLen)\n\ +{\n\ + int gidx = get_global_id(0);\n\ + int2 coord = (int2)(gidx, get_global_id(1));\n\ + vxc_short8 src0;\n\ + vxc_half8 in_h;\n\ + VXC_ReadImage(src0, input, coord.yy, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + int4 outIdx = (int4)(gidx, gidx + 1, gidx + 2, gidx + 3);\n\ + _viv_asm(COPY, in_h, src0, 16);\n\ + float4 tmpData;\n\ + VXC_DP4x4(tmpData, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + UniFP16toFP32Lo4_dp4x4);\n\ + int index = convert_int_rte(tmpData.x);\n\ + int4 data;\n\ + data = outIdx < index? convert_int_rte(outputVal1) : output_ZP;\n\ + vxc_uchar16 dst;\n\ + VXC_DP2x8(dst, data, data, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\n\ + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void sequence_mask_F16toU8(\n\ + image2d_t input, image2d_t output, int maxLen)\n\ +{\n\ + int gidx = get_global_id(0);\n\ + int4 coord = (int4)(gidx, get_global_id(1), get_global_id(2), 0);\n\ + vxc_short8 src0;\n\ + vxc_half8 in_h;\n\ + VXC_ReadImage(src0, input, coord.yz, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + int4 outIdx = (int4)(gidx, gidx + 1, gidx + 2, gidx + 3);\n\ + _viv_asm(COPY, in_h, src0, 16);\n\ + float4 tmpData;\n\ + VXC_DP4x4(tmpData, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + UniFP16toFP32Lo4_dp4x4);\n\ + int index = convert_int_rte(tmpData.x);\n\ + int4 data;\n\ + data = outIdx < index? convert_int_rte(outputVal1) : output_ZP;\n\ + vxc_uchar16 dst;\n\ + VXC_DP2x8(dst, data, data, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +"; /* end of sequence_mask_vx*/ + +static const char slice_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +#define SLICE_SAMLEFL_SH_IMPL(name, data_type, end_bin) \\\n\ +__kernel void slice_##name##_I32to##name##_SAMEFL \\\n\ + ( \\\n\ + __read_only image2d_array_t input0, \\\n\ + __read_only image2d_t input1, \\\n\ + __write_only image2d_array_t output, \\\n\ + int is_samefl \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\ + int4 coord_in; \\\n\ + Image begin_img = create_image_from_image2d(input1, 4); \\\n\ + uchar* begin_ptr = begin_img.ptr; \\\n\ + int4 begin = ((int4 *)begin_ptr)[0]; \\\n\ + \\\n\ + coord_in = coord + begin; \\\n\ + data_type src; \\\n\ + VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(0, end_bin, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_WriteImage2DArray(output, coord, src, VXC_MODIFIER(0, end_bin, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +SLICE_SAMLEFL_SH_IMPL(U8, vxc_uchar16, 15)\n\ +SLICE_SAMLEFL_SH_IMPL(I16, vxc_short8, 7)\n\ +\n\ +\n\ +#define SLICE_SAMLEFL_2D_SH_IMPL(name, data_type, end_bin) \\\n\ +__kernel void slice_##name##_I32to##name##_SAMEFL_2D \\\n\ + ( \\\n\ + __read_only image2d_array_t input0, \\\n\ + __read_only image2d_t input1, \\\n\ + __write_only image2d_array_t output, \\\n\ + int is_samefl \\\n\ + ) \\\n\ +{ \\\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); \\\n\ + int2 coord_in; \\\n\ + Image begin_img = create_image_from_image2d(input1, 4); \\\n\ + uchar* begin_ptr = begin_img.ptr; \\\n\ + int2 begin = ((int2 *)begin_ptr)[0]; \\\n\ + \\\n\ + coord_in = coord + begin; \\\n\ + data_type src; \\\n\ + VXC_ReadImage(src, input0, coord_in, 0, VXC_MODIFIER(0, end_bin, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_WriteImage(output, coord, src, VXC_MODIFIER(0, end_bin, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +SLICE_SAMLEFL_2D_SH_IMPL(U8, vxc_uchar16, 15)\n\ +SLICE_SAMLEFL_2D_SH_IMPL(I16, vxc_short8, 7)\n\ +\n\ +_viv_uniform VXC_512Bits uniU8MulAndPostShift_Lo_2x8;\n\ +_viv_uniform VXC_512Bits uniU8MulAndPostShift_Hi_2x8;\n\ +_viv_uniform int2 multAndoutZP;//[0:15] multiplier, [31:63] output zp\n\ +#define SLICE_8BITSTO16BITS(name0, name1, src_type, dst_type, save_type) \\\n\ +__kernel void slice_##name0##_I32to##name1 \\\n\ + ( \\\n\ + __read_only image2d_array_t input0, \\\n\ + __read_only image2d_t input1, \\\n\ + __write_only image2d_array_t output, \\\n\ + int is_samefl \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\ + src_type src; \\\n\ + dst_type dst0; \\\n\ + int4 coord_in; \\\n\ + Image begin_img = create_image_from_image2d(input1, 4); \\\n\ + uchar* begin_ptr = begin_img.ptr; \\\n\ + int4 begin = ((int4 *)begin_ptr)[0]; \\\n\ + \\\n\ + coord_in = coord + begin; \\\n\ + VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + vxc_ushort8 multiplier; \\\n\ + _viv_asm(COPY, multiplier, multAndoutZP, 16); \\\n\ + VXC_DP2x8(dst0, src, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\ + uniU8MulAndPostShift_Lo_2x8); \\\n\ + save_type dst; \\\n\ + _viv_asm(COPY, dst, dst0, 16); \\\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +SLICE_8BITSTO16BITS(I8, F16, vxc_char16, vxc_half8, vxc_short8)\n\ +SLICE_8BITSTO16BITS(U8, F16, vxc_uchar16, vxc_half8, vxc_short8)\n\ +\n\ +#define SLICE_8BITSTO16BITS_2D(name0, name1, src_type, dst_type, save_type) \\\n\ +__kernel void slice_##name0##_I32to##name1##_2D \\\n\ + ( \\\n\ + __read_only image2d_array_t input0, \\\n\ + __read_only image2d_t input1, \\\n\ + __write_only image2d_array_t output, \\\n\ + int is_samefl \\\n\ + ) \\\n\ +{ \\\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); \\\n\ + src_type src; \\\n\ + dst_type dst0; \\\n\ + int2 coord_in; \\\n\ + Image begin_img = create_image_from_image2d(input1, 4); \\\n\ + uchar* begin_ptr = begin_img.ptr; \\\n\ + int2 begin = ((int2 *)begin_ptr)[0]; \\\n\ + \\\n\ + coord_in = coord + begin; \\\n\ + VXC_ReadImage(src, input0, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + vxc_ushort8 multiplier; \\\n\ + _viv_asm(COPY, multiplier, multAndoutZP, 16); \\\n\ + VXC_DP2x8(dst0, src, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\ + uniU8MulAndPostShift_Lo_2x8); \\\n\ + save_type dst; \\\n\ + _viv_asm(COPY, dst, dst0, 16); \\\n\ + VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +SLICE_8BITSTO16BITS_2D(I8, F16, vxc_char16, vxc_half8, vxc_short8)\n\ +SLICE_8BITSTO16BITS_2D(U8, F16, vxc_uchar16, vxc_half8, vxc_short8)\n\ +\n\ +#define SLICE_8BITSTO8BITS(name0, name1, src_type, dst_type) \\\n\ +__kernel void slice_##name0##_I32to##name1 \\\n\ + ( \\\n\ + __read_only image2d_array_t input0, \\\n\ + __read_only image2d_t input1, \\\n\ + __write_only image2d_array_t output, \\\n\ + int is_samefl \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\ + src_type src; \\\n\ + dst_type dst; \\\n\ + int4 coord_in; \\\n\ + Image begin_img = create_image_from_image2d(input1, 4); \\\n\ + uchar* begin_ptr = begin_img.ptr; \\\n\ + int4 begin = ((int4 *)begin_ptr)[0]; \\\n\ + \\\n\ + coord_in = coord + begin; \\\n\ + VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + vxc_ushort8 multiplier; \\\n\ + _viv_asm(COPY, multiplier, multAndoutZP, 16); \\\n\ + VXC_DP2x8(dst, src, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\ + uniU8MulAndPostShift_Lo_2x8); \\\n\ + VXC_DP2x8(dst, src, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), \\\n\ + uniU8MulAndPostShift_Hi_2x8); \\\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +SLICE_8BITSTO8BITS(I8, I8, vxc_char16, vxc_char16)\n\ +SLICE_8BITSTO8BITS(U8, U8, vxc_uchar16, vxc_uchar16)\n\ +\n\ +#define SLICE_8BITSTO8BITS_2D(name0, name1, src_type, dst_type) \\\n\ +__kernel void slice_##name0##_I32to##name1##_2D \\\n\ + ( \\\n\ + __read_only image2d_array_t input0, \\\n\ + __read_only image2d_t input1, \\\n\ + __write_only image2d_array_t output, \\\n\ + int is_samefl \\\n\ + ) \\\n\ +{ \\\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); \\\n\ + src_type src; \\\n\ + dst_type dst; \\\n\ + int2 coord_in; \\\n\ + Image begin_img = create_image_from_image2d(input1, 4); \\\n\ + uchar* begin_ptr = begin_img.ptr; \\\n\ + int2 begin = ((int2 *)begin_ptr)[0]; \\\n\ + \\\n\ + coord_in = coord + begin; \\\n\ + VXC_ReadImage(src, input0, coord_in, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + vxc_ushort8 multiplier; \\\n\ + _viv_asm(COPY, multiplier, multAndoutZP, 16); \\\n\ + VXC_DP2x8(dst, src, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\ + uniU8MulAndPostShift_Lo_2x8); \\\n\ + VXC_DP2x8(dst, src, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), \\\n\ + uniU8MulAndPostShift_Hi_2x8); \\\n\ + VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +SLICE_8BITSTO8BITS_2D(I8, I8, vxc_char16, vxc_char16)\n\ +SLICE_8BITSTO8BITS_2D(U8, U8, vxc_uchar16, vxc_uchar16)\n\ +\n\ +#define SLICE_16BITS_TO(name0, name1, src_type, copy_type, dst_type) \\\n\ +__kernel void slice_##name0##_I32to##name1 \\\n\ + ( \\\n\ + __read_only image2d_array_t input0, \\\n\ + __read_only image2d_t input1, \\\n\ + __write_only image2d_array_t output, \\\n\ + int is_samefl \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\ + src_type src; \\\n\ + copy_type src0; \\\n\ + dst_type dst; \\\n\ + int4 coord_in; \\\n\ + Image begin_img = create_image_from_image2d(input1, 4); \\\n\ + uchar* begin_ptr = begin_img.ptr; \\\n\ + int4 begin = ((int4 *)begin_ptr)[0]; \\\n\ + \\\n\ + coord_in = coord + begin; \\\n\ + VXC_ReadImage2DArray(src0, input0, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src, src0, 16); \\\n\ + \\\n\ + vxc_ushort8 multiplier; \\\n\ + _viv_asm(COPY, multiplier, multAndoutZP, 16); \\\n\ + VXC_DP2x8(dst, src, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\ + uniU8MulAndPostShift_Lo_2x8); \\\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +SLICE_16BITS_TO(F16, I8, vxc_half8, vxc_short8, vxc_char16)\n\ +SLICE_16BITS_TO(F16, U8, vxc_half8, vxc_short8, vxc_uchar16)\n\ +SLICE_16BITS_TO(F16, I16, vxc_half8, vxc_short8, vxc_short8)\n\ +\n\ +#define SLICE_16BITS_TO_2D(name0, name1, src_type, copy_type, dst_type) \\\n\ +__kernel void slice_##name0##_I32to##name1##_2D \\\n\ + ( \\\n\ + __read_only image2d_array_t input0, \\\n\ + __read_only image2d_t input1, \\\n\ + __write_only image2d_array_t output, \\\n\ + int is_samefl \\\n\ + ) \\\n\ +{ \\\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); \\\n\ + src_type src; \\\n\ + copy_type src0; \\\n\ + dst_type dst; \\\n\ + int2 coord_in; \\\n\ + Image begin_img = create_image_from_image2d(input1, 4); \\\n\ + uchar* begin_ptr = begin_img.ptr; \\\n\ + int2 begin = ((int2 *)begin_ptr)[0]; \\\n\ + \\\n\ + coord_in = coord + begin; \\\n\ + VXC_ReadImage(src0, input0, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src, src0, 16); \\\n\ + \\\n\ + vxc_ushort8 multiplier; \\\n\ + _viv_asm(COPY, multiplier, multAndoutZP, 16); \\\n\ + VXC_DP2x8(dst, src, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\ + uniU8MulAndPostShift_Lo_2x8); \\\n\ + VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +SLICE_16BITS_TO_2D(F16, I8, vxc_half8, vxc_short8, vxc_char16)\n\ +SLICE_16BITS_TO_2D(F16, U8, vxc_half8, vxc_short8, vxc_uchar16)\n\ +SLICE_16BITS_TO_2D(F16, I16, vxc_half8, vxc_short8, vxc_short8)"; /* end of slice_vx*/ + static const char space2depth_internal_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ \n\ _viv_uniform VXC_512Bits uniExtractEvenUint8Stride2_2x8;\n\ @@ -33245,6 +38527,43 @@ TILE_2D(I16, I16, 6, 5, vxc_short8)\n\ TILE_2D(I16, I16, 7, 6, vxc_short8)\n\ TILE_2D(I16, I16, 0, 7, vxc_short8)\n\ \n\ +#define TILE_2D_1TON(name0, name1, type) \\\n\ +__kernel void tile_1toN_##name0##to##name1##_2D( \\\n\ + __read_only image2d_array_t input, \\\n\ + __write_only image2d_array_t output, \\\n\ + int batchIn, \\\n\ + int depthIn, \\\n\ + int depthOut, \\\n\ + int multiples_0, \\\n\ + int multiples_1, \\\n\ + int multiples_2, \\\n\ + int multiples_3 \\\n\ +) \\\n\ +{ \\\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); \\\n\ + int width = get_image_width(input); \\\n\ + int height = get_image_height(input); \\\n\ + int output_width = get_image_width(output); \\\n\ + int output_height = get_image_height(output); \\\n\ + type src; \\\n\ + VXC_ReadImage(src, input, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + do \\\n\ + { \\\n\ + do \\\n\ + { \\\n\ + VXC_WriteImage(output, coord, src, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord.x += 8; \\\n\ + } while (coord.x < output_width); \\\n\ + coord.x = 0; \\\n\ + coord.y += height; \\\n\ + } while (coord.y < output_height); \\\n\ +}\n\ +TILE_2D_1TON(U8, U8, vxc_uchar8)\n\ +TILE_2D_1TON(I16, I16, vxc_short8)\n\ +\n\ +\n\ \n\ "; /* end of tile_vx*/ @@ -34604,16 +39923,6 @@ UPSAMPLE_SCALETO16B_FUN(I16, F16, vxc_short8, vxc_short8, vxc_half8, vxc_sho UPSAMPLE_SCALETO16B_FUN(I16, I16, vxc_short8, vxc_short8, vxc_short8, vxc_short8)\n\ "; /* end of upsamplescale_k2_vx*/ -static const char vsi_nn_kernel_axis_aligned_bbox_transform_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ -\n\ -__kernel void vxcAxis_aligned_bbox_transform(\n\ - __read_only image2d_array_t input,\n\ - __write_only image2d_array_t output)\n\ -{\n\ -\n\ -}\n\ -"; /* end of vsi_nn_kernel_axis_aligned_bbox_transform_vx*/ - static const char vsi_nn_kernel_box_with_nms_limit_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ \n\ __kernel void vxcBox_with_nms_limit(\n\ @@ -34679,16 +39988,6 @@ __kernel void vxcExtra_ending_u8(\n\ }\n\ "; /* end of vsi_nn_kernel_extra_ending_vx*/ -static const char vsi_nn_kernel_generate_proposals_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ -\n\ -__kernel void vxcGenerate_proposals(\n\ - __read_only image2d_array_t input,\n\ - __write_only image2d_array_t output)\n\ -{\n\ -\n\ -}\n\ -"; /* end of vsi_nn_kernel_generate_proposals_vx*/ - static const char vsi_nn_kernel_header_vx[] = "/*\n\ ============================================================================\n\ Name : libNNExt.vx\n\ @@ -34700,6 +39999,62 @@ static const char vsi_nn_kernel_header_vx[] = "/*\n\ */\n\ #include \"cl_viv_vx_ext.h\"\n\ \n\ +typedef struct Image\n\ +{\n\ + __global uchar *ptr;\n\ + int stride_x;\n\ + int stride_y;\n\ +} Image;\n\ +\n\ +inline uchar* get_image_ptr_from_coord(Image img, int2 coord)\n\ +{\n\ + return img.ptr + coord.x * img.stride_x + coord.y * img.stride_y;\n\ +}\n\ +\n\ +inline Image create_image_from_image2d(image2d_t input, int stride_x)\n\ +{\n\ + int8 desc;\n\ + _viv_asm(COPY, desc, input, sizeof(desc));\n\ +\n\ + Image img =\n\ + {\n\ + .ptr = (uchar*)desc.s0,\n\ + .stride_x = stride_x,\n\ + .stride_y = desc.s1\n\ + };\n\ +\n\ + return img;\n\ +}\n\ +\n\ +typedef struct Tensor\n\ +{\n\ + __global uchar *ptr;\n\ + int stride_x;\n\ + int stride_y;\n\ + int stride_z;\n\ +} Tensor;\n\ +\n\ +inline uchar* create_tensor_ptr_from_coord(Tensor t, int4 coord)\n\ +{\n\ + return t.ptr + coord.x * t.stride_x + coord.y * t.stride_y + coord.z * t.stride_z;\n\ +}\n\ +\n\ +inline Tensor create_tensor_from_image2d_array(image2d_array_t input, int stride_x)\n\ +{\n\ + int8 desc;\n\ + _viv_asm(COPY, desc, input, sizeof(desc));\n\ +\n\ + Tensor t =\n\ + {\n\ + .ptr = (uchar*)desc.s0,\n\ + .stride_x = stride_x,\n\ + .stride_y = desc.s1,\n\ + .stride_z = desc.s4\n\ + };\n\ +\n\ + return t;\n\ +}\n\ +\n\ #if (VX_VERSION==1)\n\ #define VXC_DP2x8_b_(dst, src0, src1, src2, info, uniform)\\\n\ do\\\n\ @@ -36666,16 +42021,6 @@ __kernel void vxcTensorStackConcat8Bits(\n\ VXC_WriteImage2DArray(output, coord, src1, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ }"; /* end of vsi_nn_kernel_tensorstackconcat_vx*/ -static const char vsi_nn_kernel_topk_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ -\n\ -__kernel void vxcTopk(\n\ - __read_only image2d_array_t input,\n\ - __write_only image2d_array_t output)\n\ -{\n\ -\n\ -}\n\ -"; /* end of vsi_nn_kernel_topk_vx*/ - static const char vsi_nn_kernel_transform_gemm_vx[] = "/*\n\ ============================================================================\n\ Name : gemm.vx\n\ @@ -38334,6 +43679,62 @@ __kernel void detect_post_box_U8_U8toF32(\n\ static const char eltwise_ops_helper_cl[] = "#pragma OPENCL EXTENSION CL_VIV_asm : enable\n\ #pragma OPENCL EXTENSION cl_viv_vx_extension : enable\n\ \n\ +typedef struct Image\n\ +{\n\ + __global uchar *ptr;\n\ + int stride_x;\n\ + int stride_y;\n\ +} Image;\n\ +\n\ +inline uchar* get_image_ptr_from_coord(Image img, int2 coord)\n\ +{\n\ + return img.ptr + coord.x * img.stride_x + coord.y * img.stride_y;\n\ +}\n\ +\n\ +inline Image create_image_from_image2d(image2d_t input, int stride_x)\n\ +{\n\ + int8 desc;\n\ + _viv_asm(COPY, desc, input, sizeof(desc));\n\ +\n\ + Image img =\n\ + {\n\ + .ptr = (uchar*)desc.s0,\n\ + .stride_x = stride_x,\n\ + .stride_y = desc.s1\n\ + };\n\ +\n\ + return img;\n\ +}\n\ +\n\ +typedef struct Tensor\n\ +{\n\ + __global uchar *ptr;\n\ + int stride_x;\n\ + int stride_y;\n\ + int stride_z;\n\ +} Tensor;\n\ +\n\ +inline uchar* create_tensor_ptr_from_coord(Tensor t, int4 coord)\n\ +{\n\ + return t.ptr + coord.x * t.stride_x + coord.y * t.stride_y + coord.z * t.stride_z;\n\ +}\n\ +\n\ +inline Tensor create_tensor_from_image2d_array(image2d_array_t input, int stride_x)\n\ +{\n\ + int8 desc;\n\ + _viv_asm(COPY, desc, input, sizeof(desc));\n\ +\n\ + Tensor t =\n\ + {\n\ + .ptr = (uchar*)desc.s0,\n\ + .stride_x = stride_x,\n\ + .stride_y = desc.s1,\n\ + .stride_z = desc.s4\n\ + };\n\ +\n\ + return t;\n\ +}\n\ +\n\ #define readImage2DArray(Dest, Image, Coord) \\\n\ do { \\\n\ int8 desc; \\\n\ @@ -38431,6 +43832,11 @@ float4 eltwise_unary_mish(float4 x, float alpha)\n\ return x;\n\ }\n\ \n\ +float4 eltwise_unary_round(float4 x, float alpha)\n\ +{\n\ + return convert_float4(convert_int4_rte(x));\n\ +}\n\ +\n\ #define ELTWISE_UNARY_F32(func_name) \\\n\ __kernel void func_name##_F32toF32 \\\n\ ( \\\n\ @@ -38458,6 +43864,7 @@ ELTWISE_UNARY_F32(elu)\n\ ELTWISE_UNARY_F32(neg)\n\ ELTWISE_UNARY_F32(mish)\n\ ELTWISE_UNARY_F32(hard_sigmoid)\n\ +ELTWISE_UNARY_F32(round)\n\ \n\ #define ELTWISE_UNARY_F32_2D(func_name) \\\n\ __kernel void func_name##_F32toF32_2D \\\n\ @@ -38486,6 +43893,7 @@ ELTWISE_UNARY_F32_2D(elu)\n\ ELTWISE_UNARY_F32_2D(neg)\n\ ELTWISE_UNARY_F32_2D(mish)\n\ ELTWISE_UNARY_F32_2D(hard_sigmoid)\n\ +ELTWISE_UNARY_F32_2D(round)\n\ \n\ #define ELTWISE_UNARY_U8(func_name) \\\n\ __kernel void func_name##_U8toU8 \\\n\ @@ -38516,6 +43924,7 @@ ELTWISE_UNARY_U8(elu)\n\ ELTWISE_UNARY_U8(neg)\n\ ELTWISE_UNARY_U8(mish)\n\ ELTWISE_UNARY_U8(hard_sigmoid)\n\ +ELTWISE_UNARY_U8(round)\n\ \n\ #define ELTWISE_UNARY_U8_2D(func_name) \\\n\ __kernel void func_name##_U8toU8_2D \\\n\ @@ -38546,7 +43955,7 @@ ELTWISE_UNARY_U8_2D(elu)\n\ ELTWISE_UNARY_U8_2D(neg)\n\ ELTWISE_UNARY_U8_2D(mish)\n\ ELTWISE_UNARY_U8_2D(hard_sigmoid)\n\ -\n\ +ELTWISE_UNARY_U8_2D(round)\n\ \n\ __kernel void neg_I32toI32\n\ (\n\ @@ -38587,6 +43996,121 @@ __kernel void neg_I32toI32_2D\n\ }\n\ "; /* end of eltwise_unary_cl*/ +static const char erf_cl[] = "#define MUL2_RSQRTPI (1.1283791670955126f)\n\ +float eltwise_unary_erf(float x)\n\ +{\n\ + float res = 0;\n\ + float tmp = x;\n\ + float factorial = 1;\n\ + float x_pow = x;\n\ + float one = 1.0f;\n\ + float n = 1;\n\ +\n\ + while (fabs(tmp) > 1e-5)\n\ + {\n\ + res += tmp;\n\ +\n\ + factorial *= n;\n\ + one *= -1;\n\ + x_pow *= x * x;\n\ + tmp = one / factorial * x_pow / ( 2 * n + 1);\n\ +\n\ + n += 1.0f;\n\ + }\n\ + return res * MUL2_RSQRTPI;\n\ +}\n\ +\n\ +#define ELTWISE_UNARY_F32(func_name) \\\n\ +__kernel void func_name##_F32toF32 \\\n\ + ( \\\n\ + __read_only image2d_array_t input, \\\n\ + __write_only image2d_array_t output, \\\n\ + float inputScale, \\\n\ + float inputTail, \\\n\ + float outputScale, \\\n\ + float outputZP \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\ + \\\n\ + float4 src = read_imagef(input, coord); \\\n\ + \\\n\ + float4 dst = 0; \\\n\ + dst.x = eltwise_unary_##func_name(src.x); \\\n\ + \\\n\ + write_imagef(output, coord, dst); \\\n\ +}\n\ +ELTWISE_UNARY_F32(erf)\n\ +\n\ +#define ELTWISE_UNARY_F32_2D(func_name) \\\n\ +__kernel void func_name##_F32toF32_2D \\\n\ + ( \\\n\ + __read_only image2d_t input, \\\n\ + __write_only image2d_t output, \\\n\ + float inputScale, \\\n\ + float inputTail, \\\n\ + float outputScale, \\\n\ + float outputZP \\\n\ + ) \\\n\ +{ \\\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); \\\n\ + \\\n\ + float4 src = read_imagef(input, coord); \\\n\ + \\\n\ + float4 dst = 0; \\\n\ + dst.x = eltwise_unary_##func_name(src.x); \\\n\ + \\\n\ + write_imagef(output, coord, dst); \\\n\ +}\n\ +ELTWISE_UNARY_F32_2D(erf)\n\ +\n\ +#define ELTWISE_UNARY_U8(func_name) \\\n\ +__kernel void func_name##_U8toU8 \\\n\ + ( \\\n\ + __read_only image2d_array_t input, \\\n\ + __write_only image2d_array_t output, \\\n\ + float inputScale, \\\n\ + float inputTail, \\\n\ + float outputScale, \\\n\ + float outputZP \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\ + \\\n\ + uint4 src = read_imageui(input, coord); \\\n\ + float4 data = convert_float4(src) * inputScale - inputTail; \\\n\ + \\\n\ + data.x = eltwise_unary_##func_name(data.x); \\\n\ + uint4 dst = convert_uint4(data * outputScale + outputZP); \\\n\ + \\\n\ + write_imageui(output, coord, dst); \\\n\ +}\n\ +ELTWISE_UNARY_U8(erf)\n\ +\n\ +#define ELTWISE_UNARY_U8_2D(func_name) \\\n\ +__kernel void func_name##_U8toU8_2D \\\n\ + ( \\\n\ + __read_only image2d_t input, \\\n\ + __write_only image2d_t output, \\\n\ + float inputScale, \\\n\ + float inputTail, \\\n\ + float outputScale, \\\n\ + float outputZP \\\n\ + ) \\\n\ +{ \\\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); \\\n\ + \\\n\ + uint4 src = read_imageui(input, coord); \\\n\ + float4 data = convert_float4(src) * inputScale - inputTail; \\\n\ + \\\n\ + data.x = eltwise_unary_##func_name(data.x); \\\n\ + uint4 dst = convert_uint4(data * outputScale + outputZP); \\\n\ + \\\n\ + write_imageui(output, coord, dst); \\\n\ +}\n\ +ELTWISE_UNARY_U8_2D(erf)\n\ +"; /* end of erf_cl*/ + static const char floordiv_cl[] = "__kernel void floordiv_F32F32toF32(\n\ __read_only image2d_array_t input,\n\ __read_only image2d_array_t input1,\n\ @@ -38639,6 +44163,44 @@ __kernel void floordiv_I32I32toI32_2D(\n\ write_imagei(output, coord, dst);\n\ }\n\ \n\ +__kernel void floordiv_I32I32toU8(\n\ + __read_only image2d_array_t input,\n\ + __read_only image2d_array_t input1,\n\ + __write_only image2d_array_t output,\n\ + float input0Scale,\n\ + float input0Tail,\n\ + float input1Scale,\n\ + float input1Tail,\n\ + float outputScale,\n\ + float outputTail )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ + int4 src0;\n\ + int4 src1;\n\ + readImage2DArray(src0, input, coord);\n\ + readImage2DArray(src1, input1, coord);\n\ + uint4 dst = convert_uint4(floor(convert_float4(src0) / convert_float4(src1)) * outputScale + outputTail);\n\ + write_imageui(output, coord, dst);\n\ +}\n\ +\n\ +__kernel void floordiv_I32I32toU8_2D(\n\ + __read_only image2d_t input,\n\ + __read_only image2d_t input1,\n\ + __write_only image2d_t output,\n\ + float input0Scale,\n\ + float input0Tail,\n\ + float input1Scale,\n\ + float input1Tail,\n\ + float outputScale,\n\ + float outputTail )\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ + int4 src0 = read_imagei(input, coord);\n\ + int4 src1 = read_imagei(input1, coord);\n\ + uint4 dst = convert_uint4(floor(convert_float4(src0) / convert_float4(src1)) * outputScale + outputTail);\n\ + write_imageui(output, coord, dst);\n\ +}\n\ +\n\ __kernel void floordiv_U8U8toU8(\n\ __read_only image2d_array_t input,\n\ __read_only image2d_array_t input1,\n\ @@ -38683,6 +44245,52 @@ __kernel void floordiv_U8U8toU8_2D(\n\ uint4 dst = convert_uint4(out);\n\ write_imageui(output, coord, dst);\n\ }\n\ +\n\ +__kernel void floordiv_U8I32toU8(\n\ + __read_only image2d_array_t input,\n\ + __read_only image2d_array_t input1,\n\ + __write_only image2d_array_t output,\n\ + float input0Scale,\n\ + float input0Tail,\n\ + float input1Scale,\n\ + float input1Tail,\n\ + float outputScale,\n\ + float outputTail )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ + uint4 src0;\n\ + int4 src1;\n\ + float4 in0, in1, out;\n\ + readImage2DArray(src0, input, coord);\n\ + readImage2DArray(src1, input1, coord);\n\ + in0 = convert_float4(src0) * input0Scale + input0Tail;\n\ + in1 = convert_float4(src1);\n\ + out = floor(in0 / in1) * outputScale + outputTail;\n\ + uint4 dst = convert_uint4(out);\n\ + write_imageui(output, coord, dst);\n\ +}\n\ +\n\ +__kernel void floordiv_U8I32toU8_2D(\n\ + __read_only image2d_t input,\n\ + __read_only image2d_t input1,\n\ + __write_only image2d_t output,\n\ + float input0Scale,\n\ + float input0Tail,\n\ + float input1Scale,\n\ + float input1Tail,\n\ + float outputScale,\n\ + float outputTail )\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ + uint4 src0 = read_imageui(input, coord);\n\ + int4 src1 = read_imagei(input1, coord);\n\ + float4 in0, in1, out;\n\ + in0 = convert_float4(src0) * input0Scale + input0Tail;\n\ + in1 = convert_float4(src1);\n\ + out = floor(in0 / in1) * outputScale + outputTail;\n\ + uint4 dst = convert_uint4(out);\n\ + write_imageui(output, coord, dst);\n\ +}\n\ "; /* end of floordiv_cl*/ static const char gather_cl[] = "__kernel void gather_U8toU8(\n\ @@ -39036,6 +44644,825 @@ __kernel void gather_nd_F32toF32_3D(\n\ }\n\ "; /* end of gather_nd_3d_cl*/ +static const char group_normalization_f32_cl[] = "__kernel void group_norm_sumsqr_F32(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_t output,\n\ + float eps,\n\ + int is2d,\n\ + float input_zp,\n\ + float input_scale,\n\ + int width,\n\ + int height\n\ + )\n\ +{\n\ + int gidx = get_global_id(0);\n\ + int gidz = get_global_id(1);\n\ + int lidx = get_local_id(0);\n\ +\n\ + int4 coord = (int4)(gidx, 0, gidz, 0);\n\ + float4 data;\n\ + float sum = 0, sqr = 0;\n\ +\n\ + __local float lcl_sum[16];\n\ + __local float lcl_sqr[16];\n\ +\n\ + if(gidx < width)\n\ + {\n\ + for(coord.y = 0; coord.y < height;)\n\ + {\n\ + data = read_imagef(input, coord);\n\ + coord.y++;\n\ + sum += data.x;\n\ + sqr += data.x * data.x;\n\ + }\n\ + }\n\ + lcl_sum[lidx] = sum;\n\ + lcl_sqr[lidx] = sqr;\n\ + barrier(CLK_LOCAL_MEM_FENCE);\n\ +\n\ + int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0);\n\ + if(lidx == 0)\n\ + {\n\ + float4 one = (float4)(1, 1, 1, 1);\n\ + __local float4* tmp_sum = (__local float4*)lcl_sum;\n\ + __local float4* tmp_sqr = (__local float4*)lcl_sqr;\n\ +\n\ + sum = 0; sqr = 0;\n\ + for(int i = 0; i < 4; i++)\n\ + {\n\ + sum += dot(tmp_sum[i], one);\n\ + sqr += dot(tmp_sqr[i], one);\n\ + }\n\ +\n\ + float4 dst = (float4)(0);\n\ + dst.x = sum;\n\ + write_imagef(output, coord_out.xy, dst);\n\ + coord_out.x++;\n\ + dst.x = sqr;\n\ + write_imagef(output, coord_out.xy, dst);\n\ + }\n\ +}\n\ +\n\ +__kernel void group_norm_sumsqr_F32_2D(\n\ + __read_only image2d_t input,\n\ + __write_only image2d_t output,\n\ + float eps,\n\ + int is2d,\n\ + float input_zp,\n\ + float input_scale,\n\ + int width,\n\ + int height\n\ + )\n\ +{\n\ + int gidx = get_global_id(0);\n\ + int gidz = get_global_id(1);\n\ + int lidx = get_local_id(0);\n\ +\n\ + int2 coord = (int2)(gidx, gidz);\n\ + float4 data;\n\ + float sum = 0, sqr = 0;\n\ +\n\ + __local float lcl_sum[16];\n\ + __local float lcl_sqr[16];\n\ +\n\ + if(gidx < width)\n\ + {\n\ + data = read_imagef(input, coord);\n\ + sum = data.x;\n\ + sqr = data.x * data.x;\n\ + }\n\ + lcl_sum[lidx] = sum;\n\ + lcl_sqr[lidx] = sqr;\n\ + barrier(CLK_LOCAL_MEM_FENCE);\n\ +\n\ + int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0);\n\ + if(lidx == 0)\n\ + {\n\ + float4 one = (float4)(1, 1, 1, 1);\n\ + __local float4* tmp_sum = (__local float4*)lcl_sum;\n\ + __local float4* tmp_sqr = (__local float4*)lcl_sqr;\n\ +\n\ + sum = 0; sqr = 0;\n\ + for(int i = 0; i < 4; i++)\n\ + {\n\ + sum += dot(tmp_sum[i], one);\n\ + sqr += dot(tmp_sqr[i], one);\n\ + }\n\ +\n\ + float4 dst = (float4)(0);\n\ + dst.x = sum;\n\ + write_imagef(output, coord_out.xy, dst);\n\ + coord_out.x++;\n\ + dst.x = sqr;\n\ + write_imagef(output, coord_out.xy, dst);\n\ + }\n\ +}\n\ +\n\ +__kernel void group_norm_meanvari(\n\ + __read_only image2d_t input,\n\ + __write_only image2d_t output,\n\ + float eps,\n\ + float group_ratio,\n\ + int group_stride\n\ + )\n\ +{\n\ + int gidx = get_global_id(0);\n\ + int lidx = get_local_id(0);\n\ + int2 coord = (int2)(gidx, get_global_id(1));\n\ +\n\ + float2 sum_sqr = (float2)(0);\n\ + float4 mean_vari = (float4)(0);\n\ +\n\ + __local float2 lcl_data[16];\n\ + __local float2 lcl_sum[4];\n\ +\n\ + for(; coord.x < group_stride;)\n\ + {\n\ + mean_vari.x += read_imagef(input, coord).x;\n\ + coord.x++;\n\ + mean_vari.y += read_imagef(input, coord).x;\n\ + coord.x+=63;\n\ + }\n\ + lcl_data[lidx] = mean_vari.xy;\n\ + barrier(CLK_LOCAL_MEM_FENCE);\n\ + if(lidx < 4)\n\ + {\n\ + float2 tmpSum = (float2)(0);\n\ + for(int i = lidx; i < 16; i+=4)\n\ + {\n\ + tmpSum += lcl_data[i];\n\ + }\n\ + lcl_sum[lidx] = tmpSum;\n\ + }\n\ + barrier(CLK_LOCAL_MEM_FENCE);\n\ + if(lidx == 0)\n\ + {\n\ + for(int i = 0; i < 4; i++)\n\ + {\n\ + sum_sqr += lcl_sum[i];\n\ + }\n\ + mean_vari.xy = sum_sqr * group_ratio;\n\ + mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\ + mean_vari.s1 = rsqrt(mean_vari.s1);\n\ +\n\ + coord.x = 0;\n\ + write_imagef(output, coord, mean_vari);\n\ + coord.x++;\n\ + float4 data;\n\ + data.x = mean_vari.y;\n\ + write_imagef(output, coord, data);\n\ + }\n\ +}\n\ +\n\ +__kernel void group_norm_F32toF32(\n\ + __read_only image2d_array_t input,\n\ + __read_only image2d_t bias,\n\ + __read_only image2d_t scale,\n\ + __read_only image2d_t meanVari,\n\ + __write_only image2d_array_t output,\n\ + float eps,\n\ + int is2d,\n\ + float input_zp,\n\ + float input_scale,\n\ + float output_zp,\n\ + float output_scale,\n\ + float rSpaceOrg,\n\ + int width,\n\ + int height,\n\ + int pStride\n\ + )\n\ +{\n\ + int gidy = get_global_id(1);\n\ + int gidz = get_global_id(2);\n\ + int4 coord = (int4)(get_global_id(0), gidy, gidz, 0);\n\ + int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 1);\n\ +\n\ + float4 gamma = read_imagef(scale, coord_para.xy);\n\ + float4 beta = read_imagef(bias, coord_para.xy);\n\ + float4 mean_vari = read_imagef(meanVari, coord_para.zy);\n\ + mean_vari.y = read_imagef(meanVari, coord_para.wy).x;\n\ + float4 data = read_imagef(input, coord);\n\ +\n\ + float scale_vari, bias_val;\n\ +\n\ + scale_vari = gamma.s0 * mean_vari.s1;\n\ + bias_val = (beta.s0 - scale_vari * mean_vari.s0);\n\ +\n\ + float4 dst;\n\ +\n\ + dst.x = data.x * scale_vari + bias_val;\n\ + write_imagef(output, coord, dst);\n\ +}\n\ +\n\ +__kernel void group_norm_F32toF32_2D(\n\ + __read_only image2d_t input,\n\ + __read_only image2d_t bias,\n\ + __read_only image2d_t scale,\n\ + __read_only image2d_t meanVari,\n\ + __write_only image2d_t output,\n\ + float eps,\n\ + int is2d,\n\ + float input_zp,\n\ + float input_scale,\n\ + float output_zp,\n\ + float output_scale,\n\ + float rSpaceOrg,\n\ + int width,\n\ + int height,\n\ + int pStride\n\ + )\n\ +{\n\ + int gidz = get_global_id(1);\n\ + int2 coord = (int2)(get_global_id(0), gidz);\n\ + int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 1);\n\ +\n\ + float4 gamma = read_imagef(scale, coord_para.xy);\n\ + float4 beta = read_imagef(bias, coord_para.xy);\n\ + float4 mean_vari = read_imagef(meanVari, coord_para.zy);\n\ + mean_vari.y = read_imagef(meanVari, coord_para.wy).x;\n\ + float4 data = read_imagef(input, coord);\n\ +\n\ + float scale_vari, bias_val;\n\ +\n\ + scale_vari = gamma.s0 * mean_vari.s1;\n\ + bias_val = beta.s0 - scale_vari * mean_vari.s0;\n\ +\n\ + float4 dst;\n\ +\n\ + dst.x = data.x * scale_vari + bias_val;\n\ + write_imagef(output, coord, dst);\n\ +}\n\ +"; /* end of group_normalization_f32_cl*/ + +static const char group_normalization_i32_cl[] = "__kernel void group_norm_sumsqr_I32(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_t output,\n\ + float eps,\n\ + int is2d,\n\ + float input_zp,\n\ + float input_scale,\n\ + int width,\n\ + int height\n\ + )\n\ +{\n\ + int gidx = get_global_id(0);\n\ + int gidz = get_global_id(1);\n\ + int lidx = get_local_id(0);\n\ +\n\ + int4 coord = (int4)(gidx, 0, gidz, 0);\n\ + float4 data;\n\ + float sum = 0, sqr = 0;\n\ + float tmpSum = 0;\n\ + float e2InScale = input_scale * input_scale;\n\ +\n\ + __local float lcl_sum[16];\n\ + __local float lcl_sqr[16];\n\ +\n\ + if(gidx < width)\n\ + {\n\ + for(coord.y = 0; coord.y < height;)\n\ + {\n\ + data = convert_float4(read_imagei(input, coord));\n\ + coord.y++;\n\ + tmpSum += data.x;\n\ + sqr += (data.x * data.x * e2InScale);\n\ + }\n\ + sum = tmpSum * input_scale;\n\ + }\n\ + lcl_sum[lidx] = sum;\n\ + lcl_sqr[lidx] = sqr;\n\ + barrier(CLK_LOCAL_MEM_FENCE);\n\ +\n\ + int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0);\n\ + if(lidx == 0)\n\ + {\n\ + float4 one = (float4)(1, 1, 1, 1);\n\ + __local float4* tmp_sum = (__local float4*)lcl_sum;\n\ + __local float4* tmp_sqr = (__local float4*)lcl_sqr;\n\ +\n\ + sum = 0; sqr = 0;\n\ + for(int i = 0; i < 4; i++)\n\ + {\n\ + sum += dot(tmp_sum[i], one);\n\ + sqr += dot(tmp_sqr[i], one);\n\ + }\n\ +\n\ + float4 dst = (float4)(0);\n\ + dst.x = sum;\n\ + write_imagef(output, coord_out.xy, dst);\n\ + coord_out.x++;\n\ + dst.x = sqr;\n\ + write_imagef(output, coord_out.xy, dst);\n\ + }\n\ +}\n\ +\n\ +__kernel void group_norm_sumsqr_I32_2D(\n\ + __read_only image2d_t input,\n\ + __write_only image2d_t output,\n\ + float eps,\n\ + int is2d,\n\ + float input_zp,\n\ + float input_scale,\n\ + int width,\n\ + int height\n\ + )\n\ +{\n\ + int gidx = get_global_id(0);\n\ + int gidz = get_global_id(1);\n\ + int lidx = get_local_id(0);\n\ +\n\ + int2 coord = (int2)(gidx, gidz);\n\ + float4 data;\n\ + float sum = 0, sqr = 0;\n\ +\n\ + __local float lcl_sum[16];\n\ + __local float lcl_sqr[16];\n\ +\n\ + if(gidx < width)\n\ + {\n\ + data = convert_float4(read_imagei(input, coord));\n\ + sum = data.x * input_scale;\n\ + sqr = sum * sum;\n\ + }\n\ + lcl_sum[lidx] = sum;\n\ + lcl_sqr[lidx] = sqr;\n\ + barrier(CLK_LOCAL_MEM_FENCE);\n\ +\n\ + int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0);\n\ + if(lidx == 0)\n\ + {\n\ + float4 one = (float4)(1, 1, 1, 1);\n\ + __local float4* tmp_sum = (__local float4*)lcl_sum;\n\ + __local float4* tmp_sqr = (__local float4*)lcl_sqr;\n\ +\n\ + sum = 0; sqr = 0;\n\ + for(int i = 0; i < 4; i++)\n\ + {\n\ + sum += dot(tmp_sum[i], one);\n\ + sqr += dot(tmp_sqr[i], one);\n\ + }\n\ +\n\ + float4 dst = (float4)(0);\n\ + dst.x = sum;\n\ + write_imagef(output, coord_out.xy, dst);\n\ + coord_out.x++;\n\ + dst.x = sqr;\n\ + write_imagef(output, coord_out.xy, dst);\n\ + }\n\ +}\n\ +\n\ +__kernel void group_norm_I32toI32(\n\ + __read_only image2d_array_t input,\n\ + __read_only image2d_t bias,\n\ + __read_only image2d_t scale,\n\ + __read_only image2d_t meanVari,\n\ + __write_only image2d_array_t output,\n\ + float eps,\n\ + int is2d,\n\ + float input_zp,\n\ + float input_scale,\n\ + float output_zp,\n\ + float output_scale,\n\ + float rSpaceOrg,\n\ + int width,\n\ + int height,\n\ + int pStride\n\ + )\n\ +{\n\ + int gidy = get_global_id(1);\n\ + int gidz = get_global_id(2);\n\ + int4 coord = (int4)(get_global_id(0), gidy, gidz, 0);\n\ + int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 1);\n\ +\n\ + float4 gamma = read_imagef(scale, coord_para.xy);\n\ + float4 beta = read_imagef(bias, coord_para.xy);\n\ + float4 mean_vari = read_imagef(meanVari, coord_para.zy);\n\ + mean_vari.y = read_imagef(meanVari, coord_para.wy).x;\n\ + float4 data = convert_float4(read_imagei(input, coord));\n\ +\n\ + float scale_vari, bias_val;\n\ +\n\ + scale_vari = gamma.s0 * mean_vari.s1;\n\ + float alpha = input_scale * output_scale * scale_vari;\n\ + bias_val = (beta.s0 - scale_vari * mean_vari.s0) * output_scale;\n\ +\n\ + int4 dst;\n\ + float4 norm;\n\ + norm.x = data.x * alpha + bias_val;\n\ + dst = convert_int4_rte(norm);\n\ + write_imagei(output, coord, dst);\n\ +}\n\ +\n\ +__kernel void group_norm_I32toI32_2D(\n\ + __read_only image2d_t input,\n\ + __read_only image2d_t bias,\n\ + __read_only image2d_t scale,\n\ + __read_only image2d_t meanVari,\n\ + __write_only image2d_t output,\n\ + float eps,\n\ + int is2d,\n\ + float input_zp,\n\ + float input_scale,\n\ + float output_zp,\n\ + float output_scale,\n\ + float rSpaceOrg,\n\ + int width,\n\ + int height,\n\ + int pStride\n\ + )\n\ +{\n\ + int gidz = get_global_id(1);\n\ + int2 coord = (int2)(get_global_id(0), gidz);\n\ + int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 1);\n\ +\n\ + float4 gamma = read_imagef(scale, coord_para.xy);\n\ + float4 beta = read_imagef(bias, coord_para.xy);\n\ + float4 mean_vari = read_imagef(meanVari, coord_para.zy);\n\ + mean_vari.y = read_imagef(meanVari, coord_para.wy).x;\n\ + float4 data = convert_float4(read_imagei(input, coord));\n\ +\n\ + float scale_vari, bias_val;\n\ +\n\ + scale_vari = gamma.s0 * mean_vari.s1;\n\ + float alpha = input_scale * output_scale * scale_vari;\n\ + bias_val = (beta.s0 - scale_vari * mean_vari.s0) * output_scale;\n\ +\n\ + int4 dst;\n\ + float4 norm;\n\ + norm.x = data.x * alpha + bias_val;\n\ + dst = convert_int4_rte(norm);\n\ + write_imagei(output, coord, dst);\n\ +}\n\ +\n\ +__kernel void group_norm_I32toF32(\n\ + __read_only image2d_array_t input,\n\ + __read_only image2d_t bias,\n\ + __read_only image2d_t scale,\n\ + __read_only image2d_t meanVari,\n\ + __write_only image2d_array_t output,\n\ + float eps,\n\ + int is2d,\n\ + float input_zp,\n\ + float input_scale,\n\ + float output_zp,\n\ + float output_scale,\n\ + float rSpaceOrg,\n\ + int width,\n\ + int height,\n\ + int pStride\n\ + )\n\ +{\n\ + int gidy = get_global_id(1);\n\ + int gidz = get_global_id(2);\n\ + int4 coord = (int4)(get_global_id(0), gidy, gidz, 0);\n\ + int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 1);\n\ +\n\ + float4 gamma = read_imagef(scale, coord_para.xy);\n\ + float4 beta = read_imagef(bias, coord_para.xy);\n\ + float4 mean_vari = read_imagef(meanVari, coord_para.zy);\n\ + mean_vari.y = read_imagef(meanVari, coord_para.wy).x;\n\ + float4 data = convert_float4(read_imagei(input, coord));\n\ +\n\ + float scale_vari, bias_val;\n\ +\n\ + scale_vari = gamma.s0 * mean_vari.s1;\n\ + float alpha = input_scale * scale_vari;\n\ + bias_val = (beta.s0 - scale_vari * mean_vari.s0);\n\ +\n\ + float4 norm;\n\ + norm.x = data.x * alpha + bias_val;\n\ + write_imagef(output, coord, norm);\n\ +}\n\ +\n\ +__kernel void group_norm_I32toF32_2D(\n\ + __read_only image2d_t input,\n\ + __read_only image2d_t bias,\n\ + __read_only image2d_t scale,\n\ + __read_only image2d_t meanVari,\n\ + __write_only image2d_t output,\n\ + float eps,\n\ + int is2d,\n\ + float input_zp,\n\ + float input_scale,\n\ + float output_zp,\n\ + float output_scale,\n\ + float rSpaceOrg,\n\ + int width,\n\ + int height,\n\ + int pStride\n\ + )\n\ +{\n\ + int gidz = get_global_id(1);\n\ + int2 coord = (int2)(get_global_id(0), gidz);\n\ + int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 1);\n\ +\n\ + float4 gamma = read_imagef(scale, coord_para.xy);\n\ + float4 beta = read_imagef(bias, coord_para.xy);\n\ + float4 mean_vari = read_imagef(meanVari, coord_para.zy);\n\ + mean_vari.y = read_imagef(meanVari, coord_para.wy).x;\n\ + float4 data = convert_float4(read_imagei(input, coord));\n\ +\n\ + float scale_vari, bias_val;\n\ +\n\ + scale_vari = gamma.s0 * mean_vari.s1;\n\ + float alpha = input_scale * scale_vari;\n\ + bias_val = beta.s0 - scale_vari * mean_vari.s0;\n\ +\n\ + float4 norm;\n\ + norm.x = data.x * alpha + bias_val;\n\ + write_imagef(output, coord, norm);\n\ +}\n\ +"; /* end of group_normalization_i32_cl*/ + +static const char group_normalization_u8_cl[] = "__kernel void group_norm_sumsqr_U8(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_t output,\n\ + float eps,\n\ + int is2d,\n\ + float input_zp,\n\ + float input_scale,\n\ + int width,\n\ + int height\n\ + )\n\ +{\n\ + int gidx = get_global_id(0);\n\ + int gidz = get_global_id(1);\n\ + int lidx = get_local_id(0);\n\ +\n\ + int4 coord = (int4)(gidx, 0, gidz, 0);\n\ + float4 data;\n\ + float sum = 0, sqr = 0;\n\ + float tmpSum = 0, tmpSqr = 0;\n\ + float e2InScale = input_scale * input_scale;\n\ +\n\ + __local float lcl_sum[16];\n\ + __local float lcl_sqr[16];\n\ +\n\ + if(gidx < width)\n\ + {\n\ + for(coord.y = 0; coord.y < height;)\n\ + {\n\ + data = convert_float4(read_imageui(input, coord));\n\ + coord.y++;\n\ + tmpSum += data.x;\n\ + tmpSqr += data.x * data.x;\n\ + }\n\ + sqr = (tmpSqr - 2 * input_zp * tmpSum + height * input_zp * input_zp) * e2InScale;\n\ + sum = (tmpSum - height * input_zp) * input_scale;\n\ + }\n\ + lcl_sum[lidx] = sum;\n\ + lcl_sqr[lidx] = sqr;\n\ + barrier(CLK_LOCAL_MEM_FENCE);\n\ +\n\ + int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0);\n\ + if(lidx == 0)\n\ + {\n\ + float4 one = (float4)(1, 1, 1, 1);\n\ + __local float4* tmp_sum = (__local float4*)lcl_sum;\n\ + __local float4* tmp_sqr = (__local float4*)lcl_sqr;\n\ +\n\ + sum = 0; sqr = 0;\n\ + for(int i = 0; i < 4; i++)\n\ + {\n\ + sum += dot(tmp_sum[i], one);\n\ + sqr += dot(tmp_sqr[i], one);\n\ + }\n\ +\n\ + float4 dst = (float4)(0);\n\ + dst.x = sum;\n\ + write_imagef(output, coord_out.xy, dst);\n\ + coord_out.x++;\n\ + dst.x = sqr;\n\ + write_imagef(output, coord_out.xy, dst);\n\ + }\n\ +}\n\ +\n\ +__kernel void group_norm_sumsqr_U8_2D(\n\ + __read_only image2d_t input,\n\ + __write_only image2d_t output,\n\ + float eps,\n\ + int is2d,\n\ + float input_zp,\n\ + float input_scale,\n\ + int width,\n\ + int height\n\ + )\n\ +{\n\ + int gidx = get_global_id(0);\n\ + int gidz = get_global_id(1);\n\ + int lidx = get_local_id(0);\n\ +\n\ + int2 coord = (int2)(gidx, gidz);\n\ + float4 data;\n\ + float sum = 0, sqr = 0;\n\ +\n\ + __local float lcl_sum[16];\n\ + __local float lcl_sqr[16];\n\ +\n\ + if(gidx < width)\n\ + {\n\ + data = convert_float4(read_imageui(input, coord));\n\ + sum = (data.x - input_zp) * input_scale;\n\ + sqr = sum * sum;\n\ + }\n\ + lcl_sum[lidx] = sum;\n\ + lcl_sqr[lidx] = sqr;\n\ + barrier(CLK_LOCAL_MEM_FENCE);\n\ +\n\ + int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0);\n\ + if(lidx == 0)\n\ + {\n\ + float4 one = (float4)(1, 1, 1, 1);\n\ + __local float4* tmp_sum = (__local float4*)lcl_sum;\n\ + __local float4* tmp_sqr = (__local float4*)lcl_sqr;\n\ +\n\ + sum = 0; sqr = 0;\n\ + for(int i = 0; i < 4; i++)\n\ + {\n\ + sum += dot(tmp_sum[i], one);\n\ + sqr += dot(tmp_sqr[i], one);\n\ + }\n\ +\n\ + float4 dst = (float4)(0);\n\ + dst.x = sum;\n\ + write_imagef(output, coord_out.xy, dst);\n\ + coord_out.x++;\n\ + dst.x = sqr;\n\ + write_imagef(output, coord_out.xy, dst);\n\ + }\n\ +}\n\ +\n\ +__kernel void group_norm_U8toU8(\n\ + __read_only image2d_array_t input,\n\ + __read_only image2d_t bias,\n\ + __read_only image2d_t scale,\n\ + __read_only image2d_t meanVari,\n\ + __write_only image2d_array_t output,\n\ + float eps,\n\ + int is2d,\n\ + float input_zp,\n\ + float input_scale,\n\ + float output_zp,\n\ + float output_scale,\n\ + float rSpaceOrg,\n\ + int width,\n\ + int height,\n\ + int pStride\n\ + )\n\ +{\n\ + int gidy = get_global_id(1);\n\ + int gidz = get_global_id(2);\n\ + int4 coord = (int4)(get_global_id(0), gidy, gidz, 0);\n\ + int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 1);\n\ +\n\ + float4 gamma = read_imagef(scale, coord_para.xy);\n\ + float4 beta = read_imagef(bias, coord_para.xy);\n\ + float4 mean_vari = read_imagef(meanVari, coord_para.zy);\n\ + mean_vari.y = read_imagef(meanVari, coord_para.wy).x;\n\ + float4 data = convert_float4(read_imageui(input, coord));\n\ +\n\ + float scale_vari, bias_val;\n\ + float scale_inOut = input_scale * output_scale;\n\ +\n\ + scale_vari = gamma.s0 * mean_vari.s1;\n\ + float alpha = scale_inOut * scale_vari;\n\ + bias_val = (beta.s0 - scale_vari * mean_vari.s0) * output_scale + output_zp;\n\ +\n\ + uint4 dst;\n\ + data.x -= input_zp;\n\ + float4 norm;\n\ + norm.x = data.x * alpha + bias_val;\n\ + dst = convert_uint4_rte(norm);\n\ + write_imageui(output, coord, dst);\n\ +}\n\ +\n\ +__kernel void group_norm_U8toU8_2D(\n\ + __read_only image2d_t input,\n\ + __read_only image2d_t bias,\n\ + __read_only image2d_t scale,\n\ + __read_only image2d_t meanVari,\n\ + __write_only image2d_t output,\n\ + float eps,\n\ + int is2d,\n\ + float input_zp,\n\ + float input_scale,\n\ + float output_zp,\n\ + float output_scale,\n\ + float rSpaceOrg,\n\ + int width,\n\ + int height,\n\ + int pStride\n\ + )\n\ +{\n\ + int gidz = get_global_id(1);\n\ + int2 coord = (int2)(get_global_id(0), gidz);\n\ + int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 1);\n\ +\n\ + float4 gamma = read_imagef(scale, coord_para.xy);\n\ + float4 beta = read_imagef(bias, coord_para.xy);\n\ + float4 mean_vari = read_imagef(meanVari, coord_para.zy);\n\ + mean_vari.y = read_imagef(meanVari, coord_para.wy).x;\n\ + float4 data = convert_float4(read_imageui(input, coord));\n\ +\n\ + float scale_vari, bias_val;\n\ + float scale_inOut = input_scale * output_scale;\n\ +\n\ + scale_vari = gamma.s0 * mean_vari.s1;\n\ + float alpha = scale_inOut * scale_vari;\n\ + bias_val = (beta.s0 - scale_vari * mean_vari.s0) * output_scale + output_zp;\n\ +\n\ + uint4 dst;\n\ + data.x -= input_zp;\n\ + float4 norm;\n\ + norm.x = data.x * alpha + bias_val;\n\ + dst = convert_uint4_rte(norm);\n\ + write_imageui(output, coord, dst);\n\ +}\n\ +\n\ +__kernel void group_norm_U8toF32(\n\ + __read_only image2d_array_t input,\n\ + __read_only image2d_t bias,\n\ + __read_only image2d_t scale,\n\ + __read_only image2d_t meanVari,\n\ + __write_only image2d_array_t output,\n\ + float eps,\n\ + int is2d,\n\ + float input_zp,\n\ + float input_scale,\n\ + float output_zp,\n\ + float output_scale,\n\ + float rSpaceOrg,\n\ + int width,\n\ + int height,\n\ + int pStride\n\ + )\n\ +{\n\ + int gidy = get_global_id(1);\n\ + int gidz = get_global_id(2);\n\ + int4 coord = (int4)(get_global_id(0), gidy, gidz, 0);\n\ + int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 1);\n\ +\n\ + float4 gamma = read_imagef(scale, coord_para.xy);\n\ + float4 beta = read_imagef(bias, coord_para.xy);\n\ + float4 mean_vari = read_imagef(meanVari, coord_para.zy);\n\ + mean_vari.y = read_imagef(meanVari, coord_para.wy).x;\n\ + float4 data = convert_float4(read_imageui(input, coord));\n\ +\n\ + float scale_vari, bias_val;\n\ + float scale_inOut = input_scale * output_scale;\n\ +\n\ + scale_vari = gamma.s0 * mean_vari.s1;\n\ + float alpha = scale_inOut * scale_vari;\n\ + bias_val = (beta.s0 - scale_vari * mean_vari.s0) * output_scale + output_zp;\n\ +\n\ + data.x -= input_zp;\n\ + float4 norm;\n\ + norm.x = data.x * alpha + bias_val;\n\ + write_imagef(output, coord, norm);\n\ +}\n\ +\n\ +__kernel void group_norm_U8toF32_2D(\n\ + __read_only image2d_t input,\n\ + __read_only image2d_t bias,\n\ + __read_only image2d_t scale,\n\ + __read_only image2d_t meanVari,\n\ + __write_only image2d_t output,\n\ + float eps,\n\ + int is2d,\n\ + float input_zp,\n\ + float input_scale,\n\ + float output_zp,\n\ + float output_scale,\n\ + float rSpaceOrg,\n\ + int width,\n\ + int height,\n\ + int pStride\n\ + )\n\ +{\n\ + int gidz = get_global_id(1);\n\ + int2 coord = (int2)(get_global_id(0), gidz);\n\ + int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 1);\n\ +\n\ + float4 gamma = read_imagef(scale, coord_para.xy);\n\ + float4 beta = read_imagef(bias, coord_para.xy);\n\ + float4 mean_vari = read_imagef(meanVari, coord_para.zy);\n\ + mean_vari.y = read_imagef(meanVari, coord_para.wy).x;\n\ + float4 data = convert_float4(read_imageui(input, coord));\n\ +\n\ + float scale_vari, bias_val;\n\ + float scale_inOut = input_scale * output_scale;\n\ +\n\ + scale_vari = gamma.s0 * mean_vari.s1;\n\ + float alpha = scale_inOut * scale_vari;\n\ + bias_val = (beta.s0 - scale_vari * mean_vari.s0) * output_scale + output_zp;\n\ +\n\ + data.x -= input_zp;\n\ + float4 norm;\n\ + norm.x = data.x * alpha + bias_val;\n\ + write_imagef(output, coord, norm);\n\ +}\n\ +"; /* end of group_normalization_u8_cl*/ + static const char grucell_activation_cl[] = "__kernel void grucell_activation(\n\ __read_only image2d_array_t input,\n\ __write_only image2d_array_t output)\n\ @@ -44329,7 +50756,7 @@ __kernel void gemm_transb_F32F32toF32_3D(\n\ \n\ coord_a.x = get_global_id(0);\n\ coord_a.z = get_global_id(2);\n\ - write_imagef(output, coord_b, sum);\n\ + write_imagef(output, coord_a, sum);\n\ }\n\ \n\ __kernel void gemm_transb_F32I8toF32_2D(\n\ @@ -44405,7 +50832,7 @@ __kernel void gemm_transb_F32I8toF32_3D(\n\ \n\ coord_a.x = get_global_id(0);\n\ coord_a.z = get_global_id(2);\n\ - write_imagef(output, coord_b, sum);\n\ + write_imagef(output, coord_a, sum);\n\ }\n\ "; /* end of matrixmul_cl*/ @@ -45510,6 +51937,138 @@ __kernel void moments_axis2_I32toF32(\n\ write_imagef(output_vari, coord_out, vari);\n\ }"; /* end of moments_axis2_cl*/ +static const char one_hot_cl[] = "__kernel void one_hot_F32toF32\n\ + (\n\ + __read_only image2d_t input,\n\ + __write_only image2d_array_t output,\n\ + int depth,\n\ + float on_value,\n\ + float off_value,\n\ + float inputScale,\n\ + float inputTail\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\ +\n\ + float4 val = read_imagef(input, coord.xy);\n\ +\n\ + do\n\ + {\n\ + float4 dst;\n\ + dst.x = convert_int(val.x) == coord.z ? on_value : off_value;\n\ +\n\ + write_imagef(output, coord.xzyw, dst.xxxx);\n\ +\n\ + coord.z ++;\n\ + } while (coord.z < depth);\n\ +}\n\ +\n\ +__kernel void one_hot_I32toI32\n\ + (\n\ + __read_only image2d_t input,\n\ + __write_only image2d_array_t output,\n\ + int depth,\n\ + int on_value,\n\ + int off_value,\n\ + float inputScale,\n\ + float inputTail\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\ +\n\ + int4 val = read_imagei(input, coord.xy);\n\ +\n\ + do\n\ + {\n\ + int4 dst;\n\ + dst.x = val.x == coord.z ? on_value : off_value;\n\ +\n\ + write_imagei(output, coord.xzyw, dst.xxxx);\n\ +\n\ + coord.z ++;\n\ + } while (coord.z < depth);\n\ +}\n\ +\n\ +__kernel void one_hot_I32toU8\n\ + (\n\ + __read_only image2d_t input,\n\ + __write_only image2d_array_t output,\n\ + int depth,\n\ + uint on_value,\n\ + uint off_value,\n\ + float inputScale,\n\ + float inputTail\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\ +\n\ + int4 val = read_imagei(input, coord.xy);\n\ + do\n\ + {\n\ + uint4 dst;\n\ + dst.x = val.x == coord.z ? on_value : off_value;\n\ +\n\ + write_imageui(output, coord.xzyw, dst.xxxx);\n\ +\n\ + coord.z ++;\n\ + } while (coord.z < depth);\n\ +}\n\ +\n\ +__kernel void one_hot_I32toF32\n\ + (\n\ + __read_only image2d_t input,\n\ + __write_only image2d_array_t output,\n\ + int depth,\n\ + float on_value,\n\ + float off_value,\n\ + float inputScale,\n\ + float inputTail\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\ +\n\ + int4 val = read_imagei(input, coord.xy);\n\ +\n\ + do\n\ + {\n\ + float4 dst;\n\ + dst.x = val.x == coord.z ? on_value : off_value;\n\ +\n\ + write_imagef(output, coord.xzyw, dst.xxxx);\n\ +\n\ + coord.z ++;\n\ + } while (coord.z < depth);\n\ +}\n\ +\n\ +__kernel void one_hot_U8toU8\n\ + (\n\ + __read_only image2d_t input,\n\ + __write_only image2d_array_t output,\n\ + int depth,\n\ + uint on_value,\n\ + uint off_value,\n\ + float inputScale,\n\ + float inputTail\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\ +\n\ + uint4 src = read_imageui(input, coord.xy);\n\ +\n\ + int val = convert_int(convert_float(src.x) * inputScale - inputTail);\n\ +\n\ + do\n\ + {\n\ + uint4 dst;\n\ + dst.x = val == coord.z ? on_value : off_value;\n\ +\n\ + write_imageui(output, coord.xzyw, dst.xxxx);\n\ +\n\ + coord.z ++;\n\ + } while (coord.z < depth);\n\ +}\n\ +"; /* end of one_hot_cl*/ + static const char poolwithargmax_cl[] = "\n\ #define POOLWITHARGMAX_PROCESS(data_type, read_fun, write_fun0, write_fun1) \\\n\ data_type src = 0; \\\n\ @@ -47876,6 +54435,184 @@ __kernel void relu_keras_U8toF32_2D(\n\ write_imagef(output, coord, dst);\n\ }"; /* end of relu_keras_cl*/ +static const char repeat_cl[] = "__kernel void repeat_I32_axis0(\n\ + __read_only image2d_array_t input0,\n\ + __read_only image2d_t input1,\n\ + __write_only image2d_array_t output,\n\ + int width, int height, int channel, int axis)\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), 0, get_global_id(2), 0);\n\ + int4 coord_out = coord;\n\ +\n\ + for(coord.y = 0; coord.y < height;)\n\ + {\n\ + int4 data = read_imagei(input0, coord);\n\ + int4 len = read_imagei(input1, coord.yw);\n\ + coord.y++;\n\ + for(int i = 0; i < len.x; i++)\n\ + {\n\ + write_imagei(output, coord_out, data);\n\ + coord_out.y++;\n\ + }\n\ + }\n\ +}\n\ +\n\ +__kernel void repeat_I32_axis1(\n\ + __read_only image2d_array_t input0,\n\ + __read_only image2d_t input1,\n\ + __write_only image2d_array_t output,\n\ + int width, int height, int channel, int axis)\n\ +{\n\ + int4 coord = (int4)(0, get_global_id(1), get_global_id(2), 0);\n\ + int4 coord_out = coord;\n\ +\n\ + for(coord.x = 0; coord.x < width;)\n\ + {\n\ + int4 data = read_imagei(input0, coord);\n\ + int4 len = read_imagei(input1, coord.xw);\n\ + coord.x++;\n\ + for(int i = 0; i < len.x; i++)\n\ + {\n\ + write_imagei(output, coord_out, data);\n\ + coord_out.x++;\n\ + }\n\ + }\n\ +}\n\ +\n\ +__kernel void repeat_I32_axis2(\n\ + __read_only image2d_array_t input0,\n\ + __read_only image2d_t input1,\n\ + __write_only image2d_array_t output,\n\ + int width, int height, int channel, int axis)\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\ + int4 coord_out = coord;\n\ +\n\ + for(coord.z = 0; coord.z < channel;)\n\ + {\n\ + int4 data = read_imagei(input0, coord);\n\ + int4 len = read_imagei(input1, coord.zw);\n\ + coord.z++;\n\ + for(int i = 0; i < len.x; i++)\n\ + {\n\ + write_imagei(output, coord_out, data);\n\ + coord_out.z++;\n\ + }\n\ + }\n\ +}\n\ +\n\ +__kernel void repeat_I32_1D(\n\ + __read_only image2d_t input0,\n\ + __read_only image2d_t input1,\n\ + __write_only image2d_t output,\n\ + int width, int height, int channel, int axis)\n\ +{\n\ + int2 coord = (int2)(0, 0);\n\ + int2 coord_out = coord;\n\ +\n\ + for(coord.x = 0; coord.x < width;)\n\ + {\n\ + int4 data = read_imagei(input0, coord);\n\ + int4 len = read_imagei(input1, coord.xy);\n\ + coord.x++;\n\ + for(int i = 0; i < len.x; i++)\n\ + {\n\ + write_imagei(output, coord_out, data);\n\ + coord_out.x++;\n\ + }\n\ + }\n\ +}\n\ +\n\ +__kernel void repeat_F32_axis0(\n\ + __read_only image2d_array_t input0,\n\ + __read_only image2d_t input1,\n\ + __write_only image2d_array_t output,\n\ + int width, int height, int channel, int axis)\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), 0, get_global_id(2), 0);\n\ + int4 coord_out = coord;\n\ +\n\ + for(coord.y = 0; coord.y < height;)\n\ + {\n\ + float4 data = read_imagef(input0, coord);\n\ + int4 len = read_imagei(input1, coord.yw);\n\ + coord.y++;\n\ + for(int i = 0; i < len.x; i++)\n\ + {\n\ + write_imagef(output, coord_out, data);\n\ + coord_out.y++;\n\ + }\n\ + }\n\ +}\n\ +\n\ +__kernel void repeat_F32_axis1(\n\ + __read_only image2d_array_t input0,\n\ + __read_only image2d_t input1,\n\ + __write_only image2d_array_t output,\n\ + int width, int height, int channel, int axis)\n\ +{\n\ + int4 coord = (int4)(0, get_global_id(1), get_global_id(2), 0);\n\ + int4 coord_out = coord;\n\ +\n\ + for(coord.x = 0; coord.x < width;)\n\ + {\n\ + float4 data = read_imagef(input0, coord);\n\ + int4 len = read_imagei(input1, coord.xw);\n\ + coord.x++;\n\ + for(int i = 0; i < len.x; i++)\n\ + {\n\ + write_imagef(output, coord_out, data);\n\ + coord_out.x++;\n\ + }\n\ + }\n\ +}\n\ +\n\ +__kernel void repeat_F32_axis2(\n\ + __read_only image2d_array_t input0,\n\ + __read_only image2d_t input1,\n\ + __write_only image2d_array_t output,\n\ + int width, int height, int channel, int axis)\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\ + int4 coord_out = coord;\n\ +\n\ + for(coord.z = 0; coord.z < channel;)\n\ + {\n\ + float4 data = read_imagef(input0, coord);\n\ + int4 len = read_imagei(input1, coord.zw);\n\ + coord.z++;\n\ + for(int i = 0; i < len.x; i++)\n\ + {\n\ + write_imagef(output, coord_out, data);\n\ + coord_out.z++;\n\ + }\n\ + }\n\ +}\n\ +\n\ +__kernel void repeat_F32_1D(\n\ + __read_only image2d_t input0,\n\ + __read_only image2d_t input1,\n\ + __write_only image2d_t output,\n\ + int width, int height, int channel, int axis)\n\ +{\n\ + int2 coord = (int2)(0, 0);\n\ + int2 coord_out = coord;\n\ +\n\ + for(coord.x = 0; coord.x < width;)\n\ + {\n\ + float4 data = read_imagef(input0, coord);\n\ + int4 len = read_imagei(input1, coord.xy);\n\ + coord.x++;\n\ + for(int i = 0; i < len.x; i++)\n\ + {\n\ + write_imagef(output, coord_out, data);\n\ + coord_out.x++;\n\ + }\n\ + }\n\ +}\n\ +\n\ +"; /* end of repeat_cl*/ + static const char resize_1d_bilinear_cl[] = "__kernel void resize_1d_bilinear_F32toF32(\n\ __read_only image2d_array_t input,\n\ __write_only image2d_array_t output,\n\ @@ -48572,6 +55309,225 @@ __kernel void select_I8_F32_F32toF32_2D(\n\ }\n\ "; /* end of select_cl*/ +static const char sequence_mask_cl[] = "\n\ +__kernel void sequence_mask_I32toU8(\n\ + image2d_t input, image2d_array_t output, int maxLen,\n\ + float input_scale, float input_zpScale, float outputVal1, int output_ZP)\n\ +{\n\ + int gidx = get_global_id(0);\n\ + int4 coord = (int4)(gidx, get_global_id(1), get_global_id(2), 0);\n\ + int4 index = read_imagei(input, coord.yz);\n\ + uint4 data;\n\ + data.x = gidx < index.x ? convert_uint_rte(outputVal1) : (uint)(output_ZP);\n\ + write_imageui(output, coord, data);\n\ +}\n\ +\n\ +__kernel void sequence_mask_I32toU8_2D(\n\ + image2d_t input, image2d_t output, int maxLen,\n\ + float input_scale, float input_zpScale, float outputVal1, int output_ZP)\n\ +{\n\ + int gidx = get_global_id(0);\n\ + int2 coord = (int2)(gidx, get_global_id(1));\n\ + int4 index = read_imagei(input, coord.yy);\n\ + uint4 data;\n\ + data.x = gidx < index.x ? convert_uint_rte(outputVal1) : (uint)(output_ZP);\n\ + write_imageui(output, coord, data);\n\ +}\n\ +\n\ +__kernel void sequence_mask_I32toI32(\n\ + image2d_t input, image2d_array_t output, int maxLen,\n\ + float input_scale, float input_zpScale, float outputVal1, int output_ZP)\n\ +{\n\ + int gidx = get_global_id(0);\n\ + int4 coord = (int4)(gidx, get_global_id(1), get_global_id(2), 0);\n\ + int4 index = read_imagei(input, coord.yz);\n\ + int4 data;\n\ + data = gidx < index.x ? (int4)(1) : (int4)(0);\n\ + write_imagei(output, coord, data);\n\ +}\n\ +\n\ +__kernel void sequence_mask_I32toI32_2D(\n\ + image2d_t input, image2d_t output, int maxLen,\n\ + float input_scale, float input_zpScale, float outputVal1, int output_ZP)\n\ +{\n\ + int gidx = get_global_id(0);\n\ + int2 coord = (int2)(gidx, get_global_id(1));\n\ + int4 index = read_imagei(input, coord.yy);\n\ + int4 data;\n\ + data = gidx < index.x ? (int4)(1) : (int4)(0);\n\ + write_imagei(output, coord, data);\n\ +}\n\ +\n\ +__kernel void sequence_mask_I32toF32(\n\ + image2d_t input, image2d_array_t output, int maxLen,\n\ + float input_scale, float input_zpScale, float outputVal1, int output_ZP)\n\ +{\n\ + int gidx = get_global_id(0);\n\ + int4 coord = (int4)(gidx, get_global_id(1), get_global_id(2), 0);\n\ + int4 index = read_imagei(input, coord.yz);\n\ + float4 data;\n\ + data = gidx < index.x ? (float4)(1.0f) : (float4)(0.0f);\n\ + write_imagef(output, coord, data);\n\ +}\n\ +\n\ +__kernel void sequence_mask_I32toF32_2D(\n\ + image2d_t input, image2d_t output, int maxLen,\n\ + float input_scale, float input_zpScale, float outputVal1, int output_ZP)\n\ +{\n\ + int gidx = get_global_id(0);\n\ + int2 coord = (int2)(gidx, get_global_id(1));\n\ + int4 index = read_imagei(input, coord.yy);\n\ + float4 data;\n\ + data = gidx < index.x ? (float4)(1.0f) : (float4)(0.0f);\n\ + write_imagef(output, coord, data);\n\ +}"; /* end of sequence_mask_cl*/ + +static const char slice_cl[] = "__kernel void slice_F32_I32toF32\n\ + (\n\ + __read_only image2d_array_t input0,\n\ + __read_only image2d_t input1,\n\ + __write_only image2d_array_t output,\n\ + float inputScale,\n\ + float inputTail,\n\ + float outputScale,\n\ + float outputZP\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ + int4 coord_in;\n\ + Image begin_img = create_image_from_image2d(input1, 4);\n\ + uchar* begin_ptr = begin_img.ptr;\n\ + int4 begin = ((int4 *)begin_ptr)[0];\n\ +\n\ + coord_in = coord + begin;\n\ + float4 src = read_imagef(input0, coord_in);\n\ +\n\ + write_imagef(output, coord, src);\n\ +}\n\ +\n\ +__kernel void slice_F32_I32toF32_2D\n\ + (\n\ + __read_only image2d_t input0,\n\ + __read_only image2d_t input1,\n\ + __write_only image2d_t output,\n\ + float inputScale,\n\ + float inputTail,\n\ + float outputScale,\n\ + float outputZP\n\ + )\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ + int2 coord_in;\n\ + Image begin_img = create_image_from_image2d(input1, 4);\n\ + uchar* begin_ptr = begin_img.ptr;\n\ + int2 begin = ((int2 *)begin_ptr)[0];\n\ +\n\ + coord_in = coord + begin;\n\ + float4 src = read_imagef(input0, coord_in);\n\ +\n\ + write_imagef(output, coord, src);\n\ +}\n\ +\n\ +__kernel void slice_U8_I32toU8\n\ + (\n\ + __read_only image2d_array_t input0,\n\ + __read_only image2d_t input1,\n\ + __write_only image2d_array_t output,\n\ + float inputScale,\n\ + float inputTail,\n\ + float outputScale,\n\ + float outputZP\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ + int4 coord_in;\n\ + Image begin_img = create_image_from_image2d(input1, 4);\n\ + uchar* begin_ptr = begin_img.ptr;\n\ + int4 begin = ((int4 *)begin_ptr)[0];\n\ +\n\ + coord_in = coord + begin;\n\ + uint4 src = read_imageui(input0, coord_in);\n\ +\n\ + float4 data = convert_float4(src) * inputScale - inputTail;\n\ + uint4 dst = convert_uint4(data * outputScale + outputZP);\n\ +\n\ + write_imageui(output, coord, dst);\n\ +}\n\ +\n\ +__kernel void slice_U8_I32toU8_2D\n\ + (\n\ + __read_only image2d_t input0,\n\ + __read_only image2d_t input1,\n\ + __write_only image2d_t output,\n\ + float inputScale,\n\ + float inputTail,\n\ + float outputScale,\n\ + float outputZP\n\ + )\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ + int2 coord_in;\n\ + Image begin_img = create_image_from_image2d(input1, 4);\n\ + uchar* begin_ptr = begin_img.ptr;\n\ + int2 begin = ((int2 *)begin_ptr)[0];\n\ +\n\ + coord_in = coord + begin;\n\ + uint4 src = read_imageui(input0, coord_in);\n\ +\n\ + float4 data = convert_float4(src) * inputScale - inputTail;\n\ + uint4 dst = convert_uint4(data * outputScale + outputZP);\n\ +\n\ + write_imageui(output, coord, dst);\n\ +}\n\ +\n\ +__kernel void slice_I32_I32toI32\n\ + (\n\ + __read_only image2d_array_t input0,\n\ + __read_only image2d_t input1,\n\ + __write_only image2d_array_t output,\n\ + float inputScale,\n\ + float inputTail,\n\ + float outputScale,\n\ + float outputZP\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ + int4 coord_in;\n\ + Image begin_img = create_image_from_image2d(input1, 4);\n\ + uchar* begin_ptr = begin_img.ptr;\n\ + int4 begin = ((int4 *)begin_ptr)[0];\n\ +\n\ + coord_in = coord + begin;\n\ + int4 src = read_imagei(input0, coord_in);\n\ +\n\ + write_imagei(output, coord, src);\n\ +}\n\ +\n\ +__kernel void slice_I32_I32toI32_2D\n\ + (\n\ + __read_only image2d_t input0,\n\ + __read_only image2d_t input1,\n\ + __write_only image2d_t output,\n\ + float inputScale,\n\ + float inputTail,\n\ + float outputScale,\n\ + float outputZP\n\ + )\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ + int2 coord_in;\n\ + Image begin_img = create_image_from_image2d(input1, 4);\n\ + uchar* begin_ptr = begin_img.ptr;\n\ + int2 begin = ((int2 *)begin_ptr)[0];\n\ +\n\ + coord_in = coord + begin;\n\ + int4 src = read_imagei(input0, coord_in);\n\ +\n\ + write_imagei(output, coord, src);\n\ +}\n\ +\n\ +"; /* end of slice_cl*/ + static const char space2depth_internal_cl[] = "\n\ __kernel void space2depth_internal_F32toF32 (\n\ image2d_array_t input,\n\ @@ -49102,11 +56058,14 @@ static const source_map_t evis_resource[] = {"argmin_axis1_vx", argmin_axis1_vx}, {"argmin_axis2_vx", argmin_axis2_vx}, {"batchnorm_single_vx", batchnorm_single_vx}, + {"batchnorm_single_f32_vx", batchnorm_single_f32_vx}, {"cast_vx", cast_vx}, {"clip_F16_vx", clip_F16_vx}, {"clip_I16_vx", clip_I16_vx}, {"clip_I8_vx", clip_I8_vx}, {"clip_U8_vx", clip_U8_vx}, + {"conv1d_ovxlib_vx", conv1d_ovxlib_vx}, + {"conv1d_ovxlib_k1024_vx", conv1d_ovxlib_k1024_vx}, {"depth2space_crd_vx", depth2space_crd_vx}, {"depthwise_conv1d_src0_vx", depthwise_conv1d_src0_vx}, {"depthwise_conv1d_src1_vx", depthwise_conv1d_src1_vx}, @@ -49115,8 +56074,10 @@ static const source_map_t evis_resource[] = {"detect_post_box_vx", detect_post_box_vx}, {"eltwise_unary_2d_vx", eltwise_unary_2d_vx}, {"eltwise_unary_3d_vx", eltwise_unary_3d_vx}, + {"erf_vx", erf_vx}, {"floordiv_vx", floordiv_vx}, {"gather_vx", gather_vx}, + {"gather_array_vx", gather_array_vx}, {"gather_mix_vx", gather_mix_vx}, {"gather_nd_vx", gather_nd_vx}, {"gather_nd_2d_vx", gather_nd_2d_vx}, @@ -49124,6 +56085,11 @@ static const source_map_t evis_resource[] = {"gather_nd_3d_vx", gather_nd_3d_vx}, {"gather_nd_3d_mix_vx", gather_nd_3d_mix_vx}, {"gather_nd_mix_vx", gather_nd_mix_vx}, + {"group_normalization_f16_vx", group_normalization_f16_vx}, + {"group_normalization_i16_vx", group_normalization_i16_vx}, + {"group_normalization_i8_vx", group_normalization_i8_vx}, + {"group_normalization_u8_vx", group_normalization_u8_vx}, + {"group_normalization_u8_f16_vx", group_normalization_u8_f16_vx}, {"grucell_activation_vx", grucell_activation_vx}, {"grucell_activation_sma_vx", grucell_activation_sma_vx}, {"grucell_cdnn_activation_vx", grucell_cdnn_activation_vx}, @@ -49132,12 +56098,19 @@ static const source_map_t evis_resource[] = {"instance_normalization_f16_vx", instance_normalization_f16_vx}, {"instance_normalization_i16_vx", instance_normalization_i16_vx}, {"instance_normalization_i8_vx", instance_normalization_i8_vx}, + {"instance_normalization_scale_f32_vx", instance_normalization_scale_f32_vx}, + {"instance_normalization_scale_f32_bf16_vx", instance_normalization_scale_f32_bf16_vx}, + {"instance_normalization_scale_f32_f16_vx", instance_normalization_scale_f32_f16_vx}, {"instance_normalization_u8_vx", instance_normalization_u8_vx}, + {"instance_normalization_u8_f16_vx", instance_normalization_u8_f16_vx}, {"l2normalizescale_axis0_vx", l2normalizescale_axis0_vx}, {"l2normalizescale_axis1_vx", l2normalizescale_axis1_vx}, {"layer_normalization_vx", layer_normalization_vx}, {"layer_normalization_2d_vx", layer_normalization_2d_vx}, {"layer_normalization_i16_vx", layer_normalization_i16_vx}, + {"layer_normalization_scale_f32_vx", layer_normalization_scale_f32_vx}, + {"layer_normalization_scale_f32_2d_vx", layer_normalization_scale_f32_2d_vx}, + {"layer_normalization_scale_f32_bf16_vx", layer_normalization_scale_f32_bf16_vx}, {"layer_normalization_u8_f16_vx", layer_normalization_u8_f16_vx}, {"layer_normalization_wh_f16_vx", layer_normalization_wh_f16_vx}, {"layer_normalization_wh_i16_vx", layer_normalization_wh_i16_vx}, @@ -49194,6 +56167,7 @@ static const source_map_t evis_resource[] = {"moments_axis012_vx", moments_axis012_vx}, {"moments_axis1_vx", moments_axis1_vx}, {"moments_axis2_vx", moments_axis2_vx}, + {"one_hot_vx", one_hot_vx}, {"poolwithargmax_F16_vx", poolwithargmax_F16_vx}, {"poolwithargmax_I16_vx", poolwithargmax_I16_vx}, {"poolwithargmax_I8_vx", poolwithargmax_I8_vx}, @@ -49241,6 +56215,8 @@ static const source_map_t evis_resource[] = {"relational_ops_2d_vx", relational_ops_2d_vx}, {"relational_ops_3d_vx", relational_ops_3d_vx}, {"relu_keras_vx", relu_keras_vx}, + {"repeat_vx", repeat_vx}, + {"repeat_axis1_vx", repeat_axis1_vx}, {"resize_1d_bilinear_BF16_vx", resize_1d_bilinear_BF16_vx}, {"resize_1d_bilinear_DOWN_NX_vx", resize_1d_bilinear_DOWN_NX_vx}, {"resize_1d_bilinear_F16_vx", resize_1d_bilinear_F16_vx}, @@ -49255,12 +56231,14 @@ static const source_map_t evis_resource[] = {"resize_bilinear_I16_vx", resize_bilinear_I16_vx}, {"resize_bilinear_I8_vx", resize_bilinear_I8_vx}, {"resize_bilinear_U8_vx", resize_bilinear_U8_vx}, - {"resize_bilinear_U8_UP_2X_vx", resize_bilinear_U8_UP_2X_vx}, + {"resize_bilinear_U8_half_pixel_centers_vx", resize_bilinear_U8_half_pixel_centers_vx}, {"resize_bilinear_U8_opt_vx", resize_bilinear_U8_opt_vx}, {"resize_nearest_vx", resize_nearest_vx}, {"scatter_nd_vx", scatter_nd_vx}, {"scatter_nd_big_vx", scatter_nd_big_vx}, {"select_vx", select_vx}, + {"sequence_mask_vx", sequence_mask_vx}, + {"slice_vx", slice_vx}, {"space2depth_internal_vx", space2depth_internal_vx}, {"swish_vx", swish_vx}, {"tile_vx", tile_vx}, @@ -49271,11 +56249,9 @@ static const source_map_t evis_resource[] = {"upsample_U8_vx", upsample_U8_vx}, {"upsamplescale_vx", upsamplescale_vx}, {"upsamplescale_k2_vx", upsamplescale_k2_vx}, - {"vsi_nn_kernel_axis_aligned_bbox_transform_vx", vsi_nn_kernel_axis_aligned_bbox_transform_vx}, {"vsi_nn_kernel_box_with_nms_limit_vx", vsi_nn_kernel_box_with_nms_limit_vx}, {"vsi_nn_kernel_detection_postprocess_vx", vsi_nn_kernel_detection_postprocess_vx}, {"vsi_nn_kernel_extra_ending_vx", vsi_nn_kernel_extra_ending_vx}, - {"vsi_nn_kernel_generate_proposals_vx", vsi_nn_kernel_generate_proposals_vx}, {"vsi_nn_kernel_header_vx", vsi_nn_kernel_header_vx}, {"vsi_nn_kernel_heatmap_max_keypoint_vx", vsi_nn_kernel_heatmap_max_keypoint_vx}, {"vsi_nn_kernel_imageprocess_vx", vsi_nn_kernel_imageprocess_vx}, @@ -49286,7 +56262,6 @@ static const source_map_t evis_resource[] = {"vsi_nn_kernel_roi_align_vx", vsi_nn_kernel_roi_align_vx}, {"vsi_nn_kernel_signalframe_vx", vsi_nn_kernel_signalframe_vx}, {"vsi_nn_kernel_tensorstackconcat_vx", vsi_nn_kernel_tensorstackconcat_vx}, - {"vsi_nn_kernel_topk_vx", vsi_nn_kernel_topk_vx}, {"vsi_nn_kernel_transform_gemm_vx", vsi_nn_kernel_transform_gemm_vx}, {"vsi_nn_kernel_transform_interp_vx", vsi_nn_kernel_transform_interp_vx}, {"vsi_nn_kernel_transform_setupThres_vx", vsi_nn_kernel_transform_setupThres_vx}, @@ -49308,10 +56283,14 @@ static const source_map_t cl_resource[] = {"detect_post_box_cl", detect_post_box_cl}, {"eltwise_ops_helper_cl", eltwise_ops_helper_cl}, {"eltwise_unary_cl", eltwise_unary_cl}, + {"erf_cl", erf_cl}, {"floordiv_cl", floordiv_cl}, {"gather_cl", gather_cl}, {"gather_nd_cl", gather_nd_cl}, {"gather_nd_3d_cl", gather_nd_3d_cl}, + {"group_normalization_f32_cl", group_normalization_f32_cl}, + {"group_normalization_i32_cl", group_normalization_i32_cl}, + {"group_normalization_u8_cl", group_normalization_u8_cl}, {"grucell_activation_cl", grucell_activation_cl}, {"grucell_activation_sma_cl", grucell_activation_sma_cl}, {"hswish_cl", hswish_cl}, @@ -49358,6 +56337,7 @@ static const source_map_t cl_resource[] = {"moments_axis012_cl", moments_axis012_cl}, {"moments_axis1_cl", moments_axis1_cl}, {"moments_axis2_cl", moments_axis2_cl}, + {"one_hot_cl", one_hot_cl}, {"poolwithargmax_cl", poolwithargmax_cl}, {"pow_cl", pow_cl}, {"prelu_cl", prelu_cl}, @@ -49379,6 +56359,7 @@ static const source_map_t cl_resource[] = {"reduceprod_internal_axis2_cl", reduceprod_internal_axis2_cl}, {"relational_ops_cl", relational_ops_cl}, {"relu_keras_cl", relu_keras_cl}, + {"repeat_cl", repeat_cl}, {"resize_1d_bilinear_cl", resize_1d_bilinear_cl}, {"resize_1d_nearest_cl", resize_1d_nearest_cl}, {"resize_bilinear_cl", resize_bilinear_cl}, @@ -49386,6 +56367,8 @@ static const source_map_t cl_resource[] = {"roi_align_cl", roi_align_cl}, {"scatter_nd_cl", scatter_nd_cl}, {"select_cl", select_cl}, + {"sequence_mask_cl", sequence_mask_cl}, + {"slice_cl", slice_cl}, {"space2depth_internal_cl", space2depth_internal_cl}, {"swish_cl", swish_cl}, {"tile_cl", tile_cl}, diff --git a/src/tim/vx/internal/src/client/vsi_nn_vxkernel.c b/src/tim/vx/internal/src/libnnext/vsi_nn_vxkernel.c similarity index 98% rename from src/tim/vx/internal/src/client/vsi_nn_vxkernel.c rename to src/tim/vx/internal/src/libnnext/vsi_nn_vxkernel.c index 60c05b6..f79b691 100644 --- a/src/tim/vx/internal/src/client/vsi_nn_vxkernel.c +++ b/src/tim/vx/internal/src/libnnext/vsi_nn_vxkernel.c @@ -30,7 +30,7 @@ #include "vsi_nn_platform.h" #include "vsi_nn_prv.h" #include "vsi_nn_log.h" -#include "client/vsi_nn_vxkernel.h" +#include "libnnext/vsi_nn_vxkernel.h" #include "kernel/vsi_nn_kernel.h" #include "libnnext/vsi_nn_libnnext_resource.h" #if VSI_USE_VXC_BINARY @@ -308,7 +308,8 @@ static vsi_status vsi_nn_RegisterBinKernel context = graph->ctx; evis = context->config.evis.ver; - program_ptr = vsi_nn_VxBinResourceGetResource(kernel_info->resource_name[kernel_info->resource_num - 1], &program_len); + program_ptr = vsi_nn_VxBinResourceGetResource( + kernel_info->resource_name[kernel_info->resource_num - 1], &program_len); program = vxCreateProgramWithBinary(ctx, (const vx_uint8 *)program_ptr, program_len); status = vxGetStatus((vx_reference)program); diff --git a/src/tim/vx/internal/src/makefile.linux b/src/tim/vx/internal/src/makefile.linux index 799a920..45e11b8 100644 --- a/src/tim/vx/internal/src/makefile.linux +++ b/src/tim/vx/internal/src/makefile.linux @@ -60,9 +60,6 @@ OBJECTS = $(OBJ_DIR)/vsi_nn_context.o \ $(OBJ_DIR)/vsi_nn_graph_optimization.o \ $(OBJ_DIR)/vsi_nn_pre_post_process.o -vpath %.c client -OBJECTS += $(OBJ_DIR)/vsi_nn_vxkernel.o - vpath %.c utils OBJECTS += $(OBJ_DIR)/vsi_nn_code_generator.o \ $(OBJ_DIR)/vsi_nn_binary_tree.o \ @@ -92,7 +89,8 @@ OBJECTS += $(OBJ_DIR)/vsi_nn_post_fasterrcnn.o \ $(OBJ_DIR)/vsi_nn_post_cmupose.o vpath %.c libnnext -OBJECTS += $(OBJ_DIR)/vsi_nn_libnnext_resource.o +OBJECTS += $(OBJ_DIR)/vsi_nn_libnnext_resource.o \ + $(OBJ_DIR)/vsi_nn_vxkernel.o vpath %.c libnnext/ops/kernel SRCS += ${notdir ${wildcard libnnext/ops/kernel/*.c}} @@ -118,8 +116,14 @@ SRCS += ${notdir ${wildcard kernel/vx/*.c}} vpath %.c custom/ops SRCS += ${notdir ${wildcard custom/ops/*.c}} -vpath %.c custom/ops/kernel -SRCS += ${notdir ${wildcard custom/ops/kernel/*.c}} +vpath %.c custom/ops/kernel/evis +SRCS += ${notdir ${wildcard custom/ops/kernel/evis/*.c}} + +vpath %.c custom/ops/kernel/cl +SRCS += ${notdir ${wildcard custom/ops/kernel/cl/*.c}} + +vpath %.c custom/ops/kernel/cpu +SRCS += ${notdir ${wildcard custom/ops/kernel/cpu/*.c}} OBJECTS += ${patsubst %.c, $(OBJ_DIR)/%.o, $(SRCS)} diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_argmaxmin.c b/src/tim/vx/internal/src/ops/vsi_nn_op_argmaxmin.c index 0a260d2..c96a2c8 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_argmaxmin.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_argmaxmin.c @@ -217,6 +217,7 @@ static vsi_bool op_check IO_TYPE(D_I16|Q_DFP, D_U8) IO_TYPE(D_I16|Q_DFP, D_I16) IO_TYPE(D_F32, D_I32) + IO_TYPE(D_F32, D_I16) IO_TYPE(D_F16, D_I32) IO_TYPE(D_I32, D_I32) IO_TYPE(D_I8|Q_DFP, D_I32) diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_axis_aligned_bbox_transform.c b/src/tim/vx/internal/src/ops/vsi_nn_op_axis_aligned_bbox_transform.c index 83b3664..2324875 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_axis_aligned_bbox_transform.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_axis_aligned_bbox_transform.c @@ -34,159 +34,11 @@ #include "vsi_nn_ops.h" #include "vsi_nn_tensor.h" #include "vsi_nn_tensor_util.h" -#include "client/vsi_nn_vxkernel.h" +#include "kernel/vsi_nn_kernel.h" -#define _ARG_NUM (0) #define _INPUT_NUM (4) #define _OUTPUT_NUM (1) #define _IO_NUM (_INPUT_NUM + _OUTPUT_NUM) -#define _PARAM_NUM (_ARG_NUM + _IO_NUM) - -extern vx_kernel_description_t * vx_kernel_AXIS_ALIGNED_BBOX_TRANSFORM_list[]; - -static void _set_inputs_outputs - ( - vx_reference * params, - vsi_nn_tensor_t ** inputs, - vsi_nn_tensor_t ** outputs - ) -{ - uint32_t i; - uint32_t cnt; - - /* Set inputs */ - cnt = 0; - for( i = 0; i < _INPUT_NUM; i ++, cnt ++ ) - { - params[cnt] = (vx_reference)inputs[i]->t; - } - - /* Set outputs */ - for( i = 0; i < _OUTPUT_NUM; i ++, cnt ++ ) - { - params[cnt] = (vx_reference)outputs[i]->t; - } -} /* _set_inputs_outputs() */ - -static vsi_status _create_params - ( - vsi_nn_node_t * node, - vx_reference * params, - uint32_t num - ) -{ - vsi_status status = VSI_SUCCESS; -#if 0 - vx_context ctx; - vsi_nn_axis_aligned_bbox_transform_param * p; - if( 0 == num ) - { - return VSI_SUCCESS; - } - memset( params, 0, sizeof( vx_reference * ) * num ); - p = &(node->nn_param.axis_aligned_bbox_transform); - ctx = vxGetContext( (vx_reference)node->graph->g ); - /* Init parameters */ - #define _SET_PARAM( i, type, arg ) do{ \ - params[i] = (vx_reference)vxCreateScalar( ctx, type, &p->arg ); \ - status = vxGetStatus( params[i] ); \ - if( VSI_SUCCESS != status ) { \ - goto set_param_error; \ - } \ - } while(0) - _SET_PARAM( 0, VX_TYPE_INT32, type ); - #undef _SET_PARAM -set_param_error: -#endif - return status; -} /* _create_params */ - -static void _release_params - ( - vx_reference * params, - uint32_t num - ) -{ - uint32_t i; - vx_scalar scalar; - for( i = 0; i < num; i ++ ) - { - scalar = (vx_scalar)params[i]; - vxReleaseScalar( &scalar ); - } -} /* _release_params() */ - -static vsi_status cpu_op_compute - ( - vsi_nn_node_t * self, - vsi_nn_tensor_t ** inputs, - vsi_nn_tensor_t ** outputs - ) -{ - vsi_status status = VSI_SUCCESS; - vx_reference params[_PARAM_NUM]; - vx_reference * args; - - args = ¶ms[_IO_NUM]; - - if( NULL == self->n ) - { - return VSI_FAILURE; - } - - /* Set inputs and outputs */ - _set_inputs_outputs( params, inputs, outputs ); - - /* Init parameters. */ - _create_params( self, args, _ARG_NUM ); - - /* Pass parameters to node. */ - status = vsi_nn_ClientNodePassParameters( self->n, params, _PARAM_NUM ); - - _release_params( args, _ARG_NUM ); - - return status; -} - -static vsi_status vx_op_compute - ( - vsi_nn_node_t * self, - vsi_nn_tensor_t ** inputs, - vsi_nn_tensor_t ** outputs - ) -{ - vsi_status status = VSI_SUCCESS; - vx_reference params[_PARAM_NUM]; - vx_reference * args; - - args = ¶ms[_IO_NUM]; - - if( NULL == self->n ) - { - return VSI_FAILURE; - } - - /* Set inputs and outputs */ - _set_inputs_outputs( params, inputs, outputs ); - /*TODO: Add code if need to change your parameter*/ - - /* Init parameters. */ - _create_params( self, args, _ARG_NUM ); - - /* Pass parameters to node. */ - status = vsi_nn_ClientNodePassParameters( self->n, params, _PARAM_NUM ); - - _release_params( args, _ARG_NUM ); - - return status; -} - -static vsi_nn_op_compute_t op_compute_list[] = -{ - cpu_op_compute, - vx_op_compute, - NULL -}; static vsi_status op_compute ( @@ -195,46 +47,18 @@ static vsi_status op_compute vsi_nn_tensor_t ** outputs ) { - vsi_status status; - vsi_nn_kernel_info_t kernel_info; - char *path = NULL; + vsi_status status = VSI_FAILURE; - memset(&kernel_info, 0x0, sizeof(vsi_nn_kernel_info_t)); - status = VSI_FAILURE; - kernel_info.type = VX_KERNEL_TYPE_CPU; - kernel_info.kernel = vx_kernel_AXIS_ALIGNED_BBOX_TRANSFORM_list; - kernel_info.resource_num = 1; - kernel_info.resource_name = (char **)malloc(kernel_info.resource_num * sizeof(char *)); - kernel_info.resource_name[0] = "vsi_nn_kernel_axis_aligned_bbox_transform"; - path = getenv("USER_VX_SOURCE_PATH"); - if(path) - vsi_nn_VxResourceSetPath(path); + self->n = (vx_node)vsi_nn_kernel_selector( self->graph, + "axis_aligned_bbox_transform", + inputs, _INPUT_NUM, + outputs, _OUTPUT_NUM, NULL ); - if( kernel_info.type == VX_KERNEL_TYPE_VX) + if ( self->n ) { - kernel_info.kernel_index = 1; - kernel_info.init_index = 1; - } - else /*kernel_info.type = VX_KERNEL_TYPE_CPU;*/ - { - kernel_info.kernel_index = 0; - kernel_info.init_index = 0; + status = VSI_SUCCESS; } - self->n = vsi_nn_RegisterClientKernelAndNewNode( - self->graph, &kernel_info); - if (kernel_info.resource_name) - { - free(kernel_info.resource_name); - } - if( NULL == self->n ) - { - return VSI_FAILURE; - } - if (NULL != op_compute_list[kernel_info.init_index]) - { - status = op_compute_list[kernel_info.init_index](self, inputs, outputs); - } return status; } /* op_compute() */ diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_batch_norm.c b/src/tim/vx/internal/src/ops/vsi_nn_op_batch_norm.c index 64b94ee..ed63df6 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_batch_norm.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_batch_norm.c @@ -33,6 +33,8 @@ #include "vsi_nn_log.h" #include "utils/vsi_nn_util.h" #include "utils/vsi_nn_constraint_check.h" +#include "kernel/vsi_nn_kernel.h" +#include "kernel/vsi_nn_kernel_eltwise.h" static vsi_status _try_set_high_presision_tensor ( @@ -107,7 +109,21 @@ static vsi_bool _is_3d_batchnorm } } -static vsi_status op_compute +static vsi_bool _is_dynamic_batchnorm + ( + vsi_nn_tensor_t ** inputs + ) +{ + uint32_t i = 0; + for (i = 1; i < 5 ; i++) { + if (FALSE == inputs[i]->attr.is_const) { + return TRUE; + } + } + return FALSE; +} + +static vsi_status _static_batchnorm ( vsi_nn_node_t * self, vsi_nn_tensor_t ** inputs, @@ -150,6 +166,115 @@ static vsi_status op_compute status = VSI_FAILURE; } return status; +} + +static vsi_status _dynamic_batchnorm + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_param_t * param = NULL; + int32_t shapes[4][VSI_NN_MAX_DIM_NUM] = {{ 1 }}; + int32_t* shapes_ptr[4] = {NULL}; + int32_t *shapes_in[3] = {NULL}; + size_t rank_in[3] = {0}; + uint32_t new_rank = 0; + vsi_nn_tensor_t* reshape_tensors[6] = { NULL }; + vsi_bool ret = TRUE; + uint32_t i = 0; + + param = vsi_nn_kernel_param_create(); + vsi_nn_kernel_param_add_float32( param, "eps", self->nn_param.batch_norm.eps ); + + rank_in[0] = (size_t)inputs[0]->attr.dim_num; + rank_in[1] = (size_t)inputs[1]->attr.dim_num; + rank_in[2] = (size_t)inputs[3]->attr.dim_num; + shapes_in[0] = (int32_t *)inputs[0]->attr.size; + shapes_in[1] = (int32_t *)inputs[1]->attr.size; + shapes_in[2] = (int32_t *)inputs[3]->attr.size; + for (i = 0; i < 4; i++) + { + shapes_ptr[i] = shapes[i]; + } + + ret = vsi_nn_kernel_optimize_broadcast_shape( + (const int32_t**)shapes_in, (const size_t*)rank_in, 3, + (int32_t *)outputs[0]->attr.size, outputs[0]->attr.dim_num, + shapes_ptr, shapes[3], &new_rank); + + if( ret ) + { + reshape_tensors[0] = vsi_nn_reshape_tensor( self->graph, + inputs[0], (uint32_t*)shapes[0], new_rank ); + reshape_tensors[1] = vsi_nn_reshape_tensor( self->graph, + inputs[1], (uint32_t*)shapes[1], new_rank ); + reshape_tensors[2] = vsi_nn_reshape_tensor( self->graph, + inputs[2], (uint32_t*)shapes[1], new_rank ); + reshape_tensors[3] = vsi_nn_reshape_tensor( self->graph, + inputs[3], (uint32_t*)shapes[2], new_rank ); + reshape_tensors[4] = vsi_nn_reshape_tensor( self->graph, + inputs[4], (uint32_t*)shapes[2], new_rank ); + + reshape_tensors[5] = vsi_nn_reshape_tensor( self->graph, + outputs[0], (uint32_t*)shapes[3], new_rank ); + } + else + { + reshape_tensors[0] = inputs[0]; + reshape_tensors[1] = inputs[1]; + reshape_tensors[2] = inputs[2]; + reshape_tensors[3] = inputs[3]; + reshape_tensors[4] = inputs[4]; + + reshape_tensors[5] = outputs[0]; + } + + self->n = (vx_node)vsi_nn_kernel_selector( self->graph, + "batchnorm_single", + reshape_tensors, 5, + &reshape_tensors[5], 1, param ); + + if( self->n ) + { + status = VSI_SUCCESS; + } + + if (ret) + { + for ( i = 0; i < 6; i++) + { + if (reshape_tensors[i]) + { + vsi_nn_ReleaseTensor( &reshape_tensors[i] ); + } + } + } + + vsi_nn_kernel_param_release( ¶m ); + + return status; +} + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status; + if (_is_dynamic_batchnorm(inputs)) + { + status = _dynamic_batchnorm(self, inputs, outputs); + } + else + { + status = _static_batchnorm(self, inputs, outputs); + } + return status; } /* op_compute() */ static vsi_status op_optimize @@ -204,7 +329,62 @@ static vsi_status op_optimize return VSI_SUCCESS; } /* op_optimize() */ -static vsi_bool op_check +static vsi_bool _dynamic_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + /*TODO: Check tensor shapes. */ + uint32_t i = 0; + uint32_t j = 0; + uint32_t rank = inputs[0]->attr.dim_num; + + /* check inputs outputs data type */ + BEGIN_IO_TYPE_DECL(BATCHNORM_SINGLE, 5, 1) + IO_TYPE(D_F16, D_F16, D_F16, D_F16, D_F32, D_U8|Q_ASYM) + IO_TYPE(D_F16, D_F16, D_F16, D_F16, D_F32, D_I16|Q_DFP) + IO_TYPE(D_F16, D_F16, D_F16, D_F16, D_F32, D_I8|Q_DFP) + IO_TYPE(D_F16, D_F16, D_F16, D_F16, D_F32, D_F16) + IO_TYPE(D_U8|Q_ASYM, D_F16, D_F16, D_F16, D_F32, D_U8|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_F16, D_F16, D_F16, D_F32, D_F16) + IO_TYPE(D_I8|Q_DFP, D_F16, D_F16, D_F16, D_F32, D_I8|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_F16, D_F16, D_F16, D_F32, D_F16) + IO_TYPE(D_I16|Q_DFP, D_F16, D_F16, D_F16, D_F32, D_I16|Q_DFP) + IO_TYPE(D_I16|Q_DFP, D_F16, D_F16, D_F16, D_F32, D_F16) + IO_TYPE(D_F32, D_F32, D_F32, D_F32, D_F32, D_F32) + IO_TYPE(D_F16, D_F32, D_F32, D_F32, D_F32, D_F16) + IO_TYPE(D_U8|Q_ASYM, D_F32, D_F32, D_F32, D_F32, D_U8|Q_ASYM) + END_IO_TYPE_DECL(BATCHNORM_SINGLE) + if(!VALIDATE_OP_IO_TYPES(BATCHNORM_SINGLE, self, inputs, self->input.num, outputs, self->output.num)) { + char* desc = generate_op_io_types_desc(inputs, + self->input.num, outputs, self->output.num); + VSILOGE("Inputs/Outputs data type not support: %s", desc); + destroy_op_io_types_desc(desc); + return FALSE; + } + + for(i = 0; i < rank; i++) + { + vx_int32 shape0 = inputs[0]->attr.size[i]; + + for ( j = 1; j < self->input.num; j++) + { + uint32_t rank1 = inputs[j]->attr.dim_num; + vx_int32 shape1 = rank1 > i ? inputs[j]->attr.size[i] : 1; + + if(shape0 != shape1 && shape1 != 1) + { + VSILOGE("Invalid broadcast for inputs[%d] size[%u]", j, shape1); + return FALSE; + } + } + } + return TRUE; +} + +static vsi_bool _static_check ( vsi_nn_node_t * self, vsi_nn_tensor_t ** inputs, @@ -240,8 +420,26 @@ static vsi_bool op_check return FALSE; } return TRUE; +} + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + if (_is_dynamic_batchnorm(inputs)) + { + return _dynamic_check(self, inputs, outputs); + } + else + { + return _static_check(self, inputs, outputs); + } } /* op_check() */ + static vsi_bool op_setup ( vsi_nn_node_t * self, diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_batchnorm_single.c b/src/tim/vx/internal/src/ops/vsi_nn_op_batchnorm_single.c index a945e61..a773a5b 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_batchnorm_single.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_batchnorm_single.c @@ -148,18 +148,28 @@ static vsi_bool op_check /* check inputs outputs data type */ BEGIN_IO_TYPE_DECL(BATCHNORM_SINGLE, 5, 1) - IO_TYPE(D_F16, D_F16, D_F16, D_F16, D_F32, D_U8|Q_ASYM) - IO_TYPE(D_F16, D_F16, D_F16, D_F16, D_F32, D_I16|Q_DFP) - IO_TYPE(D_F16, D_F16, D_F16, D_F16, D_F32, D_I8|Q_DFP) - IO_TYPE(D_F16, D_F16, D_F16, D_F16, D_F32, D_F16) + IO_TYPE(D_F16, D_F16, D_F16, D_F16, D_F32, D_U8|Q_ASYM) + IO_TYPE(D_F16, D_F16, D_F16, D_F16, D_F32, D_I16|Q_DFP) + IO_TYPE(D_F16, D_F16, D_F16, D_F16, D_F32, D_I8|Q_DFP) + IO_TYPE(D_F16, D_F16, D_F16, D_F16, D_F32, D_F16) IO_TYPE(D_U8|Q_ASYM, D_F16, D_F16, D_F16, D_F32, D_U8|Q_ASYM) IO_TYPE(D_U8|Q_ASYM, D_F16, D_F16, D_F16, D_F32, D_F16) - IO_TYPE(D_I8|Q_DFP, D_F16, D_F16, D_F16, D_F32, D_I8|Q_DFP) - IO_TYPE(D_I8|Q_DFP, D_F16, D_F16, D_F16, D_F32, D_F16) + IO_TYPE(D_I8|Q_DFP, D_F16, D_F16, D_F16, D_F32, D_I8|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_F16, D_F16, D_F16, D_F32, D_F16) IO_TYPE(D_I16|Q_DFP, D_F16, D_F16, D_F16, D_F32, D_I16|Q_DFP) IO_TYPE(D_I16|Q_DFP, D_F16, D_F16, D_F16, D_F32, D_F16) - IO_TYPE(D_F32, D_F32, D_F32, D_F32, D_F32, D_F32) - IO_TYPE(D_F16, D_F32, D_F32, D_F32, D_F32, D_F16) + IO_TYPE(D_F16, D_F16, D_F16, D_F32, D_F32, D_U8|Q_ASYM) + IO_TYPE(D_F16, D_F16, D_F16, D_F32, D_F32, D_I16|Q_DFP) + IO_TYPE(D_F16, D_F16, D_F16, D_F32, D_F32, D_I8|Q_DFP) + IO_TYPE(D_F16, D_F16, D_F16, D_F32, D_F32, D_F16) + IO_TYPE(D_U8|Q_ASYM, D_F16, D_F16, D_F32, D_F32, D_U8|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_F16, D_F16, D_F32, D_F32, D_F16) + IO_TYPE(D_I8|Q_DFP, D_F16, D_F16, D_F32, D_F32, D_I8|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_F16, D_F16, D_F32, D_F32, D_F16) + IO_TYPE(D_I16|Q_DFP, D_F16, D_F16, D_F32, D_F32, D_I16|Q_DFP) + IO_TYPE(D_I16|Q_DFP, D_F16, D_F16, D_F32, D_F32, D_F16) + IO_TYPE(D_F32, D_F32, D_F32, D_F32, D_F32, D_F32) + IO_TYPE(D_F16, D_F32, D_F32, D_F32, D_F32, D_F16) IO_TYPE(D_U8|Q_ASYM, D_F32, D_F32, D_F32, D_F32, D_U8|Q_ASYM) END_IO_TYPE_DECL(BATCHNORM_SINGLE) if(!VALIDATE_OP_IO_TYPES(BATCHNORM_SINGLE, self, inputs, self->input.num, outputs, self->output.num)) { diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_bidirectional_sequence_lstm.c b/src/tim/vx/internal/src/ops/vsi_nn_op_bidirectional_sequence_lstm.c index b490e0e..fdc508b 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_bidirectional_sequence_lstm.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_bidirectional_sequence_lstm.c @@ -35,7 +35,7 @@ #include "vsi_nn_ops.h" #include "vsi_nn_tensor.h" #include "vsi_nn_tensor_util.h" -#include "client/vsi_nn_vxkernel.h" +#include "libnnext/vsi_nn_vxkernel.h" #include "vsi_nn_internal_node.h" #include "vsi_nn_rnn_helper.h" @@ -380,20 +380,22 @@ static vsi_bool op_setup vsi_nn_tensor_t* lstmcell_out2 = NULL; /* lstmcell output */ + /* if merge_outputs is true, there will be only 1 output, so use the attr + of the fw for the bw, since they are always same as each other.*/ vsi_nn_internal_init_tensor_attr(&attr, - &outputs[BI_LSTM_BW_OUTPUT_OUTPUT]->attr.dtype, use_virtual_tensor); + &outputs[BI_LSTM_FW_OUTPUT_OUTPUT]->attr.dtype, use_virtual_tensor); output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); lstmcell_out0 = output_tensor->t; /* lstmcell output h_state */ vsi_nn_internal_init_tensor_attr(&attr, - &outputs[BI_LSTM_BW_OUTPUT_OUTPUT]->attr.dtype, use_virtual_tensor); + &outputs[BI_LSTM_FW_OUTPUT_OUTPUT]->attr.dtype, use_virtual_tensor); output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); lstmcell_out1 = output_tensor->t; /* lstmcell output c_state */ vsi_nn_internal_init_tensor_attr(&attr, - &outputs[BI_LSTM_BW_OUTPUT_OUTPUT]->attr.dtype, use_virtual_tensor); + &outputs[BI_LSTM_FW_OUTPUT_OUTPUT]->attr.dtype, use_virtual_tensor); output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); lstmcell_out2 = output_tensor->t; diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_bidirectional_sequence_rnn.c b/src/tim/vx/internal/src/ops/vsi_nn_op_bidirectional_sequence_rnn.c index 710acf9..cb85606 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_bidirectional_sequence_rnn.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_bidirectional_sequence_rnn.c @@ -35,7 +35,7 @@ #include "vsi_nn_ops.h" #include "vsi_nn_tensor.h" #include "vsi_nn_tensor_util.h" -#include "client/vsi_nn_vxkernel.h" +#include "libnnext/vsi_nn_vxkernel.h" #include "vsi_nn_internal_node.h" #include "vsi_nn_rnn_helper.h" diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_box_with_nms_limit.c b/src/tim/vx/internal/src/ops/vsi_nn_op_box_with_nms_limit.c index fb6b0e1..d376212 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_box_with_nms_limit.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_box_with_nms_limit.c @@ -34,7 +34,7 @@ #include "vsi_nn_ops.h" #include "vsi_nn_tensor.h" #include "vsi_nn_tensor_util.h" -#include "client/vsi_nn_vxkernel.h" +#include "libnnext/vsi_nn_vxkernel.h" #define _ARG_NUM (6) #define _INPUT_NUM (3) diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_cast.c b/src/tim/vx/internal/src/ops/vsi_nn_op_cast.c index 25a8787..34eb9cf 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_cast.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_cast.c @@ -128,56 +128,77 @@ static vsi_bool op_check ) { BEGIN_IO_TYPE_DECL(CAST, 1, 1) - IO_TYPE(D_F32, D_F32) - IO_TYPE(D_F32, D_I32) - IO_TYPE(D_F32, D_U32) - IO_TYPE(D_F32, D_F16) - IO_TYPE(D_F32, D_BOOL8) - IO_TYPE(D_I32, D_F32) - IO_TYPE(D_I32, D_I32) - IO_TYPE(D_I32, D_U32) - IO_TYPE(D_I32, D_F16) - IO_TYPE(D_I32, D_BOOL8) - IO_TYPE(D_U32, D_F32) - IO_TYPE(D_U32, D_I32) - IO_TYPE(D_U32, D_U32) - IO_TYPE(D_U32, D_BOOL8) - IO_TYPE(D_F16, D_I16|Q_DFP) - IO_TYPE(D_F16, D_I8|Q_DFP) - IO_TYPE(D_F16, D_U8|Q_ASYM) - IO_TYPE(D_F16, D_BOOL8) + IO_TYPE(D_F32, D_F32) + IO_TYPE(D_F32, D_I32) + IO_TYPE(D_F32, D_U32) + IO_TYPE(D_F32, D_F16) + IO_TYPE(D_F32, D_BOOL8) + IO_TYPE(D_I32, D_F32) + IO_TYPE(D_I32, D_I32) + IO_TYPE(D_I32, D_U32) + IO_TYPE(D_I32, D_F16) + IO_TYPE(D_I32, D_BOOL8) + IO_TYPE(D_BOOL8, D_F32) + IO_TYPE(D_BOOL8, D_I32) + IO_TYPE(D_BOOL8, D_U32) + IO_TYPE(D_U32, D_F32) + IO_TYPE(D_U32, D_I32) + IO_TYPE(D_U32, D_U32) + IO_TYPE(D_U32, D_BOOL8) + IO_TYPE(D_F16, D_I16|Q_DFP) + IO_TYPE(D_F16, D_I8|Q_DFP) + IO_TYPE(D_F16, D_U8|Q_ASYM) + IO_TYPE(D_F16, D_BOOL8) IO_TYPE(D_I16|Q_DFP, D_F16) IO_TYPE(D_I16|Q_DFP, D_I8|Q_DFP) IO_TYPE(D_I16|Q_DFP, D_U8|Q_ASYM) - IO_TYPE(D_I16, D_BOOL8) - IO_TYPE(D_I8|Q_DFP, D_F16) - IO_TYPE(D_I8|Q_DFP, D_I16|Q_DFP) - IO_TYPE(D_I8|Q_DFP, D_U8|Q_ASYM) - IO_TYPE(D_I8, D_BOOL8) + IO_TYPE(D_I16|Q_DFP, D_BOOL8) + IO_TYPE(D_I16, D_F16) + IO_TYPE(D_I16, D_I8|Q_DFP) + IO_TYPE(D_I16, D_U8|Q_ASYM) + IO_TYPE(D_I16, D_BOOL8) + IO_TYPE(D_I8|Q_DFP, D_F16) + IO_TYPE(D_I8|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_U8|Q_ASYM) + IO_TYPE(D_I8|Q_DFP, D_BOOL8) + IO_TYPE(D_I8, D_F16) + IO_TYPE(D_I8, D_I16|Q_DFP) + IO_TYPE(D_I8, D_U8|Q_ASYM) + IO_TYPE(D_I8, D_BOOL8) IO_TYPE(D_U8|Q_ASYM, D_F16) IO_TYPE(D_U8|Q_ASYM, D_I16|Q_DFP) IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP) - IO_TYPE(D_U8, D_BOOL8) - IO_TYPE(D_F32, D_I16|Q_DFP) - IO_TYPE(D_F32, D_I8|Q_DFP) - IO_TYPE(D_F32, D_U8|Q_ASYM) - IO_TYPE(D_I32, D_I16|Q_DFP) - IO_TYPE(D_I32, D_I8|Q_DFP) - IO_TYPE(D_I32, D_U8|Q_ASYM) - IO_TYPE(D_F16, D_F32) - IO_TYPE(D_F16, D_I32) - IO_TYPE(D_F16, D_U8) - IO_TYPE(D_F16, D_I8) - IO_TYPE(D_F16, D_F16) - IO_TYPE(D_BOOL8, D_F16) - IO_TYPE(D_BOOL8, D_BOOL8) + IO_TYPE(D_U8|Q_ASYM, D_BOOL8) + IO_TYPE(D_U8, D_F16) + IO_TYPE(D_U8, D_I16|Q_DFP) + IO_TYPE(D_U8, D_I8|Q_DFP) + IO_TYPE(D_U8, D_BOOL8) + IO_TYPE(D_F32, D_I16|Q_DFP) + IO_TYPE(D_F32, D_I8|Q_DFP) + IO_TYPE(D_F32, D_U8|Q_ASYM) + IO_TYPE(D_I32, D_I16|Q_DFP) + IO_TYPE(D_I32, D_I8|Q_DFP) + IO_TYPE(D_I32, D_U8|Q_ASYM) + IO_TYPE(D_F16, D_F32) + IO_TYPE(D_F16, D_I32) + IO_TYPE(D_F16, D_I16) + IO_TYPE(D_F16, D_U8) + IO_TYPE(D_F16, D_I8) + IO_TYPE(D_F16, D_F16) + IO_TYPE(D_BOOL8, D_F16) + IO_TYPE(D_BOOL8, D_I16|Q_DFP) + IO_TYPE(D_BOOL8, D_I8|Q_DFP) + IO_TYPE(D_BOOL8, D_U8|Q_ASYM) + IO_TYPE(D_BOOL8, D_BOOL8) + IO_TYPE(D_BOOL8, D_I16) + IO_TYPE(D_BOOL8, D_I8) + IO_TYPE(D_BOOL8, D_U8) IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM) IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP) IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP) IO_TYPE(D_U8|Q_ASYM, D_F32) IO_TYPE(D_U8|Q_ASYM, D_I32) - IO_TYPE(D_BF16, D_BF16) - IO_TYPE(D_U8, D_F16) + IO_TYPE(D_BF16, D_BF16) END_IO_TYPE_DECL(CAST) if(!VALIDATE_OP_IO_TYPES(CAST, self, inputs, self->input.num, outputs, self->output.num)) { char* desc = generate_op_io_types_desc(inputs, diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_ceil.c b/src/tim/vx/internal/src/ops/vsi_nn_op_ceil.c new file mode 100644 index 0000000..69dbfd5 --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_ceil.c @@ -0,0 +1,110 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include +#include + +#include "vsi_nn_types.h" +#include "vsi_nn_platform.h" +#include "vsi_nn_log.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_node.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "utils/vsi_nn_util.h" +#include "utils/vsi_nn_constraint_check.h" + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + BEGIN_IO_TYPE_DECL(CEIL, 1, 1) + IO_TYPE(D_BF16, D_BF16) + IO_TYPE(D_F32, D_F32) + IO_TYPE(D_F16, D_F16) + IO_TYPE(D_F16, D_U8|Q_ASYM) + IO_TYPE(D_F16, D_I16|Q_DFP) + IO_TYPE(D_F16, D_I8|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_F16) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_I16|Q_DFP, D_F16) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_F16) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP) + END_IO_TYPE_DECL(CEIL) + if (!VALIDATE_OP_IO_TYPES(CEIL, self, inputs, self->input.num, outputs, self->output.num)) + { + char* desc = generate_op_io_types_desc(inputs, + self->input.num, outputs, self->output.num); + VSILOGE("Inputs/Outputs data type not support: %s", desc); + destroy_op_io_types_desc(desc); + return FALSE; + } + + return TRUE; +} /* op_check() */ + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status = VSI_SUCCESS; + vx_nn_rounding_params_t p; + + memset(&p, 0, sizeof(p)); + p.mode = VX_NN_DS_SIZE_ROUNDING_CEILING; + self->n = vxTensorRoundingNode(self->graph->g, inputs[0]->t, &p, sizeof(p), outputs[0]->t); + if ( !self->n ) + { + status = VSI_FAILURE; + } + + return status; +} /* op_compute() */ + +#ifdef __cpluplus +extern "C" { +#endif +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ CEIL, + /* init */ NULL, + /* compute */ op_compute, + /* deinit */ vsi_nn_op_common_deinit, + /* check */ op_check, + /* setup */ vsi_nn_op_common_setup, + /* optimize */ NULL, + /* input_num */ 1, + /* output_num */ 1 + ); +#ifdef __cplusplus +} +#endif diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_clip.c b/src/tim/vx/internal/src/ops/vsi_nn_op_clip.c index 06898c1..9c151f6 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_clip.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_clip.c @@ -35,7 +35,7 @@ #include "vsi_nn_ops.h" #include "vsi_nn_tensor.h" #include "vsi_nn_tensor_util.h" -#include "client/vsi_nn_vxkernel.h" +#include "libnnext/vsi_nn_vxkernel.h" #include "kernel/vsi_nn_kernel.h" #include "vsi_nn_internal_node.h" #include "kernel/vsi_nn_kernel_gpu_shape_optimize.h" @@ -127,6 +127,7 @@ static vsi_bool op_check IO_TYPE(D_I8|Q_DFP, D_F16) IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP) IO_TYPE(D_I16|Q_DFP, D_F16) + IO_TYPE(D_BF16, D_BF16) END_IO_TYPE_DECL(CLIP) if(!VALIDATE_OP_IO_TYPES(CLIP, self, inputs, self->input.num, outputs, self->output.num)) { char* desc = generate_op_io_types_desc(inputs, diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_conv1d.c b/src/tim/vx/internal/src/ops/vsi_nn_op_conv1d.c index 2d6f510..f902832 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_conv1d.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_conv1d.c @@ -33,6 +33,12 @@ #include "kernel/vsi_nn_kernel.h" #include "utils/vsi_nn_constraint_check.h" +typedef struct _conv1d_local_data_t { + vsi_bool use_ext_pad; + vsi_bool use_ovxlib_kernel; + vsi_nn_internal_tensor_t* pad_output; +} conv1d_local_data_t; + static vsi_status op_compute ( vsi_nn_node_t * self, @@ -45,21 +51,110 @@ static vsi_status op_compute param = vsi_nn_kernel_param_create(); - vsi_nn_kernel_param_add_int32( param, "stride", self->nn_param.conv1d.stride ); - vsi_nn_kernel_param_add_int32( param, "pad_front", self->nn_param.conv1d.pad[0] ); - vsi_nn_kernel_param_add_int32( param, "pad_end", self->nn_param.conv1d.pad[1] ); - vsi_nn_kernel_param_add_int32( param, "dilation", self->nn_param.conv1d.dilation); - vsi_nn_kernel_param_add_int32( param, "overflow_policy", self->vx_param.overflow_policy ); - vsi_nn_kernel_param_add_int32( param, "rounding_policy", self->vx_param.rounding_policy ); - vsi_nn_kernel_param_add_int32( param, - "down_scale_size_rounding", self->vx_param.down_scale_size_rounding ); - self->n = (vx_node)vsi_nn_kernel_selector( self->graph, "conv1d", - inputs, 3, outputs, 1, param ); + if(self->nn_param.conv1d.local->use_ovxlib_kernel) + { + vsi_nn_tensor_t* new_inputs[3] = { NULL }; + vsi_nn_tensor_t* reshape_tensors[3] = { NULL }; + uint32_t shape[VSI_NN_MAX_DIM_NUM] = { 0 }; + int32_t new_rank = 0; + int32_t pad_front = self->nn_param.conv1d.pad[0]; + int32_t pad_end = self->nn_param.conv1d.pad[1]; + + if (1 == inputs[0]->attr.dim_num) + { + shape[0] = inputs[0]->attr.size[0]; + shape[1] = 1; + new_rank = 2; + reshape_tensors[0] = vsi_nn_reshape_tensor( self->graph, + inputs[0], (uint32_t*)shape, new_rank ); + new_inputs[0] = reshape_tensors[0]; + } + else + { + new_inputs[0] = inputs[0]; + } + + if (1 == inputs[1]->attr.dim_num) + { + shape[0] = inputs[1]->attr.size[0]; + shape[1] = 1; + new_rank = 2; + reshape_tensors[1] = vsi_nn_reshape_tensor( self->graph, + inputs[1], (uint32_t*)shape, new_rank ); + new_inputs[1] = reshape_tensors[1]; + } + else + { + new_inputs[1] = inputs[1]; + } + + if (1 == inputs[2]->attr.dim_num) + { + shape[0] = inputs[2]->attr.size[0]; + shape[1] = 1; + new_rank = 2; + reshape_tensors[2] = vsi_nn_reshape_tensor( self->graph, + inputs[2], (uint32_t*)shape, new_rank ); + new_inputs[2] = reshape_tensors[2]; + } + else + { + new_inputs[2] = inputs[2]; + } + + /* overwrite input[0] with padded tensor */ + if(self->nn_param.conv1d.local->use_ext_pad) + { + vsi_nn_internal_compute_node( self ); + new_inputs[0] = self->nn_param.conv1d.local->pad_output->t; + pad_front = 0; + pad_end = 0; + } + + vsi_nn_kernel_param_add_int32( param, "stride", self->nn_param.conv1d.stride ); + vsi_nn_kernel_param_add_int32( param, "pad_front", pad_front ); + vsi_nn_kernel_param_add_int32( param, "pad_end", pad_end ); + vsi_nn_kernel_param_add_int32( param, "dilation", self->nn_param.conv1d.dilation ); + vsi_nn_kernel_param_add_int32( param, "overflow_policy", self->vx_param.overflow_policy ); + vsi_nn_kernel_param_add_int32( param, "rounding_policy", self->vx_param.rounding_policy ); + vsi_nn_kernel_param_add_int32( param, + "down_scale_size_rounding", self->vx_param.down_scale_size_rounding ); + self->n = (vx_node)vsi_nn_kernel_selector( self->graph, "conv1d_ovxlib", + new_inputs, 3, outputs, 1, param ); + + if (reshape_tensors[0]) vsi_nn_ReleaseTensor( &reshape_tensors[0] ); + if (reshape_tensors[1]) vsi_nn_ReleaseTensor( &reshape_tensors[1] ); + if (reshape_tensors[2]) vsi_nn_ReleaseTensor( &reshape_tensors[2] ); + } + else + { + vsi_nn_kernel_param_add_int32( param, "stride", self->nn_param.conv1d.stride ); + vsi_nn_kernel_param_add_int32( param, "pad_front", self->nn_param.conv1d.pad[0] ); + vsi_nn_kernel_param_add_int32( param, "pad_end", self->nn_param.conv1d.pad[1] ); + vsi_nn_kernel_param_add_int32( param, "dilation", self->nn_param.conv1d.dilation ); + vsi_nn_kernel_param_add_int32( param, "overflow_policy", self->vx_param.overflow_policy ); + vsi_nn_kernel_param_add_int32( param, "rounding_policy", self->vx_param.rounding_policy ); + vsi_nn_kernel_param_add_int32( param, + "down_scale_size_rounding", self->vx_param.down_scale_size_rounding ); + if( self->nn_param.conv1d.multiplier > 0 ) + { + vsi_nn_kernel_param_add_int32( param, "multiplier", + self->nn_param.conv1d.multiplier ); + self->n = (vx_node)vsi_nn_kernel_selector( self->graph, "depthwise_conv1d", + inputs, 3, outputs, 1, param ); + } + else + { + self->n = (vx_node)vsi_nn_kernel_selector( self->graph, "conv1d", + inputs, 3, outputs, 1, param ); + } + } if( self->n ) { status = VSI_SUCCESS; } + vsi_nn_kernel_param_release( ¶m ); return status; } /* op_compute() */ @@ -151,6 +246,51 @@ static vsi_bool op_setup outputs[0]->attr.size[2] = inputs[0]->attr.size[2]; outputs[0]->attr.dim_num = 3; } + + if ( (self->nn_param.conv1d.ksize == 1024 && self->nn_param.conv1d.dilation == 1) + || (self->nn_param.conv1d.ksize == 3 && self->nn_param.conv1d.dilation > 7) ) + { + if (self->nn_param.conv1d.stride == 1 && self->nn_param.conv1d.multiplier == 0) + { + self->nn_param.conv1d.local->use_ovxlib_kernel = TRUE; + if ((p->pad[0] || p->pad[1]) && (inputs[0]->attr.size[0] >= 65535)) + { + vsi_nn_tensor_attr_t attr; + vsi_nn_internal_node_t* curr = NULL; + vsi_nn_internal_tensor_t* tensor = NULL; + uint32_t *front_data = NULL; + uint32_t *back_data = NULL; + memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t)); + vsi_nn_internal_init_tensor_attr(&attr, &inputs[0]->attr.dtype, TRUE); + tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + + curr = vsi_nn_internal_new_node(self, VSI_NN_OP_PAD, 0, 0); + front_data = (uint32_t*)\ + vsi_nn_internal_new_node_param(curr, sizeof(uint32_t) * inputs[0]->attr.dim_num); + back_data = (uint32_t*)\ + vsi_nn_internal_new_node_param(curr, sizeof(uint32_t) * inputs[0]->attr.dim_num); + + front_data[0] = p->pad[0]; + front_data[1] = 0; + front_data[2] = 0; + back_data[0] = p->pad[1]; + back_data[1] = 0; + back_data[2] = 0; + curr->node->nn_param.pad.front_size = front_data; + curr->node->nn_param.pad.back_size = back_data; + curr->node->nn_param.pad.dim_num = 3; + curr->node->nn_param.pad.const_val = 0; + curr->node->nn_param.pad.mode = VSI_NN_PAD_MODE_CONSTANT; + curr->inputs[0] = inputs[0]; + curr->outputs[0] = tensor->t; + vsi_nn_internal_setup_node(self, curr); + + self->nn_param.conv1d.local->use_ext_pad = TRUE; + self->nn_param.conv1d.local->pad_output = tensor; + } + } + } + return TRUE; } /* op_setup() */ @@ -159,9 +299,30 @@ static vsi_status op_deinit vsi_nn_node_t * self ) { + vsi_nn_internal_deinit_node_wksp(self); + + vsi_nn_safe_free(self->nn_param.gru_ovxlib.local); + return vsi_nn_op_common_deinit(self); } +static vsi_status op_init + ( + vsi_nn_node_t * self + ) +{ + vsi_status status = VSI_SUCCESS; + + vsi_nn_internal_init_node_wksp(self); + + self->nn_param.conv1d.local = (conv1d_local_data_t *)malloc(sizeof(conv1d_local_data_t)); + memset(self->nn_param.conv1d.local, 0x00, sizeof(conv1d_local_data_t)); + self->nn_param.conv1d.local->use_ext_pad = FALSE; + self->nn_param.conv1d.local->use_ovxlib_kernel = FALSE; + + return status; +} /* op_init() */ + #ifdef __cplusplus extern "C" { #endif @@ -169,7 +330,7 @@ extern "C" { DEF_OP_REG ( /* op_name */ CONV1D, - /* init */ NULL, + /* init */ op_init, /* compute */ op_compute, /* deinit */ op_deinit, /* check */ op_check, diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_conv2d.c b/src/tim/vx/internal/src/ops/vsi_nn_op_conv2d.c index fe9c4a3..bc8540d 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_conv2d.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_conv2d.c @@ -118,6 +118,7 @@ static vsi_bool op_check BEGIN_IO_TYPE_DECL(CONV2D, 3, 1) /* IO_TYPE(INPUT, WEIGHT, BIAS, OUTPUT) */ IO_TYPE(D_F32, D_F32, D_F32, D_F32) + IO_TYPE(D_F32, D_F32, D_F32, D_F16) IO_TYPE(D_F16, D_F16, D_F16, D_F16) IO_TYPE(D_F16, D_F16, D_F32, D_F16) diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_crop.c b/src/tim/vx/internal/src/ops/vsi_nn_op_crop.c index cb1cda3..f99aa44 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_crop.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_crop.c @@ -35,7 +35,7 @@ #include "vsi_nn_tensor_util.h" #include "vsi_nn_prv.h" #include "vsi_nn_log.h" -#include "client/vsi_nn_vxkernel.h" +#include "libnnext/vsi_nn_vxkernel.h" #include "utils/vsi_nn_constraint_check.h" #define _ARG_NUM (3) diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_dataconvert.c b/src/tim/vx/internal/src/ops/vsi_nn_op_dataconvert.c index e86fe3d..79631e3 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_dataconvert.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_dataconvert.c @@ -176,54 +176,117 @@ static vsi_bool op_check ) { BEGIN_IO_TYPE_DECL(DATACONVERT, 1, 1) - IO_TYPE(D_F16, D_F16) - IO_TYPE(D_F16, D_F32) - IO_TYPE(D_F16, D_I32) - IO_TYPE(D_F16, D_BF16) - IO_TYPE(D_F16, D_I16|Q_DFP) - IO_TYPE(D_F16, D_I8|Q_DFP) - IO_TYPE(D_F16, D_U8|Q_ASYM) - IO_TYPE(D_F32, D_F32) - IO_TYPE(D_F32, D_I32) - IO_TYPE(D_F32, D_F16) - IO_TYPE(D_F32, D_BF16) - IO_TYPE(D_F32, D_I16|Q_DFP) - IO_TYPE(D_F32, D_I8|Q_DFP) - IO_TYPE(D_F32, D_U8|Q_ASYM) + IO_TYPE(D_F16, D_F16) + IO_TYPE(D_F16, D_F32) + IO_TYPE(D_F16, D_I32) + IO_TYPE(D_F16, D_U32) + IO_TYPE(D_F16, D_BF16) + IO_TYPE(D_F16, D_I16|Q_DFP) + IO_TYPE(D_F16, D_I8|Q_DFP) + IO_TYPE(D_F16, D_U8|Q_ASYM) + IO_TYPE(D_F16, D_I16) + IO_TYPE(D_F16, D_I8) + IO_TYPE(D_F16, D_U8) + IO_TYPE(D_F32, D_F32) + IO_TYPE(D_F32, D_I32) + IO_TYPE(D_F32, D_U32) + IO_TYPE(D_F32, D_F16) + IO_TYPE(D_F32, D_BF16) + IO_TYPE(D_F32, D_I16|Q_DFP) + IO_TYPE(D_F32, D_I8|Q_DFP) + IO_TYPE(D_F32, D_U8|Q_ASYM) + IO_TYPE(D_F32, D_I16) + IO_TYPE(D_F32, D_I8) + IO_TYPE(D_F32, D_U8) + IO_TYPE(D_I16|Q_DFP, D_F32) IO_TYPE(D_I16|Q_DFP, D_I32) + IO_TYPE(D_I16|Q_DFP, D_U32) IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP) IO_TYPE(D_I16|Q_DFP, D_I8|Q_DFP) IO_TYPE(D_I16|Q_DFP, D_U8|Q_ASYM) IO_TYPE(D_I16|Q_DFP, D_F16) - IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP) - IO_TYPE(D_I8|Q_DFP, D_I8|Q_ASYM) - IO_TYPE(D_I8|Q_ASYM, D_I8|Q_ASYM) - IO_TYPE(D_I8|Q_ASYM, D_I8|Q_DFP) - IO_TYPE(D_I8|Q_DFP, D_I16|Q_DFP) - IO_TYPE(D_I8|Q_DFP, D_U8|Q_ASYM) - IO_TYPE(D_I8|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_I16|Q_DFP, D_I16) + IO_TYPE(D_I16|Q_DFP, D_I8) + IO_TYPE(D_I16|Q_DFP, D_U8) + IO_TYPE(D_I16, D_F32) + IO_TYPE(D_I16, D_I32) + IO_TYPE(D_I16, D_U32) + IO_TYPE(D_I16, D_I16|Q_DFP) + IO_TYPE(D_I16, D_I8|Q_DFP) + IO_TYPE(D_I16, D_U8|Q_ASYM) + IO_TYPE(D_I16, D_F16) + IO_TYPE(D_I16, D_I16) + IO_TYPE(D_I16, D_I8) + IO_TYPE(D_I16, D_U8) + IO_TYPE(D_I8|Q_DFP, D_F32) IO_TYPE(D_I8|Q_DFP, D_F16) IO_TYPE(D_I8|Q_DFP, D_I32) + IO_TYPE(D_I8|Q_DFP, D_U32) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_ASYM) + IO_TYPE(D_I8|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_U8|Q_ASYM) + IO_TYPE(D_I8|Q_DFP, D_I8) + IO_TYPE(D_I8|Q_DFP, D_I8) + IO_TYPE(D_I8|Q_DFP, D_I16) + IO_TYPE(D_I8|Q_DFP, D_U8) + IO_TYPE(D_I8, D_F32) + IO_TYPE(D_I8, D_F16) + IO_TYPE(D_I8, D_I32) + IO_TYPE(D_I8, D_U32) + IO_TYPE(D_I8, D_I8|Q_DFP) + IO_TYPE(D_I8, D_I8|Q_ASYM) + IO_TYPE(D_I8, D_I16|Q_DFP) + IO_TYPE(D_I8, D_U8|Q_ASYM) + IO_TYPE(D_I8, D_I8) + IO_TYPE(D_I8, D_I16) + IO_TYPE(D_I8|Q_ASYM, D_I8|Q_ASYM) + IO_TYPE(D_I8|Q_ASYM, D_I8|Q_DFP) + IO_TYPE(D_I8|Q_ASYM, D_U8|Q_ASYM) IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM) IO_TYPE(D_U8|Q_ASYM, D_I8|Q_ASYM) IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP) IO_TYPE(D_U8|Q_ASYM, D_I16|Q_DFP) IO_TYPE(D_U8|Q_ASYM, D_F16) IO_TYPE(D_U8|Q_ASYM, D_I32) + IO_TYPE(D_U8|Q_ASYM, D_U32) IO_TYPE(D_U8|Q_ASYM, D_F32) - IO_TYPE(D_BOOL8, D_BOOL8) - IO_TYPE(D_BOOL8, D_U8|Q_ASYM) - IO_TYPE(D_BOOL8, D_I8|Q_ASYM) - IO_TYPE(D_BOOL8, D_I8|Q_DFP) - IO_TYPE(D_BOOL8, D_I16|Q_DFP) - IO_TYPE(D_BF16, D_BF16) - IO_TYPE(D_BF16, D_F16) - IO_TYPE(D_BF16, D_F32) - IO_TYPE(D_I32, D_I32) - IO_TYPE(D_I32, D_I16|Q_DFP) - IO_TYPE(D_I16, D_I16|Q_DFP) - IO_TYPE(D_I8, D_I8|Q_DFP) - IO_TYPE(D_U8, D_U8|Q_ASYM) + IO_TYPE(D_U8, D_U8|Q_ASYM) + IO_TYPE(D_U8, D_I16|Q_DFP) + IO_TYPE(D_U8, D_F16) + IO_TYPE(D_U8, D_I32) + IO_TYPE(D_U8, D_U32) + IO_TYPE(D_U8, D_F32) + IO_TYPE(D_BOOL8, D_BOOL8) + IO_TYPE(D_BOOL8, D_U8|Q_ASYM) + IO_TYPE(D_BOOL8, D_I8|Q_ASYM) + IO_TYPE(D_BOOL8, D_I8|Q_DFP) + IO_TYPE(D_BOOL8, D_I16|Q_DFP) + IO_TYPE(D_BOOL8, D_U8) + IO_TYPE(D_BOOL8, D_I8) + IO_TYPE(D_BOOL8, D_I8) + IO_TYPE(D_BOOL8, D_I16) + IO_TYPE(D_BF16, D_BF16) + IO_TYPE(D_BF16, D_F16) + IO_TYPE(D_BF16, D_F32) + IO_TYPE(D_I32, D_I32) + IO_TYPE(D_I32, D_I16|Q_DFP) + IO_TYPE(D_I32, D_I16) + IO_TYPE(D_I32, D_I8|Q_DFP) + IO_TYPE(D_I32, D_I8) + IO_TYPE(D_I32, D_U32) + IO_TYPE(D_I32, D_U16) + IO_TYPE(D_I32, D_U8|Q_ASYM) + IO_TYPE(D_I32, D_U8) + IO_TYPE(D_U32, D_U32) + IO_TYPE(D_U32, D_I16|Q_DFP) + IO_TYPE(D_U32, D_I16) + IO_TYPE(D_U32, D_I8|Q_DFP) + IO_TYPE(D_U32, D_I8) + IO_TYPE(D_U32, D_I32) + IO_TYPE(D_U32, D_U16) + IO_TYPE(D_U32, D_U8|Q_ASYM) + IO_TYPE(D_U32, D_U8) END_IO_TYPE_DECL(DATACONVERT) if (!VALIDATE_OP_IO_TYPES(DATACONVERT, self, inputs, self->input.num, outputs, self->output.num)) { diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_deconvolution.c b/src/tim/vx/internal/src/ops/vsi_nn_op_deconvolution.c index 4e33da4..133141d 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_deconvolution.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_deconvolution.c @@ -33,6 +33,7 @@ #include "vsi_nn_tensor_util.h" #include "utils/vsi_nn_util.h" #include "vsi_nn_log.h" +#include "utils/vsi_nn_dtype_util.h" #include "utils/vsi_nn_constraint_check.h" #define COMPUTE_DECONV_SZ( in, ksize, pad_1, pad_2, stride, output_padding )\ @@ -181,6 +182,8 @@ static vsi_bool op_check vsi_nn_tensor_t ** outputs ) { + vsi_bool ret = FALSE; + BEGIN_IO_TYPE_DECL(DECONVOLUTION, 3, 1) IO_TYPE(D_F16, D_F16, D_NONE, D_F16) IO_TYPE(D_F16, D_F16, D_F32, D_F16) @@ -197,6 +200,8 @@ static vsi_bool op_check IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP, D_I32|Q_DFP, D_U8|Q_ASYM) IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC, D_I32|Q_SYM_PC, D_U8|Q_ASYM) IO_TYPE(D_I8|Q_ASYM, D_I8|Q_SYM_PC, D_I32|Q_SYM_PC, D_I8|Q_ASYM) + IO_TYPE(D_I8|Q_ASYM, D_I8|Q_SYM_PC, D_NONE, D_I8|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC, D_NONE, D_U8|Q_ASYM) IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_NONE, D_U8|Q_ASYM) IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_I32|Q_ASYM, D_U8|Q_ASYM) IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_I32|Q_ASYM, D_I8|Q_DFP) @@ -225,7 +230,10 @@ static vsi_bool op_check return FALSE; } - return TRUE; + /* Check fl and scale*/ + ret = vsi_nn_QuantCheck(inputs[0], inputs[1], inputs[2]); + + return ret; } /* op_check() */ static vsi_bool op_setup diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_deconvolution1d.c b/src/tim/vx/internal/src/ops/vsi_nn_op_deconvolution1d.c index cf8b2a7..3e971e7 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_deconvolution1d.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_deconvolution1d.c @@ -33,6 +33,7 @@ #include "vsi_nn_tensor_util.h" #include "utils/vsi_nn_util.h" #include "vsi_nn_log.h" +#include "utils/vsi_nn_dtype_util.h" #include "kernel/vsi_nn_kernel.h" #define COMPUTE_DECONV_SZ( in, ksize, pad_1, pad_2, stride, output_padding )\ @@ -57,8 +58,24 @@ static vsi_status op_compute weight_attr.size[2] = weight_attr.size[1]; weight_attr.size[1] = 1; weight_attr.dim_num = 4; - weight_tensor = vsi_nn_CreateTensor( self->graph, &weight_attr ); - vsi_nn_ReshapeTensor( self->graph, inputs[1], weight_tensor, weight_attr.size, 4 ); + if (inputs[1]->attr.dtype.qnt_type != VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_SYMMETRIC) + { + weight_tensor = vsi_nn_reshape_tensor( self->graph, inputs[1], weight_attr.size, 4 ); + } + else + { + uint8_t * data = NULL; + data = vsi_nn_ConvertTensorToData( self->graph, inputs[1] ); + if (NULL == data) + { + VSILOGE("Convert data fail.\n"); + status = VSI_FAILURE; + return status; + } + weight_attr.dtype.channel_dim = inputs[1]->attr.dtype.channel_dim + 1; + weight_tensor = vsi_nn_CreateTensorFromData(self->graph, data, &weight_attr); + vsi_nn_safe_free( data ); + } #ifdef VX_DECONVOLUTION_WEIGHT_LAYOUT_COMPATIBLE_KHRONOS if ( vsi_nn_compareVersion(self->graph, 1, 1, 21) == -1 && TRUE == weight_tensor->attr.is_const ) @@ -118,8 +135,11 @@ static vsi_bool op_check vsi_nn_tensor_t ** outputs ) { - //TODO: Check tensor shapes. - return TRUE; + vsi_bool ret = FALSE; + + ret = vsi_nn_OpCheck(VSI_NN_OP_DECONVOLUTION, self, inputs, outputs); + + return ret; } /* op_check() */ static vsi_bool op_setup diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_dropout.c b/src/tim/vx/internal/src/ops/vsi_nn_op_dropout.c index ea76c4b..f63db97 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_dropout.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_dropout.c @@ -34,7 +34,7 @@ #include "vsi_nn_tensor_util.h" #include "vsi_nn_prv.h" #include "vsi_nn_log.h" -#include "client/vsi_nn_vxkernel.h" +#include "libnnext/vsi_nn_vxkernel.h" #include "vsi_nn_internal_node.h" #include "utils/vsi_nn_constraint_check.h" diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_eltwise.c b/src/tim/vx/internal/src/ops/vsi_nn_op_eltwise.c index ea69316..528a72f 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_eltwise.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_eltwise.c @@ -195,19 +195,29 @@ static vsi_bool op_check_minimum /* check inputs outputs data type */ BEGIN_IO_TYPE_DECL(MINIMUM, 2, 1) IO_TYPE(D_F16, D_F16, D_F16) - IO_TYPE(D_BF16, D_BF16, D_BF16) - IO_TYPE(D_F16, D_F16, D_I8|Q_DFP) - IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I8|Q_DFP) - IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM) - IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I16|Q_DFP) - IO_TYPE(D_F16, D_F16, D_U8|Q_ASYM) - IO_TYPE(D_I8|Q_DFP, D_F16, D_I8|Q_DFP) - IO_TYPE(D_I8|Q_DFP, D_F16, D_F16) - IO_TYPE(D_U8|Q_ASYM, D_F16, D_U8|Q_ASYM) - IO_TYPE(D_U8|Q_ASYM, D_F16, D_F16) - IO_TYPE(D_I16|Q_DFP, D_F16, D_I16|Q_DFP) - IO_TYPE(D_I16|Q_DFP, D_F16, D_F16) IO_TYPE(D_F16, D_F16, D_I16|Q_DFP) + IO_TYPE(D_F16, D_F16, D_I8|Q_DFP) + IO_TYPE(D_F16, D_F16, D_U8|Q_ASYM) + + IO_TYPE(D_F16, D_I16|Q_DFP, D_F16) + IO_TYPE(D_F16, D_I16|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_F16, D_I8|Q_DFP, D_F16) + IO_TYPE(D_F16, D_I8|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_F16, D_U8|Q_ASYM, D_F16) + IO_TYPE(D_F16, D_U8|Q_ASYM, D_U8|Q_ASYM) + + IO_TYPE(D_I16|Q_DFP, D_F16, D_F16) + IO_TYPE(D_I16|Q_DFP, D_F16, D_I16|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_F16, D_F16) + IO_TYPE(D_I8|Q_DFP, D_F16, D_I8|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_F16, D_F16) + IO_TYPE(D_U8|Q_ASYM, D_F16, D_U8|Q_ASYM) + + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM) + + IO_TYPE(D_BF16, D_BF16, D_BF16) IO_TYPE(D_F32, D_F32, D_F32) IO_TYPE(D_I32, D_I32, D_I32) END_IO_TYPE_DECL(MINIMUM) @@ -232,19 +242,29 @@ static vsi_bool op_check_maximum /* check inputs outputs data type */ BEGIN_IO_TYPE_DECL(MAXIMUM, 2, 1) IO_TYPE(D_F16, D_F16, D_F16) - IO_TYPE(D_BF16, D_BF16, D_BF16) - IO_TYPE(D_F16, D_F16, D_I8|Q_DFP) - IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I8|Q_DFP) - IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM) - IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I16|Q_DFP) - IO_TYPE(D_F16, D_F16, D_U8|Q_ASYM) - IO_TYPE(D_I8|Q_DFP, D_F16, D_I8|Q_DFP) - IO_TYPE(D_I8|Q_DFP, D_F16, D_F16) - IO_TYPE(D_U8|Q_ASYM, D_F16, D_U8|Q_ASYM) - IO_TYPE(D_U8|Q_ASYM, D_F16, D_F16) - IO_TYPE(D_I16|Q_DFP, D_F16, D_I16|Q_DFP) - IO_TYPE(D_I16|Q_DFP, D_F16, D_F16) IO_TYPE(D_F16, D_F16, D_I16|Q_DFP) + IO_TYPE(D_F16, D_F16, D_I8|Q_DFP) + IO_TYPE(D_F16, D_F16, D_U8|Q_ASYM) + + IO_TYPE(D_F16, D_I16|Q_DFP, D_F16) + IO_TYPE(D_F16, D_I16|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_F16, D_I8|Q_DFP, D_F16) + IO_TYPE(D_F16, D_I8|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_F16, D_U8|Q_ASYM, D_F16) + IO_TYPE(D_F16, D_U8|Q_ASYM, D_U8|Q_ASYM) + + IO_TYPE(D_I16|Q_DFP, D_F16, D_F16) + IO_TYPE(D_I16|Q_DFP, D_F16, D_I16|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_F16, D_F16) + IO_TYPE(D_I8|Q_DFP, D_F16, D_I8|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_F16, D_F16) + IO_TYPE(D_U8|Q_ASYM, D_F16, D_U8|Q_ASYM) + + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM) + + IO_TYPE(D_BF16, D_BF16, D_BF16) IO_TYPE(D_F32, D_F32, D_F32) IO_TYPE(D_I32, D_I32, D_I32) END_IO_TYPE_DECL(MAXIMUM) @@ -338,6 +358,8 @@ static vsi_bool op_check_add IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I8|Q_DFP) IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_U8|Q_ASYM) IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_I16, D_I16|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_I16|Q_DFP, D_I16, D_I16|Q_DFP) IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_F16) IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM) IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_F16) @@ -359,6 +381,8 @@ static vsi_bool op_check_add IO_TYPE(D_F16, D_F32, D_F16) IO_TYPE(D_F16, D_F16, D_F32) IO_TYPE(D_I32, D_I32, D_I32) + IO_TYPE(D_I16, D_I32, D_I32) + IO_TYPE(D_I32, D_I16, D_I32) IO_TYPE(D_I32, D_I32, D_U8|Q_ASYM) IO_TYPE(D_I32, D_I32, D_I16|Q_DFP) IO_TYPE(D_I32, D_I32, D_I8|Q_DFP) @@ -409,6 +433,8 @@ static vsi_bool op_check_sub IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I8|Q_DFP) IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_U8|Q_ASYM) IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_I16, D_I16|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_I16|Q_DFP, D_I16, D_I16|Q_DFP) IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_F16) IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM) IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_F16) @@ -422,6 +448,8 @@ static vsi_bool op_check_sub IO_TYPE(D_F16, D_F32, D_F16) IO_TYPE(D_F16, D_F16, D_F32) IO_TYPE(D_I32, D_I32, D_I32) + IO_TYPE(D_I16, D_I32, D_I32) + IO_TYPE(D_I32, D_I16, D_I32) END_IO_TYPE_DECL(SUBTRACT) if(!VALIDATE_OP_IO_TYPES(SUBTRACT, self, inputs, self->input.num, outputs, self->output.num)) { char* desc = generate_op_io_types_desc(inputs, @@ -446,27 +474,33 @@ static vsi_bool op_check_div { /* check inputs outputs data type */ BEGIN_IO_TYPE_DECL(DIVIDE, 2, 1) - IO_TYPE(D_BF16, D_BF16, D_BF16) - IO_TYPE(D_F16, D_F16, D_F16) - IO_TYPE(D_F16, D_I8|Q_DFP, D_I8|Q_DFP) - IO_TYPE(D_F16, D_I8|Q_DFP, D_F16) - IO_TYPE(D_F16, D_F16, D_I8|Q_DFP) - IO_TYPE(D_F16, D_F16, D_I16|Q_DFP) - IO_TYPE(D_F16, D_F16, D_U8|Q_ASYM) + IO_TYPE(D_BF16, D_BF16, D_BF16) + IO_TYPE(D_F16, D_F16, D_F16) + IO_TYPE(D_F16, D_F16, D_I16|Q_DFP) + IO_TYPE(D_F16, D_F16, D_I8|Q_DFP) + IO_TYPE(D_F16, D_F16, D_U8|Q_ASYM) + IO_TYPE(D_F16, D_I8|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_F16, D_I8|Q_DFP, D_F16) + IO_TYPE(D_F16, D_U8|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_F16, D_U8|Q_ASYM, D_F16) + IO_TYPE(D_F16, D_I16|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_F16, D_I16|Q_DFP, D_F16) IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM) IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_F16) - IO_TYPE(D_U8|Q_ASYM, D_F16, D_U8|Q_ASYM) - IO_TYPE(D_U8|Q_ASYM, D_F16, D_F16) + IO_TYPE(D_U8|Q_ASYM, D_F16, D_U8|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_F16, D_F16) IO_TYPE(D_U8|Q_ASYM, D_I16|Q_DFP, D_U8|Q_ASYM) IO_TYPE(D_U8|Q_ASYM, D_I16|Q_DFP, D_I16|Q_DFP) IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I8|Q_DFP) IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_F16) - IO_TYPE(D_I8|Q_DFP, D_F16, D_I8|Q_DFP) - IO_TYPE(D_I8|Q_DFP, D_F16, D_F16) + IO_TYPE(D_I8|Q_DFP, D_F16, D_I8|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_F16, D_F16) IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_I16, D_I16|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_I16|Q_DFP, D_I16, D_I16|Q_DFP) IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_F16) - IO_TYPE(D_I16|Q_DFP, D_F16, D_I16|Q_DFP) - IO_TYPE(D_I16|Q_DFP, D_F16, D_F16) + IO_TYPE(D_I16|Q_DFP, D_F16, D_I16|Q_DFP) + IO_TYPE(D_I16|Q_DFP, D_F16, D_F16) IO_TYPE(D_F32, D_F32, D_F32) IO_TYPE(D_F32, D_F32, D_F16) IO_TYPE(D_F32, D_F16, D_F32) @@ -475,6 +509,8 @@ static vsi_bool op_check_div IO_TYPE(D_F16, D_F32, D_F16) IO_TYPE(D_F16, D_F16, D_F32) IO_TYPE(D_I32, D_I32, D_I32) + IO_TYPE(D_I16, D_I32, D_I32) + IO_TYPE(D_I32, D_I16, D_I32) END_IO_TYPE_DECL(DIVIDE) if(!VALIDATE_OP_IO_TYPES(DIVIDE, self, inputs, self->input.num, outputs, self->output.num)) { char* desc = generate_op_io_types_desc(inputs, @@ -521,6 +557,8 @@ static vsi_bool op_check_mul IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_F16) IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I8|Q_DFP) IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_I16, D_I16|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_I16|Q_DFP, D_I16, D_I16|Q_DFP) IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_F16) IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM) IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_F16) @@ -538,6 +576,8 @@ static vsi_bool op_check_mul IO_TYPE(D_F16, D_F32, D_F16) IO_TYPE(D_F16, D_F16, D_F32) IO_TYPE(D_I32, D_I32, D_I32) + IO_TYPE(D_I16, D_I32, D_I32) + IO_TYPE(D_I32, D_I16, D_I32) END_IO_TYPE_DECL(MULTIPLY) if(!VALIDATE_OP_IO_TYPES(MULTIPLY, self, inputs, self->input.num, outputs, self->output.num)) { char* desc = generate_op_io_types_desc(inputs, diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_eltwise_unary.c b/src/tim/vx/internal/src/ops/vsi_nn_op_eltwise_unary.c index c74da7a..d55ac92 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_eltwise_unary.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_eltwise_unary.c @@ -196,6 +196,7 @@ DEF_ELEMENT_WISE_UNARY_OP( ELU, elu ); DEF_ELEMENT_WISE_UNARY_OP( NEG, neg ); DEF_ELEMENT_WISE_UNARY_OP( HARD_SIGMOID, hard_sigmoid ); DEF_ELEMENT_WISE_UNARY_OP( MISH, mish ); +DEF_ELEMENT_WISE_UNARY_OP( ROUND, round ); #undef DEF_ELEMENT_UNARY_WISE_OP diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_erf.c b/src/tim/vx/internal/src/ops/vsi_nn_op_erf.c new file mode 100644 index 0000000..5da991f --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_erf.c @@ -0,0 +1,128 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include + +#include "vsi_nn_types.h" +#include "vsi_nn_log.h" +#include "vsi_nn_node.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "utils/vsi_nn_constraint_check.h" + +/* + Declare number of input and output. + */ +#define _INPUT_NUM (1) +#define _OUTPUT_NUM (1) + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status = VSI_FAILURE; + + self->n = (vx_node)vsi_nn_kernel_selector( self->graph, "erf", + inputs, 1, outputs, 1, NULL ); + + if( self->n ) + { + status = VSI_SUCCESS; + } + + return status; +} /* op_compute() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + BEGIN_IO_TYPE_DECL(ERF, 1, 1) + /* IO_TYPE(INPUT, OUTPUT) */ + IO_TYPE(D_F32, D_F32) + IO_TYPE(D_F32, D_F16) + IO_TYPE(D_F32, D_BF16) + + IO_TYPE(D_F16, D_F32) + IO_TYPE(D_F16, D_F16) + IO_TYPE(D_F16, D_U8|Q_ASYM) + IO_TYPE(D_F16, D_I8|Q_DFP) + IO_TYPE(D_F16, D_I16|Q_DFP) + + IO_TYPE(D_BF16, D_BF16) + IO_TYPE(D_BF16, D_F32) + + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_F16) + + IO_TYPE(D_I8|Q_ASYM, D_I8|Q_ASYM) + IO_TYPE(D_I8|Q_ASYM, D_F16) + + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_F16) + + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_I16|Q_DFP, D_F16) + END_IO_TYPE_DECL(ERF) + if (!VALIDATE_OP_IO_TYPES(ERF, self, inputs, self->input.num, outputs, self->output.num)) + { + char* desc = generate_op_io_types_desc(inputs, + self->input.num, outputs, self->output.num); + VSILOGE("Inputs/Outputs data type not support: %s", desc); + destroy_op_io_types_desc(desc); + return FALSE; + } + + return TRUE; +} /* op_check() */ + + +__BEGIN_DECLS + +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ ERF, + /* init */ NULL, + /* compute */ op_compute, + /* deinit */ vsi_nn_op_common_deinit, + /* check */ op_check, + /* setup */ vsi_nn_op_common_setup, + /* optimize */ NULL, + /* input_num */ _INPUT_NUM, + /* output_num */ _OUTPUT_NUM + ); + +__END_DECLS diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_extra_ending.c b/src/tim/vx/internal/src/ops/vsi_nn_op_extra_ending.c index a3b7fb7..009e75d 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_extra_ending.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_extra_ending.c @@ -34,7 +34,7 @@ #include "vsi_nn_ops.h" #include "vsi_nn_tensor.h" #include "vsi_nn_tensor_util.h" -#include "client/vsi_nn_vxkernel.h" +#include "libnnext/vsi_nn_vxkernel.h" #include "utils/vsi_nn_constraint_check.h" #define _INPUT_NUM (1) diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_floordiv.c b/src/tim/vx/internal/src/ops/vsi_nn_op_floordiv.c index 9cd0bd2..4c19f01 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_floordiv.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_floordiv.c @@ -34,7 +34,7 @@ #include "vsi_nn_tensor_util.h" #include "vsi_nn_prv.h" #include "vsi_nn_log.h" -#include "client/vsi_nn_vxkernel.h" +#include "libnnext/vsi_nn_vxkernel.h" #include "kernel/vsi_nn_kernel_eltwise.h" #include "utils/vsi_nn_constraint_check.h" @@ -68,7 +68,6 @@ static vsi_status op_compute if( ret ) { - reshape_tensors[0] = vsi_nn_reshape_tensor( self->graph, inputs[0], (uint32_t*)shapes[0], new_rank ); reshape_tensors[1] = vsi_nn_reshape_tensor( self->graph, @@ -105,6 +104,7 @@ static vsi_bool op_check IO_TYPE(D_F16, D_F16, D_F16) IO_TYPE(D_F32, D_F32, D_F32) IO_TYPE(D_I32, D_I32, D_I32) + IO_TYPE(D_I32, D_I32, D_U8|Q_ASYM) IO_TYPE(D_BF16, D_BF16, D_BF16) IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM) IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_F16) @@ -112,6 +112,7 @@ static vsi_bool op_check IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_F16) IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I16|Q_DFP) IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_F16) + IO_TYPE(D_U8|Q_ASYM, D_I32, D_U8|Q_ASYM) END_IO_TYPE_DECL(FLOORDIV) if(!VALIDATE_OP_IO_TYPES(FLOORDIV, self, inputs, self->input.num, outputs, self->output.num)) { char* desc = generate_op_io_types_desc(inputs, @@ -189,4 +190,3 @@ DEF_OP_REG #ifdef __cplusplus } #endif - diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_fullconnect.c b/src/tim/vx/internal/src/ops/vsi_nn_op_fullconnect.c index b2023b6..7d95b97 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_fullconnect.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_fullconnect.c @@ -115,7 +115,12 @@ static vsi_bool op_check /* Check fl and scale*/ ret = vsi_nn_QuantCheck(inputs[0], inputs[1], inputs[2]); - ret = ret && vsi_nn_OpCheck(VSI_NN_OP_FCL_RELU, self, inputs, outputs); + if (!ret) + { + return ret; + } + + ret = vsi_nn_OpCheck(VSI_NN_OP_FCL_RELU, self, inputs, outputs); if(!ret) { /* check inputs outputs data type */ diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_fullconnect2.c b/src/tim/vx/internal/src/ops/vsi_nn_op_fullconnect2.c index d1af0cd..25e951c 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_fullconnect2.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_fullconnect2.c @@ -33,7 +33,7 @@ #include "vsi_nn_ops.h" #include "vsi_nn_tensor.h" #include "vsi_nn_tensor_util.h" -#include "client/vsi_nn_vxkernel.h" +#include "libnnext/vsi_nn_vxkernel.h" #include "utils/vsi_nn_util.h" #define _ARG_NUM (2) diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_gather.c b/src/tim/vx/internal/src/ops/vsi_nn_op_gather.c index 81af2ce..d373015 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_gather.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_gather.c @@ -99,13 +99,15 @@ static vsi_bool op_check ) { BEGIN_IO_TYPE_DECL(GATHER, 2, 1) - IO_TYPE(D_I32, D_I32, D_I32) - IO_TYPE(D_F32, D_I32, D_F32) - IO_TYPE(D_F16, D_I32, D_U8|Q_ASYM) - IO_TYPE(D_F16, D_I32, D_I16|Q_DFP) - IO_TYPE(D_F16, D_I32, D_I8|Q_DFP) - IO_TYPE(D_F16, D_I32, D_F16) + IO_TYPE(D_I32, D_I32, D_I32) + IO_TYPE(D_F32, D_I32, D_F32) + IO_TYPE(D_F16, D_I32, D_U8|Q_ASYM) + IO_TYPE(D_F16, D_I32, D_I16|Q_DFP) + IO_TYPE(D_F16, D_I32, D_I8|Q_DFP) + IO_TYPE(D_F16, D_I32, D_F16) + IO_TYPE(D_BF16, D_I32, D_BF16) IO_TYPE(D_U8|Q_ASYM, D_I32, D_U8|Q_ASYM) + IO_TYPE(D_U8, D_I32, D_U8) IO_TYPE(D_U8|Q_ASYM, D_I32, D_F16) IO_TYPE(D_I8|Q_DFP, D_I32, D_I8|Q_DFP) IO_TYPE(D_I8|Q_DFP, D_I32, D_F16) diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_gather_nd.c b/src/tim/vx/internal/src/ops/vsi_nn_op_gather_nd.c index 9d5341a..6d55fad 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_gather_nd.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_gather_nd.c @@ -98,10 +98,11 @@ static vsi_bool op_check BEGIN_IO_TYPE_DECL(GATHER_ND, 2, 1) IO_TYPE(D_I32, D_I32, D_I32) IO_TYPE(D_F32, D_I32, D_F32) - IO_TYPE(D_F16, D_I32, D_U8|Q_ASYM) - IO_TYPE(D_F16, D_I32, D_I16|Q_DFP) - IO_TYPE(D_F16, D_I32, D_I8|Q_DFP) - IO_TYPE(D_F16, D_I32, D_F16) + IO_TYPE(D_F16, D_I32, D_U8|Q_ASYM) + IO_TYPE(D_F16, D_I32, D_I16|Q_DFP) + IO_TYPE(D_F16, D_I32, D_I8|Q_DFP) + IO_TYPE(D_F16, D_I32, D_F16) + IO_TYPE(D_BF16, D_I32, D_BF16) IO_TYPE(D_U8|Q_ASYM, D_I32, D_U8|Q_ASYM) IO_TYPE(D_U8|Q_ASYM, D_I32, D_F16) IO_TYPE(D_I8|Q_DFP, D_I32, D_I8|Q_DFP) diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_generate_proposals.c b/src/tim/vx/internal/src/ops/vsi_nn_op_generate_proposals.c index 927123b..cc42045 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_generate_proposals.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_generate_proposals.c @@ -34,163 +34,11 @@ #include "vsi_nn_ops.h" #include "vsi_nn_tensor.h" #include "vsi_nn_tensor_util.h" -#include "client/vsi_nn_vxkernel.h" +#include "kernel/vsi_nn_kernel.h" -#define _ARG_NUM (6) #define _INPUT_NUM (4) #define _OUTPUT_NUM (3) #define _IO_NUM (_INPUT_NUM + _OUTPUT_NUM) -#define _PARAM_NUM (_ARG_NUM + _IO_NUM) - -extern vx_kernel_description_t * vx_kernel_GENERATE_PROPOSALS_list[]; - -static void _set_inputs_outputs - ( - vx_reference * params, - vsi_nn_tensor_t ** inputs, - vsi_nn_tensor_t ** outputs - ) -{ - uint32_t i; - uint32_t cnt; - - /* Set inputs */ - cnt = 0; - for( i = 0; i < _INPUT_NUM; i ++, cnt ++ ) - { - params[cnt] = (vx_reference)inputs[i]->t; - } - - /* Set outputs */ - for( i = 0; i < _OUTPUT_NUM; i ++, cnt ++ ) - { - params[cnt] = (vx_reference)outputs[i]->t; - } -} /* _set_inputs_outputs() */ - -static vsi_status _create_params - ( - vsi_nn_node_t * node, - vx_reference * params, - uint32_t num - ) -{ - vsi_status status; - vx_context ctx; - vsi_nn_generate_proposals_param * p; - if( 0 == num ) - { - return VSI_SUCCESS; - } - memset( params, 0, sizeof( vx_reference * ) * num ); - p = &(node->nn_param.generate_proposals); - ctx = vxGetContext( (vx_reference)node->graph->g ); - /* Init parameters */ - #define _SET_PARAM( i, type, arg ) do{ \ - params[i] = (vx_reference)vxCreateScalar( ctx, type, &p->arg ); \ - status = vxGetStatus( params[i] ); \ - if( VSI_SUCCESS != status ) { \ - goto set_param_error; \ - } \ - } while(0) - _SET_PARAM( 0, VX_TYPE_FLOAT32, height_stride ); - _SET_PARAM( 1, VX_TYPE_FLOAT32, width_stride ); - _SET_PARAM( 2, VX_TYPE_INT32, pre_nms_top_n ); - _SET_PARAM( 3, VX_TYPE_INT32, post_nms_top_n ); - _SET_PARAM( 4, VX_TYPE_FLOAT32, iou_threshold ); - _SET_PARAM( 5, VX_TYPE_FLOAT32, min_size ); - #undef _SET_PARAM -set_param_error: - - return status; -} /* _create_params */ - -static void _release_params - ( - vx_reference * params, - uint32_t num - ) -{ - uint32_t i; - vx_scalar scalar; - for( i = 0; i < num; i ++ ) - { - scalar = (vx_scalar)params[i]; - vxReleaseScalar( &scalar ); - } -} /* _release_params() */ - -static vsi_status cpu_op_compute - ( - vsi_nn_node_t * self, - vsi_nn_tensor_t ** inputs, - vsi_nn_tensor_t ** outputs - ) -{ - vsi_status status = VSI_SUCCESS; - vx_reference params[_PARAM_NUM]; - vx_reference * args; - - args = ¶ms[_IO_NUM]; - - if( NULL == self->n ) - { - return VSI_FAILURE; - } - - /* Set inputs and outputs */ - _set_inputs_outputs( params, inputs, outputs ); - - /* Init parameters. */ - _create_params( self, args, _ARG_NUM ); - - /* Pass parameters to node. */ - status = vsi_nn_ClientNodePassParameters( self->n, params, _PARAM_NUM ); - - _release_params( args, _ARG_NUM ); - - return status; -} - -static vsi_status vx_op_compute - ( - vsi_nn_node_t * self, - vsi_nn_tensor_t ** inputs, - vsi_nn_tensor_t ** outputs - ) -{ - vsi_status status = VSI_SUCCESS; - vx_reference params[_PARAM_NUM]; - vx_reference * args; - - args = ¶ms[_IO_NUM]; - - if( NULL == self->n ) - { - return VSI_FAILURE; - } - - /* Set inputs and outputs */ - _set_inputs_outputs( params, inputs, outputs ); - /*TODO: Add code if need to change your parameter*/ - - /* Init parameters. */ - _create_params( self, args, _ARG_NUM ); - - /* Pass parameters to node. */ - status = vsi_nn_ClientNodePassParameters( self->n, params, _PARAM_NUM ); - - _release_params( args, _ARG_NUM ); - - return status; -} - -static vsi_nn_op_compute_t op_compute_list[] = -{ - cpu_op_compute, - vx_op_compute, - NULL -}; static vsi_status op_compute ( @@ -199,46 +47,27 @@ static vsi_status op_compute vsi_nn_tensor_t ** outputs ) { - vsi_status status; - vsi_nn_kernel_info_t kernel_info; - char *path = NULL; + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_param_t * param = NULL; - memset(&kernel_info, 0x0, sizeof(vsi_nn_kernel_info_t)); - status = VSI_FAILURE; - kernel_info.type = VX_KERNEL_TYPE_CPU; - kernel_info.kernel = vx_kernel_GENERATE_PROPOSALS_list; - kernel_info.resource_num = 1; - kernel_info.resource_name = (char **)malloc(kernel_info.resource_num * sizeof(char *)); - kernel_info.resource_name[0] = "vsi_nn_kernel_generate_proposals"; - path = getenv("USER_VX_SOURCE_PATH"); - if(path) - vsi_nn_VxResourceSetPath(path); + param = vsi_nn_kernel_param_create(); - if( kernel_info.type == VX_KERNEL_TYPE_VX) + vsi_nn_kernel_param_add_float32( param, "height_stride", self->nn_param.generate_proposals.height_stride ); + vsi_nn_kernel_param_add_float32( param, "width_stride", self->nn_param.generate_proposals.width_stride ); + vsi_nn_kernel_param_add_int32( param, "pre_nms_top_n", self->nn_param.generate_proposals.pre_nms_top_n); + vsi_nn_kernel_param_add_int32( param, "post_nms_top_n", self->nn_param.generate_proposals.post_nms_top_n); + vsi_nn_kernel_param_add_float32( param, "iou_threshold", self->nn_param.generate_proposals.iou_threshold ); + vsi_nn_kernel_param_add_float32( param, "min_size", self->nn_param.generate_proposals.min_size ); + + self->n = (vx_node)vsi_nn_kernel_selector( self->graph, "cpu beckend conv2d", + inputs, _INPUT_NUM, outputs, _OUTPUT_NUM, param ); + + if( self->n ) { - kernel_info.kernel_index = 1; - kernel_info.init_index = 1; - } - else /*kernel_info.type = VX_KERNEL_TYPE_CPU;*/ - { - kernel_info.kernel_index = 0; - kernel_info.init_index = 0; + status = VSI_SUCCESS; } - self->n = vsi_nn_RegisterClientKernelAndNewNode( - self->graph, &kernel_info); - if (kernel_info.resource_name) - { - free(kernel_info.resource_name); - } - if( NULL == self->n ) - { - return VSI_FAILURE; - } - if (NULL != op_compute_list[kernel_info.init_index]) - { - status = op_compute_list[kernel_info.init_index](self, inputs, outputs); - } + vsi_nn_kernel_param_release( ¶m ); return status; } /* op_compute() */ diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_grouped_conv1d.c b/src/tim/vx/internal/src/ops/vsi_nn_op_grouped_conv1d.c new file mode 100644 index 0000000..d979662 --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_grouped_conv1d.c @@ -0,0 +1,207 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include + +#include "vsi_nn_types.h" +#include "vsi_nn_log.h" +#include "vsi_nn_node.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "vsi_nn_internal_node.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_constraint_check.h" + +#define _INPUT_NUM (3) +#define _OUTPUT_NUM (1) + +static vsi_nn_tensor_t * _expand_tensor_dim + ( vsi_nn_graph_t * graph, vsi_nn_tensor_t *tensor, uint32_t * shape, size_t rank, int32_t expand_dim ) +{ + uint32_t new_shape[VSI_NN_MAX_DIM_NUM] = { 0 }; + uint32_t i, cnt; + if ( expand_dim < 0 ) + { + expand_dim = (int32_t)rank + expand_dim; + } + if ( expand_dim < 0 || (uint32_t)expand_dim > rank ) + { + VSILOGE("Run dim to expand %d, rank is %lu", expand_dim, rank); + return NULL; + } + for ( i = 0, cnt = 0; i < rank; i ++ ) + { + if ( i == (uint32_t)expand_dim ) + { + new_shape[cnt] = 1; + cnt ++; + } + new_shape[cnt] = shape[i]; + cnt ++; + } + if ( (uint32_t)expand_dim == rank ) + { + new_shape[cnt] = 1; + } + + return vsi_nn_reshape_tensor( graph, tensor, new_shape, (uint32_t)rank + 1 ); +} /* _expand_tensor_dim() */ + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + return vsi_nn_internal_compute_node( self ); +} /* op_compute() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_bool ret = FALSE; + + ret = vsi_nn_OpCheck(VSI_NN_OP_CONV2D, self, inputs, outputs); + + return ret; +} /* op_check() */ + +static vsi_bool op_setup + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_nn_internal_node_t* curr = NULL; + vsi_nn_grouped_conv1d_param* p = &self->nn_param.grouped_conv1d; + + vsi_nn_internal_init_node_wksp(self); + + if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) + { + outputs[0]->attr.size[0] = vsi_nn_ComputeFilterSize + ( + inputs[0]->attr.size[0], + inputs[1]->attr.size[0], + p->pad, + p->stride, + p->dilation, + VSI_NN_ROUND_FLOOR + ); + + outputs[0]->attr.size[1] = inputs[1]->attr.size[2]; + outputs[0]->attr.size[2] = inputs[0]->attr.size[2]; + outputs[0]->attr.dim_num = 3; + } + + p->local->input = _expand_tensor_dim( self->graph, inputs[0], + inputs[0]->attr.size, inputs[0]->attr.dim_num, 0 ); + p->local->weight = _expand_tensor_dim( self->graph, inputs[1], + inputs[1]->attr.size, inputs[1]->attr.dim_num, 0 ); + p->local->output = _expand_tensor_dim( self->graph, outputs[0], + outputs[0]->attr.size, outputs[0]->attr.dim_num, 0 ); + + + curr = vsi_nn_internal_new_node(self, VSI_NN_OP_GROUPED_CONV2D, 0, 0); + curr->inputs[0] = p->local->input; + curr->inputs[1] = p->local->weight; + curr->inputs[2] = inputs[2]; + curr->outputs[0] = p->local->output; + curr->node->nn_param.grouped_conv2d.ksize[0] = 1; + curr->node->nn_param.grouped_conv2d.ksize[1] = p->ksize; + curr->node->nn_param.grouped_conv2d.dilation[0] = 1; + curr->node->nn_param.grouped_conv2d.dilation[1] = p->dilation; + curr->node->nn_param.grouped_conv2d.pad[0] = 0; + curr->node->nn_param.grouped_conv2d.pad[1] = p->pad[0]; + curr->node->nn_param.grouped_conv2d.pad[2] = 0; + curr->node->nn_param.grouped_conv2d.pad[3] = p->pad[1]; + curr->node->nn_param.grouped_conv2d.stride[0] = 1; + curr->node->nn_param.grouped_conv2d.stride[1] = p->stride; + curr->node->nn_param.grouped_conv2d.group = p->group; + curr->node->nn_param.grouped_conv2d.multiplier = p->multiplier; + curr->node->nn_param.grouped_conv2d.weights = p->weights; + curr->node->nn_param.grouped_conv2d.pad_type = p->pad_type; + + vsi_nn_internal_setup_node(self, curr); + + return TRUE; +} /* op_setup() */ + +static vsi_status op_init + ( + vsi_nn_node_t* self + ) +{ + + self->nn_param.grouped_conv1d.local = (grouped_conv1d_local_data_t *)malloc(sizeof(grouped_conv1d_local_data_t)); + memset(self->nn_param.grouped_conv1d.local, 0x00, sizeof(grouped_conv1d_local_data_t)); + + return VSI_SUCCESS; +} /* op_init() */ + +static vsi_status op_deinit + ( + vsi_nn_node_t* self + ) +{ + vsi_nn_grouped_conv1d_param* p = &self->nn_param.grouped_conv1d; + vsi_nn_internal_deinit_node_wksp(self); + + vsi_safe_release_tensor(p->local->input); + vsi_safe_release_tensor(p->local->weight); + vsi_safe_release_tensor(p->local->output); + vsi_nn_safe_free(p->local); + + return vsi_nn_op_common_deinit(self); +} /* op_deinit() */ + +__BEGIN_DECLS + +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ GROUPED_CONV1D, + /* init */ op_init, + /* compute */ op_compute, + /* deinit */ op_deinit, + /* check */ op_check, + /* setup */ op_setup, + /* optimize */ NULL, + /* input_num */ _INPUT_NUM, + /* output_num */ _OUTPUT_NUM + ); + +__END_DECLS + diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_grouped_conv2d.c b/src/tim/vx/internal/src/ops/vsi_nn_op_grouped_conv2d.c index 30e6f94..32b62af 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_grouped_conv2d.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_grouped_conv2d.c @@ -36,7 +36,7 @@ #include "vsi_nn_tensor_util.h" #include "utils/vsi_nn_util.h" #include "utils/vsi_nn_dtype_util.h" -#include "client/vsi_nn_vxkernel.h" +#include "libnnext/vsi_nn_vxkernel.h" #include "utils/vsi_nn_constraint_check.h" #define _ARG_NUM (1) @@ -207,7 +207,11 @@ static vsi_bool op_check vsi_nn_tensor_t ** outputs ) { - return vsi_nn_OpCheck(VSI_NN_OP_CONV2D, self, inputs, outputs); + vsi_bool ret = FALSE; + + ret = vsi_nn_OpCheck(VSI_NN_OP_CONV2D, self, inputs, outputs); + + return ret; } /* op_check() */ static vsi_bool op_setup diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_groupnormalize.c b/src/tim/vx/internal/src/ops/vsi_nn_op_groupnormalize.c new file mode 100644 index 0000000..a217600 --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_groupnormalize.c @@ -0,0 +1,297 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include +#include + +#include "vsi_nn_types.h" +#include "vsi_nn_platform.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_node.h" +#include "utils/vsi_nn_math.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_tensor_util.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_log.h" +#include "libnnext/vsi_nn_vxkernel.h" +#include "kernel/vsi_nn_kernel.h" +#include "utils/vsi_nn_constraint_check.h" + +#define _INPUT_NUM (3) +#define _OUTPUT_NUM (1) + +static vsi_status _try_set_high_presision_tensor + ( + vsi_nn_tensor_t **inputs + ) +{ + vsi_status status; + vsi_nn_vxtensor_attr_t attr; + + status = VSI_SUCCESS; + attr = VSI_NN_TENSOR_ATTR_HIGH_PRECISION; + + if (VSI_NN_TYPE_FLOAT32 == inputs[1]->attr.dtype.vx_type) + { + status = vsi_nn_SetTensorAttr(inputs[1], attr); + if (VSI_SUCCESS != status) + { + return status; + } + } + if (VSI_NN_TYPE_FLOAT32 == inputs[2]->attr.dtype.vx_type) + { + status = vsi_nn_SetTensorAttr(inputs[2], attr); + if (VSI_SUCCESS != status) + { + return status; + } + } + + return status; +} + +static vsi_bool _is_3d_group_norm + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs + ) +{ + if ( 3 == inputs[0]->attr.dim_num ) + { + return TRUE; + } + return FALSE; +} /* _is_3d_group_norm() */ + +static vsi_status _op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_param_t * param = NULL; + vsi_nn_kernel_node_t n = NULL; + float eps = self->nn_param.groupnorm.eps; + int32_t group_num = self->nn_param.groupnorm.group_num; + vsi_nn_tensor_t * tmp_inputs[3] = {NULL, NULL, NULL}; + vsi_nn_tensor_t * tmp_outputs[1] = {NULL}; + vsi_nn_groupnorm_lcl_data *local = self->nn_param.groupnorm.lcl_data; + + status = _try_set_high_presision_tensor(inputs); + if (status != VSI_SUCCESS) + { + VSILOGE("Set tensor attr of high presision fail"); + return status; + } + + if (_is_3d_group_norm(self, inputs)) + { + tmp_inputs[0] = local->reshaped_input; + tmp_outputs[0] = local->reshaped_output; + tmp_inputs[1] = inputs[1]; + tmp_inputs[2] = inputs[2]; + } + else + { + tmp_inputs[0] = inputs[0]; + tmp_outputs[0] = outputs[0]; + tmp_inputs[1] = inputs[1]; + tmp_inputs[2] = inputs[2]; + } + + param = vsi_nn_kernel_param_create(); + vsi_nn_kernel_param_add_float32( param, "eps", eps ); + vsi_nn_kernel_param_add_int32( param, "group_num", group_num ); + n = vsi_nn_kernel_selector( self->graph, "group_norm", + tmp_inputs, _INPUT_NUM, tmp_outputs, _OUTPUT_NUM, param ); + if ( n != NULL ) + { + self->n = (vx_node)n; + status = VSI_SUCCESS; + } + + if (param != NULL) + { + vsi_nn_kernel_param_release( ¶m ); + } + + return status; +} /* op_compute() */ + +static vsi_status _op_optimize + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs, + vsi_nn_opt_direction_e direction + ) +{ + uint32_t dim = 0; + vsi_nn_groupnorm_lcl_data* local = NULL; + uint32_t shape[VSI_NN_MAX_DIM_NUM]; + char tensor_name[128]; + + if (_is_3d_group_norm(self, inputs) == FALSE) + { + return VSI_SUCCESS; + } + + VSILOGD("Optimize 3D %s, uid %u", vsi_nn_OpGetName(self->op), self->uid); + /* + insert a reshape node before and after 3D group_norm + */ + shape[0] = 1; + shape[1] = inputs[0]->attr.size[0]; + shape[2] = inputs[0]->attr.size[1]; + shape[3] = inputs[0]->attr.size[2]; + dim = 4; + local = self->nn_param.groupnorm.lcl_data; + if (VSI_NN_OPTIMIZE_FORWARD == direction) + { + /* reshape 3d input (xcn) --> 4d input (whcn) */ + local->reshaped_input = vsi_nn_reshape_tensor(self->graph, inputs[0], shape, dim); + } + else + { + /* reshape 3d output(xcn) --> 4d output(whcn) */ + local->reshaped_output = vsi_nn_reshape_tensor(self->graph, outputs[0], shape, dim); + if (local->reshaped_output && local->reshaped_output->t) + { + memset(tensor_name, 0, sizeof(tensor_name)); + snprintf(tensor_name, sizeof(tensor_name), "uid_%u_reshape_out_0", self->uid); + if (vxSetReferenceName((vx_reference)local->reshaped_output->t, tensor_name) == VSI_FAILURE) + { + VSILOGW("Set uid %u groupnorm reshaped output name fail", self->uid); + return VSI_FAILURE; + } + } + } + + return VSI_SUCCESS; +} /* op_optimize() */ + +static vsi_bool _op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + BEGIN_IO_TYPE_DECL(GROUP_NORM, 3, 1) + IO_TYPE(D_F16, D_F32, D_F16, D_F16) + IO_TYPE(D_F16, D_F32, D_F16, D_U8|Q_ASYM) + IO_TYPE(D_F32, D_F32, D_F16, D_F32) + IO_TYPE(D_F32, D_F32, D_F32, D_F32) + IO_TYPE(D_I32, D_F32, D_F16, D_I32) + IO_TYPE(D_I32, D_F32, D_F16, D_F32) + IO_TYPE(D_I8|Q_DFP, D_F32, D_F16, D_F16) + IO_TYPE(D_I8|Q_DFP, D_F32, D_F16, D_I8|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_F32, D_F16, D_F16) + IO_TYPE(D_U8|Q_ASYM, D_F32, D_F16, D_U8|Q_ASYM) + IO_TYPE(D_I16|Q_DFP, D_F32, D_F16, D_F16) + IO_TYPE(D_I16|Q_DFP, D_F32, D_F16, D_I16|Q_DFP) + END_IO_TYPE_DECL(GROUP_NORM) + if (!VALIDATE_OP_IO_TYPES(GROUP_NORM, self, inputs, self->input.num, outputs, self->output.num)) + { + char* desc = generate_op_io_types_desc(inputs, + self->input.num, outputs, self->output.num); + VSILOGE("Inputs/Outputs data type not support: %s", desc); + destroy_op_io_types_desc(desc); + return FALSE; + } + + return TRUE; +} /* op_check() */ + +static vsi_status _op_init + ( + vsi_nn_node_t * self + ) +{ + vsi_status status = VSI_SUCCESS; + + self->nn_param.groupnorm.lcl_data = + (vsi_nn_groupnorm_lcl_data *)malloc(sizeof(vsi_nn_groupnorm_lcl_data)); + if (NULL == self->nn_param.groupnorm.lcl_data) + { + return VX_ERROR_NO_MEMORY; + } + + memset( self->nn_param.groupnorm.lcl_data, 0, sizeof(vsi_nn_groupnorm_lcl_data) ); + + self->nn_param.groupnorm.lcl_data->reshaped_input = NULL; + self->nn_param.groupnorm.lcl_data->reshaped_output = NULL; + + return status; +} /* op_init() */ + +static vsi_status _op_deinit + ( + vsi_nn_node_t * self + ) +{ + vsi_nn_groupnormalize_param *p = &(self->nn_param.groupnorm); + if (p->lcl_data->reshaped_input) + { + vsi_nn_ReleaseTensor(&(p->lcl_data->reshaped_input)); + p->lcl_data->reshaped_input = NULL; + } + if (p->lcl_data->reshaped_output) + { + vsi_nn_ReleaseTensor(&(p->lcl_data->reshaped_output)); + p->lcl_data->reshaped_output = NULL; + } + if (self->nn_param.groupnorm.lcl_data) + { + free(self->nn_param.groupnorm.lcl_data); + self->nn_param.groupnorm.lcl_data = NULL; + } + vsi_nn_op_common_deinit(self); + + return VSI_SUCCESS; +} /* op_deinit() */ + +#ifdef __cplusplus +extern "C" { +#endif +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ GROUP_NORM, + /* init */ _op_init, + /* compute */ _op_compute, + /* deinit */ _op_deinit, + /* check */ _op_check, + /* setup */ vsi_nn_op_common_setup, + /* optimize */ _op_optimize, + /* input_num */ _INPUT_NUM, + /* output_num */ _OUTPUT_NUM + ); +#ifdef __cplusplus +} +#endif + diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_gru_ovxlib.c b/src/tim/vx/internal/src/ops/vsi_nn_op_gru_ovxlib.c index 82ad745..d4ac7a2 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_gru_ovxlib.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_gru_ovxlib.c @@ -35,7 +35,7 @@ #include "vsi_nn_ops.h" #include "vsi_nn_tensor.h" #include "vsi_nn_tensor_util.h" -#include "client/vsi_nn_vxkernel.h" +#include "libnnext/vsi_nn_vxkernel.h" #include "vsi_nn_internal_node.h" #include "vsi_nn_rnn_helper.h" @@ -265,6 +265,17 @@ static vsi_bool op_setup_default curr->node->nn_param.grucell_ovxlib.activation = curr_param->activation; curr->node->nn_param.grucell_ovxlib.recurrent_activation = curr_param->recurrent_activation; curr->node->nn_param.grucell_ovxlib.linear_before_reset = curr_param->linear_before_reset; + if ( reshape_output->attr.dtype.vx_type == VSI_NN_TYPE_BFLOAT16 ) + { + int32_t k = 0; + for (k = 0; k < sizeof( curr_param->internal_dtype ) / sizeof(curr_param->internal_dtype[0]); k++) + { + if (curr_param->internal_dtype[k].vx_type == VSI_NN_TYPE_NONE) + { + curr_param->internal_dtype[k].vx_type = VSI_NN_TYPE_BFLOAT16; + } + } + } memcpy( curr->node->nn_param.grucell_ovxlib.internal_dtype, curr_param->internal_dtype, sizeof( curr_param->internal_dtype ) ); curr->node->nn_param.grucell_ovxlib.use_cudnn_implementation = curr_param->use_cudnn_implementation; diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_grucell_ovxlib.c b/src/tim/vx/internal/src/ops/vsi_nn_op_grucell_ovxlib.c index d5797d9..c1d60b6 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_grucell_ovxlib.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_grucell_ovxlib.c @@ -35,7 +35,7 @@ #include "vsi_nn_ops.h" #include "vsi_nn_tensor.h" #include "vsi_nn_tensor_util.h" -#include "client/vsi_nn_vxkernel.h" +#include "libnnext/vsi_nn_vxkernel.h" #include "ops/vsi_nn_op_grucell_ovxlib.h" #include "vsi_nn_internal_node.h" #include "vsi_nn_rnn_helper.h" @@ -236,7 +236,15 @@ static vsi_bool op_setup_float use_virtual_tensor, inputs[GRUCELL_INPUT_INPUT], inputs[GRUCELL_INPUT_H_STATE]); dtype.qnt_type = VSI_NN_QNT_TYPE_NONE; - dtype.vx_type = VSI_NN_TYPE_FLOAT16; + if ( input_hstate->t->attr.dtype.vx_type == VSI_NN_TYPE_BFLOAT16 || + input_hstate->t->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT32 ) + { + dtype.vx_type = input_hstate->t->attr.dtype.vx_type; + } + else + { + dtype.vx_type = VSI_NN_TYPE_FLOAT16; + } tmp_tensor = vsi_nn_rnn_create_tp_fc(self, input_hstate->t, p->local->weights_z_r, p->local->bias_z_r, &dtype, use_virtual_tensor); @@ -261,7 +269,15 @@ static vsi_bool op_setup_float inputs[GRUCELL_INPUT_INPUT], tensor_rt->t); dtype.qnt_type = VSI_NN_QNT_TYPE_NONE; - dtype.vx_type = VSI_NN_TYPE_FLOAT16; + if ( tmp_tensor->t->attr.dtype.vx_type == VSI_NN_TYPE_BFLOAT16 || + tmp_tensor->t->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT32 ) + { + dtype.vx_type = input_hstate->t->attr.dtype.vx_type; + } + else + { + dtype.vx_type = VSI_NN_TYPE_FLOAT16; + } /* W{c} x [x{t}, r{t}] */ tmp_tensor = vsi_nn_rnn_create_tp_fc(self, tmp_tensor->t, p->local->weights_c, p->local->bias_c, &dtype, use_virtual_tensor); @@ -270,7 +286,15 @@ static vsi_bool op_setup_float else { dtype.qnt_type = VSI_NN_QNT_TYPE_NONE; - dtype.vx_type = VSI_NN_TYPE_FLOAT16; + if ( inputs[GRUCELL_INPUT_H_STATE]->attr.dtype.vx_type == VSI_NN_TYPE_BFLOAT16 || + inputs[GRUCELL_INPUT_H_STATE]->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT32 ) + { + dtype.vx_type = inputs[GRUCELL_INPUT_H_STATE]->attr.dtype.vx_type; + } + else + { + dtype.vx_type = VSI_NN_TYPE_FLOAT16; + } /* r.(hstate*w_hc + b_hc) */ tmp_tensor = vsi_nn_rnn_create_tp_fc(self, inputs[GRUCELL_INPUT_H_STATE], inputs[GRUCELL_INPUT_WEIGHT_H2C], inputs[GRUCELL_INPUT_BIAS_H2C], &dtype, use_virtual_tensor); @@ -635,12 +659,28 @@ static vsi_bool op_setup_float_cudnn_v2 use_virtual_tensor, inputs[GRUCELL_INPUT_INPUT], inputs[GRUCELL_INPUT_H_STATE]); dtype.qnt_type = VSI_NN_QNT_TYPE_NONE; - dtype.vx_type = VSI_NN_TYPE_FLOAT16; + if ( concated_input->t->attr.dtype.vx_type == VSI_NN_TYPE_BFLOAT16 || + concated_input->t->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT32 ) + { + dtype.vx_type = concated_input->t->attr.dtype.vx_type; + } + else + { + dtype.vx_type = VSI_NN_TYPE_FLOAT16; + } tmp_tensor = vsi_nn_rnn_create_tp_fc(self, concated_input->t, p->local->weights_input, p->local->bias_z_r, &dtype, use_virtual_tensor); dtype.qnt_type = VSI_NN_QNT_TYPE_NONE; - dtype.vx_type = VSI_NN_TYPE_FLOAT16; + if ( splited_input_fc_output_tensors[0]->t->attr.dtype.vx_type == VSI_NN_TYPE_BFLOAT16 || + splited_input_fc_output_tensors[0]->t->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT32 ) + { + dtype.vx_type = splited_input_fc_output_tensors[0]->t->attr.dtype.vx_type; + } + else + { + dtype.vx_type = VSI_NN_TYPE_FLOAT16; + } { uint32_t _slices[] = { inputs[GRUCELL_INPUT_INPUT]->attr.size[0], inputs[GRUCELL_INPUT_H_STATE]->attr.size[0] }; @@ -651,7 +691,15 @@ static vsi_bool op_setup_float_cudnn_v2 inputs[GRUCELL_INPUT_WEIGHT_I2C], inputs[GRUCELL_INPUT_BIAS_I2C], &dtype, use_virtual_tensor); dtype.qnt_type = VSI_NN_QNT_TYPE_NONE; - dtype.vx_type = VSI_NN_TYPE_FLOAT16; + if ( inputs[GRUCELL_INPUT_H_STATE]->attr.dtype.vx_type == VSI_NN_TYPE_BFLOAT16 || + inputs[GRUCELL_INPUT_H_STATE]->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT32 ) + { + dtype.vx_type = inputs[GRUCELL_INPUT_H_STATE]->attr.dtype.vx_type; + } + else + { + dtype.vx_type = VSI_NN_TYPE_FLOAT16; + } recurrent2cand_output = vsi_nn_rnn_create_tp_fc(self, inputs[GRUCELL_INPUT_H_STATE], inputs[GRUCELL_INPUT_WEIGHT_H2C], inputs[GRUCELL_INPUT_BIAS_H2C], &dtype, use_virtual_tensor); @@ -668,7 +716,15 @@ static vsi_bool op_setup_float_cudnn_v2 attr.vtl = use_virtual_tensor; attr.is_const = FALSE; attr.dtype.qnt_type = VSI_NN_QNT_TYPE_NONE; - attr.dtype.vx_type = VSI_NN_TYPE_FLOAT16; + if ( splited_input_fc_output_tensors[0]->t->attr.dtype.vx_type == VSI_NN_TYPE_BFLOAT16 || + splited_input_fc_output_tensors[0]->t->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT32 ) + { + dtype.vx_type = splited_input_fc_output_tensors[0]->t->attr.dtype.vx_type; + } + else + { + dtype.vx_type = VSI_NN_TYPE_FLOAT16; + } tmp_tensor = vsi_nn_internal_new_tensor(self, &attr, 0.0f); curr = vsi_nn_internal_new_node( self, VSI_NN_OP_A_TIMES_B_PLUS_C, 0, 0 ); @@ -969,7 +1025,15 @@ static vsi_bool op_setup_default memcpy(&attr, &(inputs[GRUCELL_INPUT_WEIGHT_H2C]->attr), sizeof(attr)); attr.dtype.qnt_type = VSI_NN_QNT_TYPE_NONE; - attr.dtype.vx_type = VSI_NN_TYPE_FLOAT16; + if ( rh_mul_outputs->t->attr.dtype.vx_type == VSI_NN_TYPE_BFLOAT16 || + rh_mul_outputs->t->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT32 ) + { + attr.dtype.vx_type = rh_mul_outputs->t->attr.dtype.vx_type; + } + else + { + attr.dtype.vx_type = VSI_NN_TYPE_FLOAT16; + } wei_r2c_tensor = vsi_nn_ConvertTensorDtype(self->graph, inputs[GRUCELL_INPUT_WEIGHT_H2C], &(attr.dtype)); rh_cand_fc_output = vsi_nn_rnn_create_tp_fc(self, @@ -1049,7 +1113,16 @@ static vsi_bool op_setup_default attr.vtl = use_virtual_tensor; attr.is_const = FALSE; attr.dtype.qnt_type = VSI_NN_QNT_TYPE_NONE; - attr.dtype.vx_type = VSI_NN_TYPE_FLOAT16; + if ( input_tensor->t->attr.dtype.vx_type == VSI_NN_TYPE_BFLOAT16 || + input_tensor->t->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT32 ) + { + attr.dtype.vx_type = input_tensor->t->attr.dtype.vx_type; + } + else + { + attr.dtype.vx_type = VSI_NN_TYPE_FLOAT16; + } + tmp_tensor = vsi_nn_internal_new_tensor(self, &attr, 0.0f); /* create internal tensor sub node (1-zt)*c */ diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_heatmap_max_keypoint.c b/src/tim/vx/internal/src/ops/vsi_nn_op_heatmap_max_keypoint.c index d588b12..0a01f72 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_heatmap_max_keypoint.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_heatmap_max_keypoint.c @@ -34,7 +34,7 @@ #include "vsi_nn_ops.h" #include "vsi_nn_tensor.h" #include "vsi_nn_tensor_util.h" -#include "client/vsi_nn_vxkernel.h" +#include "libnnext/vsi_nn_vxkernel.h" #define _ARG_NUM (1) #define _INPUT_NUM (2) diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_imageprocess.c b/src/tim/vx/internal/src/ops/vsi_nn_op_imageprocess.c index cc8103c..8883c35 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_imageprocess.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_imageprocess.c @@ -34,7 +34,7 @@ #include "vsi_nn_ops.h" #include "vsi_nn_tensor.h" #include "vsi_nn_tensor_util.h" -#include "client/vsi_nn_vxkernel.h" +#include "libnnext/vsi_nn_vxkernel.h" #include "utils/vsi_nn_constraint_check.h" #define _ARG_NUM (14) diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_instancenormalize.c b/src/tim/vx/internal/src/ops/vsi_nn_op_instancenormalize.c index 2a0f6a2..8f54a50 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_instancenormalize.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_instancenormalize.c @@ -34,7 +34,7 @@ #include "vsi_nn_tensor_util.h" #include "vsi_nn_prv.h" #include "vsi_nn_log.h" -#include "client/vsi_nn_vxkernel.h" +#include "libnnext/vsi_nn_vxkernel.h" #include "kernel/vsi_nn_kernel.h" #include "utils/vsi_nn_constraint_check.h" @@ -211,16 +211,23 @@ static vsi_bool op_check { BEGIN_IO_TYPE_DECL(INSTANCE_NORM, 3, 1) IO_TYPE(D_F16, D_F32, D_F16, D_F16) + IO_TYPE(D_F16, D_F32, D_F32, D_F16) + IO_TYPE(D_F16, D_F16, D_F16, D_F16) IO_TYPE(D_F32, D_F32, D_F16, D_F32) + IO_TYPE(D_F32, D_F16, D_F16, D_F32) IO_TYPE(D_F32, D_F32, D_F32, D_F32) IO_TYPE(D_I32, D_F32, D_F16, D_I32) IO_TYPE(D_I32, D_F32, D_F16, D_F32) + IO_TYPE(D_BF16, D_F32, D_F32, D_BF16) IO_TYPE(D_I8|Q_DFP, D_F32, D_F16, D_F16) IO_TYPE(D_I8|Q_DFP, D_F32, D_F16, D_I8|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_F32, D_F32, D_I8|Q_DFP) IO_TYPE(D_U8|Q_ASYM, D_F32, D_F16, D_F16) IO_TYPE(D_U8|Q_ASYM, D_F32, D_F16, D_U8|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_F32, D_F32, D_U8|Q_ASYM) IO_TYPE(D_I16|Q_DFP, D_F32, D_F16, D_F16) IO_TYPE(D_I16|Q_DFP, D_F32, D_F16, D_I16|Q_DFP) + IO_TYPE(D_I16|Q_DFP, D_F32, D_F32, D_I16|Q_DFP) END_IO_TYPE_DECL(INSTANCE_NORM) if (!VALIDATE_OP_IO_TYPES(INSTANCE_NORM, self, inputs, self->input.num, outputs, self->output.num)) { diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_interp.c b/src/tim/vx/internal/src/ops/vsi_nn_op_interp.c index cc38677..e11ba4e 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_interp.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_interp.c @@ -174,6 +174,7 @@ static vsi_bool op_setup curr->node->nn_param.strided_slice.begin_mask = 0; curr->node->nn_param.strided_slice.end_mask = 0; curr->node->nn_param.strided_slice.shrink_axis_mask = 0; + curr->node->nn_param.strided_slice.new_axis_mask = 0; begin_dims = (int32_t *)vsi_nn_internal_new_node_param(curr, VSI_NN_MAX_DIM_NUM * sizeof(uint32_t)); end_dims = (int32_t *)vsi_nn_internal_new_node_param(curr, diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_l2normalizescale.c b/src/tim/vx/internal/src/ops/vsi_nn_op_l2normalizescale.c index 04e5610..4057e45 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_l2normalizescale.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_l2normalizescale.c @@ -34,11 +34,11 @@ #include "vsi_nn_tensor_util.h" #include "vsi_nn_prv.h" #include "vsi_nn_log.h" -#include "client/vsi_nn_vxkernel.h" #include "kernel/vsi_nn_kernel.h" #include "utils/vsi_nn_util.h" #include "kernel/vsi_nn_kernel_gpu_shape_optimize.h" #include "utils/vsi_nn_constraint_check.h" +#include "utils/vsi_nn_dtype_util.h" #define _INPUT_NUM (2) #define _OUTPUT_NUM (1) @@ -84,7 +84,8 @@ static vsi_nn_tensor_t* _expand_scale_tensor attr.size[0] = scale_size_out; attr.size[1] = 1; attr.dim_num = 2; - attr.dtype.vx_type = scale->attr.dtype.vx_type; + out_dtype.qnt_type = VSI_NN_QNT_TYPE_NONE; + attr.dtype.vx_type = VSI_NN_TYPE_FLOAT16; attr.vtl = FALSE; scale_tensor = vsi_nn_CreateTensor(graph, &attr); out_dtype = scale->attr.dtype; @@ -154,6 +155,65 @@ static vsi_bool _check_value_is_equal_to_one return ret; } +static vsi_bool _tensor_data_convert + ( + vsi_nn_graph_t* graph, + vsi_nn_tensor_t* in_tensor, + vsi_nn_tensor_t* out_tensor + ) +{ + vsi_bool ret = TRUE; + float* tensor_data = NULL; + uint32_t size = 0; + uint32_t stride[VSI_NN_MAX_DIM_NUM] = { 0 }; + uint8_t* data = NULL; + + tensor_data = vsi_nn_ConvertTensorToFloat32Data( graph, in_tensor ); + if ( NULL == tensor_data ) + { + VSILOGE( "Convert data fail." ); + return FALSE; + } + + size = vsi_nn_GetStrideSize( &out_tensor->attr, stride ); + data = (uint8_t *)malloc( size ); + + if ( data ) + { + uint32_t i = 0; + uint32_t elements = size / stride[0]; + vsi_status status = VSI_SUCCESS; + + for ( i = 0; i < elements; i ++ ) + { + status = vsi_nn_Float32ToDtype( tensor_data[i], &data[stride[0] * i], &out_tensor->attr.dtype ); + if( VSI_FAILURE == status ) + { + VSILOGE("Convert default_value to dtype fail"); + break; + } + } + + status = vsi_nn_CopyDataToTensor( graph, out_tensor, data ); + free( data ); + data = NULL; + if ( VSI_FAILURE == status ) + { + VSILOGE("Copy data to tensor fail"); + } + } + + if ( !in_tensor->attr.is_created_from_handle ) + { + if ( tensor_data ) + { + free(tensor_data); + } + } + + return ret; +} + static vsi_status op_compute ( vsi_nn_node_t * self, @@ -180,7 +240,10 @@ static vsi_status op_compute p = &(self->nn_param.l2normalizescale); axis = p->axis; - if ( inputs[1]->attr.is_const == TRUE && _check_value_is_equal_to_one(self->graph, inputs[1]) ) + if ( (inputs[1]->attr.is_const == TRUE && _check_value_is_equal_to_one(self->graph, inputs[1])) || + ( inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_BFLOAT16 && + outputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_BFLOAT16 ) + ) { return vsi_nn_internal_compute_node( self ); } @@ -203,10 +266,10 @@ static vsi_status op_compute shapes[1][2] = 1; shapes[1][3] = 1; scale_size = shapes[0][new_axis]; - is_expand_scale = (vx_bool)((size < scale_size) && (TRUE == inputs[1]->attr.is_const)); + is_expand_scale = (vx_bool)(TRUE == inputs[1]->attr.is_const); vsi_nn_kernel_param_add_int32( param, "axis", new_axis ); - if( ret ) + if ( ret ) { reshape_tensors[0] = vsi_nn_reshape_tensor( self->graph, inputs[0], (uint32_t*)shapes[0], rank_in ); @@ -249,13 +312,20 @@ static vsi_bool op_check ) { BEGIN_IO_TYPE_DECL(L2NORMALIZESCALE, _INPUT_NUM, _OUTPUT_NUM) - IO_TYPE(D_F16, D_F16, D_F16) - IO_TYPE(D_I8|Q_DFP, D_F16, D_I8|Q_DFP) - IO_TYPE(D_I8|Q_DFP, D_F16, D_F16) + IO_TYPE(D_F16, D_F16, D_F16) + IO_TYPE(D_F16, D_F32, D_F16) + IO_TYPE(D_BF16, D_BF16, D_BF16) + IO_TYPE(D_BF16, D_F32, D_BF16) + IO_TYPE(D_I8|Q_DFP, D_F16, D_I8|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_F16, D_F16) + IO_TYPE(D_I8|Q_DFP, D_F32, D_I8|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_F32, D_F16) IO_TYPE(D_U8|Q_ASYM, D_F16, D_U8|Q_ASYM) IO_TYPE(D_U8|Q_ASYM, D_F16, D_F16) IO_TYPE(D_I16|Q_DFP, D_F16, D_I16|Q_DFP) IO_TYPE(D_I16|Q_DFP, D_F16, D_F16) + IO_TYPE(D_I16|Q_DFP, D_F32, D_I16|Q_DFP) + IO_TYPE(D_I16|Q_DFP, D_F32, D_F16) IO_TYPE(D_F32, D_F32, D_F32) IO_TYPE(D_U8|Q_ASYM, D_F32, D_U8|Q_ASYM) END_IO_TYPE_DECL(L2NORMALIZESCALE) @@ -328,8 +398,53 @@ static vsi_bool op_setup curr->outputs[0] = outputs[0]; vsi_nn_internal_setup_node( self, curr ); } + else if ( inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_BFLOAT16 && + outputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_BFLOAT16 ) + { + vsi_nn_internal_tensor_t* output_tensor = NULL; + vsi_nn_internal_tensor_t* reshape_tensor = NULL; + vsi_nn_tensor_attr_t attr; + int32_t dim_num = inputs[0]->attr.dim_num; + int32_t i = 0; - ret = vsi_nn_op_common_setup(self, inputs, outputs); + memcpy( &attr, &outputs[0]->attr, sizeof( attr ) ); + attr.vtl = TRUE; + attr.is_const = FALSE; + output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + + curr = vsi_nn_internal_new_node(self, VSI_NN_OP_L2_NORMALIZE, 0, 0); + curr->node->nn_param.l2_normalize.axis = self->nn_param.l2normalizescale.axis; + curr->inputs[0] = inputs[0]; + curr->outputs[0] = output_tensor->t; + vsi_nn_internal_setup_node( self, curr ); + + memcpy( &attr, &inputs[1]->attr, sizeof( attr ) ); + for (i = 0; i < dim_num; i++) + { + attr.size[i] = i == self->nn_param.l2normalizescale.axis ? inputs[0]->attr.size[i] : 1; + } + attr.dim_num = dim_num; + if (attr.dtype.vx_type != VSI_NN_TYPE_BFLOAT16) + { + attr.dtype.vx_type = VSI_NN_TYPE_BFLOAT16; + attr.dtype.qnt_type = VSI_NN_QNT_TYPE_NONE; + } + reshape_tensor = vsi_nn_internal_new_tensor(self, &attr, 0.0f); + _tensor_data_convert(self->graph, inputs[1], reshape_tensor->t); + + curr = vsi_nn_internal_new_node(self, VSI_NN_OP_MULTIPLY, 0, 0); + curr->inputs[0] = output_tensor->t; + curr->inputs[1] = reshape_tensor->t; + curr->node->nn_param.multiply.scale = 1.0f; + curr->node->vx_param.overflow_policy = VX_CONVERT_POLICY_SATURATE; + curr->node->vx_param.rounding_policy = VX_ROUND_POLICY_TO_NEAREST_EVEN; + curr->outputs[0] = outputs[0]; + vsi_nn_internal_setup_node( self, curr ); + } + else + { + ret = vsi_nn_op_common_setup(self, inputs, outputs); + } return ret; } diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_layernormalize.c b/src/tim/vx/internal/src/ops/vsi_nn_op_layernormalize.c index 7cc8663..be46f09 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_layernormalize.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_layernormalize.c @@ -34,7 +34,7 @@ #include "vsi_nn_tensor_util.h" #include "vsi_nn_prv.h" #include "vsi_nn_log.h" -#include "client/vsi_nn_vxkernel.h" +#include "libnnext/vsi_nn_vxkernel.h" #include "kernel/vsi_nn_kernel.h" #include "utils/vsi_nn_constraint_check.h" @@ -51,28 +51,11 @@ static vsi_status op_compute vsi_status status = VSI_FAILURE; vsi_nn_kernel_param_t * param = NULL; vsi_nn_kernel_node_t n = NULL; - float eps = self->nn_param.instancenorm.eps; - uint32_t *input_size = inputs[0]->attr.size; - uint32_t dims_num = inputs[0]->attr.dim_num; - int32_t rs_flg = 0; - int32_t wh_flg = 0; + float eps = self->nn_param.layernorm.eps; - param =vsi_nn_kernel_param_create(); - - if (input_size[0] >= GPU_TENSOR_MAX_WIDTH) - { - wh_flg = 1; - } - - if ((input_size[1] * input_size[2] < GPU_TENSOR_MAX_WIDTH) - && dims_num > 2) - { - rs_flg = 1; - } + param = vsi_nn_kernel_param_create(); vsi_nn_kernel_param_add_float32( param, "eps", eps ); - vsi_nn_kernel_param_add_int32( param, "reshape_flg", rs_flg ); - vsi_nn_kernel_param_add_int32( param, "wh_flg", wh_flg ); n = vsi_nn_kernel_selector( self->graph, "layer_norm", inputs, _INPUT_NUM, outputs, _OUTPUT_NUM, param ); if ( n != NULL ) @@ -99,10 +82,14 @@ static vsi_bool op_check BEGIN_IO_TYPE_DECL(LAYER_NORM, 3, 1) IO_TYPE(D_F32, D_F32, D_F32, D_F32) IO_TYPE(D_F16, D_F32, D_F16, D_F16) + IO_TYPE(D_F16, D_F32, D_F32, D_F16) IO_TYPE(D_F16, D_F32, D_F16, D_U8|Q_ASYM) + IO_TYPE(D_BF16, D_F32, D_F32, D_BF16) IO_TYPE(D_U8|Q_ASYM, D_F32, D_F16, D_F16) IO_TYPE(D_U8|Q_ASYM, D_F32, D_F16, D_U8|Q_ASYM) IO_TYPE(D_I16|Q_DFP, D_F32, D_F16, D_I16|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_F32, D_F32, D_U8|Q_ASYM) + IO_TYPE(D_I16|Q_DFP, D_F32, D_F32, D_I16|Q_DFP) END_IO_TYPE_DECL(LAYER_NORM) if (!VALIDATE_OP_IO_TYPES(LAYER_NORM, self, inputs, self->input.num, outputs, self->output.num)) { diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_lstm_ovxlib.c b/src/tim/vx/internal/src/ops/vsi_nn_op_lstm_ovxlib.c index 899711c..5910a92 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_lstm_ovxlib.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_lstm_ovxlib.c @@ -35,7 +35,7 @@ #include "vsi_nn_ops.h" #include "vsi_nn_tensor.h" #include "vsi_nn_tensor_util.h" -#include "client/vsi_nn_vxkernel.h" +#include "libnnext/vsi_nn_vxkernel.h" #include "vsi_nn_internal_node.h" #include "vsi_nn_rnn_helper.h" diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_lstmunit_activation.c b/src/tim/vx/internal/src/ops/vsi_nn_op_lstmunit_activation.c index d95f48d..3db70e8 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_lstmunit_activation.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_lstmunit_activation.c @@ -35,7 +35,7 @@ #include "vsi_nn_ops.h" #include "vsi_nn_tensor.h" #include "vsi_nn_tensor_util.h" -#include "client/vsi_nn_vxkernel.h" +#include "libnnext/vsi_nn_vxkernel.h" #include "kernel/vsi_nn_kernel.h" #include "utils/vsi_nn_util.h" @@ -153,8 +153,7 @@ static vsi_bool op_setup memcpy(&attr, &(inputs[LSTMUNIT_ACT_DATA_BI + i]->attr), sizeof(vsi_nn_tensor_attr_t)); attr.size[1] = 1; attr.dim_num = 2; - t0 = vsi_nn_CreateTensor( self->graph, &attr ); - vsi_nn_ReshapeTensor(self->graph, inputs[LSTMUNIT_ACT_DATA_BI + i], t0, attr.size, attr.dim_num); + t0 = vsi_nn_reshape_tensor(self->graph, inputs[LSTMUNIT_ACT_DATA_BI + i], attr.size, attr.dim_num); if( dst_dtype.vx_type != t0->attr.dtype.vx_type && dst_dtype.qnt_type != t0->attr.dtype.qnt_type ) @@ -176,8 +175,7 @@ static vsi_bool op_setup memcpy(&attr, &(inputs[LSTMUNIT_ACT_LN_WI + i]->attr), sizeof(vsi_nn_tensor_attr_t)); attr.size[1] = 1; attr.dim_num = 2; - t1 = vsi_nn_CreateTensor( self->graph, &attr ); - vsi_nn_ReshapeTensor(self->graph, inputs[LSTMUNIT_ACT_LN_WI + i], t1, attr.size, attr.dim_num); + t1 = vsi_nn_reshape_tensor(self->graph, inputs[LSTMUNIT_ACT_LN_WI + i], attr.size, attr.dim_num); if( dst_dtype.vx_type != t1->attr.dtype.vx_type && dst_dtype.qnt_type != t1->attr.dtype.qnt_type ) diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_lstmunit_ovxlib.c b/src/tim/vx/internal/src/ops/vsi_nn_op_lstmunit_ovxlib.c index 07b6ca2..f57eddb 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_lstmunit_ovxlib.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_lstmunit_ovxlib.c @@ -35,7 +35,7 @@ #include "vsi_nn_ops.h" #include "vsi_nn_tensor.h" #include "vsi_nn_tensor_util.h" -#include "client/vsi_nn_vxkernel.h" +#include "libnnext/vsi_nn_vxkernel.h" #include "ops/vsi_nn_op_lstmunit_ovxlib.h" #include "vsi_nn_internal_node.h" #include "vsi_nn_rnn_helper.h" @@ -307,7 +307,7 @@ static vsi_bool op_setup p->local->use_cifg = ( NULL == inputs[LSTMUNIT_INPUT_WEIGHT_I2I] ); p->local->use_layer_norm = ( NULL != inputs[LSTMUNIT_INPUT_LAYERNORM_F] ); p->local->use_projection = ( NULL != inputs[LSTMUNIT_INPUT_WEIGHT_PROJ] ); - p->local->use_projection_bias = FALSE;//NULL != inputs[19]; + p->local->use_projection_bias = ( NULL != inputs[LSTMUNIT_INPUT_BIAS_PROJ] ); p->local->multi_batch = ( inputs[LSTMUNIT_INPUT_INPUT]->attr.size[1] > 1 ); p->local->use_peephole = ( NULL != inputs[LSTMUNIT_INPUT_WEIGHT_C2O] ); ifco_start_index = p->local->use_cifg ? 1 : 0; @@ -621,8 +621,6 @@ static vsi_bool op_setup curr->inputs[1] = inputs[LSTMUNIT_INPUT_WEIGHT_PROJ]; curr->inputs[2] = zero_bias_tensor; - tmp_tensor = output_tensor; - /* Save output to h_state first and copy to output */ if( p->local->use_hybrid && p->local->use_projection_bias ) { @@ -636,6 +634,8 @@ static vsi_bool op_setup curr->outputs[0] = outputs[LSTMUNIT_OUTPUT_H_STATE]; } + tmp_tensor = output_tensor; + vsi_nn_internal_setup_node(self, curr); if( p->local->use_hybrid && p->local->use_projection_bias ) diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_matrixmul.c b/src/tim/vx/internal/src/ops/vsi_nn_op_matrixmul.c index f4b8efe..eaeaaa5 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_matrixmul.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_matrixmul.c @@ -106,6 +106,9 @@ static vsi_bool op_check IO_TYPE(D_F16, D_I16|Q_DFP, D_I16|Q_DFP) IO_TYPE(D_F16, D_I8|Q_DFP, D_F16) IO_TYPE(D_F16, D_U8|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_F16, D_I8|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_F16, D_U8, D_U8) + IO_TYPE(D_F16, D_I8, D_I8) IO_TYPE(D_F16, D_F16, D_F16) IO_TYPE(D_F32, D_F32, D_F32) IO_TYPE(D_F32, D_I8|Q_DFP, D_F32) diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_moments.c b/src/tim/vx/internal/src/ops/vsi_nn_op_moments.c index 985f2da..fbcdd0b 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_moments.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_moments.c @@ -37,6 +37,31 @@ #define _INPUT_NUM 1 #define _OUTPUT_NUM 2 +static void _squeeze_axis + ( + vsi_nn_tensor_t *input, + const int32_t* axis_in, + int32_t axis_num, + int32_t* axis_out, + int32_t *axis_num_out + ) +{ + int32_t i = 0; + + memcpy(axis_out, axis_in, sizeof(int32_t) * axis_num); + *axis_num_out = axis_num; + + for (i = 0; i < axis_num; i++) + { + if (axis_in[i] == 3 && input->attr.size[3] == 1) + { + *axis_num_out = axis_num - 1; + axis_out[i] = 0; + break; + } + } +} + static vsi_status op_compute ( vsi_nn_node_t * self, @@ -47,22 +72,25 @@ static vsi_status op_compute vsi_status status = VSI_FAILURE; vsi_nn_kernel_param_t * param = NULL; vsi_nn_kernel_node_t n = NULL; - int32_t* axis = self->nn_param.moments.axis; + int32_t axes_copy[VSI_NN_MAX_DIM_NUM] = { 0 }; + const int32_t* axis = self->nn_param.moments.axis; int32_t axis_num = self->nn_param.moments.axis_num; int32_t keep_dim = self->nn_param.moments.keep_dim ? 1 : 0; - param =vsi_nn_kernel_param_create(); + _squeeze_axis(inputs[0], axis, axis_num, axes_copy, &axis_num); - vsi_nn_kernel_param_add_buffer( param, "axis", axis, axis_num); + param = vsi_nn_kernel_param_create(); + + vsi_nn_kernel_param_add_buffer( param, "axis", axes_copy, axis_num); vsi_nn_kernel_param_add_int32( param, "keep_dim", keep_dim); n = vsi_nn_kernel_selector( self->graph, "moments", inputs, _INPUT_NUM, outputs, _OUTPUT_NUM, param ); - if( n != NULL ) + if (n != NULL) { self->n = (vx_node)n; status = VSI_SUCCESS; } - if(param != NULL) + if (param != NULL) { vsi_nn_kernel_param_release( ¶m ); } @@ -77,6 +105,10 @@ static vsi_bool op_check vsi_nn_tensor_t ** outputs ) { + int32_t axes_copy[VSI_NN_MAX_DIM_NUM] = { 0 }; + int32_t axes_num = 0; + int32_t i = 0; + BEGIN_IO_TYPE_DECL(MOMENTS, 1, 2) IO_TYPE(D_U8|Q_ASYM, D_F16, D_F16) IO_TYPE(D_I8|Q_DFP, D_F16, D_F16) @@ -85,7 +117,7 @@ static vsi_bool op_check IO_TYPE(D_F32, D_F32, D_F32) IO_TYPE(D_I32, D_F32, D_F32) END_IO_TYPE_DECL(MOMENTS) - if(!VALIDATE_OP_IO_TYPES(MOMENTS, self, inputs, self->input.num, outputs, self->output.num)) { + if (!VALIDATE_OP_IO_TYPES(MOMENTS, self, inputs, self->input.num, outputs, self->output.num)) { char* desc = generate_op_io_types_desc(inputs, self->input.num, outputs, self->output.num); VSILOGE("Inputs/Outputs data type not support: %s", desc); @@ -93,6 +125,18 @@ static vsi_bool op_check return FALSE; } + _squeeze_axis(inputs[0], self->nn_param.moments.axis, + self->nn_param.moments.axis_num, axes_copy, &axes_num); + + for (i = 0; i < axes_num; i++) + { + if (axes_copy[i] > 2) + { + VSILOGE("moments shader path not support axis: %d", axes_copy[i]); + return FALSE; + } + } + return TRUE; } /* op_check() */ @@ -107,23 +151,15 @@ static vsi_bool op_setup int32_t i = 0, j = 0; vsi_nn_moments_param * p = NULL; - if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) + if (VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num) { - int32_t* axis = NULL; + const int32_t* axis = NULL; int32_t axis_num = 0; p = &(self->nn_param.moments); axis = p->axis; axis_num = p->axis_num; - for(i = 0; i < axis_num; i++) - { - if(axis[i] > 2) - { - return FALSE; - } - } - - if(p->keep_dim) + if (p->keep_dim) { outputs[0]->attr.dim_num = inputs[0]->attr.dim_num; outputs[1]->attr.dim_num = inputs[0]->attr.dim_num; @@ -133,45 +169,35 @@ static vsi_bool op_setup outputs[0]->attr.size[i] = inputs[0]->attr.size[i]; outputs[1]->attr.size[i] = inputs[0]->attr.size[i]; } - switch(axis_num) + + for (i = 0; i < axis_num; i++) { - case 1: - outputs[0]->attr.size[axis[0]] = 1; - outputs[1]->attr.size[axis[0]] = 1; - break; - case 2: - outputs[0]->attr.size[axis[0]] = 1; - outputs[0]->attr.size[axis[1]] = 1; - outputs[1]->attr.size[axis[0]] = 1; - outputs[1]->attr.size[axis[1]] = 1; - break; - case 3: - outputs[0]->attr.size[axis[0]] = 1; - outputs[0]->attr.size[axis[1]] = 1; - outputs[0]->attr.size[axis[2]] = 1; - outputs[1]->attr.size[axis[0]] = 1; - outputs[1]->attr.size[axis[1]] = 1; - outputs[1]->attr.size[axis[2]] = 1; - break; - default: - return FALSE; + outputs[0]->attr.size[axis[i]] = 1; + outputs[1]->attr.size[axis[i]] = 1; } } else { + int32_t idx = 0; + outputs[0]->attr.dim_num = inputs[0]->attr.dim_num - axis_num; outputs[1]->attr.dim_num = inputs[0]->attr.dim_num - axis_num; - for (i = 0; i < axis[0]; i++) + for (i = 0; i < (int32_t)inputs[0]->attr.dim_num; i++) { - outputs[0]->attr.size[i] = inputs[0]->attr.size[i]; - outputs[1]->attr.size[i] = inputs[0]->attr.size[i]; - } + for (j = 0; j < axis_num; j++) + { + if ( i == axis[j] ) + { + break; + } + } - for (j = axis[0] + axis_num; j < (int32_t)inputs[0]->attr.dim_num; j++) - { - outputs[0]->attr.size[i] = inputs[0]->attr.size[j]; - outputs[1]->attr.size[i++] = inputs[0]->attr.size[j]; + if (j == axis_num) + { + outputs[0]->attr.size[idx] = inputs[0]->attr.size[i]; + outputs[1]->attr.size[idx++] = inputs[0]->attr.size[i]; + } } } } diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_nms.c b/src/tim/vx/internal/src/ops/vsi_nn_op_nms.c new file mode 100644 index 0000000..b5e8f4f --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_nms.c @@ -0,0 +1,136 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include + +#include "vsi_nn_types.h" +#include "vsi_nn_log.h" +#include "vsi_nn_node.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" + +typedef struct _nms_local_data_t { + int32_t placeholder; +} nms_local_data_t; + +/* + Declare number of input and output. + */ +#define _INPUT_NUM (2) +#define _OUTPUT_NUM (3) + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_nms_param * p = &(self->nn_param.nms); + vsi_nn_kernel_param_t * param = NULL; + + param = vsi_nn_kernel_param_create(); + + vsi_nn_kernel_param_add_int32( param, "max_output_size", p->max_output_size ); + vsi_nn_kernel_param_add_float32( param, "iou_threshold", p->iou_threshold ); + vsi_nn_kernel_param_add_float32( param, "score_threshold", p->score_threshold ); + vsi_nn_kernel_param_add_float32( param, "soft_nms_sigma", p->soft_nms_sigma ); + + self->n = (vx_node)vsi_nn_kernel_selector( self->graph, + "nms", + inputs, _INPUT_NUM, + outputs, _OUTPUT_NUM, param ); + + if ( self->n ) + { + status = VSI_SUCCESS; + } + + vsi_nn_kernel_param_release( ¶m ); + + return status; +} /* op_compute() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + /*TODO: Check tensor shapes. */ + return TRUE; +} /* op_check() */ + +static vsi_bool op_setup + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) + { + outputs[0]->attr.dim_num = 1; + outputs[0]->attr.size[0] = self->nn_param.nms.max_output_size; + } + + if ( VSI_NN_DIM_AUTO == outputs[1]->attr.dim_num ) + { + outputs[1]->attr.dim_num = 1; + outputs[1]->attr.size[0] = self->nn_param.nms.max_output_size; + } + + if ( VSI_NN_DIM_AUTO == outputs[2]->attr.dim_num ) + { + outputs[0]->attr.dim_num = 1; + outputs[0]->attr.size[0] = 1; + } + + return TRUE; +} /* op_setup() */ + +__BEGIN_DECLS + +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ NMS, + /* init */ NULL, + /* compute */ op_compute, + /* deinit */ vsi_nn_op_common_deinit, + /* check */ op_check, + /* setup */ op_setup, + /* optimize */ NULL, + /* input_num */ _INPUT_NUM, + /* output_num */ _OUTPUT_NUM + ); + +__END_DECLS diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_one_hot.c b/src/tim/vx/internal/src/ops/vsi_nn_op_one_hot.c new file mode 100644 index 0000000..bddcc12 --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_one_hot.c @@ -0,0 +1,176 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include + +#include "vsi_nn_types.h" +#include "vsi_nn_log.h" +#include "vsi_nn_node.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "utils/vsi_nn_constraint_check.h" +/* + Declare number of input and output. + */ +#define _INPUT_NUM (1) +#define _OUTPUT_NUM (1) + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_param_t * param = NULL; + + param = vsi_nn_kernel_param_create(); + + vsi_nn_kernel_param_add_int32( param, "depth", self->nn_param.one_hot.depth ); + vsi_nn_kernel_param_add_float32( param, "on_value", self->nn_param.one_hot.on_value ); + vsi_nn_kernel_param_add_float32( param, "off_value", self->nn_param.one_hot.off_value ); + vsi_nn_kernel_param_add_int32( param, "axis", self->nn_param.one_hot.axis ); + + self->n = (vx_node)vsi_nn_kernel_selector( self->graph, "one_hot", + inputs, 1, outputs, 1, param ); + + if( self->n ) + { + status = VSI_SUCCESS; + } + + vsi_nn_kernel_param_release( ¶m ); + + return status; +} /* op_compute() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + BEGIN_IO_TYPE_DECL(ONE_HOT, 1, 1) + /* IO_TYPE(INPUT, OUTPUT) */ + IO_TYPE(D_F32, D_F32) + IO_TYPE(D_F32, D_F16) + + IO_TYPE(D_F16, D_F32) + IO_TYPE(D_F16, D_F16) + IO_TYPE(D_F16, D_U8|Q_ASYM) + IO_TYPE(D_F16, D_I8|Q_DFP) + IO_TYPE(D_F16, D_I16|Q_DFP) + + IO_TYPE(D_I32, D_F32) + IO_TYPE(D_I32, D_F16) + IO_TYPE(D_I32, D_U8|Q_ASYM) + IO_TYPE(D_I32, D_I8|Q_DFP) + IO_TYPE(D_I32, D_I16|Q_DFP) + IO_TYPE(D_I32, D_I32) + + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_F16) + + IO_TYPE(D_I8|Q_ASYM, D_I8|Q_ASYM) + IO_TYPE(D_I8|Q_ASYM, D_F16) + + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_F16) + + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_I16|Q_DFP, D_F16) + END_IO_TYPE_DECL(ONE_HOT) + if (!VALIDATE_OP_IO_TYPES(ONE_HOT, self, inputs, self->input.num, outputs, self->output.num)) + { + char* desc = generate_op_io_types_desc(inputs, + self->input.num, outputs, self->output.num); + VSILOGE("Inputs/Outputs data type not support: %s", desc); + destroy_op_io_types_desc(desc); + return FALSE; + } + + return TRUE; +} /* op_check() */ + +static vsi_bool op_setup + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_nn_one_hot_param* p = &self->nn_param.one_hot; + int32_t i = 0; + int32_t axis = p->axis; + int32_t depth = p->depth; + + if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) + { + outputs[0]->attr.dim_num = inputs[0]->attr.dim_num + 1; + axis = (axis == -1) ? 0 : axis; + + for (i = 0; i < (int32_t)outputs[0]->attr.dim_num; i++) + { + if ( i < axis) + { + outputs[0]->attr.size[i] = inputs[0]->attr.size[i]; + } + else if ( i == axis) + { + outputs[0]->attr.size[i] = depth; + } + else + { + outputs[0]->attr.size[i] = inputs[0]->attr.size[i - 1]; + } + } + } + + return TRUE; +} /* op_setup() */ + +__BEGIN_DECLS + +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ ONE_HOT, + /* init */ NULL, + /* compute */ op_compute, + /* deinit */ vsi_nn_op_common_deinit, + /* check */ op_check, + /* setup */ op_setup, + /* optimize */ NULL, + /* input_num */ _INPUT_NUM, + /* output_num */ _OUTPUT_NUM + ); + +__END_DECLS diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_pool.c b/src/tim/vx/internal/src/ops/vsi_nn_op_pool.c index 5ec4a6c..d90d7a2 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_pool.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_pool.c @@ -22,6 +22,7 @@ * *****************************************************************************/ #include +#include #include "vsi_nn_types.h" #include "vsi_nn_platform.h" @@ -35,6 +36,32 @@ #include "utils/vsi_nn_util.h" #include "utils/vsi_nn_constraint_check.h" +static vsi_bool _is_pool1d + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs + ) +{ + /* + support pool1d from version 1.1.31 + */ + if (vsi_nn_compareVersion(self->graph, 1, 1, 31) == -1) + { + return FALSE; + } + else + { + if ( 3 == inputs[0]->attr.dim_num ) + { + return TRUE; + } + else + { + return FALSE; + } + } +} + static vsi_status op_compute ( vsi_nn_node_t * self, @@ -44,26 +71,53 @@ static vsi_status op_compute { vsi_status status; vx_nn_pooling_params_ext_t params; + vsi_nn_tensor_t * tmp_inputs[1] = {NULL}; + vsi_nn_tensor_t * tmp_outputs[1] = {NULL}; + vsi_nn_pool_lcl_data *local = self->nn_param.pool.local; + status = VSI_FAILURE; memset( ¶ms, 0, sizeof( params ) ); - params.base.pool_type = self->nn_param.pool.type; - params.base.pool_size_x = self->nn_param.pool.ksize[0]; - params.base.pool_size_y = self->nn_param.pool.ksize[1]; - params.base.pool_pad_x_left = self->nn_param.pool.pad[0]; - params.base.pool_pad_x_right = self->nn_param.pool.pad[1]; - params.base.pool_pad_y_top = self->nn_param.pool.pad[2]; - params.base.pool_pad_y_bottom = self->nn_param.pool.pad[3]; - params.base.rounding = self->vx_param.down_scale_size_rounding; - params.stride_x = self->nn_param.pool.stride[0]; - params.stride_y = self->nn_param.pool.stride[1]; + if(_is_pool1d(self, inputs)) + { + // pool1d + tmp_inputs[0] = local->reshaped_input; + tmp_outputs[0] = local->reshaped_output; + + params.base.pool_type = self->nn_param.pool.type; + params.base.pool_size_x = self->nn_param.pool.ksize[0]; + params.base.pool_size_y = 1; + params.base.pool_pad_x_left = self->nn_param.pool.pad[0]; + params.base.pool_pad_x_right = self->nn_param.pool.pad[1]; + params.base.pool_pad_y_top = 0; + params.base.pool_pad_y_bottom = 0; + params.base.rounding = self->vx_param.down_scale_size_rounding; + params.stride_x = self->nn_param.pool.stride[0]; + params.stride_y = 1; + } + else + { + tmp_inputs[0] = inputs[0]; + tmp_outputs[0] = outputs[0]; + + params.base.pool_type = self->nn_param.pool.type; + params.base.pool_size_x = self->nn_param.pool.ksize[0]; + params.base.pool_size_y = self->nn_param.pool.ksize[1]; + params.base.pool_pad_x_left = self->nn_param.pool.pad[0]; + params.base.pool_pad_x_right = self->nn_param.pool.pad[1]; + params.base.pool_pad_y_top = self->nn_param.pool.pad[2]; + params.base.pool_pad_y_bottom = self->nn_param.pool.pad[3]; + params.base.rounding = self->vx_param.down_scale_size_rounding; + params.stride_x = self->nn_param.pool.stride[0]; + params.stride_y = self->nn_param.pool.stride[1]; + } self->n = vxPoolingLayer2( self->graph->g, - inputs[0]->t, + tmp_inputs[0]->t, (vx_nn_pooling_params_t *)¶ms, sizeof( params ), - outputs[0]->t + tmp_outputs[0]->t ); if( NULL != self->n ) @@ -73,6 +127,65 @@ static vsi_status op_compute return status; } /* op_compute() */ +static vsi_status op_optimize + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs, + vsi_nn_opt_direction_e direction + ) +{ + uint32_t dim = 0; + vsi_nn_pool_lcl_data *local = NULL; + uint32_t shape[VSI_NN_MAX_DIM_NUM]; + char tensor_name[128]; + + dim = inputs[0]->attr.dim_num; + if(FALSE == _is_pool1d(self, inputs)) + { + return VSI_SUCCESS; + } + + VSILOGD("Optimize pool1d %s, uid %u", vsi_nn_OpGetName(self->op), self->uid); + /* + insert a reshape node before and after pool1d + */ + local = self->nn_param.pool.local; + if (VSI_NN_OPTIMIZE_FORWARD == direction) + { + /* reshape 3d input (xcn) --> 4d input (whcn) */ + shape[0] = inputs[0]->attr.size[0];//width + shape[1] = 1;//height + shape[2] = inputs[0]->attr.size[1]; + shape[3] = inputs[0]->attr.size[2]; + dim = 4; + local->reshaped_input = vsi_nn_reshape_tensor(self->graph, inputs[0], shape, dim); + } + else + { + /* reshape 3d output(xcn) --> 4d output(whcn) */ + shape[0] = outputs[0]->attr.size[0];//width + shape[1] = 1;//height + shape[2] = outputs[0]->attr.size[1]; + shape[3] = outputs[0]->attr.size[2]; + dim = 4; + local->reshaped_output = vsi_nn_reshape_tensor(self->graph, outputs[0], shape, dim); + if(local->reshaped_output && local->reshaped_output->t) + { + memset(tensor_name, 0, sizeof(tensor_name)); + snprintf(tensor_name, sizeof(tensor_name), "uid_%u_reshape_out_0", self->uid); + if(vxSetReferenceName((vx_reference)local->reshaped_output->t, tensor_name) == VSI_FAILURE) + { + VSILOGW("Set uid %u pool1d reshaped output name fail", self->uid); + return VSI_FAILURE; + } + } + } + + return VSI_SUCCESS; +} /* op_optimize() */ + + static vsi_bool op_check ( vsi_nn_node_t * self, @@ -119,6 +232,54 @@ static vsi_bool op_check return TRUE; } /* op_check() */ +static vsi_status op_init + ( + vsi_nn_node_t * self + ) +{ + vsi_status status = VSI_SUCCESS; + + self->nn_param.pool.local = + (vsi_nn_pool_lcl_data *)malloc(sizeof(vsi_nn_pool_lcl_data)); + if (NULL == self->nn_param.pool.local) + { + return VX_ERROR_NO_MEMORY; + } + + memset( self->nn_param.pool.local, 0, sizeof(vsi_nn_pool_lcl_data) ); + + self->nn_param.pool.local->reshaped_input = NULL; + self->nn_param.pool.local->reshaped_output = NULL; + + return status; +} /* op_init() */ + +static vsi_status op_deinit + ( + vsi_nn_node_t * self + ) +{ + vsi_nn_pool_param *p = &(self->nn_param.pool); + if(p->local->reshaped_input) + { + vsi_nn_ReleaseTensor(&(p->local->reshaped_input)); + p->local->reshaped_input = NULL; + } + if(p->local->reshaped_output) + { + vsi_nn_ReleaseTensor(&(p->local->reshaped_output)); + p->local->reshaped_output = NULL; + } + if(self->nn_param.pool.local) + { + free(self->nn_param.pool.local); + self->nn_param.pool.local = NULL; + } + vsi_nn_op_common_deinit(self); + + return VSI_SUCCESS; +} /* op_deinit() */ + static vsi_bool op_setup ( vsi_nn_node_t * self, @@ -129,38 +290,69 @@ static vsi_bool op_setup vsi_bool ret; ret = TRUE; - vsi_nn_compute_padding( - inputs[0]->attr.size, - self->nn_param.pool.ksize, - self->nn_param.pool.stride, - NULL, - self->nn_param.pool.pad_type, - self->nn_param.pool.pad - ); - /* Pooling */ - outputs[0]->attr.size[0] = vsi_nn_ComputeFilterSize - ( - inputs[0]->attr.size[0], - self->nn_param.pool.ksize[0], - &self->nn_param.pool.pad[0], - self->nn_param.pool.stride[0], - 0, - self->nn_param.pool.round_type + if(_is_pool1d(self, inputs)) + { + vsi_nn_compute_padding_conv1d( + inputs[0]->attr.size, + self->nn_param.pool.ksize, + self->nn_param.pool.stride, + NULL, + self->nn_param.pool.pad_type, + self->nn_param.pool.pad ); - outputs[0]->attr.size[1] = vsi_nn_ComputeFilterSize - ( - inputs[0]->attr.size[1], - self->nn_param.pool.ksize[1], - &self->nn_param.pool.pad[2], - self->nn_param.pool.stride[1], - 0, - self->nn_param.pool.round_type + + /* Pooling */ + outputs[0]->attr.size[0] = vsi_nn_ComputeFilterSize + ( + inputs[0]->attr.size[0], + self->nn_param.pool.ksize[0], + &self->nn_param.pool.pad[0], + self->nn_param.pool.stride[0], + 0, + self->nn_param.pool.round_type + ); + + outputs[0]->attr.size[1] = inputs[0]->attr.size[1]; + outputs[0]->attr.size[2] = inputs[0]->attr.size[2]; + } + else + { + vsi_nn_compute_padding( + inputs[0]->attr.size, + self->nn_param.pool.ksize, + self->nn_param.pool.stride, + NULL, + self->nn_param.pool.pad_type, + self->nn_param.pool.pad ); + /* Pooling */ + outputs[0]->attr.size[0] = vsi_nn_ComputeFilterSize + ( + inputs[0]->attr.size[0], + self->nn_param.pool.ksize[0], + &self->nn_param.pool.pad[0], + self->nn_param.pool.stride[0], + 0, + self->nn_param.pool.round_type + ); + + outputs[0]->attr.size[1] = vsi_nn_ComputeFilterSize + ( + inputs[0]->attr.size[1], + self->nn_param.pool.ksize[1], + &self->nn_param.pool.pad[2], + self->nn_param.pool.stride[1], + 0, + self->nn_param.pool.round_type + ); + + outputs[0]->attr.size[2] = inputs[0]->attr.size[2]; + outputs[0]->attr.size[3] = inputs[0]->attr.size[3]; + } + outputs[0]->attr.dim_num = inputs[0]->attr.dim_num; - outputs[0]->attr.size[2] = inputs[0]->attr.size[2]; - outputs[0]->attr.size[3] = inputs[0]->attr.size[3]; if( NULL != outputs[1] ) { outputs[1]->attr.dim_num = outputs[0]->attr.dim_num; @@ -178,12 +370,12 @@ extern "C" { DEF_OP_REG ( /* op_name */ POOL, - /* init */ NULL, + /* init */ op_init, /* compute */ op_compute, - /* deinit */ vsi_nn_op_common_deinit, + /* deinit */ op_deinit, /* check */ op_check, /* setup */ op_setup, - /* optimize */ NULL, + /* optimize */ op_optimize, /* input_num */ 1, /* output_num */ 1 ); diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_poolwithargmax.c b/src/tim/vx/internal/src/ops/vsi_nn_op_poolwithargmax.c index 6171449..bc8c3de 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_poolwithargmax.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_poolwithargmax.c @@ -34,7 +34,7 @@ #include "utils/vsi_nn_util.h" #include "vsi_nn_prv.h" #include "vsi_nn_log.h" -#include "client/vsi_nn_vxkernel.h" +#include "libnnext/vsi_nn_vxkernel.h" #include "kernel/vsi_nn_kernel_eltwise.h" #include "utils/vsi_nn_constraint_check.h" @@ -244,6 +244,7 @@ static vsi_bool op_setup ) { vsi_bool ret = TRUE; + vsi_nn_compute_padding( inputs[0]->attr.size, self->nn_param.pool.ksize, @@ -266,17 +267,6 @@ static vsi_status op_deinit vsi_nn_node_t * self ) { - uint32_t i; - - for (i = 0; i < _VSI_NN_POOLWITHARGMAX_LOCAL_TENSOR_NUM; i++) - { - if (self->nn_param.pool.local.local_tensor[i] != NULL) - { - vxReleaseTensor(&(self->nn_param.pool.local.local_tensor[i])); - self->nn_param.pool.local.local_tensor[i] = NULL; - } - } - vsi_nn_op_common_deinit(self); return VSI_SUCCESS; diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_post_process.c b/src/tim/vx/internal/src/ops/vsi_nn_op_post_process.c index a50c1b3..a198d32 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_post_process.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_post_process.c @@ -34,7 +34,7 @@ #include "vsi_nn_ops.h" #include "vsi_nn_tensor.h" #include "vsi_nn_tensor_util.h" -#include "client/vsi_nn_vxkernel.h" +#include "libnnext/vsi_nn_vxkernel.h" #include "utils/vsi_nn_dtype_util.h" #include "vsi_nn_internal_node.h" diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process.c b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process.c index ccb0510..473b900 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process.c @@ -34,7 +34,7 @@ #include "vsi_nn_ops.h" #include "vsi_nn_tensor.h" #include "vsi_nn_tensor_util.h" -#include "client/vsi_nn_vxkernel.h" +#include "libnnext/vsi_nn_vxkernel.h" #include "vsi_nn_internal_node.h" #include "utils/vsi_nn_util.h" diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_bgra.c b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_bgra.c index 4ac9bb1..10b7260 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_bgra.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_bgra.c @@ -34,7 +34,7 @@ #include "vsi_nn_ops.h" #include "vsi_nn_tensor.h" #include "vsi_nn_tensor_util.h" -#include "client/vsi_nn_vxkernel.h" +#include "libnnext/vsi_nn_vxkernel.h" #include "kernel/vsi_nn_kernel.h" #include "utils/vsi_nn_constraint_check.h" @@ -88,6 +88,7 @@ static vsi_bool op_check { BEGIN_IO_TYPE_DECL(PRE_PROCESS_BGRA, 1, 1) IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_U8, D_U8|Q_ASYM) END_IO_TYPE_DECL(PRE_PROCESS_BGRA) if(!VALIDATE_OP_IO_TYPES(PRE_PROCESS_BGRA, self, inputs, self->input.num, outputs, self->output.num)) { char* desc = generate_op_io_types_desc(inputs, diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_gray.c b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_gray.c index 80797a2..ebebc54 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_gray.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_gray.c @@ -34,7 +34,7 @@ #include "vsi_nn_ops.h" #include "vsi_nn_tensor.h" #include "vsi_nn_tensor_util.h" -#include "client/vsi_nn_vxkernel.h" +#include "libnnext/vsi_nn_vxkernel.h" #include "kernel/vsi_nn_kernel.h" #include "utils/vsi_nn_constraint_check.h" @@ -87,6 +87,10 @@ static vsi_bool op_check IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP) IO_TYPE(D_U8|Q_ASYM, D_I16|Q_DFP) IO_TYPE(D_U8|Q_ASYM, D_F16) + IO_TYPE(D_U8, D_U8|Q_ASYM) + IO_TYPE(D_U8, D_I8|Q_DFP) + IO_TYPE(D_U8, D_I16|Q_DFP) + IO_TYPE(D_U8, D_F16) END_IO_TYPE_DECL(PRE_PROCESS_GRAY) if(!VALIDATE_OP_IO_TYPES(PRE_PROCESS_GRAY, self, inputs, self->input.num, outputs, self->output.num)) { char* desc = generate_op_io_types_desc(inputs, diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_nv12.c b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_nv12.c index d754e27..f11ed8e 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_nv12.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_nv12.c @@ -87,6 +87,10 @@ static vsi_bool op_check IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_I8|Q_DFP) IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_I16|Q_DFP) IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_F16) + IO_TYPE(D_U8, D_U8, D_U8|Q_ASYM) + IO_TYPE(D_U8, D_U8, D_I8|Q_DFP) + IO_TYPE(D_U8, D_U8, D_I16|Q_DFP) + IO_TYPE(D_U8, D_U8, D_F16) END_IO_TYPE_DECL(PRE_PROCESS_NV12) if(!VALIDATE_OP_IO_TYPES(PRE_PROCESS_NV12, self, inputs, self->input.num, outputs, self->output.num)) { char* desc = generate_op_io_types_desc(inputs, diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_rgb.c b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_rgb.c index a31005d..a8ce9be 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_rgb.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_rgb.c @@ -34,7 +34,7 @@ #include "vsi_nn_ops.h" #include "vsi_nn_tensor.h" #include "vsi_nn_tensor_util.h" -#include "client/vsi_nn_vxkernel.h" +#include "libnnext/vsi_nn_vxkernel.h" #include "kernel/vsi_nn_kernel.h" #include "utils/vsi_nn_constraint_check.h" @@ -91,6 +91,10 @@ static vsi_bool op_check IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP) IO_TYPE(D_U8|Q_ASYM, D_I16|Q_DFP) IO_TYPE(D_U8|Q_ASYM, D_F16) + IO_TYPE(D_U8, D_U8|Q_ASYM) + IO_TYPE(D_U8, D_I8|Q_DFP) + IO_TYPE(D_U8, D_I16|Q_DFP) + IO_TYPE(D_U8, D_F16) END_IO_TYPE_DECL(PRE_PROCESS_RGB) if(!VALIDATE_OP_IO_TYPES(PRE_PROCESS_RGB, self, inputs, self->input.num, outputs, self->output.num)) { char* desc = generate_op_io_types_desc(inputs, diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_tensor.c b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_tensor.c index c1536be..ba50f33 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_tensor.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_tensor.c @@ -34,7 +34,7 @@ #include "vsi_nn_ops.h" #include "vsi_nn_tensor.h" #include "vsi_nn_tensor_util.h" -#include "client/vsi_nn_vxkernel.h" +#include "libnnext/vsi_nn_vxkernel.h" #include "utils/vsi_nn_dtype_util.h" #include "vsi_nn_internal_node.h" diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_yuv420.c b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_yuv420.c index 50c2355..4a7eb22 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_yuv420.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_yuv420.c @@ -87,6 +87,10 @@ static vsi_bool op_check IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM, D_I8|Q_DFP) IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM, D_I16|Q_DFP) IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM, D_F16) + IO_TYPE(D_U8, D_U8, D_U8, D_U8|Q_ASYM) + IO_TYPE(D_U8, D_U8, D_U8, D_I8|Q_DFP) + IO_TYPE(D_U8, D_U8, D_U8, D_I16|Q_DFP) + IO_TYPE(D_U8, D_U8, D_U8, D_F16) END_IO_TYPE_DECL(PRE_PROCESS_YUV420) if(!VALIDATE_OP_IO_TYPES(PRE_PROCESS_YUV420, self, inputs, self->input.num, outputs, self->output.num)) { char* desc = generate_op_io_types_desc(inputs, diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_yuv444.c b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_yuv444.c index 99a7674..296e245 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_yuv444.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_yuv444.c @@ -87,6 +87,10 @@ static vsi_bool op_check IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM, D_I8|Q_DFP) IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM, D_I16|Q_DFP) IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM, D_F16) + IO_TYPE(D_U8, D_U8, D_U8, D_U8|Q_ASYM) + IO_TYPE(D_U8, D_U8, D_U8, D_I8|Q_DFP) + IO_TYPE(D_U8, D_U8, D_U8, D_I16|Q_DFP) + IO_TYPE(D_U8, D_U8, D_U8, D_F16) END_IO_TYPE_DECL(PRE_PROCESS_YUV444) if(!VALIDATE_OP_IO_TYPES(PRE_PROCESS_YUV444, self, inputs, self->input.num, outputs, self->output.num)) { char* desc = generate_op_io_types_desc(inputs, diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_prelu.c b/src/tim/vx/internal/src/ops/vsi_nn_op_prelu.c index f51e4b2..07f074e 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_prelu.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_prelu.c @@ -132,7 +132,7 @@ static vsi_status _prelu_op_compute if (one_rank) { - is_per_channel_alpha = (inputs[1]->attr.dim_num > 2 && alpha_shape == inputs[0]->attr.size[2]); + is_per_channel_alpha = (inputs[1]->attr.dim_num > 2 && alpha_shape == inputs[1]->attr.size[2]); } if (is_per_channel_alpha) diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_quantized_16bit_lstm.c b/src/tim/vx/internal/src/ops/vsi_nn_op_quantized_16bit_lstm.c index c5bf9d2..4ea879f 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_quantized_16bit_lstm.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_quantized_16bit_lstm.c @@ -35,7 +35,7 @@ #include "vsi_nn_ops.h" #include "vsi_nn_tensor.h" #include "vsi_nn_tensor_util.h" -#include "client/vsi_nn_vxkernel.h" +#include "libnnext/vsi_nn_vxkernel.h" #include "vsi_nn_internal_node.h" static vsi_status op_compute diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_reduce.c b/src/tim/vx/internal/src/ops/vsi_nn_op_reduce.c index 3d01e79..cc07d0a 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_reduce.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_reduce.c @@ -33,7 +33,7 @@ #include "utils/vsi_nn_util.h" #include "vsi_nn_prv.h" #include "vsi_nn_log.h" -#include "client/vsi_nn_vxkernel.h" +#include "libnnext/vsi_nn_vxkernel.h" #include "vsi_nn_internal_node.h" #include "kernel/vsi_nn_kernel_gpu_shape_optimize.h" diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_reduce_internal.c b/src/tim/vx/internal/src/ops/vsi_nn_op_reduce_internal.c index 41cc43b..1bcc83f 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_reduce_internal.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_reduce_internal.c @@ -322,6 +322,8 @@ static vsi_bool op_check_reducemax_internal IO_TYPE(D_F16, D_F16) IO_TYPE(D_F32, D_F32) IO_TYPE(D_I32, D_I32) + IO_TYPE(D_I32, D_I16) + IO_TYPE(D_I16, D_I32) IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM) IO_TYPE(D_U8|Q_ASYM, D_F16) IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP) diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_reducesum_internal.c b/src/tim/vx/internal/src/ops/vsi_nn_op_reducesum_internal.c index 3749f8a..a77d54e 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_reducesum_internal.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_reducesum_internal.c @@ -34,7 +34,7 @@ #include "vsi_nn_ops.h" #include "vsi_nn_tensor.h" #include "vsi_nn_tensor_util.h" -#include "client/vsi_nn_vxkernel.h" +#include "libnnext/vsi_nn_vxkernel.h" #include "kernel/vsi_nn_kernel_gpu_shape_optimize.h" static vsi_status op_compute diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_relational_ops.c b/src/tim/vx/internal/src/ops/vsi_nn_op_relational_ops.c index 21e9bf3..2f08f5e 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_relational_ops.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_relational_ops.c @@ -145,6 +145,21 @@ static vsi_bool op_check IO_TYPE(D_BOOL8, D_BOOL8, D_BOOL8) IO_TYPE(D_F32, D_F32, D_BOOL8) IO_TYPE(D_I32, D_I32, D_BOOL8) + + IO_TYPE(D_F16, D_F16, D_I8) + IO_TYPE(D_F16, D_I16|Q_DFP, D_I8) + IO_TYPE(D_F16, D_I8|Q_DFP, D_I8) + IO_TYPE(D_F16, D_U8|Q_ASYM, D_I8) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I8) + IO_TYPE(D_I16|Q_DFP, D_F16, D_I8) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I8) + IO_TYPE(D_I8|Q_DFP, D_F16, D_I8) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_I8) + IO_TYPE(D_U8|Q_ASYM, D_F16, D_I8) + IO_TYPE(D_BF16, D_BF16, D_I8) + IO_TYPE(D_BOOL8, D_BOOL8, D_I8) + IO_TYPE(D_F32, D_F32, D_I8) + IO_TYPE(D_I32, D_I32, D_I8) END_IO_TYPE_DECL(RELATIONAL_OPS) if(!VALIDATE_OP_IO_TYPES(RELATIONAL_OPS, self, inputs, self->input.num, outputs, self->output.num)) { char* desc = generate_op_io_types_desc(inputs, diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_relu_keras.c b/src/tim/vx/internal/src/ops/vsi_nn_op_relu_keras.c index 5b312cb..8c40d42 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_relu_keras.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_relu_keras.c @@ -34,7 +34,7 @@ #include "vsi_nn_ops.h" #include "vsi_nn_tensor.h" #include "vsi_nn_tensor_util.h" -#include "client/vsi_nn_vxkernel.h" +#include "libnnext/vsi_nn_vxkernel.h" #include "vsi_nn_internal_node.h" #include "utils/vsi_nn_util.h" diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_relu_keras_internal.c b/src/tim/vx/internal/src/ops/vsi_nn_op_relu_keras_internal.c index 8c2c914..aa5e8f5 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_relu_keras_internal.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_relu_keras_internal.c @@ -34,7 +34,7 @@ #include "vsi_nn_ops.h" #include "vsi_nn_tensor.h" #include "vsi_nn_tensor_util.h" -#include "client/vsi_nn_vxkernel.h" +#include "libnnext/vsi_nn_vxkernel.h" #include "kernel/vsi_nn_kernel.h" #include "utils/vsi_nn_util.h" #include "kernel/vsi_nn_kernel_gpu_shape_optimize.h" diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_repeat.c b/src/tim/vx/internal/src/ops/vsi_nn_op_repeat.c new file mode 100644 index 0000000..3200fe5 --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_repeat.c @@ -0,0 +1,340 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include +#include + +#include "vsi_nn_types.h" +#include "vsi_nn_platform.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_node.h" +#include "utils/vsi_nn_math.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_tensor_util.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_log.h" +#include "libnnext/vsi_nn_vxkernel.h" +#include "kernel/vsi_nn_kernel.h" +#include "utils/vsi_nn_constraint_check.h" + +#define _INPUT_NUM (2) +#define _OUTPUT_NUM (1) + +static vsi_status _create_local_tensor + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + int32_t* repeat_host = self->nn_param.repeat.repeat_host; + int32_t axis = self->nn_param.repeat.axis; + vsi_nn_repeat_lcl_data *local = self->nn_param.repeat.local; + uint32_t shape[VSI_NN_MAX_DIM_NUM] = {1, 1, 1, 1}; + uint32_t i = 0; + + if (axis == -1) + { + axis = 0; + for(i = 0; i < inputs[0]->attr.dim_num; i++) + { + shape[0] *= inputs[0]->attr.size[i]; + } + + local->reshaped_input = vsi_nn_reshape_tensor(self->graph, inputs[0], shape, 1); + + shape[0] = 1; + for(i = 0; i < outputs[0]->attr.dim_num; i++) + { + shape[0] *= outputs[0]->attr.size[i]; + } + local->reshaped_output = vsi_nn_reshape_tensor(self->graph, outputs[0], shape, 1); + } + + if (repeat_host) + { + vsi_nn_tensor_attr_t attr; + int32_t len = 0; + + if (self->nn_param.repeat.axis < 0) + { + len = local->reshaped_input->attr.size[0]; + } + else if (axis == 1 || inputs[0]->attr.dim_num == 1) + { + len = inputs[0]->attr.size[0]; + } + else if (axis == 0) + { + len = inputs[0]->attr.size[1]; + } + else if (axis == 2) + { + len = inputs[0]->attr.size[2]; + } + + memset( &attr, 0, sizeof(vsi_nn_tensor_attr_t) ); + attr.dtype.vx_type = VSI_NN_TYPE_INT32; + attr.is_const = FALSE; + attr.vtl = TRUE; + attr.size[0] = len; + attr.size[1] = 1; + attr.dim_num = 2; + + local->repeat_tensor = vsi_nn_CreateTensorFromData(self->graph, (uint8_t*)repeat_host, &attr); + } + + return VSI_SUCCESS; +} + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_param_t * param = NULL; + vsi_nn_kernel_node_t n = NULL; + int32_t axis = self->nn_param.repeat.axis; + vsi_nn_tensor_t * tmp_inputs[2] = {NULL, NULL}; + vsi_nn_tensor_t * tmp_output[1] = {NULL}; + vsi_nn_repeat_lcl_data *local = self->nn_param.repeat.local; + + status = _create_local_tensor(self, inputs, outputs); + if (status != VSI_SUCCESS) + { + VSILOGE("Create local tensor fail"); + return status; + } + + if (local->reshaped_input) + { + tmp_inputs[0] = local->reshaped_input; + tmp_output[0] = local->reshaped_output; + } + else + { + tmp_inputs[0] = inputs[0]; + tmp_output[0] = outputs[0]; + } + + if (local->repeat_tensor) + { + tmp_inputs[1] = local->repeat_tensor; + } + else + { + tmp_inputs[1] = inputs[1]; + } + + param = vsi_nn_kernel_param_create(); + vsi_nn_kernel_param_add_int32( param, "axis", axis ); + n = vsi_nn_kernel_selector( self->graph, "repeat", + tmp_inputs, _INPUT_NUM, tmp_output, _OUTPUT_NUM, param ); + if ( n != NULL ) + { + self->n = (vx_node)n; + status = VSI_SUCCESS; + } + + if (param != NULL) + { + vsi_nn_kernel_param_release( ¶m ); + } + + return status; +} /* op_compute() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_nn_repeat_param * p = NULL; + + BEGIN_IO_TYPE_DECL(REPEAT, 2, 1) + IO_TYPE(D_F16, D_I32, D_F16) + IO_TYPE(D_F32, D_I32, D_F32) + IO_TYPE(D_I32, D_I32, D_I32) + IO_TYPE(D_I8, D_I32, D_I8) + IO_TYPE(D_U8, D_I32, D_U8) + IO_TYPE(D_I16, D_I32, D_I16) + IO_TYPE(D_I8|Q_DFP, D_I32, D_I8|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_I32, D_U8|Q_ASYM) + IO_TYPE(D_I16|Q_DFP, D_I32, D_I16|Q_DFP) + END_IO_TYPE_DECL(REPEAT) + if (!VALIDATE_OP_IO_TYPES(REPEAT, self, inputs, self->input.num, outputs, self->output.num)) + { + char* desc = generate_op_io_types_desc(inputs, + self->input.num, outputs, self->output.num); + VSILOGE("Inputs/Outputs data type not support: %s", desc); + destroy_op_io_types_desc(desc); + return FALSE; + } + + p = (vsi_nn_repeat_param *)&(self->nn_param.repeat); + if ((p->repeat_host == NULL && p->maxlen < 1) || p->axis > 3) + { + VSILOGE("Unsupported parameters"); + } + return TRUE; +} /* op_check() */ + +static vsi_bool op_setup + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_nn_repeat_param * p = NULL; + int32_t i = 0; + int32_t sum = 0; + int32_t axis = 0; + p = (vsi_nn_repeat_param *)&(self->nn_param.repeat); + axis = p->axis; + + if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) + { + outputs[0]->attr.dim_num = inputs[0]->attr.dim_num; + for(i = 0; i < (int32_t)inputs[0]->attr.dim_num; i++) + { + outputs[0]->attr.size[i] = inputs[0]->attr.size[i]; + } + + if (p->repeat_host) + { + for(i = 0; i < p->repeat_len; i++) + { + sum += p->repeat_host[i]; + } + } + else + { + sum = p->maxlen; + } + + if (inputs[0]->attr.dim_num == 1 || axis == -1 || axis == 1) + { + outputs[0]->attr.size[0] = sum; + } + else if (axis == 0) + { + outputs[0]->attr.size[1] = sum; + } + else if (axis == 2) + { + outputs[0]->attr.size[2] = sum; + } + else if (axis == 3) + { + outputs[0]->attr.size[3] = sum; + } + } + + return TRUE; +} /* op_setup() */ + +static vsi_status op_init + ( + vsi_nn_node_t * self + ) +{ + vsi_status status = VSI_SUCCESS; + + self->nn_param.repeat.local = + (vsi_nn_repeat_lcl_data *)malloc(sizeof(vsi_nn_repeat_lcl_data)); + if (NULL == self->nn_param.repeat.local) + { + return VX_ERROR_NO_MEMORY; + } + + memset( self->nn_param.repeat.local, 0, sizeof(vsi_nn_repeat_lcl_data) ); + + self->nn_param.repeat.local->reshaped_input = NULL; + self->nn_param.repeat.local->reshaped_output = NULL; + self->nn_param.repeat.local->repeat_tensor = NULL; + self->nn_param.repeat.repeat_host = NULL; + self->nn_param.repeat.repeat_len = 0; + self->nn_param.repeat.axis = -1; + self->nn_param.repeat.maxlen = -1; + + return status; +} /* op_init() */ + +static vsi_status op_deinit + ( + vsi_nn_node_t * self + ) +{ + vsi_nn_repeat_param *p = &(self->nn_param.repeat); + if (p->local->reshaped_input) + { + vsi_nn_ReleaseTensor(&(p->local->reshaped_input)); + p->local->reshaped_input = NULL; + } + if (p->local->reshaped_output) + { + vsi_nn_ReleaseTensor(&(p->local->reshaped_output)); + p->local->reshaped_output = NULL; + } + if (p->local->repeat_tensor) + { + vsi_nn_ReleaseTensor(&(p->local->repeat_tensor)); + p->local->repeat_tensor = NULL; + } + if (self->nn_param.repeat.local) + { + free(self->nn_param.repeat.local); + self->nn_param.repeat.local = NULL; + } + vsi_nn_op_common_deinit(self); + + return VSI_SUCCESS; +} /* op_deinit() */ + +#ifdef __cplusplus +extern "C" { +#endif +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ REPEAT, + /* init */ op_init, + /* compute */ op_compute, + /* deinit */ op_deinit, + /* check */ op_check, + /* setup */ op_setup, + /* optimize */ NULL, + /* input_num */ _INPUT_NUM, + /* output_num */ _OUTPUT_NUM + ); +#ifdef __cplusplus +} +#endif + diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_resize.c b/src/tim/vx/internal/src/ops/vsi_nn_op_resize.c index 51ea588..255388e 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_resize.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_resize.c @@ -41,7 +41,7 @@ #include "utils/vsi_nn_util.h" #include "vsi_nn_prv.h" #include "vsi_nn_log.h" -#include "client/vsi_nn_vxkernel.h" +#include "libnnext/vsi_nn_vxkernel.h" #include "vsi_nn_internal_node.h" #define _ARG_NUM (1) diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_resize_internal.c b/src/tim/vx/internal/src/ops/vsi_nn_op_resize_internal.c index 0a7f893..bd761d0 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_resize_internal.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_resize_internal.c @@ -34,7 +34,7 @@ #include "vsi_nn_ops.h" #include "vsi_nn_tensor.h" #include "vsi_nn_tensor_util.h" -#include "client/vsi_nn_vxkernel.h" +#include "libnnext/vsi_nn_vxkernel.h" #include "kernel/vsi_nn_kernel.h" #include "utils/vsi_nn_util.h" #include "utils/vsi_nn_constraint_check.h" diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_resize_nearest_internal.c b/src/tim/vx/internal/src/ops/vsi_nn_op_resize_nearest_internal.c index a0e0d48..f46b561 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_resize_nearest_internal.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_resize_nearest_internal.c @@ -34,7 +34,7 @@ #include "vsi_nn_ops.h" #include "vsi_nn_tensor.h" #include "vsi_nn_tensor_util.h" -#include "client/vsi_nn_vxkernel.h" +#include "libnnext/vsi_nn_vxkernel.h" #include "kernel/vsi_nn_kernel.h" #include "utils/vsi_nn_util.h" #include "utils/vsi_nn_constraint_check.h" diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_reverse.c b/src/tim/vx/internal/src/ops/vsi_nn_op_reverse.c index 86b9ad3..a77de72 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_reverse.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_reverse.c @@ -1,4 +1,3 @@ - /**************************************************************************** * * Copyright (c) 2020 Vivante Corporation @@ -35,221 +34,33 @@ #include "utils/vsi_nn_util.h" #include "vsi_nn_prv.h" #include "vsi_nn_log.h" -#include "client/vsi_nn_vxkernel.h" +#include "utils/vsi_nn_dtype_util.h" #include "utils/vsi_nn_constraint_check.h" -#define _ARG_NUM (1) #define _INPUT_NUM (1) #define _OUTPUT_NUM (1) #define _IO_NUM (_INPUT_NUM + _OUTPUT_NUM) #define _PARAM_NUM (_ARG_NUM + _IO_NUM) -#define USE_OVX_API TRUE - -#if (USE_OVX_API == FALSE) -extern vx_kernel_description_t * vx_kernel_REVERSE_list[]; - -static void _set_inputs_outputs +static vsi_bool _is_same_quant ( - vx_reference * params, vsi_nn_tensor_t ** inputs, vsi_nn_tensor_t ** outputs ) { - uint32_t i; - uint32_t cnt; + vsi_nn_dtype_t *src_dtype = NULL,*dst_dtype = NULL; - /* Set inputs */ - cnt = 0; - for( i = 0; i < _INPUT_NUM; i ++, cnt ++ ) + src_dtype = &inputs[0]->attr.dtype; + dst_dtype = &outputs[0]->attr.dtype; + + if (vsi_nn_DtypeCompare(src_dtype, dst_dtype) == FALSE) { - params[cnt] = (vx_reference)inputs[i]->t; + return FALSE; } - /* Set outputs */ - for( i = 0; i < _OUTPUT_NUM; i ++, cnt ++ ) - { - params[cnt] = (vx_reference)outputs[i]->t; - } -} /* _set_inputs_outputs() */ + return TRUE; +} /* _is_same_quant */ -static vsi_status _create_params - ( - vsi_nn_node_t * node, - vx_reference * params, - uint32_t num - ) -{ - vsi_status status; - vx_context ctx; - vsi_nn_reverse_param * p; - if( 0 == num ) - { - return VSI_SUCCESS; - } - memset( params, 0, sizeof( vx_reference * ) * num ); - p = &node->nn_param.reverse; - ctx = vxGetContext( (vx_reference)node->graph->g ); - /* Init parameters */ -#define _SET_PARAM( i, type, arg ) do{ \ - params[i] = (vx_reference)vxCreateScalar( ctx, type, &p->arg ); \ - status = vxGetStatus( params[i] ); \ - if( VSI_SUCCESS != status ) { \ - goto set_param_error; \ - } \ - } while(0) - _SET_PARAM( 0, VX_TYPE_INT32, axis[0] ); -#undef _SET_PARAM -set_param_error: - - return status; -} /* _create_params */ - -static void _release_params - ( - vx_reference * params, - uint32_t num - ) -{ - uint32_t i; - vx_scalar scalar; - for( i = 0; i < num; i ++ ) - { - scalar = (vx_scalar)params[i]; - vxReleaseScalar( &scalar ); - } -} /* _release_params() */ - -static vsi_status cpu_op_compute - ( - vsi_nn_node_t * self, - vsi_nn_tensor_t ** inputs, - vsi_nn_tensor_t ** outputs - ) -{ - vsi_status status = VSI_SUCCESS; - vx_reference params[_PARAM_NUM]; - vx_reference * args; - - args = ¶ms[_IO_NUM]; - - if( NULL == self->n ) - { - return VSI_FAILURE; - } - - /* Set inputs and outputs */ - _set_inputs_outputs( params, inputs, outputs ); - - /* Init parameters. */ - _create_params( self, args, _ARG_NUM ); - - /* Pass parameters to node. */ - status = vsi_nn_ClientNodePassParameters( self->n, params, _PARAM_NUM ); - - _release_params( args, _ARG_NUM ); - - return status; -} - -static vsi_status op_pre_compute - ( - vsi_nn_node_t * self, - vsi_nn_tensor_t ** inputs, - vsi_nn_tensor_t ** outputs, - vsi_nn_kernel_info_t * kernel_info - ) -{ - vsi_nn_type_e in_dataType = inputs[0]->attr.dtype.vx_type; - vsi_nn_type_e out_dataType = outputs[0]->attr.dtype.vx_type; - uint32_t i; - uint32_t changed_num = 1; - - for( i = self->nn_param.reverse.axis[0] + 1; i < inputs[0]->attr.dim_num; i++ ) - { - changed_num *= inputs[0]->attr.size[inputs[0]->attr.dim_num - 1 - i]; - } - - if ((in_dataType != VSI_NN_TYPE_INT16 || out_dataType != VSI_NN_TYPE_INT16) - && self->nn_param.reverse.axis[0] != 0) - { - VSILOGE("tensorReverse shader unsupport format or axis:%d!\n", - self->nn_param.reverse.axis[0]); - return VSI_FAILURE; - } - else if (changed_num >= 65536) - { - VSILOGE("tensorReverse unsupport change num:%d!\n", changed_num); - return VSI_FAILURE; - } - - kernel_info->kernel_index = 1; - - return VSI_SUCCESS; -} - -static void reshape_tensor_shape - ( - vsi_nn_node_t * self, - vsi_nn_tensor_t * input, - vx_reference * params, - uint32_t index - ) -{ - uint32_t i; - int32_t size[4] = {0}; - int32_t size0[4] = {1, 1, 1, 1}; - uint32_t dims = 2; - - for( i = 0; i < input->attr.dim_num; i++ ) - { - size0[i] = input->attr.size[i]; - } - - size[0] = size0[0] * size0[1] * size0[2]; - size[1] = size0[3]; - - self->nn_param.reverse.local.local_tensor[index] = - vxReshapeTensor(input->t, size, dims); - params[index] = (vx_reference)self->nn_param.reverse.local.local_tensor[index]; -} - -static vsi_status vx_op_compute - ( - vsi_nn_node_t * self, - vsi_nn_tensor_t ** inputs, - vsi_nn_tensor_t ** outputs - ) -{ - vsi_status status = VSI_SUCCESS; - vx_reference params[_PARAM_NUM]; - vx_border_t border; - - if( NULL == self->n ) - { - return VSI_FAILURE; - } - - reshape_tensor_shape(self, inputs[0], params, 0); - reshape_tensor_shape(self, outputs[0], params, 1); - - /* Pass parameters to node. */ - status = vsi_nn_ClientNodePassParameters( self->n, params, _PARAM_NUM ); - - border.mode = VX_BORDER_REPLICATE; - border.constant_value.U32 = 0; - status |= vxSetNodeAttribute(self->n, VX_NODE_BORDER, &border, sizeof(border)); - - return status; -} - -static vsi_nn_op_compute_t op_compute_list[] = -{ - cpu_op_compute, - vx_op_compute, - NULL -}; -#endif static vsi_status op_compute ( vsi_nn_node_t * self, @@ -258,45 +69,30 @@ static vsi_status op_compute ) { vsi_status status = VSI_FAILURE; -#if (USE_OVX_API == TRUE) + vx_nn_tensor_reverse_params_t para; vsi_nn_reverse_param * p; int32_t axes[VSI_NN_MAX_DIM_NUM] = {0}; - p = &self->nn_param.reverse; - memcpy(axes, p->axis, sizeof(int32_t) * p->axis_num); - para.axis = axes; - para.numberOfAxis = p->axis_num; - self->n = vxTensorReverse( self->graph->g, inputs[0]->t, ¶, - sizeof(vx_nn_tensor_reverse_params_t), outputs[0]->t ); - if( NULL != self->n ) - { - status = VSI_SUCCESS; - } -#else - vsi_nn_kernel_info_t kernel_info; - memset(&kernel_info, 0x0, sizeof(vsi_nn_kernel_info_t)); - kernel_info.resource_num = 1; - kernel_info.resource_name = (char **)malloc(kernel_info.resource_num * sizeof(char *)); - kernel_info.resource_name[0] = "vsi_nn_kernel_reverse"; - kernel_info.type = vsi_nn_GetVXKernelTypeForShader(); - kernel_info.kernel = vx_kernel_REVERSE_list; - kernel_info.init_index = 1; - op_pre_compute(self, inputs, outputs, &kernel_info); + if ( _is_same_quant(inputs, outputs) ) + { + p = &self->nn_param.reverse; + memcpy(axes, p->axis, sizeof(int32_t) * p->axis_num); + para.axis = axes; + para.numberOfAxis = p->axis_num; + self->n = vxTensorReverse( self->graph->g, inputs[0]->t, ¶, + sizeof(vx_nn_tensor_reverse_params_t), outputs[0]->t ); + if( NULL != self->n ) + { + status = VSI_SUCCESS; + } - self->n = vsi_nn_RegisterClientKernelAndNewNode( - self->graph, &kernel_info); - if (kernel_info.resource_name) free(kernel_info.resource_name); - if( NULL == self->n ) - { - return VSI_FAILURE; + return status; } - if (NULL != op_compute_list[kernel_info.init_index]) + else { - status = op_compute_list[kernel_info.init_index](self, inputs, outputs); + return vsi_nn_internal_compute_node( self ); } -#endif - return status; } /* op_compute() */ @@ -328,6 +124,10 @@ static vsi_bool op_check IO_TYPE(D_F32, D_F32) IO_TYPE(D_F32, D_BF16) IO_TYPE(D_BF16, D_F32) + IO_TYPE(D_F16, D_I32) + IO_TYPE(D_U8|Q_ASYM, D_I32) + IO_TYPE(D_I8|Q_DFP, D_I32) + IO_TYPE(D_I16|Q_DFP, D_I32) /* HW 9.0 */ IO_TYPE(D_BF16, D_BF16) @@ -347,22 +147,72 @@ static vsi_status op_deinit vsi_nn_node_t * self ) { -#if (USE_OVX_API == FALSE) - uint32_t i; - for (i = 0; i < _VSI_NN_REVERSE_LOCAL_TENSOR_NUM; i++) - { - if (self->nn_param.reverse.local.local_tensor[i] != NULL) - { - vxReleaseTensor(&(self->nn_param.reverse.local.local_tensor[i])); - self->nn_param.reverse.local.local_tensor[i] = NULL; - } - } -#endif + vsi_nn_internal_deinit_node_wksp(self); vsi_nn_op_common_deinit(self); return VSI_SUCCESS; } /* op_deinit() */ +static vsi_status op_optimize + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs, + vsi_nn_opt_direction_e direction + ) +{ + if ( _is_same_quant(inputs, outputs) ) + { + return VSI_SUCCESS; + } + else + { + return vsi_nn_internal_optimize_node(self, direction ); + } +} /* op_optimize() */ + +static vsi_bool op_setup + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_bool ret = TRUE; + vsi_nn_internal_node_t* curr = NULL; + + vsi_nn_internal_init_node_wksp(self); + + ret = vsi_nn_op_common_setup(self, inputs, outputs); + + if ( _is_same_quant(inputs, outputs) == FALSE ) + { + vsi_nn_internal_tensor_t* output_tensor = NULL; + vsi_nn_tensor_attr_t attr; + int32_t size = sizeof( attr.size ); + + memcpy( &attr, &inputs[0]->attr, sizeof( attr ) ); + memcpy( &attr.size, &outputs[0]->attr.size, size ); + attr.vtl = TRUE; + attr.is_const = FALSE; + output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + + curr = vsi_nn_internal_new_node(self, VSI_NN_OP_REVERSE, 0, 0); + curr->inputs[0] = inputs[0]; + curr->outputs[0] = output_tensor->t; + curr->node->nn_param.reverse.axis = self->nn_param.reverse.axis; + curr->node->nn_param.reverse.axis_num = self->nn_param.reverse.axis_num; + vsi_nn_internal_setup_node(self, curr); + + curr = vsi_nn_internal_new_node(self, VSI_NN_OP_DATACONVERT, 0, 0); + curr->inputs[0] = output_tensor->t; + curr->outputs[0] = outputs[0]; + vsi_nn_internal_setup_node(self, curr); + } + + return ret; +} /* op_setup() */ + #ifdef __cplusplus extern "C" { #endif @@ -374,8 +224,8 @@ DEF_OP_REG /* compute */ op_compute, /* deinit */ op_deinit, /* check */ op_check, - /* setup */ vsi_nn_op_common_setup, - /* optimize */ NULL, + /* setup */ op_setup, + /* optimize */ op_optimize, /* input_num */ _INPUT_NUM, /* output_num */ _OUTPUT_NUM ); diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_rnncell_ovxlib.c b/src/tim/vx/internal/src/ops/vsi_nn_op_rnncell_ovxlib.c index a6fa7b8..721da3b 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_rnncell_ovxlib.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_rnncell_ovxlib.c @@ -35,7 +35,7 @@ #include "vsi_nn_ops.h" #include "vsi_nn_tensor.h" #include "vsi_nn_tensor_util.h" -#include "client/vsi_nn_vxkernel.h" +#include "libnnext/vsi_nn_vxkernel.h" #include "vsi_nn_internal_node.h" #include "vsi_nn_rnn_helper.h" diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_roi_align.c b/src/tim/vx/internal/src/ops/vsi_nn_op_roi_align.c index 472f994..f754a67 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_roi_align.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_roi_align.c @@ -35,7 +35,7 @@ #include "vsi_nn_ops.h" #include "vsi_nn_tensor.h" #include "vsi_nn_tensor_util.h" -#include "client/vsi_nn_vxkernel.h" +#include "libnnext/vsi_nn_vxkernel.h" #include "kernel/vsi_nn_kernel.h" #include "vsi_nn_internal_node.h" #include "kernel/vsi_nn_kernel_gpu_shape_optimize.h" diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_scale.c b/src/tim/vx/internal/src/ops/vsi_nn_op_scale.c index b7c4056..87a7144 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_scale.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_scale.c @@ -37,7 +37,7 @@ #include "utils/vsi_nn_util.h" #include "utils/vsi_nn_link_list.h" #include "utils/vsi_nn_dtype_util.h" -#include "client/vsi_nn_vxkernel.h" +#include "libnnext/vsi_nn_vxkernel.h" #define _INPUT_NUM (3) #define _OUTPUT_NUM (1) diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_sequence_mask.c b/src/tim/vx/internal/src/ops/vsi_nn_op_sequence_mask.c new file mode 100644 index 0000000..500e676 --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_sequence_mask.c @@ -0,0 +1,176 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include +#include + +#include "vsi_nn_types.h" +#include "vsi_nn_log.h" +#include "vsi_nn_node.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "utils/vsi_nn_constraint_check.h" + +#define _ARG_NUM (1) +#define _INPUT_NUM (2) +#define _OUTPUT_NUM (1) + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_param_t * param = NULL; + vsi_nn_kernel_node_t n = NULL; + int32_t max_len = self->nn_param.sequence_mask.maxlen; + + param =vsi_nn_kernel_param_create(); + + vsi_nn_kernel_param_add_int32( param, "max_len", max_len ); + n = vsi_nn_kernel_selector( self->graph, "sequence_mask", inputs, 2, outputs, 1, param ); + if ( n != NULL ) + { + self->n = (vx_node)n; + status = VSI_SUCCESS; + } + + if (param != NULL) + { + vsi_nn_kernel_param_release( ¶m ); + } + + return status; +} /* op_compute() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_nn_sequence_mask_param * p = NULL; + + BEGIN_IO_TYPE_DECL(SEQUENCE_MASK, 2, 1) + IO_TYPE(D_U8|Q_ASYM, D_I32, D_U8|Q_ASYM) + IO_TYPE(D_I8|Q_DFP, D_I32, D_I8|Q_DFP) + IO_TYPE(D_I16|Q_DFP, D_I32, D_I16|Q_DFP) + IO_TYPE(D_U8, D_I32, D_U8) + IO_TYPE(D_I8, D_I32, D_I8) + IO_TYPE(D_I16, D_I32, D_I16) + IO_TYPE(D_F16, D_I32, D_F16) + IO_TYPE(D_I32, D_I32, D_U8|Q_ASYM) + IO_TYPE(D_I32, D_I32, D_BOOL8) + IO_TYPE(D_I16, D_I32, D_BOOL8) + IO_TYPE(D_I16, D_I16, D_BOOL8) + IO_TYPE(D_I8, D_I32, D_BOOL8) + IO_TYPE(D_I8, D_I16, D_BOOL8) + IO_TYPE(D_U8, D_I32, D_BOOL8) + IO_TYPE(D_U8, D_I16, D_BOOL8) + IO_TYPE(D_F16, D_I32, D_BOOL8) + IO_TYPE(D_F16, D_I16, D_BOOL8) + IO_TYPE(D_I32, D_I32, D_I32) + IO_TYPE(D_I32, D_I32, D_I32|Q_DFP) + IO_TYPE(D_I32, D_I32, D_F32) + + IO_TYPE(D_U8|Q_ASYM, D_NONE, D_U8|Q_ASYM) + IO_TYPE(D_I32, D_NONE, D_U8|Q_ASYM) + IO_TYPE(D_I32, D_NONE, D_BOOL8) + END_IO_TYPE_DECL(SEQUENCE_MASK) + if (!VALIDATE_OP_IO_TYPES(SEQUENCE_MASK, self, inputs, self->input.num, outputs, self->output.num)) { + char* desc = generate_op_io_types_desc(inputs, + self->input.num, outputs, self->output.num); + VSILOGE("Inputs/Outputs data type not support: %s", desc); + destroy_op_io_types_desc(desc); + return FALSE; + } + + p = &(self->nn_param.sequence_mask); + if (p->maxlen < 0) + { + VSILOGE("Max length must bigger than 1"); + return FALSE; + } + + return TRUE; +} /* op_check() */ + +static vsi_bool op_setup + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + /* TODO: Add code to comput outputs' shape. */ + uint32_t i = 0; + vsi_nn_sequence_mask_param * p = NULL; + + if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) + { + p = &(self->nn_param.sequence_mask); + outputs[0]->attr.dim_num = inputs[0]->attr.dim_num + 1; + outputs[0]->attr.size[0] = p->maxlen; + for (i = 0; i < (uint32_t)inputs[0]->attr.dim_num; i++) + { + outputs[0]->attr.size[i+1] = inputs[0]->attr.size[i]; + } + } + return TRUE; +} /* op_setup() */ + +static vsi_status op_deinit + ( + vsi_nn_node_t * self + ) +{ + vsi_nn_op_common_deinit(self); + + return VSI_SUCCESS; +} /* op_deinit() */ + +#ifdef __cplusplus +extern "C" { +#endif +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ SEQUENCE_MASK, + /* init */ NULL, + /* compute */ op_compute, + /* deinit */ op_deinit, + /* check */ op_check, + /* setup */ op_setup, + /* optimize */ NULL, + /* input_num */ _INPUT_NUM, + /* output_num */ _OUTPUT_NUM + ); +#ifdef __cplusplus +} +#endif diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_shufflechannel.c b/src/tim/vx/internal/src/ops/vsi_nn_op_shufflechannel.c index b87e1e6..b45f405 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_shufflechannel.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_shufflechannel.c @@ -35,7 +35,7 @@ #include "vsi_nn_prv.h" #include "vsi_nn_log.h" #include "vsi_nn_test.h" -#include "client/vsi_nn_vxkernel.h" +#include "libnnext/vsi_nn_vxkernel.h" #include "utils/vsi_nn_constraint_check.h" #define _ARG_NUM (2) diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_signalframe.c b/src/tim/vx/internal/src/ops/vsi_nn_op_signalframe.c index d51f294..432970b 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_signalframe.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_signalframe.c @@ -34,7 +34,7 @@ #include "vsi_nn_tensor_util.h" #include "vsi_nn_prv.h" #include "vsi_nn_log.h" -#include "client/vsi_nn_vxkernel.h" +#include "libnnext/vsi_nn_vxkernel.h" #include "utils/vsi_nn_constraint_check.h" #define _ARG_NUM (5) diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_slice.c b/src/tim/vx/internal/src/ops/vsi_nn_op_slice.c index ff7ea13..b5ef3e5 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_slice.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_slice.c @@ -34,7 +34,7 @@ #include "vsi_nn_tensor_util.h" #include "vsi_nn_prv.h" #include "vsi_nn_log.h" -#include "client/vsi_nn_vxkernel.h" +#include "kernel/vsi_nn_kernel.h" #include "vsi_nn_internal_node.h" #include "utils/vsi_nn_constraint_check.h" @@ -52,7 +52,24 @@ static vsi_status op_compute vsi_nn_tensor_t ** outputs ) { - return vsi_nn_internal_compute_node( self ); + if (self->input.num > 1) + { + vsi_status status = VSI_FAILURE; + + self->n = (vx_node)vsi_nn_kernel_selector( self->graph, "slice", + inputs, 2, outputs, _OUTPUT_NUM, NULL ); + + if( self->n ) + { + status = VSI_SUCCESS; + } + + return status; + } + else + { + return vsi_nn_internal_compute_node( self ); + } } /* op_compute() */ static vsi_bool op_check @@ -64,7 +81,39 @@ static vsi_bool op_check { vsi_bool ret = FALSE; - ret = vsi_nn_OpCheck(VSI_NN_OP_STRIDED_SLICE, self, inputs, outputs); + if (self->input.num > 1) + { + BEGIN_IO_TYPE_DECL(SLICE, 2, 1) + IO_TYPE(D_F16, D_I32, D_F16) + IO_TYPE(D_F16, D_I32, D_I8|Q_DFP) + IO_TYPE(D_F16, D_I32, D_I16|Q_DFP) + IO_TYPE(D_F16, D_I32, D_U8|Q_ASYM) + IO_TYPE(D_I8|Q_DFP, D_I32, D_F16) + IO_TYPE(D_I16|Q_DFP, D_I32, D_F16) + IO_TYPE(D_U8|Q_ASYM, D_I32, D_F16) + IO_TYPE(D_I8|Q_DFP, D_I32, D_I8|Q_DFP) + IO_TYPE(D_I16|Q_DFP, D_I32, D_I16|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_I32, D_U8|Q_ASYM) + IO_TYPE(D_F32, D_I32, D_F32) + IO_TYPE(D_I32, D_I32, D_I32) + + /* HW 9.0 */ + IO_TYPE(D_BF16, D_I32, D_BF16) + END_IO_TYPE_DECL(SLICE) + if (!VALIDATE_OP_IO_TYPES(SLICE, self, inputs, self->input.num, outputs, self->output.num)) + { + char* desc = generate_op_io_types_desc(inputs, + self->input.num, outputs, self->output.num); + VSILOGE("Inputs/Outputs data type not support: %s", desc); + destroy_op_io_types_desc(desc); + return FALSE; + } + return TRUE; + } + else + { + ret = vsi_nn_OpCheck(VSI_NN_OP_STRIDED_SLICE, self, inputs, outputs); + } return ret; } /* op_check() */ @@ -77,7 +126,14 @@ static vsi_status op_optimize vsi_nn_opt_direction_e direction ) { - return vsi_nn_internal_optimize_node( self, direction ); + if (self->input.num > 1) + { + return VSI_SUCCESS; + } + else + { + return vsi_nn_internal_optimize_node( self, direction ); + } } static vsi_bool op_setup @@ -90,15 +146,16 @@ static vsi_bool op_setup vsi_nn_slice_param * p; vsi_nn_internal_node_t* curr = NULL; uint32_t i; - if(self->nn_param.slice.dims == 0) + + if (self->nn_param.slice.dims == 0) { self->nn_param.slice.dims = inputs[0]->attr.dim_num; } - p = (vsi_nn_slice_param *)&(self->nn_param.slice); vsi_nn_internal_init_node_wksp( self ); + p = (vsi_nn_slice_param *)&(self->nn_param.slice); - if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) + if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) { for(i = 0; i < p->dims; i++) { @@ -107,6 +164,11 @@ static vsi_bool op_setup outputs[0]->attr.dim_num = p->dims; } + if (self->input.num > 1) + { + return TRUE; + } + for (i = 0; i < self->nn_param.slice.dims; i++) { p->lcl_data->begin_dims[i] = self->nn_param.slice.start[i]; @@ -124,6 +186,7 @@ static vsi_bool op_setup curr->node->nn_param.strided_slice.begin_mask = 0; curr->node->nn_param.strided_slice.end_mask = 0; curr->node->nn_param.strided_slice.shrink_axis_mask = 0; + curr->node->nn_param.strided_slice.new_axis_mask = 0; curr->inputs[0] = inputs[0]; curr->outputs[0] = outputs[0]; vsi_nn_internal_setup_node( self, curr ); @@ -143,7 +206,7 @@ static vsi_status op_init p = &(self->nn_param.slice); p->lcl_data = - (vsi_nn_slice_lcl_data *)malloc(sizeof(vsi_nn_slice_lcl_data)); + (vsi_nn_slice_lcl_data *)malloc(sizeof(vsi_nn_slice_lcl_data)); if (NULL == p->lcl_data) { return VX_ERROR_NO_MEMORY; @@ -169,6 +232,7 @@ static vsi_status op_deinit } vsi_nn_internal_deinit_node_wksp( self ); + vsi_nn_op_common_deinit(self); return VSI_SUCCESS; } /* op_deinit() */ @@ -177,20 +241,19 @@ static vsi_status op_deinit #ifdef __cplusplus extern "C" { #endif -/* Registrar */ -DEF_OP_REG - ( - /* op_name */ SLICE, - /* init */ op_init, - /* compute */ op_compute, - /* deinit */ op_deinit, - /* check */ op_check, - /* setup */ op_setup, - /* optimize */ op_optimize, - /* input_num */ 1, - /* output_num */ 1 - ); + /* Registrar */ + DEF_OP_REG + ( + /* op_name */ SLICE, + /* init */ op_init, + /* compute */ op_compute, + /* deinit */ op_deinit, + /* check */ op_check, + /* setup */ op_setup, + /* optimize */ op_optimize, + /* input_num */ 1, + /* output_num */ 1 + ); #ifdef __cplusplus } #endif - diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_softmax_internal.c b/src/tim/vx/internal/src/ops/vsi_nn_op_softmax_internal.c index 1d7b1b2..4d9d80e 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_softmax_internal.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_softmax_internal.c @@ -35,7 +35,7 @@ #include "utils/vsi_nn_util.h" #include "utils/vsi_nn_link_list.h" -#define MAX_SOFTMAX_BATCH 65535 +#define MAX_SOFTMAX_BATCH 65520 static vsi_bool _need_split_softmax ( diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_space2depth.c b/src/tim/vx/internal/src/ops/vsi_nn_op_space2depth.c index 3a2aea3..e0ef8c7 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_space2depth.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_space2depth.c @@ -35,7 +35,7 @@ #include "utils/vsi_nn_util.h" #include "utils/vsi_nn_dtype_util.h" #include "utils/vsi_nn_math.h" -#include "client/vsi_nn_vxkernel.h" +#include "libnnext/vsi_nn_vxkernel.h" #include "libnnext/vx_lib_nnext.h" #include "vsi_nn_test.h" #include "utils/vsi_nn_constraint_check.h" diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_spatial_transformer.c b/src/tim/vx/internal/src/ops/vsi_nn_op_spatial_transformer.c index 86e3e4f..c514fbf 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_spatial_transformer.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_spatial_transformer.c @@ -34,7 +34,7 @@ #include "vsi_nn_ops.h" #include "vsi_nn_tensor.h" #include "vsi_nn_tensor_util.h" -#include "client/vsi_nn_vxkernel.h" +#include "libnnext/vsi_nn_vxkernel.h" #include "utils/vsi_nn_dtype_util.h" #define _ARG_NUM (2) diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_split.c b/src/tim/vx/internal/src/ops/vsi_nn_op_split.c index 7fa6eee..831570f 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_split.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_split.c @@ -135,13 +135,31 @@ static vsi_bool op_check { BEGIN_IO_TYPE_DECL(SPLIT, 1, 1) IO_TYPE(D_F16, D_F16) + IO_TYPE(D_F16, D_I8|Q_DFP) + IO_TYPE(D_F16, D_I16|Q_DFP) + IO_TYPE(D_F16, D_U8|Q_ASYM) + IO_TYPE(D_I8|Q_DFP, D_F16) + IO_TYPE(D_I16|Q_DFP, D_F16) + IO_TYPE(D_U8|Q_ASYM, D_F16) IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP) IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP) IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_F16, D_I8) + IO_TYPE(D_F16, D_I16) + IO_TYPE(D_F16, D_U8) + IO_TYPE(D_I8, D_F16) + IO_TYPE(D_I16, D_F16) + IO_TYPE(D_U8, D_F16) + IO_TYPE(D_I8, D_I8) + IO_TYPE(D_I16, D_I16) + IO_TYPE(D_U8, D_U8) IO_TYPE(D_F32, D_F32) IO_TYPE(D_F32, D_BF16) IO_TYPE(D_BF16, D_F32) IO_TYPE(D_I32, D_I32) + + /* HW 9.0 */ + IO_TYPE(D_BF16, D_BF16) END_IO_TYPE_DECL(SPLIT) if(!VALIDATE_OP_IO_TYPES(SPLIT, self, inputs, 1, &outputs[i], 1)) { char* desc = generate_op_io_types_desc(inputs, 1, &outputs[i], 1); @@ -197,18 +215,17 @@ static vsi_bool op_setup end[i] = inputs[0]->attr.size[i]; } end[axis] = 0; - for(i = 0; i < num; i++) + for (i = 0; i < num; i++) { - int j; + int32_t j; start[axis] = end[axis]; - if(slices_num == 0) + if (slices_num == 0) end[axis] += average; else end[axis] += slices[i]; - memcpy(&outputs[i]->attr.dtype, &inputs[0]->attr.dtype, sizeof(vsi_nn_dtype_t)); outputs[i]->attr.dim_num = inputs[0]->attr.dim_num; - for(j = 0; j < VSI_NN_MAX_DIM_NUM; j++) + for (j = 0; j < VSI_NN_MAX_DIM_NUM; j++) { outputs[i]->attr.size[j] = inputs[0]->attr.size[j]; } @@ -225,6 +242,7 @@ static vsi_bool op_setup curr->node->nn_param.strided_slice.begin_mask = 0; curr->node->nn_param.strided_slice.end_mask = 0; curr->node->nn_param.strided_slice.shrink_axis_mask = 0; + curr->node->nn_param.strided_slice.new_axis_mask = 0; curr->inputs[0] = inputs[0]; curr->outputs[0] = outputs[i]; vsi_nn_internal_setup_node( self, curr ); diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_stack.c b/src/tim/vx/internal/src/ops/vsi_nn_op_stack.c index 84e3481..7f47cfa 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_stack.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_stack.c @@ -37,7 +37,7 @@ #include "utils/vsi_nn_util.h" #include "utils/vsi_nn_link_list.h" #include "utils/vsi_nn_dtype_util.h" -#include "client/vsi_nn_vxkernel.h" +#include "libnnext/vsi_nn_vxkernel.h" #define _ARG_NUM (1) #define _INPUT_NUM VSI_NN_STACK_MAX_INPUTS diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_strided_slice.c b/src/tim/vx/internal/src/ops/vsi_nn_op_strided_slice.c index fa32a0c..0d84833 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_strided_slice.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_strided_slice.c @@ -33,6 +33,7 @@ #include "vsi_nn_ops.h" #include "vsi_nn_tensor.h" #include "vsi_nn_tensor_util.h" +#include "vsi_nn_test.h" #include "utils/vsi_nn_util.h" #include "utils/vsi_nn_dtype_util.h" #include "utils/vsi_nn_constraint_check.h" @@ -106,12 +107,13 @@ static vsi_bool _get_stride_slice_start_stop_stride vsi_nn_tensor_t ** outputs ) { - vx_uint32 i = 0; - vx_int32 int32_value = 0; + int32_t i = 0; + int32_t int32_value = 0; vsi_nn_strided_slice_param *p = &(self->nn_param.strided_slice); int32_t *start = p->lcl2_data->begin_dims; int32_t *stop = p->lcl2_data->end_dims; int32_t *stride = p->lcl2_data->stride_dims; + strided_slice_param* params = &p->lcl2_data->params; for (i = 0; i < VSI_NN_MAX_DIM_NUM; i ++) { @@ -120,36 +122,36 @@ static vsi_bool _get_stride_slice_start_stop_stride stride[i] = 1; } - for (i = 0; i < p->stride_dims_num; ++i) + for (i = 0; i < params->stride_dims_num; ++i) { - stride[i] = p->stride_dims[i]; + stride[i] = params->stride_dims[i]; } - for (i = 0; i < p->begin_dims_num; ++i) + for (i = 0; i < params->begin_dims_num; ++i) { - int32_value = p->begin_dims[i]; + int32_value = params->begin_dims[i]; start[i] = get_slice_axis_value(int32_value, inputs[0]->attr.size[i]); } - for (i = 0; i < p->end_dims_num; ++i) + for (i = 0; i < params->end_dims_num; ++i) { - int32_value = p->end_dims[i]; + int32_value = params->end_dims[i]; stop[i] = get_slice_axis_value(int32_value, inputs[0]->attr.size[i]); } /*if the ith bit of mask is set, the start or stop will be the fullest possible range in that dimension.*/ - for (i = 0; i < inputs[0]->attr.dim_num; i ++) + for (i = 0; i < (int32_t)inputs[0]->attr.dim_num; i ++) { - if (p->begin_mask & (1 << i)) + if (params->begin_mask & (1 << i)) { start[i] = get_slice_mask_start_value(stride[i], inputs[0]->attr.size[i]); } start[i] = vsi_nn_clamp(start[i], 0, (vx_int32)(inputs[0]->attr.size[i] - 1)); - if (p->shrink_axis_mask & (1 << i)) + if (params->shrink_axis_mask & (1 << i)) { stop[i] = start[i] + 1; } @@ -163,7 +165,7 @@ static vsi_bool _get_stride_slice_start_stop_stride } /* reset start stop and stride when output size is 1*/ - for (i = 0; i < outputs[0]->attr.dim_num; i ++) + for (i = 0; i < (int32_t)outputs[0]->attr.dim_num; i ++) { if (outputs[0]->attr.size[i] == 1 && stride[i] < 0) { @@ -174,12 +176,12 @@ static vsi_bool _get_stride_slice_start_stop_stride if (_check_neg_start_end_dims(start, stop, inputs[0]->attr.dim_num)) { - memcpy(start, p->begin_dims, sizeof(int32_t) * p->begin_dims_num); - memcpy(stop, p->end_dims, sizeof(int32_t) * p->end_dims_num); - memcpy(stride, p->stride_dims, sizeof(int32_t) * p->stride_dims_num); - p->lcl2_data->begin_mask = p->begin_mask; - p->lcl2_data->end_mask = p->end_mask; - p->lcl2_data->shrink_axis_mask = p->shrink_axis_mask; + memcpy(start, params->begin_dims, sizeof(int32_t) * params->begin_dims_num); + memcpy(stop, params->end_dims, sizeof(int32_t) * params->end_dims_num); + memcpy(stride, params->stride_dims, sizeof(int32_t) * params->stride_dims_num); + p->lcl2_data->begin_mask = params->begin_mask; + p->lcl2_data->end_mask = params->end_mask; + p->lcl2_data->shrink_axis_mask = params->shrink_axis_mask; } return TRUE; @@ -276,6 +278,7 @@ static vsi_status op_compute int32_t *stop_dims = NULL; int32_t *stride_dims = NULL; vsi_nn_strided_slice_lcl_data2 * p = self->nn_param.strided_slice.lcl2_data; + strided_slice_param* params = &p->params; start_dims = p->begin_dims; stop_dims = p->end_dims; @@ -301,12 +304,12 @@ static vsi_status op_compute { uint32_t sizes[VSI_NN_MAX_DIM_NUM] = {1}; uint32_t dims = inputs[0]->attr.dim_num; - int32_t shrink_axis_mask = self->nn_param.strided_slice.shrink_axis_mask; + int32_t shrink_axis_mask = params->shrink_axis_mask; memset(¶m, 0, sizeof(vx_nn_stride_slice_params_t)); memset(&attr, 0, sizeof(attr)); - attr.size[0] = self->nn_param.strided_slice.begin_dims_num; + attr.size[0] = params->begin_dims_num; attr.dim_num = 1; attr.is_const = TRUE; attr.dtype.vx_type = VSI_NN_TYPE_INT32; @@ -325,7 +328,7 @@ static vsi_status op_compute param.begin_dims = REQUIRED_IO(begin_dims_tensor); memset(&attr, 0, sizeof(attr)); - attr.size[0] = self->nn_param.strided_slice.end_dims_num; + attr.size[0] = params->end_dims_num; attr.dim_num = 1; attr.is_const = TRUE; attr.dtype.vx_type = VSI_NN_TYPE_INT32; @@ -344,7 +347,7 @@ static vsi_status op_compute param.end_dims = REQUIRED_IO(end_dims_tensor); memset(&attr, 0, sizeof(attr)); - attr.size[0] = self->nn_param.strided_slice.stride_dims_num; + attr.size[0] = params->stride_dims_num; attr.dim_num = 1; attr.is_const = TRUE; attr.dtype.vx_type = VSI_NN_TYPE_INT32; @@ -461,6 +464,77 @@ static vsi_bool op_check return TRUE; } /* op_check() */ +static vsi_bool _build_strided_slice_params(vsi_nn_strided_slice_param * op_params, int32_t input_dims) +{ + uint32_t i = 0; + int32_t num_add_axis = 0; + int32_t added_ellipsis = 0; + int32_t begin_mask = op_params->begin_mask; + int32_t end_mask = op_params->end_mask; + int32_t shrink_axis_mask = op_params->shrink_axis_mask; + const int32_t *begin_dims = op_params->begin_dims; + const int32_t *end_dims = op_params->end_dims; + const int32_t *stride_dims = op_params->stride_dims; + strided_slice_param *params = &op_params->lcl2_data->params; + + for (i = 0; i < op_params->begin_dims_num; i++) + { + if ( op_params->new_axis_mask & (1 << i)) + { + num_add_axis ++; + } + } + + params->num_add_axis = num_add_axis; + + for (i = 0; i < (uint32_t)(input_dims + num_add_axis); i++) + { + if ( op_params->new_axis_mask & (1 << i) ) + { + continue; + } + else if (i >= op_params->begin_dims_num + added_ellipsis) + { + params->begin_mask |= (1 << params->begin_dims_num); + params->end_mask |= (1 << params->end_dims_num); + params->begin_dims[params->begin_dims_num ++ ] = + 0; + params->end_dims[params->end_dims_num ++] = + 0; + params->stride_dims[params->stride_dims_num ++] = + 1; + } + else + { + int32_t orig_idx = i - added_ellipsis; + + if (begin_mask & (1 << orig_idx)) + { + params->begin_mask |= (1 << params->begin_dims_num); + } + + if (end_mask & (1 << orig_idx)) + { + params->end_mask |= (1 << params->end_dims_num); + } + + if (shrink_axis_mask & (1 << orig_idx)) + { + params->shrink_axis_mask |= (1 << params->begin_dims_num); + } + + params->begin_dims[params->begin_dims_num ++] = + begin_dims[orig_idx]; + params->end_dims[params->end_dims_num ++] = + end_dims[orig_idx]; + params->stride_dims[params->stride_dims_num ++] = + stride_dims[orig_idx]; + } + } + + return TRUE; +} + static vsi_bool op_setup ( vsi_nn_node_t * self, @@ -468,18 +542,26 @@ static vsi_bool op_setup vsi_nn_tensor_t ** outputs ) { - if(self->nn_param.strided_slice.begin_dims_num == 0) + uint32_t i = 0; + vsi_nn_strided_slice_param *p = &(self->nn_param.strided_slice); + strided_slice_param *params = &p->lcl2_data->params; + + if ( vsi_nn_compareVersion(self->graph, 1, 1, 32) == -1 + && self->nn_param.strided_slice.begin_dims_num == 0) { self->nn_param.strided_slice.begin_dims_num = inputs[0]->attr.dim_num; self->nn_param.strided_slice.end_dims_num = inputs[0]->attr.dim_num; self->nn_param.strided_slice.stride_dims_num = inputs[0]->attr.dim_num; } + + _build_strided_slice_params(p, inputs[0]->attr.dim_num); + /* TODO: Add code to comput outputs' shape. */ - if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) + if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) { - vsi_nn_strided_slice_param *p = &(self->nn_param.strided_slice); - vx_uint32 i; + int32_t idx = 0; + for (i = 0; i < inputs[0]->attr.dim_num; i++) { vx_int32 begin = 0, end = 1, stride = 1; @@ -487,20 +569,20 @@ static vsi_bool op_setup vx_int32 output_size = 0; vx_int32 j; - begin = get_slice_axis_value(p->begin_dims[i], input_size); - end = get_slice_axis_value(p->end_dims[i], input_size); - stride = p->stride_dims[i]; - if (p->begin_mask & (1 << i)) + begin = get_slice_axis_value(params->begin_dims[i], input_size); + end = get_slice_axis_value(params->end_dims[i], input_size); + stride = params->stride_dims[i]; + if (params->begin_mask & (1 << i)) { begin = get_slice_mask_start_value(stride, input_size); } begin = vsi_nn_clamp(begin, 0, (vx_int32)(input_size - 1)); - if (p->shrink_axis_mask & (1 << i)) + if (params->shrink_axis_mask & (1 << i)) { end = begin + 1; } - if (p->end_mask & (1 << i)) + if (params->end_mask & (1 << i)) { end = get_slice_mask_stop_value(stride, input_size); } @@ -512,11 +594,25 @@ static vsi_bool op_setup outputs[0]->attr.size[i] = output_size; } outputs[0]->attr.dim_num = 0; - for (i = 0; i < inputs[0]->attr.dim_num; i++) + for (idx = 0, i = 0; i < inputs[0]->attr.dim_num + params->num_add_axis; i++) { - if (p->shrink_axis_mask & (1 << i)) continue; + if (p->new_axis_mask & (1 << i)) + { + outputs[0]->attr.size[outputs[0]-> + attr.dim_num] = 1; + + outputs[0]->attr.dim_num++; + continue; + } + else if (params->shrink_axis_mask & (1 << idx)) + { + idx ++; + continue; + } + outputs[0]->attr.size[outputs[0]-> - attr.dim_num] = outputs[0]->attr.size[i]; + attr.dim_num] = outputs[0]->attr.size[idx ++]; + outputs[0]->attr.dim_num++; } } @@ -600,14 +696,16 @@ static vsi_status op_deinit ) { vsi_nn_strided_slice_lcl_data2 * lcl2_data; - + strided_slice_param *params = NULL; if(NULL == self) { return VSI_FAILURE; } lcl2_data = self->nn_param.strided_slice.lcl2_data; - if(self->n) + params = &lcl2_data->params; + + if (self->n) { if( NULL != self && NULL != self->n ) { @@ -616,6 +714,10 @@ static vsi_status op_deinit } } + vsi_nn_safe_free( params->begin_dims ); + vsi_nn_safe_free( params->end_dims ); + vsi_nn_safe_free( params->stride_dims ); + if (lcl2_data->cp_node) { vxReleaseNode( &lcl2_data->cp_node ); @@ -674,42 +776,74 @@ static vsi_status op_init ) { vsi_status status = VSI_SUCCESS; + vsi_nn_strided_slice_lcl_data2 * lcl2_data = NULL; + strided_slice_param* params = NULL; + + if (vsi_nn_compareVersion(self->graph, 1, 1, 32) == -1) + { + self->nn_param.strided_slice.new_axis_mask = 0; + } self->nn_param.strided_slice.lcl2_data = - (vsi_nn_strided_slice_lcl_data2 *)malloc(sizeof(vsi_nn_strided_slice_lcl_data2)); + (vsi_nn_strided_slice_lcl_data2 *)malloc(sizeof(vsi_nn_strided_slice_lcl_data2)); if (NULL == self->nn_param.strided_slice.lcl2_data) { return VX_ERROR_NO_MEMORY; } - memset( self->nn_param.strided_slice.lcl2_data, 0, sizeof(vsi_nn_strided_slice_lcl_data2) ); + lcl2_data = self->nn_param.strided_slice.lcl2_data; - self->nn_param.strided_slice.lcl2_data->begin_dims = + memset( lcl2_data, 0, sizeof(vsi_nn_strided_slice_lcl_data2) ); + + params = &lcl2_data->params; + + lcl2_data->begin_dims = (int32_t *)malloc(sizeof(int32_t) * VSI_NN_MAX_DIM_NUM); - if (NULL == self->nn_param.strided_slice.lcl2_data->begin_dims) + if (NULL == lcl2_data->begin_dims) { return VX_ERROR_NO_MEMORY; } - memset(self->nn_param.strided_slice.lcl2_data->begin_dims, 0, - sizeof(int32_t) * VSI_NN_MAX_DIM_NUM); + memset(lcl2_data->begin_dims, 0, sizeof(int32_t) * VSI_NN_MAX_DIM_NUM); - self->nn_param.strided_slice.lcl2_data->end_dims = + params->begin_dims = (int32_t *)malloc(sizeof(int32_t) * VSI_NN_MAX_DIM_NUM); - if (NULL == self->nn_param.strided_slice.lcl2_data->end_dims) + if (NULL == lcl2_data->begin_dims) { return VX_ERROR_NO_MEMORY; } - memset(self->nn_param.strided_slice.lcl2_data->end_dims, 0, - sizeof(int32_t) * VSI_NN_MAX_DIM_NUM); + memset(params->begin_dims, 0, sizeof(int32_t) * VSI_NN_MAX_DIM_NUM); - self->nn_param.strided_slice.lcl2_data->stride_dims = + lcl2_data->end_dims = (int32_t *)malloc(sizeof(int32_t) * VSI_NN_MAX_DIM_NUM); - if (NULL == self->nn_param.strided_slice.lcl2_data->stride_dims) + if (NULL == lcl2_data->end_dims) { return VX_ERROR_NO_MEMORY; } - memset(self->nn_param.strided_slice.lcl2_data->stride_dims, 0, - sizeof(int32_t) * VSI_NN_MAX_DIM_NUM); + memset(lcl2_data->end_dims, 0, sizeof(int32_t) * VSI_NN_MAX_DIM_NUM); + + params->end_dims = + (int32_t *)malloc(sizeof(int32_t) * VSI_NN_MAX_DIM_NUM); + if (NULL == params->end_dims) + { + return VX_ERROR_NO_MEMORY; + } + memset(params->end_dims, 0, sizeof(int32_t) * VSI_NN_MAX_DIM_NUM); + + lcl2_data->stride_dims = + (int32_t *)malloc(sizeof(int32_t) * VSI_NN_MAX_DIM_NUM); + if (NULL == lcl2_data->stride_dims) + { + return VX_ERROR_NO_MEMORY; + } + memset(lcl2_data->stride_dims, 0, sizeof(int32_t) * VSI_NN_MAX_DIM_NUM); + + params->stride_dims = + (int32_t *)malloc(sizeof(int32_t) * VSI_NN_MAX_DIM_NUM); + if (NULL == params->stride_dims) + { + return VX_ERROR_NO_MEMORY; + } + memset(params->stride_dims, 0, sizeof(int32_t) * VSI_NN_MAX_DIM_NUM); return status; } /* op_init() */ diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_svdf.c b/src/tim/vx/internal/src/ops/vsi_nn_op_svdf.c index 94fa617..642975e 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_svdf.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_svdf.c @@ -163,17 +163,17 @@ static vsi_bool op_check ret = FALSE; } - if(ret) + if (ret) { BEGIN_IO_TYPE_DECL(SVDF, 5, 2) - IO_TYPE(D_F16, D_F16, D_F16, D_F16, D_F16, D_F16, D_F16) - IO_TYPE(D_F16, D_F16, D_F16, D_F16, D_F32, D_F16, D_F16) - IO_TYPE(D_F32, D_F16, D_F16, D_F16, D_F32, D_F32, D_F16) - IO_TYPE(D_F32, D_F32, D_F32, D_F32, D_F32, D_F32, D_F32) - IO_TYPE(D_F16, D_F16, D_F16, D_F16, D_F16, D_F16, D_NONE) - IO_TYPE(D_F16, D_F16, D_F16, D_F16, D_F32, D_F16, D_NONE) - IO_TYPE(D_F32, D_F16, D_F16, D_F16, D_F32, D_F32, D_NONE) - IO_TYPE(D_F32, D_F32, D_F32, D_F32, D_F32, D_F32, D_NONE) + IO_TYPE(D_F16, D_F16, D_F16, D_F16, D_F16, D_F16, D_F16) + IO_TYPE(D_F16, D_F16, D_F16, D_F16, D_F32, D_F16, D_F16) + IO_TYPE(D_F32, D_F16, D_F16, D_F16, D_F32, D_F32, D_F16) + IO_TYPE(D_F32, D_F32, D_F32, D_F32, D_F32, D_F32, D_F32) + IO_TYPE(D_F16, D_F16, D_F16, D_F16, D_NONE, D_F16, D_F16) + IO_TYPE(D_F16, D_F16, D_F16, D_F16, D_NONE, D_F32, D_F16) + IO_TYPE(D_F32, D_F16, D_F16, D_F16, D_NONE, D_F32, D_F32) + IO_TYPE(D_F32, D_F32, D_F32, D_F32, D_NONE, D_F32, D_F32) END_IO_TYPE_DECL(SVDF) if(!VALIDATE_OP_IO_TYPES(SVDF, self, inputs, self->input.num, outputs, self->output.num)) { char* desc = generate_op_io_types_desc(inputs, diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_sync_host.c b/src/tim/vx/internal/src/ops/vsi_nn_op_sync_host.c index cc46ab0..957ecd5 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_sync_host.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_sync_host.c @@ -25,7 +25,7 @@ #include #include "vsi_nn_pub.h" -#include "client/vsi_nn_vxkernel.h" +#include "libnnext/vsi_nn_vxkernel.h" #define _ARG_NUM (0) #define _INPUT_NUM (1) diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_tensor_add_mean_stddev_norm.c b/src/tim/vx/internal/src/ops/vsi_nn_op_tensor_add_mean_stddev_norm.c index 0906eab..78f3508 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_tensor_add_mean_stddev_norm.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_tensor_add_mean_stddev_norm.c @@ -34,7 +34,7 @@ #include "vsi_nn_ops.h" #include "vsi_nn_tensor.h" #include "vsi_nn_tensor_util.h" -#include "client/vsi_nn_vxkernel.h" +#include "libnnext/vsi_nn_vxkernel.h" #include "kernel/vsi_nn_kernel.h" #include "utils/vsi_nn_util.h" #include "utils/vsi_nn_constraint_check.h" diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_tensorstackconcat.c b/src/tim/vx/internal/src/ops/vsi_nn_op_tensorstackconcat.c index 39d32a5..f752f1e 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_tensorstackconcat.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_tensorstackconcat.c @@ -34,7 +34,7 @@ #include "vsi_nn_tensor_util.h" #include "vsi_nn_prv.h" #include "vsi_nn_log.h" -#include "client/vsi_nn_vxkernel.h" +#include "libnnext/vsi_nn_vxkernel.h" #include "utils/vsi_nn_constraint_check.h" #define _ARG_NUM (0) diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_topk.c b/src/tim/vx/internal/src/ops/vsi_nn_op_topk.c index 56d056d..09343e7 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_topk.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_topk.c @@ -34,158 +34,10 @@ #include "vsi_nn_ops.h" #include "vsi_nn_tensor.h" #include "vsi_nn_tensor_util.h" -#include "client/vsi_nn_vxkernel.h" +#include "kernel/vsi_nn_kernel.h" -#define _ARG_NUM (1) #define _INPUT_NUM (1) #define _OUTPUT_NUM (2) -#define _IO_NUM (_INPUT_NUM + _OUTPUT_NUM) -#define _PARAM_NUM (_ARG_NUM + _IO_NUM) - -extern vx_kernel_description_t * vx_kernel_TOPK_list[]; - -static void _set_inputs_outputs - ( - vx_reference * params, - vsi_nn_tensor_t ** inputs, - vsi_nn_tensor_t ** outputs - ) -{ - uint32_t i; - uint32_t cnt; - - /* Set inputs */ - cnt = 0; - for( i = 0; i < _INPUT_NUM; i ++, cnt ++ ) - { - params[cnt] = (vx_reference)inputs[i]->t; - } - - /* Set outputs */ - for( i = 0; i < _OUTPUT_NUM; i ++, cnt ++ ) - { - params[cnt] = (vx_reference)outputs[i]->t; - } -} /* _set_inputs_outputs() */ - -static vsi_status _create_params - ( - vsi_nn_node_t * node, - vx_reference * params, - uint32_t num - ) -{ - vsi_status status; - vx_context ctx; - vsi_nn_topk_param * p; - if( 0 == num ) - { - return VSI_SUCCESS; - } - memset( params, 0, sizeof( vx_reference * ) * num ); - p = &(node->nn_param.topk); - ctx = vxGetContext( (vx_reference)node->graph->g ); - /* Init parameters */ - #define _SET_PARAM( i, type, arg ) do{ \ - params[i] = (vx_reference)vxCreateScalar( ctx, type, &p->arg ); \ - status = vxGetStatus( params[i] ); \ - if( VSI_SUCCESS != status ) { \ - goto set_param_error; \ - } \ - } while(0) - _SET_PARAM( 0, VX_TYPE_UINT32, k ); - #undef _SET_PARAM -set_param_error: - - return status; -} /* _create_params */ - -static void _release_params - ( - vx_reference * params, - uint32_t num - ) -{ - uint32_t i; - vx_scalar scalar; - for( i = 0; i < num; i ++ ) - { - scalar = (vx_scalar)params[i]; - vxReleaseScalar( &scalar ); - } -} /* _release_params() */ - -static vsi_status cpu_op_compute - ( - vsi_nn_node_t * self, - vsi_nn_tensor_t ** inputs, - vsi_nn_tensor_t ** outputs - ) -{ - vsi_status status = VSI_SUCCESS; - vx_reference params[_PARAM_NUM]; - vx_reference * args; - - args = ¶ms[_IO_NUM]; - - if( NULL == self->n ) - { - return VSI_FAILURE; - } - - /* Set inputs and outputs */ - _set_inputs_outputs( params, inputs, outputs ); - - /* Init parameters. */ - _create_params( self, args, _ARG_NUM ); - - /* Pass parameters to node. */ - status = vsi_nn_ClientNodePassParameters( self->n, params, _PARAM_NUM ); - - _release_params( args, _ARG_NUM ); - - return status; -} - -static vsi_status vx_op_compute - ( - vsi_nn_node_t * self, - vsi_nn_tensor_t ** inputs, - vsi_nn_tensor_t ** outputs - ) -{ - vsi_status status = VSI_SUCCESS; - vx_reference params[_PARAM_NUM]; - vx_reference * args; - - args = ¶ms[_IO_NUM]; - - if( NULL == self->n ) - { - return VSI_FAILURE; - } - - /* Set inputs and outputs */ - _set_inputs_outputs( params, inputs, outputs ); - /*TODO: Add code if need to change your parameter*/ - - /* Init parameters. */ - _create_params( self, args, _ARG_NUM ); - - /* Pass parameters to node. */ - status = vsi_nn_ClientNodePassParameters( self->n, params, _PARAM_NUM ); - - _release_params( args, _ARG_NUM ); - - return status; -} - -static vsi_nn_op_compute_t op_compute_list[] = -{ - cpu_op_compute, - vx_op_compute, - NULL -}; static vsi_status op_compute ( @@ -194,46 +46,20 @@ static vsi_status op_compute vsi_nn_tensor_t ** outputs ) { - vsi_status status; - vsi_nn_kernel_info_t kernel_info; - char *path = NULL; + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_param_t * param = NULL; - memset(&kernel_info, 0x0, sizeof(vsi_nn_kernel_info_t)); - status = VSI_FAILURE; - kernel_info.type = VX_KERNEL_TYPE_CPU; - kernel_info.kernel = vx_kernel_TOPK_list; - kernel_info.resource_num = 1; - kernel_info.resource_name = (char **)malloc(kernel_info.resource_num * sizeof(char *)); - kernel_info.resource_name[0] = "vsi_nn_kernel_topk"; - path = getenv("USER_VX_SOURCE_PATH"); - if(path) - vsi_nn_VxResourceSetPath(path); + param = vsi_nn_kernel_param_create(); + vsi_nn_kernel_param_add_int32( param, "top_k", self->nn_param.topk.k ); - if( kernel_info.type == VX_KERNEL_TYPE_VX) + self->n = (vx_node)vsi_nn_kernel_selector( self->graph, "topk", + inputs, _INPUT_NUM, outputs, _OUTPUT_NUM, param ); + + if( self->n ) { - kernel_info.kernel_index = 1; - kernel_info.init_index = 1; - } - else /*kernel_info.type = VX_KERNEL_TYPE_CPU;*/ - { - kernel_info.kernel_index = 0; - kernel_info.init_index = 0; + status = VSI_SUCCESS; } - self->n = vsi_nn_RegisterClientKernelAndNewNode( - self->graph, &kernel_info); - if (kernel_info.resource_name) - { - free(kernel_info.resource_name); - } - if( NULL == self->n ) - { - return VSI_FAILURE; - } - if (NULL != op_compute_list[kernel_info.init_index]) - { - status = op_compute_list[kernel_info.init_index](self, inputs, outputs); - } return status; } /* op_compute() */ @@ -269,6 +95,14 @@ static vsi_bool op_setup { outputs[0]->attr.size[i] = inputs[0]->attr.size[i]; } + } + + if( VSI_NN_DIM_AUTO == outputs[1]->attr.dim_num ) + { + vsi_nn_topk_param * p; + + p = &(self->nn_param.topk); + outputs[1]->attr.dim_num = inputs[0]->attr.dim_num; outputs[1]->attr.size[0] = p->k; for (i = 1; i < inputs[0]->attr.dim_num; i++) @@ -276,6 +110,7 @@ static vsi_bool op_setup outputs[1]->attr.size[i] = inputs[0]->attr.size[i]; } } + return TRUE; } /* op_setup() */ diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_unidirectional_sequence_rnn.c b/src/tim/vx/internal/src/ops/vsi_nn_op_unidirectional_sequence_rnn.c index febe9e3..311e433 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_unidirectional_sequence_rnn.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_unidirectional_sequence_rnn.c @@ -35,7 +35,7 @@ #include "vsi_nn_ops.h" #include "vsi_nn_tensor.h" #include "vsi_nn_tensor_util.h" -#include "client/vsi_nn_vxkernel.h" +#include "libnnext/vsi_nn_vxkernel.h" #include "vsi_nn_internal_node.h" #include "vsi_nn_rnn_helper.h" diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_unstack.c b/src/tim/vx/internal/src/ops/vsi_nn_op_unstack.c index 5717fe3..499cdd7 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_unstack.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_unstack.c @@ -34,7 +34,7 @@ #include "vsi_nn_tensor_util.h" #include "vsi_nn_prv.h" #include "vsi_nn_log.h" -#include "client/vsi_nn_vxkernel.h" +#include "libnnext/vsi_nn_vxkernel.h" #include "vsi_nn_internal_node.h" #define _INPUT_NUM (1) diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_upsample.c b/src/tim/vx/internal/src/ops/vsi_nn_op_upsample.c index 6dd771f..dbe5ff8 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_upsample.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_upsample.c @@ -35,7 +35,7 @@ #include "vsi_nn_prv.h" #include "vsi_nn_log.h" #include "ops/vsi_nn_op_upsample.h" -#include "client/vsi_nn_vxkernel.h" +#include "libnnext/vsi_nn_vxkernel.h" #include "kernel/vsi_nn_kernel_eltwise.h" #include "utils/vsi_nn_constraint_check.h" diff --git a/src/tim/vx/internal/src/utils/vsi_nn_code_generator.c b/src/tim/vx/internal/src/utils/vsi_nn_code_generator.c index 16c1bff..8c07eb6 100644 --- a/src/tim/vx/internal/src/utils/vsi_nn_code_generator.c +++ b/src/tim/vx/internal/src/utils/vsi_nn_code_generator.c @@ -419,6 +419,15 @@ static _op_param_gen_t s_op_gen[] = /* INTERP */ NULL, /* RESIZE_1D */ NULL, /* UPSAMPLESCALE */ NULL, + /* GROUPNORM */ NULL, + /* ROUND */ NULL, + /* CEIL */ NULL, + /* SEQUENCE_MASK */ NULL, + /* REPEAT */ NULL, + /* ERF */ NULL, + /* ONE_HOT */ NULL, + /* NMS */ NULL, + /* GROUPED_CONV1D */ NULL, }; _compiler_assert( _cnt_of_array(s_op_gen) == VSI_NN_OP_NUM, vsi_nn_code_generator_c ); diff --git a/src/tim/vx/internal/src/utils/vsi_nn_constraint_check.c b/src/tim/vx/internal/src/utils/vsi_nn_constraint_check.c index 845a790..36060ea 100644 --- a/src/tim/vx/internal/src/utils/vsi_nn_constraint_check.c +++ b/src/tim/vx/internal/src/utils/vsi_nn_constraint_check.c @@ -158,6 +158,8 @@ vsi_bool validate_op_io_types if(self && self->attr.enable_op_constraint_check) { uint32_t i = 0; + int32_t j = 0; + int32_t reg_tensor_num = op_constraint_reg->reg_input_num + op_constraint_reg->reg_output_num; node_io_signature_t* sig = _get_op_signature(inputs, inputs_num, outputs, outputs_num, op_constraint_reg); @@ -167,7 +169,22 @@ vsi_bool validate_op_io_types for(i = 0; i < op_constraint_reg->io_types_item_count; i++) { const uint8_t* curr = ((const uint8_t*)op_constraint_reg->types) \ + op_constraint_reg->io_types_item_size * i; - if(!memcmp(curr, sig->types, op_constraint_reg->io_types_item_size)) { + vsi_nn_type_e *curr_type = (vsi_nn_type_e *)curr; + + for (j = 0; j < reg_tensor_num; j++) + { + vsi_nn_type_e qnt_type = sig->types[j] >> Q_SHIFT; + vsi_nn_type_e data_type = sig->types[j] & ((1 << Q_SHIFT) - 1); + vsi_nn_type_e curr_qnt_type = curr_type[j] >> Q_SHIFT; + vsi_nn_type_e curr_data_type = curr_type[j] & ((1 << Q_SHIFT) - 1); + if ( (qnt_type != (vsi_nn_type_e)VSI_NN_QNT_TYPE_NONE && qnt_type != curr_qnt_type) || + data_type != curr_data_type ) + { + break; + } + } + if (j == reg_tensor_num) + { matched = TRUE; break; } diff --git a/src/tim/vx/internal/src/utils/vsi_nn_dtype.c b/src/tim/vx/internal/src/utils/vsi_nn_dtype.c index 75f686c..e80ef51 100644 --- a/src/tim/vx/internal/src/utils/vsi_nn_dtype.c +++ b/src/tim/vx/internal/src/utils/vsi_nn_dtype.c @@ -399,6 +399,9 @@ vsi_bool vsi_nn_dtype_convert_quantize_asymm_to_float case U8: return vsi_nn_dtype_convert_quantize_asymm8_to_float( (const uint8_t *)buffer, size, scale, zero_point, out_buffer ); + case I32: + return vsi_nn_dtype_convert_quantize_symm32_to_float( + (const int *)buffer, size, scale, zero_point, out_buffer ); default: VSILOGE("Don't support convert asymm quant %d to float.", dtype); break; diff --git a/src/tim/vx/internal/src/vsi_nn_graph.c b/src/tim/vx/internal/src/vsi_nn_graph.c index a49d8f8..c94a1ca 100644 --- a/src/tim/vx/internal/src/vsi_nn_graph.c +++ b/src/tim/vx/internal/src/vsi_nn_graph.c @@ -41,7 +41,7 @@ #include "utils/vsi_nn_map.h" #include "vsi_nn_graph_optimization.h" -static vsi_status _set_reference_name +static vsi_status _set_reference_node_name ( vsi_nn_graph_t *graph, vsi_nn_node_t *node @@ -49,10 +49,7 @@ static vsi_status _set_reference_name { #define _NODE_ID_LEN 64 vsi_status status; - vsi_nn_tensor_t *tensor; - uint32_t i; char name[_NODE_ID_LEN]; - if(NULL == node || NULL == graph) { return VSI_FAILURE; @@ -66,6 +63,28 @@ static vsi_status _set_reference_name status = vxSetReferenceName((vx_reference)node->n, name); } TEST_CHECK_STATUS(status, final); + +final: + return status; +} /* _set_reference_node_name() */ + +static vsi_status _set_reference_tensor_name + ( + vsi_nn_graph_t *graph, + vsi_nn_node_t *node + ) +{ +#define _NODE_ID_LEN 64 + vsi_status status; + vsi_nn_tensor_t *tensor; + uint32_t i; + char name[_NODE_ID_LEN]; + if(NULL == node || NULL == graph) + { + return VSI_FAILURE; + } + + status = VSI_SUCCESS; for(i = 0; i < node->output.num; i++) { memset(name, 0, sizeof(char) * _NODE_ID_LEN); @@ -80,7 +99,7 @@ static vsi_status _set_reference_name final: return status; -} /* _set_reference_name() */ +} /* _set_reference_tensor_name() */ static vsi_status _check_swapped_tensors ( @@ -345,6 +364,12 @@ static vsi_status compute_node continue; vsi_nn_TensorReinit( graph, outputs[j] ); } + status = _set_reference_tensor_name(graph, node); + if( VSI_SUCCESS != status ) + { + VSILOGW("Set reference node[%d] %s output tensor name fail", + node_id, vsi_nn_OpGetName(node->op)); + } /* Create vx node */ VSILOGD("Instance node[%d] \"%s\" ...", node_id, vsi_nn_OpGetName(node->op)); @@ -354,7 +379,7 @@ static vsi_status compute_node VSILOGE( "Create node[%d] %s fail", node_id, vsi_nn_OpGetName(node->op)); break; } - status = _set_reference_name(graph, node); + status = _set_reference_node_name(graph, node); if( VSI_SUCCESS != status ) { VSILOGW("Set reference name fail"); @@ -465,6 +490,65 @@ final: return status; } /* setup_node() */ +static vsi_status set_graph_precision + ( + vsi_nn_graph_t * graph, + vsi_nn_node_id_t *node_list + ) +{ + uint32_t i, j; + vsi_status status; + vsi_nn_tensor_t **inputs; + vsi_nn_tensor_t **outputs; + vsi_nn_node_id_t node_id; + vsi_nn_node_t *node; + + status = VSI_SUCCESS; + inputs = allocate_io_buffer(graph); + outputs = allocate_io_buffer(graph); + if(NULL == inputs || NULL == outputs) + { + VSILOGE("allocate io buffer fail"); + status = VSI_FAILURE; + goto final; + } + + if(vsi_nn_IsGraphFastMode(graph)) + { + goto final; + } + for( i = 0; i < graph->node_num; i++ ) + { + node_id = node_list[i]; + memset( inputs, 0, graph->max_node_io * sizeof( vsi_nn_tensor_t * ) ); + memset( outputs, 0, graph->max_node_io * sizeof( vsi_nn_tensor_t * ) ); + /* Get inputs, outputs. */ + node = vsi_nn_GetNode( graph, node_id ); + vsi_nn_GetTensors( graph, node->input.tensors, + node->input.num, inputs ); + vsi_nn_GetTensors( graph, node->output.tensors, + node->output.num, outputs ); + + for(j = 0; j < node->input.num; j++) + { + if(inputs[j] != NULL && inputs[j]->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT32) + { + vsi_nn_SetTensorAttr(inputs[j], VSI_NN_TENSOR_ATTR_HIGH_PRECISION); + } + } + for(j = 0; j < node->output.num; j++) + { + if(outputs[j] != NULL && outputs[j]->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT32) + { + vsi_nn_SetTensorAttr(outputs[j], VSI_NN_TENSOR_ATTR_HIGH_PRECISION); + } + } + } +final: + free_io_buffer(inputs); + free_io_buffer(outputs); + return status; +} vsi_nn_graph_t * vsi_nn_CreateGraph ( vsi_nn_context_t ctx, @@ -507,6 +591,7 @@ vsi_nn_graph_t * vsi_nn_CreateGraph graph->rnn_wksp = NULL; graph->node_table = (vsi_nn_map_t *)malloc( sizeof( vsi_nn_map_t ) ); graph->tensor_table = (vsi_nn_map_t *)malloc( sizeof( vsi_nn_map_t ) ); + graph->isAllowFastMode = TRUE; vsi_nn_MapInit( graph->node_table ); vsi_nn_MapInit( graph->tensor_table ); } @@ -532,6 +617,18 @@ void vsi_nn_ReleaseGraph ptr = *graph; if( NULL != graph && NULL != * graph ) { + if( NULL != ptr->nodes ) + { + for( i = 0; i < ptr->node_num; i++ ) + { + vsi_nn_RemoveNode( *graph, (vsi_nn_node_id_t)i ); + } + free( (*graph)->node_table ); + } + if( NULL != ptr->g ) + { + vxReleaseGraph( &ptr->g ); + } if( NULL != ptr->tensors ) { for( i = 0; i < ptr->tensor_num; i++ ) @@ -545,14 +642,6 @@ void vsi_nn_ReleaseGraph { vsi_nn_ReleaseTensor( &ptr->complete_signal.tensor ); } - if( NULL != ptr->nodes ) - { - for( i = 0; i < ptr->node_num; i++ ) - { - vsi_nn_RemoveNode( *graph, (vsi_nn_node_id_t)i ); - } - free( (*graph)->node_table ); - } if( NULL != ptr->input.tensors ) { free( ptr->input.tensors ); @@ -565,10 +654,6 @@ void vsi_nn_ReleaseGraph { vsi_nn_rnn_DeinitWksp( ptr ); } - if( NULL != ptr->g ) - { - vxReleaseGraph( &ptr->g ); - } free( ptr ); *graph = NULL; } @@ -661,6 +746,12 @@ vsi_status vsi_nn_SetupGraph goto final; } + /* Set all of tensor attribute in graph to high precision */ + status = set_graph_precision(graph, nodes_list); + if(VSI_SUCCESS != status) + { + goto final; + } /* Try setup graph complete signal node. */ status = vsi_nn_TrySetupCompleteSignalNode( graph ); TEST_CHECK_STATUS( status, final ); @@ -1369,7 +1460,7 @@ void vsi_nn_DumpGraphNodeOutputsEx #define _SHAPE_BUF_SZ (64) char shape[_SHAPE_BUF_SZ] = { 0 }; char filename[_MAX_TENSOR_NAME_SZ] = { 0 }; - char filename_prefix[_SHAPE_BUF_SZ] = { 0 }; + char filename_prefix[_SHAPE_BUF_SZ + 1] = { 0 }; const char * op_name; uint32_t i; uint32_t o; @@ -1998,3 +2089,29 @@ vsi_status vsi_nn_SetGraphPriority #endif return status; } + +vsi_status vsi_nn_SetGraphFastMode + ( + vsi_nn_graph_t* graph, + vsi_bool fastmode + ) +{ + vsi_status status = VSI_SUCCESS; + if(graph) + { + graph->isAllowFastMode = fastmode; + } + else + { + status = VSI_FAILURE; + } + return status; +} + +vsi_bool vsi_nn_IsGraphFastMode + ( + const vsi_nn_graph_t* graph + ) +{ + return NULL == graph ? FALSE : graph->isAllowFastMode; +} diff --git a/src/tim/vx/internal/src/vsi_nn_graph_optimization.c b/src/tim/vx/internal/src/vsi_nn_graph_optimization.c index 1cde801..4cdbd82 100644 --- a/src/tim/vx/internal/src/vsi_nn_graph_optimization.c +++ b/src/tim/vx/internal/src/vsi_nn_graph_optimization.c @@ -807,10 +807,25 @@ vsi_status vsi_nn_OptimizeGraph vsi_bool *dirty ) { - vsi_status status = VSI_FAILURE; + vsi_status status = VSI_SUCCESS; + uint32_t i = 0; + vsi_bool nbg_flag = FALSE; + vsi_nn_node_t* node = NULL; + for(i = 0; i < graph->node_num; i++) + { + node = vsi_nn_GetNode(graph, i); + if(node->op == VSI_NN_OP_NBG) + { + nbg_flag = TRUE; + break; + } + } - status = _graph_optimization_convert_int8_to_uint8(graph, dirty); - TEST_CHECK_STATUS(status, final); + if (!nbg_flag) + { + status = _graph_optimization_convert_int8_to_uint8(graph, dirty); + TEST_CHECK_STATUS(status, final); + } final: return status; diff --git a/src/tim/vx/internal/src/vsi_nn_internal_node.c b/src/tim/vx/internal/src/vsi_nn_internal_node.c index 2ad5bc0..9c4485e 100644 --- a/src/tim/vx/internal/src/vsi_nn_internal_node.c +++ b/src/tim/vx/internal/src/vsi_nn_internal_node.c @@ -429,7 +429,8 @@ void vsi_nn_internal_init_tensor_attr if( dtype->qnt_type == VSI_NN_QNT_TYPE_NONE && ( dtype->vx_type != VSI_NN_TYPE_FLOAT16 && - dtype->vx_type != VSI_NN_TYPE_FLOAT32 ) ) + dtype->vx_type != VSI_NN_TYPE_FLOAT32 && + dtype->vx_type != VSI_NN_TYPE_BFLOAT16 ) ) { attr->dtype.qnt_type = VSI_NN_QNT_TYPE_NONE; attr->dtype.vx_type = VSI_NN_TYPE_FLOAT16; diff --git a/src/tim/vx/internal/src/vsi_nn_node.c b/src/tim/vx/internal/src/vsi_nn_node.c index 0c05bb2..98f05a7 100644 --- a/src/tim/vx/internal/src/vsi_nn_node.c +++ b/src/tim/vx/internal/src/vsi_nn_node.c @@ -166,7 +166,7 @@ void vsi_nn_PrintNode vsi_nn_node_id_t id ) { -#define _MAX_PRINT_BUF_SZ (256) +#define _MAX_PRINT_BUF_SZ (1024) uint32_t i; int count; char buf[_MAX_PRINT_BUF_SZ]; diff --git a/src/tim/vx/internal/src/vsi_nn_node_attr_template.c b/src/tim/vx/internal/src/vsi_nn_node_attr_template.c index 5c5ffd6..6743605 100644 --- a/src/tim/vx/internal/src/vsi_nn_node_attr_template.c +++ b/src/tim/vx/internal/src/vsi_nn_node_attr_template.c @@ -186,6 +186,9 @@ static _node_template s_template[] = /* PRE_PROCESS_NV12 */ NULL, /* SCATTER_ND */ NULL, /* DECONVOLUTION1D */ NULL, + /* GROUPNORM */ NULL, + /* SEQUENCE_MASK */ NULL, + /* REPEAT */ NULL, }; //_compiler_assert( _cnt_of_array(s_template) == VSI_NN_OP_NUM, vsi_nn_node_attr_template_c ); diff --git a/src/tim/vx/internal/src/vsi_nn_pre_post_process.c b/src/tim/vx/internal/src/vsi_nn_pre_post_process.c index c7c49a6..a97bd7f 100644 --- a/src/tim/vx/internal/src/vsi_nn_pre_post_process.c +++ b/src/tim/vx/internal/src/vsi_nn_pre_post_process.c @@ -508,6 +508,7 @@ vsi_status vsi_nn_add_single_postproc_node ) { vsi_nn_node_t* node; + vsi_nn_node_t** consume_nodes = NULL; vsi_nn_process_permute_t* permute = NULL; vsi_nn_tensor_t* org_norm_tensor = NULL; vsi_nn_tensor_attr_t input_attr; @@ -515,8 +516,10 @@ vsi_status vsi_nn_add_single_postproc_node vsi_nn_tensor_id_t postproc_input; vsi_nn_tensor_id_t postproc_output; vsi_nn_postprocess_dtype_convert_t* dtype_convert = NULL; - int32_t i = 0; + uint32_t i = 0; + uint32_t j = 0; int32_t idx = 0; + uint32_t nodes_count = 0; vsi_status status = VSI_SUCCESS; org_norm_tensor = vsi_nn_GetTensor(graph, graph->output.tensors[output_idx]); @@ -561,10 +564,29 @@ vsi_status vsi_nn_add_single_postproc_node postproc_input = vsi_nn_AddTensor(graph, VSI_NN_TENSOR_ID_AUTO, &input_attr, NULL); postproc_output = vsi_nn_AddTensor(graph, VSI_NN_TENSOR_ID_AUTO, &output_attr, NULL); + /* Get origin norm tensor comsume nodes and connect its' comsume nodes */ + vsi_nn_get_tensor_consumers(graph, graph->output.tensors[output_idx], NULL, &nodes_count); + if(nodes_count != 0) + { + consume_nodes = (vsi_nn_node_t**)malloc(sizeof(vsi_nn_node_t*)*nodes_count); + vsi_nn_get_tensor_consumers(graph, graph->output.tensors[output_idx], consume_nodes, NULL); + for(i = 0; i < nodes_count; i++) + { + for(j = 0; j < consume_nodes[i]->input.num; j++) + { + if(consume_nodes[i]->input.tensors[j] == graph->output.tensors[output_idx]) + { + consume_nodes[i]->input.tensors[j] = postproc_input; + break; + } + } + } + } + /* Reconnect node tensors */ node->input.tensors[0] = postproc_input; node->output.tensors[0] = postproc_output; - for(i = 0; i < (int32_t)last_node->output.num; i++) + for(i = 0; i < last_node->output.num; i++) { if(last_node->output.tensors[i] == graph->output.tensors[output_idx]) { @@ -574,7 +596,13 @@ vsi_status vsi_nn_add_single_postproc_node } graph->output.tensors[output_idx] = postproc_output; + final: + if(consume_nodes) + { + free(consume_nodes); + consume_nodes = NULL; + } return status; } /* vsi_nn_add_single_postproc_node() */ diff --git a/src/tim/vx/internal/src/vsi_nn_tensor.c b/src/tim/vx/internal/src/vsi_nn_tensor.c index 0af8be5..a7bd3c7 100644 --- a/src/tim/vx/internal/src/vsi_nn_tensor.c +++ b/src/tim/vx/internal/src/vsi_nn_tensor.c @@ -575,20 +575,25 @@ vsi_nn_tensor_t * vsi_nn_CreateTensorWithDefault data = (uint8_t *)malloc( size ); if( data ) { - uint32_t i = 0; + uint32_t i = 0, j = 0; uint32_t elements = size / stride[0]; - vsi_status status = VSI_SUCCESS; + vsi_status status = VSI_FAILURE; - for( i = 0; i < elements; i ++ ) + status = vsi_nn_Float32ToDtype( defualt_value, &data[0], &t->attr.dtype ); + if(stride[0] == 1) { - status = vsi_nn_Float32ToDtype( defualt_value, &data[stride[0] * i], &t->attr.dtype ); - if( VSI_FAILURE == status ) + memset(data, data[0], size); + } + else + { + for( i = 1; i < elements; i ++ ) { - VSILOGE("Convert default_value to dtype fail"); - break; + for(j=0;jattr.dtype ); - for( i = 0; i < elements; i ++ ) + if(stride[0] == 1) { - status = vsi_nn_Float32ToDtype( value, &data[stride[0] * i], &tensor->attr.dtype ); - if( VSI_FAILURE == status ) + memset(data, data[0], size); + } + else + { + for( i = 1; i < elements; i ++ ) { - VSILOGE("Convert value to dtype fail"); - break; + for(j=0;jattr.dtype.vx_type ); for( i = 0; i < tensor->attr.dim_num; i ++ ) {