Pass vecadd.

2024-01-31 23:54:44 +08:00 · 2024-01-31 23:54:44 +08:00 · 09bc1df7ae
parent fd56811650
commit 09bc1df7ae
12 changed files with 53131 additions and 48 deletions
--- a/.gitignore
+++ b/.gitignore
@ -45,3 +45,6 @@ CMakeCache.txt
 # OS generated files
 .DS_Store
 .DS_Store?
+
+build
+.vscode
--- a/README.md
+++ b/README.md
@ -1,5 +1,15 @@
 # CuPBoP: Cuda for Parallelized and Broad-range Processors

+## Install
+
+```shell
+install cuda 11.5
+git submodule update --init --recursive
+sudo apt install llvm
+sudo apt install clang
+sudo apt install libstdc++-12-dev
+```
+
 ## Introduction

 CuPBoP is a framework which support executing unmodified CUDA source code
--- a/examples/vecadd/host.s
+++ b/examples/vecadd/host.s
--- a/examples/vecadd/kernel.s
+++ b/examples/vecadd/kernel.s
@ -0,0 +1,102 @@
+	.text
+	.file	"vecadd.cu"
+	.globl	_Z6vecAddPdS_S_i                # -- Begin function _Z6vecAddPdS_S_i
+	.p2align	4, 0x90
+	.type	_Z6vecAddPdS_S_i,@function
+_Z6vecAddPdS_S_i:                       # @_Z6vecAddPdS_S_i
+.L_Z6vecAddPdS_S_i$local:
+# %bb.0:                                # %_after_block_sync_1
+	pushq	%rbp
+	movq	%rsp, %rbp
+	pushq	%r15
+	pushq	%r14
+	pushq	%r13
+	pushq	%r12
+	pushq	%rbx
+	pushq	%rax
+	movl	%ecx, %ebx
+	movq	%rdx, %r14
+	movq	%rsi, %r15
+	movq	%rdi, %r12
+	data16
+	leaq	block_size@TLSGD(%rip), %rdi
+	data16
+	data16
+	rex64
+	callq	__tls_get_addr@PLT
+	movl	(%rax), %r13d
+	testl	%r13d, %r13d
+	je	.LBB0_5
+# %bb.1:                                # %_after_block_sync_0.lr.ph
+	data16
+	leaq	block_index_x@TLSGD(%rip), %rdi
+	data16
+	data16
+	rex64
+	callq	__tls_get_addr@PLT
+	movq	%rax, -48(%rbp)                 # 8-byte Spill
+	data16
+	leaq	block_size_x@TLSGD(%rip), %rdi
+	data16
+	data16
+	rex64
+	callq	__tls_get_addr@PLT
+	movl	(%rax), %esi
+	movq	-48(%rbp), %rax                 # 8-byte Reload
+	movl	(%rax), %edi
+	imull	%esi, %edi
+	xorl	%ecx, %ecx
+	jmp	.LBB0_2
+	.p2align	4, 0x90
+.LBB0_4:                                # %intra_warp_inc
+                                        #   in Loop: Header=BB0_2 Depth=1
+	incl	%ecx
+	cmpl	%r13d, %ecx
+	jae	.LBB0_5
+.LBB0_2:                                # %_after_block_sync_0
+                                        # =>This Inner Loop Header: Depth=1
+	movl	%ecx, %eax
+	cltd
+	idivl	%esi
+	addl	%edi, %edx
+	cmpl	%ebx, %edx
+	jge	.LBB0_4
+# %bb.3:                                #   in Loop: Header=BB0_2 Depth=1
+	movslq	%edx, %rax
+	movsd	(%r12,%rax,8), %xmm0            # xmm0 = mem[0],zero
+	addsd	(%r15,%rax,8), %xmm0
+	movsd	%xmm0, (%r14,%rax,8)
+	jmp	.LBB0_4
+.LBB0_5:                                # %_after_block_sync_2
+	addq	$8, %rsp
+	popq	%rbx
+	popq	%r12
+	popq	%r13
+	popq	%r14
+	popq	%r15
+	popq	%rbp
+	retq
+.Lfunc_end0:
+	.size	_Z6vecAddPdS_S_i, .Lfunc_end0-_Z6vecAddPdS_S_i
+                                        # -- End function
+	.globl	vecAddPdS_S_i_wrapper           # -- Begin function vecAddPdS_S_i_wrapper
+	.p2align	4, 0x90
+	.type	vecAddPdS_S_i_wrapper,@function
+vecAddPdS_S_i_wrapper:                  # @vecAddPdS_S_i_wrapper
+# %bb.0:
+	movq	(%rdi), %rax
+	movq	8(%rdi), %rcx
+	movq	(%rax), %rax
+	movq	(%rcx), %rsi
+	movq	16(%rdi), %rcx
+	movq	(%rcx), %rdx
+	movq	24(%rdi), %rcx
+	movl	(%rcx), %ecx
+	movq	%rax, %rdi
+	jmp	.L_Z6vecAddPdS_S_i$local        # TAILCALL
+.Lfunc_end1:
+	.size	vecAddPdS_S_i_wrapper, .Lfunc_end1-vecAddPdS_S_i_wrapper
+                                        # -- End function
+	.ident	"Ubuntu clang version 14.0.0-1ubuntu1.1"
+	.ident	"clang version 3.8.0 (tags/RELEASE_380/final)"
+	.section	".note.GNU-stack","",@progbits
--- a/examples/vecadd/run.sh
+++ b/examples/vecadd/run.sh
@ -0,0 +1,32 @@
+export CuPBoP_PATH=`pwd`/../../
+export LD_LIBRARY_PATH=../../build/runtime:../../build/runtime/threadPool:$LD_LIBRARY_PATH
+export CUDA_PATH=/usr/local/cuda-11.7
+
+cd examples/vecadd
+# Compile CUDA source code (both host and kernel) to bitcode files
+clang++ -std=c++11 vecadd.cu \
+      -I../.. --cuda-path=$CUDA_PATH \
+      --cuda-gpu-arch=sm_50 -L$CUDA_PATH/lib64 \
+      -lcudart_static -ldl -lrt -pthread -save-temps -v  || true
+# Apply compilation transformations on the kernel bitcode file
+../..//build/compilation/kernelTranslator \
+      vecadd-cuda-nvptx64-nvidia-cuda-sm_50.bc kernel.bc
+# Apply compilation transformations on the host bitcode file
+../..//build/compilation/hostTranslator \
+      vecadd-host-x86_64-pc-linux-gnu.bc host.bc
+# Generate object files
+llc --relocation-model=pic --filetype=obj  kernel.bc
+llc --relocation-model=pic --filetype=obj  host.bc
+
+llc kernel.bc --relocation-model=pic -filetype=asm -o kernel.s 
+llc host.bc --relocation-model=pic -filetype=asm -o host.s 
+
+# Link with runtime libraries and generate the executable file
+gcc -o vecadd -fPIC -no-pie \
+      -I../../runtime/threadPool/include \
+      -L../../build/runtime  \
+      -L../../build/runtime/threadPool \
+      host.o kernel.o \
+      -I../.. -lc -lm -lCPUruntime -lthreadPool -lpthread
+# Execute
+./vecadd
--- a/examples/vecadd/vecadd
+++ b/examples/vecadd/vecadd
--- a/examples/vecadd/vecadd-cuda-nvptx64-nvidia-cuda-sm_50.cui
+++ b/examples/vecadd/vecadd-cuda-nvptx64-nvidia-cuda-sm_50.cui
--- a/examples/vecadd/vecadd-cuda-nvptx64-nvidia-cuda-sm_50.s
+++ b/examples/vecadd/vecadd-cuda-nvptx64-nvidia-cuda-sm_50.s
@ -0,0 +1,73 @@
+//
+// Generated by LLVM NVPTX Back-End
+//
+
+.version 7.5
+.target sm_50
+.address_size 64
+
+	// .globl	_Z6vecAddPdS_S_i
+.global .align 1 .b8 blockIdx[1];
+.global .align 1 .b8 blockDim[1];
+.global .align 1 .b8 threadIdx[1];
+
+.visible .entry _Z6vecAddPdS_S_i(
+	.param .u64 _Z6vecAddPdS_S_i_param_0,
+	.param .u64 _Z6vecAddPdS_S_i_param_1,
+	.param .u64 _Z6vecAddPdS_S_i_param_2,
+	.param .u32 _Z6vecAddPdS_S_i_param_3
+)
+{
+	.local .align 8 .b8 	__local_depot0[32];
+	.reg .b64 	%SP;
+	.reg .b64 	%SPL;
+	.reg .pred 	%p<2>;
+	.reg .b32 	%r<9>;
+	.reg .b64 	%rd<18>;
+	.reg .f64 	%fd<4>;
+
+	mov.u64 	%SPL, __local_depot0;
+	cvta.local.u64 	%SP, %SPL;
+	ld.param.u32 	%r1, [_Z6vecAddPdS_S_i_param_3];
+	ld.param.u64 	%rd3, [_Z6vecAddPdS_S_i_param_2];
+	ld.param.u64 	%rd2, [_Z6vecAddPdS_S_i_param_1];
+	ld.param.u64 	%rd1, [_Z6vecAddPdS_S_i_param_0];
+	cvta.to.global.u64 	%rd4, %rd3;
+	cvta.global.u64 	%rd5, %rd4;
+	cvta.to.global.u64 	%rd6, %rd2;
+	cvta.global.u64 	%rd7, %rd6;
+	cvta.to.global.u64 	%rd8, %rd1;
+	cvta.global.u64 	%rd9, %rd8;
+	st.u64 	[%SP+0], %rd9;
+	st.u64 	[%SP+8], %rd7;
+	st.u64 	[%SP+16], %rd5;
+	st.u32 	[%SP+24], %r1;
+	mov.u32 	%r2, %ctaid.x;
+	mov.u32 	%r3, %ntid.x;
+	mul.lo.s32 	%r4, %r2, %r3;
+	mov.u32 	%r5, %tid.x;
+	add.s32 	%r6, %r4, %r5;
+	st.u32 	[%SP+28], %r6;
+	ld.u32 	%r7, [%SP+28];
+	ld.u32 	%r8, [%SP+24];
+	setp.ge.s32 	%p1, %r7, %r8;
+	@%p1 bra 	LBB0_2;
+	bra.uni 	LBB0_1;
+LBB0_1:
+	ld.u64 	%rd10, [%SP+0];
+	ld.s32 	%rd11, [%SP+28];
+	shl.b64 	%rd12, %rd11, 3;
+	add.s64 	%rd13, %rd10, %rd12;
+	ld.f64 	%fd1, [%rd13];
+	ld.u64 	%rd14, [%SP+8];
+	add.s64 	%rd15, %rd14, %rd12;
+	ld.f64 	%fd2, [%rd15];
+	add.rn.f64 	%fd3, %fd1, %fd2;
+	ld.u64 	%rd16, [%SP+16];
+	add.s64 	%rd17, %rd16, %rd12;
+	st.f64 	[%rd17], %fd3;
+	bra.uni 	LBB0_2;
+LBB0_2:
+	ret;
+
+}
--- a/examples/vecadd/vecadd-host-x86_64-pc-linux-gnu.cui
+++ b/examples/vecadd/vecadd-host-x86_64-pc-linux-gnu.cui
--- a/examples/vecadd/vecadd-host-x86_64-pc-linux-gnu.s
+++ b/examples/vecadd/vecadd-host-x86_64-pc-linux-gnu.s
--- a/examples/vecadd/vecadd.cu-cuda-nvptx64-nvidia-cuda.fatbin
+++ b/examples/vecadd/vecadd.cu-cuda-nvptx64-nvidia-cuda.fatbin
--- a/test/runHeteroMark.sh
+++ b/test/runHeteroMark.sh
@ -19,53 +19,10 @@ cd $TestCase
 clang++ -std=c++11 $HeteroMark_PATH/src/$1/cuda/$1_cuda_benchmark.cu -I$HeteroMark_PATH \
        --cuda-path=$CUDA_PATH  \
        --cuda-gpu-arch=sm_50 -L$CUDA_PATH/lib64 \
-        -lcudart_static -ldl -lrt -pthread -save-temps -v  || true
-export LD_LIBRARY_PATH=$CuPBoP_BUILD_PATH/runtime:$CuPBoP_BUILD_PATH/runtime/threadPool:$LD_LIBRARY_PATH
-export PATH=$CuPBoP_BUILD_PATH/compilation:$PATH
-kernelTranslator $1_cuda_benchmark-cuda-nvptx64-nvidia-cuda-sm_50.bc kernel.bc
-hostTranslator $1_cuda_benchmark-host-x86_64-unknown-linux-gnu.bc host.bc
-llc --relocation-model=pic --filetype=obj  kernel.bc
-llc --relocation-model=pic --filetype=obj  host.bc
-g++ -o $1 -fPIC -no-pie \
-    $HeteroMark_PATH/src/$1/cuda/main.cc host.o kernel.o $HeteroMark_PATH/src/$1/*.cc  $HeteroMark_PATH/src/common/benchmark/*.cc \
-    $HeteroMark_PATH/src/common/command_line_option/*.cc  $HeteroMark_PATH/src/common/time_measurement/*.cc \
-    -L$CuPBoP_BUILD_PATH/runtime   -L$CuPBoP_BUILD_PATH/runtime/threadPool \
-    -I$HeteroMark_PATH -I$CUDA_PATH/include -lpthread -lc -lCPUruntime -lthreadPool
+        -lcudart_static -ldl -lrt -pthread -save-temps

-case $1 in
-  aes)
-    ./$1 -i $DATASET_PATH/aes/1KB.data -k $DATASET_PATH/aes/key.data -q -v
-    ;;
+# clang++ -std=c++11 vecadd.cu \
+#       -I../.. --cuda-path=$CUDA_PATH \
+#       --cuda-gpu-arch=sm_50 -L$CUDA_PATH/lib64 \
+#       -lcudart_static -ldl -lrt -pthread -save-temps -v

-  bs)
-    ./$1 -q -v
-    ;;
-
-  ep)
-    ./$1 -q -v -m 10 -x 64
-    ;;
-
-  fir)
-    ./$1 -q -v
-    ;;
-
-#   ga)
-#     ./$1 -q -i $DATASET_PATH/ga/1024_64.data -v
-#     ;;
-
-  hist)
-    ./$1 -q -v
-    ;;
-
-  kmeans)
-    ./$1 -i $DATASET_PATH/kmeans/100_34.txt -q -v
-    ;;
-
-  pr)
-    ./$1 -i $DATASET_PATH/pr/512.data -q -v
-    ;;
-
-  *)
-    echo -n "unknown"
-    ;;
-esac