From 49adfd026cf03bb3d9c55c63f215f46cb1948f0b Mon Sep 17 00:00:00 2001 From: Ruobing Han Date: Thu, 15 Sep 2022 11:15:21 -0400 Subject: [PATCH] add vecadd example and update README.md --- README.md | 57 +++++++++++------------ examples/vecadd/vecadd.cu | 95 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 122 insertions(+), 30 deletions(-) create mode 100644 examples/vecadd/vecadd.cu diff --git a/README.md b/README.md index 7aac4b8..396fd2c 100644 --- a/README.md +++ b/README.md @@ -5,13 +5,13 @@ CuPBoP is a framework which support executing unmodified CUDA source code on non-NVIDIA devices. Currently, CuPBoP support serveral CPU backends, including x86, AArch64, and RISC-V. -Supporting [Vortex](https://vortex.cc.gatech.edu/) backend is working in progress. +Supporting the RISC-V GPU [Vortex](https://vortex.cc.gatech.edu/) is working in progress. ## Install ### Prerequisites -- Linux +- Linux system - [LLVM 14.0.1](https://github.com/llvm/llvm-project/releases/tag/llvmorg-14.0.1) ### Installation @@ -48,36 +48,33 @@ Supporting [Vortex](https://vortex.cc.gatech.edu/) backend is working in progres make ``` -## Run HIST application in Hetero-mark benchmark +## Run Vector Addition example ```bash -# Clone Hetero-mark benchmark -git clone https://github.com/drcut/SC_evaluate -cd SC_evaluate/Hetero-cox/src/hist -# Compile CUDA source code to LLVM IR -# this may raise error due to absence of CUDA library, just ignore them -clang++ -std=c++11 cuda/hist_cuda_benchmark.cu \\ - -I../.. --cuda-path=$CuPBoP_PATH/cuda-10.1 \\ - --cuda-gpu-arch=sm_50 -L$CuPBoP_PATH/cuda-10.1/lib64 \\ - -lcudart_static -ldl -lrt -pthread -save-temps -v || true -# Translate host/kernel LLVM IR to formats that suitable for CPU -$CuPBoP_PATH/build/compilation/kernelTranslator \\ - hist_cuda_benchmark-cuda-nvptx64-nvidia-cuda-sm_50.bc kernel.bc -$CuPBoP_PATH/build/compilation/hostTranslator \\ - hist_cuda_benchmark-host-x86_64-unknown-linux-gnu.bc host.bc -# generate object files +cd examples/vecadd +# Compile CUDA source code (both host and kernel) to bitcode files +clang++ -std=c++11 vecadd.cu \ + -I../.. --cuda-path=$CuPBoP_PATH/cuda-10.1 \ + --cuda-gpu-arch=sm_50 -L$CuPBoP_PATH/cuda-10.1/lib64 \ + -lcudart_static -ldl -lrt -pthread -save-temps -v || true +# Apply compilation transformations on the kernel bitcode file +$CuPBoP_PATH/build/compilation/kernelTranslator \ + vecadd-cuda-nvptx64-nvidia-cuda-sm_50.bc kernel.bc +# Apply compilation transformations on the host bitcode file +$CuPBoP_PATH/build/compilation/hostTranslator \ + vecadd-host-x86_64-unknown-linux-gnu.bc host.bc +# Generate object files llc --relocation-model=pic --filetype=obj kernel.bc llc --relocation-model=pic --filetype=obj host.bc -# generate CPU executable file -g++ -o hist -fPIC -no-pie \\ --I$CuPBoP_PATH/runtime/threadPool/include \\ --L$CuPBoP_PATH/build/runtime \\ --L$CuPBoP_PATH/build/runtime/threadPool \\ -cuda/main.cc host.o kernel.o *.cc ../common/benchmark/*.cc \\ -../common/command_line_option/*.cc ../common/time_measurement/*.cc \\ --I../.. -lpthread -lc -lx86Runtime -lthreadPool -# execute and verify -./hist -q -v +# Link with runtime libraries and generate the executable file +g++ -o vecadd -fPIC -no-pie \ + -I$CuPBoP_PATH/runtime/threadPool/include \ + -L$CuPBoP_PATH/build/runtime \ + -L$CuPBoP_PATH/build/runtime/threadPool \ + host.o kernel.o \ + -I../.. -lpthread -lc -lx86Runtime -lthreadPool +# Execute +./vecadd ``` ## How to contribute? @@ -87,10 +84,10 @@ Please refer to [Contribution.md](./CONTRIBUTING.md) for more detail. ## Related publications -- "COX: Exposing CUDA Warp-Level Functions to CPUs" +- COX: Exposing CUDA Warp-Level Functions to CPUs ACM Transactions on Architecture and Code Optimization [link](https://dl.acm.org/doi/abs/10.1145/3554736) -- "CuPBoP: CUDA for Parallelized and Broad-range Processors" +- CuPBoP: CUDA for Parallelized and Broad-range Processors arxiv preprint [link](https://arxiv.org/abs/2206.07896) diff --git a/examples/vecadd/vecadd.cu b/examples/vecadd/vecadd.cu new file mode 100644 index 0000000..6371ca4 --- /dev/null +++ b/examples/vecadd/vecadd.cu @@ -0,0 +1,95 @@ +// Get from: https://github.com/olcf/vector_addition_tutorials +#include +#include +#include + +const double epsilon = 1e-6; +// CUDA kernel. Each thread takes care of one element of c +__global__ void vecAdd(double *a, double *b, double *c, int n) +{ + // Get our global thread ID + int id = blockIdx.x*blockDim.x+threadIdx.x; + + // Make sure we do not go out of bounds + if (id < n) + c[id] = a[id] + b[id]; +} + +int main( int argc, char* argv[] ) +{ + //cudaSetDevice(0); + // Size of vectors + int n = 100000; + + // Host input vectors + double *h_a; + double *h_b; + //Host output vector + double *h_c; + + // Device input vectors + double *d_a; + double *d_b; + //Device output vector + double *d_c; + + // Size, in bytes, of each vector + size_t bytes = n*sizeof(double); + + // Allocate memory for each vector on host + h_a = (double*)malloc(bytes); + h_b = (double*)malloc(bytes); + h_c = (double*)malloc(bytes); + + // Allocate memory for each vector on GPU + cudaMalloc(&d_a, bytes); + cudaMalloc(&d_b, bytes); + cudaMalloc(&d_c, bytes); + + int i; + // Initialize vectors on host + for( i = 0; i < n; i++ ) { + h_a[i] = sin(i)*sin(i); + h_b[i] = cos(i)*cos(i); + } + + // Copy host vectors to device + cudaMemcpy( d_a, h_a, bytes, cudaMemcpyHostToDevice); + cudaMemcpy( d_b, h_b, bytes, cudaMemcpyHostToDevice); + + int blockSize, gridSize; + + // Number of threads in each thread block + blockSize = 1024; + + // Number of thread blocks in grid + gridSize = (int)ceil((float)n/blockSize); + + // Execute the kernel + vecAdd<<>>(d_a, d_b, d_c, n); + + // Copy array back to host + cudaMemcpy( h_c, d_c, bytes, cudaMemcpyDeviceToHost ); + + // Sum up vector c and print result divided by n, this should equal 1 within error + double sum = 0; + for(i=0; i