From 49adfd026cf03bb3d9c55c63f215f46cb1948f0b Mon Sep 17 00:00:00 2001
From: Ruobing Han <hanruobing@gatech.edu>
Date: Thu, 15 Sep 2022 11:15:21 -0400
Subject: [PATCH] add vecadd example and update README.md

---
 README.md                 | 57 +++++++++++------------
 examples/vecadd/vecadd.cu | 95 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 122 insertions(+), 30 deletions(-)
 create mode 100644 examples/vecadd/vecadd.cu

diff --git a/README.md b/README.md
index 7aac4b8..396fd2c 100644
--- a/README.md
+++ b/README.md
@@ -5,13 +5,13 @@
 CuPBoP is a framework which support executing unmodified CUDA source code
 on non-NVIDIA devices.
 Currently, CuPBoP support serveral CPU backends, including x86, AArch64, and RISC-V.
-Supporting [Vortex](https://vortex.cc.gatech.edu/) backend is working in progress.
+Supporting the RISC-V GPU [Vortex](https://vortex.cc.gatech.edu/) is working in progress.
 
 ## Install
 
 ### Prerequisites
 
-- Linux
+- Linux system
 - [LLVM 14.0.1](https://github.com/llvm/llvm-project/releases/tag/llvmorg-14.0.1)
 
 ### Installation
@@ -48,36 +48,33 @@ Supporting [Vortex](https://vortex.cc.gatech.edu/) backend is working in progres
    make
    ```
 
-## Run HIST application in Hetero-mark benchmark
+## Run Vector Addition example
 
 ```bash
-# Clone Hetero-mark benchmark
-git clone https://github.com/drcut/SC_evaluate
-cd SC_evaluate/Hetero-cox/src/hist
-# Compile CUDA source code to LLVM IR
-# this may raise error due to absence of CUDA library, just ignore them
-clang++ -std=c++11 cuda/hist_cuda_benchmark.cu \\
-    -I../.. --cuda-path=$CuPBoP_PATH/cuda-10.1 \\
-    --cuda-gpu-arch=sm_50 -L$CuPBoP_PATH/cuda-10.1/lib64 \\
-    -lcudart_static -ldl -lrt -pthread -save-temps -v  || true
-# Translate host/kernel LLVM IR to formats that suitable for CPU
-$CuPBoP_PATH/build/compilation/kernelTranslator \\
-   hist_cuda_benchmark-cuda-nvptx64-nvidia-cuda-sm_50.bc kernel.bc
-$CuPBoP_PATH/build/compilation/hostTranslator \\
-   hist_cuda_benchmark-host-x86_64-unknown-linux-gnu.bc host.bc
-# generate object files
+cd examples/vecadd
+# Compile CUDA source code (both host and kernel) to bitcode files
+clang++ -std=c++11 vecadd.cu \
+      -I../.. --cuda-path=$CuPBoP_PATH/cuda-10.1 \
+      --cuda-gpu-arch=sm_50 -L$CuPBoP_PATH/cuda-10.1/lib64 \
+      -lcudart_static -ldl -lrt -pthread -save-temps -v  || true
+# Apply compilation transformations on the kernel bitcode file
+$CuPBoP_PATH/build/compilation/kernelTranslator \
+      vecadd-cuda-nvptx64-nvidia-cuda-sm_50.bc kernel.bc
+# Apply compilation transformations on the host bitcode file
+$CuPBoP_PATH/build/compilation/hostTranslator \
+      vecadd-host-x86_64-unknown-linux-gnu.bc host.bc
+# Generate object files
 llc --relocation-model=pic --filetype=obj  kernel.bc
 llc --relocation-model=pic --filetype=obj  host.bc
-# generate CPU executable file
-g++ -o hist -fPIC -no-pie \\
--I$CuPBoP_PATH/runtime/threadPool/include \\
--L$CuPBoP_PATH/build/runtime  \\
--L$CuPBoP_PATH/build/runtime/threadPool \\
-cuda/main.cc host.o kernel.o *.cc  ../common/benchmark/*.cc \\
-../common/command_line_option/*.cc  ../common/time_measurement/*.cc \\
--I../.. -lpthread -lc -lx86Runtime -lthreadPool
-# execute and verify
-./hist -q -v
+# Link with runtime libraries and generate the executable file
+g++ -o vecadd -fPIC -no-pie \
+      -I$CuPBoP_PATH/runtime/threadPool/include \
+      -L$CuPBoP_PATH/build/runtime  \
+      -L$CuPBoP_PATH/build/runtime/threadPool \
+      host.o kernel.o \
+      -I../.. -lpthread -lc -lx86Runtime -lthreadPool
+# Execute
+./vecadd
 ```
 
 ## How to contribute?
@@ -87,10 +84,10 @@ Please refer to [Contribution.md](./CONTRIBUTING.md) for more detail.
 
 ## Related publications
 
-- "COX: Exposing CUDA Warp-Level Functions to CPUs"
+- COX: Exposing CUDA Warp-Level Functions to CPUs
 ACM Transactions on Architecture and Code Optimization
 [link](https://dl.acm.org/doi/abs/10.1145/3554736)
-- "CuPBoP: CUDA for Parallelized and Broad-range Processors"
+- CuPBoP: CUDA for Parallelized and Broad-range Processors
 arxiv preprint
 [link](https://arxiv.org/abs/2206.07896)
 
diff --git a/examples/vecadd/vecadd.cu b/examples/vecadd/vecadd.cu
new file mode 100644
index 0000000..6371ca4
--- /dev/null
+++ b/examples/vecadd/vecadd.cu
@@ -0,0 +1,95 @@
+// Get from: https://github.com/olcf/vector_addition_tutorials
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+
+const double epsilon = 1e-6;
+// CUDA kernel. Each thread takes care of one element of c
+__global__ void vecAdd(double *a, double *b, double *c, int n)
+{
+    // Get our global thread ID
+    int id = blockIdx.x*blockDim.x+threadIdx.x;
+
+    // Make sure we do not go out of bounds
+    if (id < n)
+        c[id] = a[id] + b[id];
+}
+
+int main( int argc, char* argv[] )
+{
+    //cudaSetDevice(0);
+    // Size of vectors
+    int n = 100000;
+
+    // Host input vectors
+    double *h_a;
+    double *h_b;
+    //Host output vector
+    double *h_c;
+
+    // Device input vectors
+    double *d_a;
+    double *d_b;
+    //Device output vector
+    double *d_c;
+
+    // Size, in bytes, of each vector
+    size_t bytes = n*sizeof(double);
+
+    // Allocate memory for each vector on host
+    h_a = (double*)malloc(bytes);
+    h_b = (double*)malloc(bytes);
+    h_c = (double*)malloc(bytes);
+
+    // Allocate memory for each vector on GPU
+    cudaMalloc(&d_a, bytes);
+    cudaMalloc(&d_b, bytes);
+    cudaMalloc(&d_c, bytes);
+
+    int i;
+    // Initialize vectors on host
+    for( i = 0; i < n; i++ ) {
+        h_a[i] = sin(i)*sin(i);
+        h_b[i] = cos(i)*cos(i);
+    }
+
+    // Copy host vectors to device
+    cudaMemcpy( d_a, h_a, bytes, cudaMemcpyHostToDevice);
+    cudaMemcpy( d_b, h_b, bytes, cudaMemcpyHostToDevice);
+
+    int blockSize, gridSize;
+
+    // Number of threads in each thread block
+    blockSize = 1024;
+
+    // Number of thread blocks in grid
+    gridSize = (int)ceil((float)n/blockSize);
+
+    // Execute the kernel
+    vecAdd<<<gridSize, blockSize>>>(d_a, d_b, d_c, n);
+
+    // Copy array back to host
+    cudaMemcpy( h_c, d_c, bytes, cudaMemcpyDeviceToHost );
+
+    // Sum up vector c and print result divided by n, this should equal 1 within error
+    double sum = 0;
+    for(i=0; i<n; i++)
+        sum += h_c[i];
+    sum/=(double)n;
+    if(abs(sum-1.0)<epsilon)
+        printf("PASS\n");
+    else
+        printf("FAIL\n");
+
+    // Release device memory
+    cudaFree(d_a);
+    cudaFree(d_b);
+    cudaFree(d_c);
+
+    // Release host memory
+    free(h_a);
+    free(h_b);
+    free(h_c);
+
+    return 0;
+}