add vecadd example and update README.md
This commit is contained in:
parent
91e94ad3a6
commit
49adfd026c
57
README.md
57
README.md
|
@ -5,13 +5,13 @@
|
||||||
CuPBoP is a framework which support executing unmodified CUDA source code
|
CuPBoP is a framework which support executing unmodified CUDA source code
|
||||||
on non-NVIDIA devices.
|
on non-NVIDIA devices.
|
||||||
Currently, CuPBoP support serveral CPU backends, including x86, AArch64, and RISC-V.
|
Currently, CuPBoP support serveral CPU backends, including x86, AArch64, and RISC-V.
|
||||||
Supporting [Vortex](https://vortex.cc.gatech.edu/) backend is working in progress.
|
Supporting the RISC-V GPU [Vortex](https://vortex.cc.gatech.edu/) is working in progress.
|
||||||
|
|
||||||
## Install
|
## Install
|
||||||
|
|
||||||
### Prerequisites
|
### Prerequisites
|
||||||
|
|
||||||
- Linux
|
- Linux system
|
||||||
- [LLVM 14.0.1](https://github.com/llvm/llvm-project/releases/tag/llvmorg-14.0.1)
|
- [LLVM 14.0.1](https://github.com/llvm/llvm-project/releases/tag/llvmorg-14.0.1)
|
||||||
|
|
||||||
### Installation
|
### Installation
|
||||||
|
@ -48,36 +48,33 @@ Supporting [Vortex](https://vortex.cc.gatech.edu/) backend is working in progres
|
||||||
make
|
make
|
||||||
```
|
```
|
||||||
|
|
||||||
## Run HIST application in Hetero-mark benchmark
|
## Run Vector Addition example
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
# Clone Hetero-mark benchmark
|
cd examples/vecadd
|
||||||
git clone https://github.com/drcut/SC_evaluate
|
# Compile CUDA source code (both host and kernel) to bitcode files
|
||||||
cd SC_evaluate/Hetero-cox/src/hist
|
clang++ -std=c++11 vecadd.cu \
|
||||||
# Compile CUDA source code to LLVM IR
|
-I../.. --cuda-path=$CuPBoP_PATH/cuda-10.1 \
|
||||||
# this may raise error due to absence of CUDA library, just ignore them
|
--cuda-gpu-arch=sm_50 -L$CuPBoP_PATH/cuda-10.1/lib64 \
|
||||||
clang++ -std=c++11 cuda/hist_cuda_benchmark.cu \\
|
-lcudart_static -ldl -lrt -pthread -save-temps -v || true
|
||||||
-I../.. --cuda-path=$CuPBoP_PATH/cuda-10.1 \\
|
# Apply compilation transformations on the kernel bitcode file
|
||||||
--cuda-gpu-arch=sm_50 -L$CuPBoP_PATH/cuda-10.1/lib64 \\
|
$CuPBoP_PATH/build/compilation/kernelTranslator \
|
||||||
-lcudart_static -ldl -lrt -pthread -save-temps -v || true
|
vecadd-cuda-nvptx64-nvidia-cuda-sm_50.bc kernel.bc
|
||||||
# Translate host/kernel LLVM IR to formats that suitable for CPU
|
# Apply compilation transformations on the host bitcode file
|
||||||
$CuPBoP_PATH/build/compilation/kernelTranslator \\
|
$CuPBoP_PATH/build/compilation/hostTranslator \
|
||||||
hist_cuda_benchmark-cuda-nvptx64-nvidia-cuda-sm_50.bc kernel.bc
|
vecadd-host-x86_64-unknown-linux-gnu.bc host.bc
|
||||||
$CuPBoP_PATH/build/compilation/hostTranslator \\
|
# Generate object files
|
||||||
hist_cuda_benchmark-host-x86_64-unknown-linux-gnu.bc host.bc
|
|
||||||
# generate object files
|
|
||||||
llc --relocation-model=pic --filetype=obj kernel.bc
|
llc --relocation-model=pic --filetype=obj kernel.bc
|
||||||
llc --relocation-model=pic --filetype=obj host.bc
|
llc --relocation-model=pic --filetype=obj host.bc
|
||||||
# generate CPU executable file
|
# Link with runtime libraries and generate the executable file
|
||||||
g++ -o hist -fPIC -no-pie \\
|
g++ -o vecadd -fPIC -no-pie \
|
||||||
-I$CuPBoP_PATH/runtime/threadPool/include \\
|
-I$CuPBoP_PATH/runtime/threadPool/include \
|
||||||
-L$CuPBoP_PATH/build/runtime \\
|
-L$CuPBoP_PATH/build/runtime \
|
||||||
-L$CuPBoP_PATH/build/runtime/threadPool \\
|
-L$CuPBoP_PATH/build/runtime/threadPool \
|
||||||
cuda/main.cc host.o kernel.o *.cc ../common/benchmark/*.cc \\
|
host.o kernel.o \
|
||||||
../common/command_line_option/*.cc ../common/time_measurement/*.cc \\
|
-I../.. -lpthread -lc -lx86Runtime -lthreadPool
|
||||||
-I../.. -lpthread -lc -lx86Runtime -lthreadPool
|
# Execute
|
||||||
# execute and verify
|
./vecadd
|
||||||
./hist -q -v
|
|
||||||
```
|
```
|
||||||
|
|
||||||
## How to contribute?
|
## How to contribute?
|
||||||
|
@ -87,10 +84,10 @@ Please refer to [Contribution.md](./CONTRIBUTING.md) for more detail.
|
||||||
|
|
||||||
## Related publications
|
## Related publications
|
||||||
|
|
||||||
- "COX: Exposing CUDA Warp-Level Functions to CPUs"
|
- COX: Exposing CUDA Warp-Level Functions to CPUs
|
||||||
ACM Transactions on Architecture and Code Optimization
|
ACM Transactions on Architecture and Code Optimization
|
||||||
[link](https://dl.acm.org/doi/abs/10.1145/3554736)
|
[link](https://dl.acm.org/doi/abs/10.1145/3554736)
|
||||||
- "CuPBoP: CUDA for Parallelized and Broad-range Processors"
|
- CuPBoP: CUDA for Parallelized and Broad-range Processors
|
||||||
arxiv preprint
|
arxiv preprint
|
||||||
[link](https://arxiv.org/abs/2206.07896)
|
[link](https://arxiv.org/abs/2206.07896)
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,95 @@
|
||||||
|
// Get from: https://github.com/olcf/vector_addition_tutorials
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <math.h>
|
||||||
|
|
||||||
|
const double epsilon = 1e-6;
|
||||||
|
// CUDA kernel. Each thread takes care of one element of c
|
||||||
|
__global__ void vecAdd(double *a, double *b, double *c, int n)
|
||||||
|
{
|
||||||
|
// Get our global thread ID
|
||||||
|
int id = blockIdx.x*blockDim.x+threadIdx.x;
|
||||||
|
|
||||||
|
// Make sure we do not go out of bounds
|
||||||
|
if (id < n)
|
||||||
|
c[id] = a[id] + b[id];
|
||||||
|
}
|
||||||
|
|
||||||
|
int main( int argc, char* argv[] )
|
||||||
|
{
|
||||||
|
//cudaSetDevice(0);
|
||||||
|
// Size of vectors
|
||||||
|
int n = 100000;
|
||||||
|
|
||||||
|
// Host input vectors
|
||||||
|
double *h_a;
|
||||||
|
double *h_b;
|
||||||
|
//Host output vector
|
||||||
|
double *h_c;
|
||||||
|
|
||||||
|
// Device input vectors
|
||||||
|
double *d_a;
|
||||||
|
double *d_b;
|
||||||
|
//Device output vector
|
||||||
|
double *d_c;
|
||||||
|
|
||||||
|
// Size, in bytes, of each vector
|
||||||
|
size_t bytes = n*sizeof(double);
|
||||||
|
|
||||||
|
// Allocate memory for each vector on host
|
||||||
|
h_a = (double*)malloc(bytes);
|
||||||
|
h_b = (double*)malloc(bytes);
|
||||||
|
h_c = (double*)malloc(bytes);
|
||||||
|
|
||||||
|
// Allocate memory for each vector on GPU
|
||||||
|
cudaMalloc(&d_a, bytes);
|
||||||
|
cudaMalloc(&d_b, bytes);
|
||||||
|
cudaMalloc(&d_c, bytes);
|
||||||
|
|
||||||
|
int i;
|
||||||
|
// Initialize vectors on host
|
||||||
|
for( i = 0; i < n; i++ ) {
|
||||||
|
h_a[i] = sin(i)*sin(i);
|
||||||
|
h_b[i] = cos(i)*cos(i);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Copy host vectors to device
|
||||||
|
cudaMemcpy( d_a, h_a, bytes, cudaMemcpyHostToDevice);
|
||||||
|
cudaMemcpy( d_b, h_b, bytes, cudaMemcpyHostToDevice);
|
||||||
|
|
||||||
|
int blockSize, gridSize;
|
||||||
|
|
||||||
|
// Number of threads in each thread block
|
||||||
|
blockSize = 1024;
|
||||||
|
|
||||||
|
// Number of thread blocks in grid
|
||||||
|
gridSize = (int)ceil((float)n/blockSize);
|
||||||
|
|
||||||
|
// Execute the kernel
|
||||||
|
vecAdd<<<gridSize, blockSize>>>(d_a, d_b, d_c, n);
|
||||||
|
|
||||||
|
// Copy array back to host
|
||||||
|
cudaMemcpy( h_c, d_c, bytes, cudaMemcpyDeviceToHost );
|
||||||
|
|
||||||
|
// Sum up vector c and print result divided by n, this should equal 1 within error
|
||||||
|
double sum = 0;
|
||||||
|
for(i=0; i<n; i++)
|
||||||
|
sum += h_c[i];
|
||||||
|
sum/=(double)n;
|
||||||
|
if(abs(sum-1.0)<epsilon)
|
||||||
|
printf("PASS\n");
|
||||||
|
else
|
||||||
|
printf("FAIL\n");
|
||||||
|
|
||||||
|
// Release device memory
|
||||||
|
cudaFree(d_a);
|
||||||
|
cudaFree(d_b);
|
||||||
|
cudaFree(d_c);
|
||||||
|
|
||||||
|
// Release host memory
|
||||||
|
free(h_a);
|
||||||
|
free(h_b);
|
||||||
|
free(h_c);
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
Loading…
Reference in New Issue