add static/dynamic shared memory example
This commit is contained in:
		
							parent
							
								
									3d22cc1f36
								
							
						
					
					
						commit
						e5f020d997
					
				|  | @ -52,6 +52,30 @@ jobs: | |||
|           cd ${{ github.workspace }}/SC_evaluate/Hetero-cox | ||||
|           wget https://www.dropbox.com/s/ie2hcxw9lfoghg8/data.tar.gz?dl=1 | ||||
|           tar -xzf 'data.tar.gz?dl=1' | ||||
|       - name: Execute the static shared memory demo | ||||
|         run: | | ||||
|           cd ${{ github.workspace }}/examples/sharedMemory | ||||
|           clang++ -std=c++11 reverse.cu -I../.. --cuda-path=${{ github.workspace }}/cuda-10.1  --cuda-gpu-arch=sm_50 -L${{ github.workspace }}/cuda-10.1/lib64 -lcudart_static -ldl -lrt -pthread -save-temps -v  || true | ||||
|           export LD_LIBRARY_PATH=${{ github.workspace }}/build/runtime:${{ github.workspace }}/build/runtime/threadPool:$LD_LIBRARY_PATH | ||||
|           export PATH=${{ github.workspace }}/build/compilation:$PATH | ||||
|           kernelTranslator reverse-cuda-nvptx64-nvidia-cuda-sm_50.bc kernel.bc | ||||
|           hostTranslator reverse_cuda_benchmark-host-x86_64-unknown-linux-gnu.bc host.bc | ||||
|           llc --relocation-model=pic --filetype=obj  kernel.bc | ||||
|           llc --relocation-model=pic --filetype=obj  host.bc | ||||
|           g++ -o reverse -fPIC -no-pie -L${{ github.workspace }}/build/runtime   -L${{ github.workspace }}/build/runtime/threadPool   cuda/main.cc host.o kernel.o *.cc  ../common/benchmark/*.cc ../common/command_line_option/*.cc  ../common/time_measurement/*.cc -I../.. -lpthread -lc -lx86Runtime -lthreadPool | ||||
|           ./reverse | ||||
|       - name: Execute the dynamic shared memory demo | ||||
|         run: | | ||||
|           cd ${{ github.workspace }}/examples/dynamicSharedMemory | ||||
|           clang++ -std=c++11 reverse.cu -I../.. --cuda-path=${{ github.workspace }}/cuda-10.1  --cuda-gpu-arch=sm_50 -L${{ github.workspace }}/cuda-10.1/lib64 -lcudart_static -ldl -lrt -pthread -save-temps -v  || true | ||||
|           export LD_LIBRARY_PATH=${{ github.workspace }}/build/runtime:${{ github.workspace }}/build/runtime/threadPool:$LD_LIBRARY_PATH | ||||
|           export PATH=${{ github.workspace }}/build/compilation:$PATH | ||||
|           kernelTranslator reverse-cuda-nvptx64-nvidia-cuda-sm_50.bc kernel.bc | ||||
|           hostTranslator reverse_cuda_benchmark-host-x86_64-unknown-linux-gnu.bc host.bc | ||||
|           llc --relocation-model=pic --filetype=obj  kernel.bc | ||||
|           llc --relocation-model=pic --filetype=obj  host.bc | ||||
|           g++ -o reverse -fPIC -no-pie -L${{ github.workspace }}/build/runtime   -L${{ github.workspace }}/build/runtime/threadPool   cuda/main.cc host.o kernel.o *.cc  ../common/benchmark/*.cc ../common/command_line_option/*.cc  ../common/time_measurement/*.cc -I../.. -lpthread -lc -lx86Runtime -lthreadPool | ||||
|           ./reverse | ||||
|       - name: Execute the AES example | ||||
|         run: | | ||||
|           cd ${{ github.workspace }}/SC_evaluate/Hetero-cox/src/aes | ||||
|  |  | |||
|  | @ -0,0 +1,40 @@ | |||
| // Get from: https://developer.nvidia.com/blog/using-shared-memory-cuda-cc/ | ||||
| #include <stdio.h> | ||||
| #include <stdlib.h> | ||||
| 
 | ||||
| __global__ void dynamicReverse(int *d, int n) | ||||
| { | ||||
|   extern __shared__ int s[]; | ||||
|   int t = threadIdx.x; | ||||
|   int tr = n-t-1; | ||||
|   s[t] = d[t]; | ||||
|   __syncthreads(); | ||||
|   d[t] = s[tr]; | ||||
| } | ||||
| 
 | ||||
| int main() | ||||
| { | ||||
|   const int n = 64; | ||||
|   int a[n], r[n], d[n]; | ||||
| 
 | ||||
|   for (int i = 0; i < n; i++) { | ||||
|     a[i] = i; | ||||
|     r[i] = n-i-1; | ||||
|     d[i] = 0; | ||||
|   } | ||||
| 
 | ||||
|   int *d_d; | ||||
|   cudaMalloc(&d_d, n * sizeof(int)); | ||||
| 
 | ||||
|   // run version with static shared memory | ||||
|   cudaMemcpy(d_d, a, n*sizeof(int), cudaMemcpyHostToDevice); | ||||
|   dynamicReverse<<<1,n,n*sizeof(int)>>>(d_d, n); | ||||
|   cudaMemcpy(d, d_d, n*sizeof(int), cudaMemcpyDeviceToHost); | ||||
|   for (int i = 0; i < n; i++) | ||||
|     if (d[i] != r[i]) { | ||||
|       printf("Error: d[%d]!=r[%d] (%d, %d)n", i, i, d[i], r[i]); | ||||
|       exit(1); | ||||
|     } | ||||
|     printf("PASS\n"); | ||||
|     return 0; | ||||
| } | ||||
|  | @ -33,7 +33,7 @@ int main() | |||
|   for (int i = 0; i < n; i++) | ||||
|     if (d[i] != r[i]) { | ||||
|       printf("Error: d[%d]!=r[%d] (%d, %d)n", i, i, d[i], r[i]); | ||||
|       return 1; | ||||
|       exit(1); | ||||
|     } | ||||
|     printf("PASS\n"); | ||||
|     return 0; | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue