add static/dynamic shared memory example
This commit is contained in:
		
							parent
							
								
									3d22cc1f36
								
							
						
					
					
						commit
						e5f020d997
					
				|  | @ -52,6 +52,30 @@ jobs: | ||||||
|           cd ${{ github.workspace }}/SC_evaluate/Hetero-cox |           cd ${{ github.workspace }}/SC_evaluate/Hetero-cox | ||||||
|           wget https://www.dropbox.com/s/ie2hcxw9lfoghg8/data.tar.gz?dl=1 |           wget https://www.dropbox.com/s/ie2hcxw9lfoghg8/data.tar.gz?dl=1 | ||||||
|           tar -xzf 'data.tar.gz?dl=1' |           tar -xzf 'data.tar.gz?dl=1' | ||||||
|  |       - name: Execute the static shared memory demo | ||||||
|  |         run: | | ||||||
|  |           cd ${{ github.workspace }}/examples/sharedMemory | ||||||
|  |           clang++ -std=c++11 reverse.cu -I../.. --cuda-path=${{ github.workspace }}/cuda-10.1  --cuda-gpu-arch=sm_50 -L${{ github.workspace }}/cuda-10.1/lib64 -lcudart_static -ldl -lrt -pthread -save-temps -v  || true | ||||||
|  |           export LD_LIBRARY_PATH=${{ github.workspace }}/build/runtime:${{ github.workspace }}/build/runtime/threadPool:$LD_LIBRARY_PATH | ||||||
|  |           export PATH=${{ github.workspace }}/build/compilation:$PATH | ||||||
|  |           kernelTranslator reverse-cuda-nvptx64-nvidia-cuda-sm_50.bc kernel.bc | ||||||
|  |           hostTranslator reverse_cuda_benchmark-host-x86_64-unknown-linux-gnu.bc host.bc | ||||||
|  |           llc --relocation-model=pic --filetype=obj  kernel.bc | ||||||
|  |           llc --relocation-model=pic --filetype=obj  host.bc | ||||||
|  |           g++ -o reverse -fPIC -no-pie -L${{ github.workspace }}/build/runtime   -L${{ github.workspace }}/build/runtime/threadPool   cuda/main.cc host.o kernel.o *.cc  ../common/benchmark/*.cc ../common/command_line_option/*.cc  ../common/time_measurement/*.cc -I../.. -lpthread -lc -lx86Runtime -lthreadPool | ||||||
|  |           ./reverse | ||||||
|  |       - name: Execute the dynamic shared memory demo | ||||||
|  |         run: | | ||||||
|  |           cd ${{ github.workspace }}/examples/dynamicSharedMemory | ||||||
|  |           clang++ -std=c++11 reverse.cu -I../.. --cuda-path=${{ github.workspace }}/cuda-10.1  --cuda-gpu-arch=sm_50 -L${{ github.workspace }}/cuda-10.1/lib64 -lcudart_static -ldl -lrt -pthread -save-temps -v  || true | ||||||
|  |           export LD_LIBRARY_PATH=${{ github.workspace }}/build/runtime:${{ github.workspace }}/build/runtime/threadPool:$LD_LIBRARY_PATH | ||||||
|  |           export PATH=${{ github.workspace }}/build/compilation:$PATH | ||||||
|  |           kernelTranslator reverse-cuda-nvptx64-nvidia-cuda-sm_50.bc kernel.bc | ||||||
|  |           hostTranslator reverse_cuda_benchmark-host-x86_64-unknown-linux-gnu.bc host.bc | ||||||
|  |           llc --relocation-model=pic --filetype=obj  kernel.bc | ||||||
|  |           llc --relocation-model=pic --filetype=obj  host.bc | ||||||
|  |           g++ -o reverse -fPIC -no-pie -L${{ github.workspace }}/build/runtime   -L${{ github.workspace }}/build/runtime/threadPool   cuda/main.cc host.o kernel.o *.cc  ../common/benchmark/*.cc ../common/command_line_option/*.cc  ../common/time_measurement/*.cc -I../.. -lpthread -lc -lx86Runtime -lthreadPool | ||||||
|  |           ./reverse | ||||||
|       - name: Execute the AES example |       - name: Execute the AES example | ||||||
|         run: | |         run: | | ||||||
|           cd ${{ github.workspace }}/SC_evaluate/Hetero-cox/src/aes |           cd ${{ github.workspace }}/SC_evaluate/Hetero-cox/src/aes | ||||||
|  |  | ||||||
|  | @ -0,0 +1,40 @@ | ||||||
|  | // Get from: https://developer.nvidia.com/blog/using-shared-memory-cuda-cc/ | ||||||
|  | #include <stdio.h> | ||||||
|  | #include <stdlib.h> | ||||||
|  | 
 | ||||||
|  | __global__ void dynamicReverse(int *d, int n) | ||||||
|  | { | ||||||
|  |   extern __shared__ int s[]; | ||||||
|  |   int t = threadIdx.x; | ||||||
|  |   int tr = n-t-1; | ||||||
|  |   s[t] = d[t]; | ||||||
|  |   __syncthreads(); | ||||||
|  |   d[t] = s[tr]; | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | int main() | ||||||
|  | { | ||||||
|  |   const int n = 64; | ||||||
|  |   int a[n], r[n], d[n]; | ||||||
|  | 
 | ||||||
|  |   for (int i = 0; i < n; i++) { | ||||||
|  |     a[i] = i; | ||||||
|  |     r[i] = n-i-1; | ||||||
|  |     d[i] = 0; | ||||||
|  |   } | ||||||
|  | 
 | ||||||
|  |   int *d_d; | ||||||
|  |   cudaMalloc(&d_d, n * sizeof(int)); | ||||||
|  | 
 | ||||||
|  |   // run version with static shared memory | ||||||
|  |   cudaMemcpy(d_d, a, n*sizeof(int), cudaMemcpyHostToDevice); | ||||||
|  |   dynamicReverse<<<1,n,n*sizeof(int)>>>(d_d, n); | ||||||
|  |   cudaMemcpy(d, d_d, n*sizeof(int), cudaMemcpyDeviceToHost); | ||||||
|  |   for (int i = 0; i < n; i++) | ||||||
|  |     if (d[i] != r[i]) { | ||||||
|  |       printf("Error: d[%d]!=r[%d] (%d, %d)n", i, i, d[i], r[i]); | ||||||
|  |       exit(1); | ||||||
|  |     } | ||||||
|  |     printf("PASS\n"); | ||||||
|  |     return 0; | ||||||
|  | } | ||||||
|  | @ -33,7 +33,7 @@ int main() | ||||||
|   for (int i = 0; i < n; i++) |   for (int i = 0; i < n; i++) | ||||||
|     if (d[i] != r[i]) { |     if (d[i] != r[i]) { | ||||||
|       printf("Error: d[%d]!=r[%d] (%d, %d)n", i, i, d[i], r[i]); |       printf("Error: d[%d]!=r[%d] (%d, %d)n", i, i, d[i], r[i]); | ||||||
|       return 1; |       exit(1); | ||||||
|     } |     } | ||||||
|     printf("PASS\n"); |     printf("PASS\n"); | ||||||
|     return 0; |     return 0; | ||||||
|  |  | ||||||
		Loading…
	
		Reference in New Issue