/*----------------------------------------------------------- ** gaussian.cu -- The program is to solve a linear system Ax = b ** by using Gaussian Elimination. The algorithm on page 101 ** ("Foundations of Parallel Programming") is used. ** The sequential version is gaussian.c. This parallel ** implementation converts three independent for() loops ** into three Fans. Use the data file ge_3.dat to verify ** the correction of the output. ** ** Written by Andreas Kura, 02/15/95 ** Modified by Chong-wei Xu, 04/20/95 ** Modified by Chris Gregg for CUDA, 07/20/2009 **----------------------------------------------------------- */ #include "cuda_runtime.h" #include #include #include #include #include #ifdef TIMING #include "timing.h" #endif #ifdef RD_WG_SIZE_0_0 #define MAXBLOCKSIZE RD_WG_SIZE_0_0 #elif defined(RD_WG_SIZE_0) #define MAXBLOCKSIZE RD_WG_SIZE_0 #elif defined(RD_WG_SIZE) #define MAXBLOCKSIZE RD_WG_SIZE #else #define MAXBLOCKSIZE 512 #endif // 2D defines. Go from specific to general #ifdef RD_WG_SIZE_1_0 #define BLOCK_SIZE_XY RD_WG_SIZE_1_0 #elif defined(RD_WG_SIZE_1) #define BLOCK_SIZE_XY RD_WG_SIZE_1 #elif defined(RD_WG_SIZE) #define BLOCK_SIZE_XY RD_WG_SIZE #else #define BLOCK_SIZE_XY 1 #endif #ifdef TIMING struct timeval tv; struct timeval tv_total_start, tv_total_end; struct timeval tv_h2d_start, tv_h2d_end; struct timeval tv_d2h_start, tv_d2h_end; struct timeval tv_kernel_start, tv_kernel_end; struct timeval tv_mem_alloc_start, tv_mem_alloc_end; struct timeval tv_close_start, tv_close_end; float init_time = 0, mem_alloc_time = 0, h2d_time = 0, kernel_time = 0, d2h_time = 0, close_time = 0, total_time = 0; #endif int Size; float *a, *b, *finalVec; float *m; FILE *fp; void InitProblemOnce(char *filename); void InitPerRun(); void ForwardSub(); void BackSub(); __global__ void Fan1(float *m, float *a, int Size, int t); __global__ void Fan2(float *m, float *a, float *b, int Size, int j1, int t); void InitMat(float *ary, int nrow, int ncol); void InitAry(float *ary, int ary_size); void PrintMat(float *ary, int nrow, int ncolumn); void PrintAry(float *ary, int ary_size); void PrintDeviceProperties(); void checkCUDAError(const char *msg); unsigned int totalKernelTime = 0; // create both matrix and right hand side, Ke Wang 2013/08/12 11:51:06 void create_matrix(float *m, int size) { int i, j; float lamda = -0.01; float coe[2 * size - 1]; float coe_i = 0.0; for (i = 0; i < size; i++) { coe_i = 10 * exp(lamda * i); j = size - 1 + i; coe[j] = coe_i; j = size - 1 - i; coe[j] = coe_i; } for (i = 0; i < size; i++) { for (j = 0; j < size; j++) { m[i * size + j] = coe[size - 1 - i + j]; } } } int main(int argc, char *argv[]) { printf("WG size of kernel 1 = %d, WG size of kernel 2= %d X %d\n", MAXBLOCKSIZE, BLOCK_SIZE_XY, BLOCK_SIZE_XY); int verbose = 1; int i, j; char flag; if (argc < 2) { printf("Usage: gaussian -f filename / -s size [-q]\n\n"); printf("-q (quiet) suppresses printing the matrix and result values.\n"); printf("-f (filename) path of input file\n"); printf( "-s (size) size of matrix. Create matrix and rhs in this program \n"); printf( "The first line of the file contains the dimension of the matrix, n."); printf("The second line of the file is a newline.\n"); printf("The next n lines contain n tab separated values for the matrix."); printf("The next line of the file is a newline.\n"); printf("The next line of the file is a 1xn vector with tab separated " "values.\n"); printf("The next line of the file is a newline. (optional)\n"); printf("The final line of the file is the pre-computed solution. " "(optional)\n"); printf("Example: matrix4.txt:\n"); printf("4\n"); printf("\n"); printf("-0.6 -0.5 0.7 0.3\n"); printf("-0.3 -0.9 0.3 0.7\n"); printf("-0.4 -0.5 -0.3 -0.8\n"); printf("0.0 -0.1 0.2 0.9\n"); printf("\n"); printf("-0.85 -0.68 0.24 -0.53\n"); printf("\n"); printf("0.7 0.0 -0.4 -0.5\n"); exit(0); } cudaSetDevice(0); PrintDeviceProperties(); // char filename[100]; // sprintf(filename,"matrices/matrix%d.txt",size); for (i = 1; i < argc; i++) { if (argv[i][0] == '-') { // flag flag = argv[i][1]; switch (flag) { case 's': // platform i++; Size = atoi(argv[i]); printf("Create matrix internally in parse, size = %d \n", Size); a = (float *)malloc(Size * Size * sizeof(float)); create_matrix(a, Size); b = (float *)malloc(Size * sizeof(float)); for (j = 0; j < Size; j++) b[j] = 1.0; m = (float *)malloc(Size * Size * sizeof(float)); break; case 'f': // platform i++; printf("Read file from %s \n", argv[i]); InitProblemOnce(argv[i]); break; case 'q': // quiet verbose = 1; break; } } } // InitProblemOnce(filename); InitPerRun(); // begin timing struct timeval time_start; gettimeofday(&time_start, NULL); // run kernels ForwardSub(); // end timing struct timeval time_end; gettimeofday(&time_end, NULL); unsigned int time_total = (time_end.tv_sec * 1000000 + time_end.tv_usec) - (time_start.tv_sec * 1000000 + time_start.tv_usec); if (verbose) { printf("Matrix m is: \n"); PrintMat(m, Size, Size); printf("Matrix a is: \n"); PrintMat(a, Size, Size); printf("Array b is: \n"); PrintAry(b, Size); } BackSub(); if (verbose) { printf("The final solution is: \n"); PrintAry(finalVec, Size); } printf("\nTime total (including memory transfers)\t%f sec\n", time_total * 1e-6); printf("Time for CUDA kernels:\t%f sec\n", totalKernelTime * 1e-6); /*printf("%d,%d\n",size,time_total); fprintf(stderr,"%d,%d\n",size,time_total);*/ free(m); free(a); free(b); #ifdef TIMING printf("Exec: %f\n", kernel_time); #endif } /*------------------------------------------------------ ** PrintDeviceProperties **----------------------------------------------------- */ void PrintDeviceProperties() { cudaDeviceProp deviceProp; int nDevCount = 0; cudaGetDeviceCount(&nDevCount); printf("Total Device found: %d", nDevCount); for (int nDeviceIdx = 0; nDeviceIdx < nDevCount; ++nDeviceIdx) { memset(&deviceProp, 0, sizeof(deviceProp)); if (cudaSuccess == cudaGetDeviceProperties(&deviceProp, nDeviceIdx)) { printf("\nDevice Name \t\t - %s ", deviceProp.name); printf("\n**************************************"); printf("\nTotal Global Memory\t\t\t - %lu KB", deviceProp.totalGlobalMem / 1024); printf("\nShared memory available per block \t - %lu KB", deviceProp.sharedMemPerBlock / 1024); printf("\nNumber of registers per thread block \t - %d", deviceProp.regsPerBlock); printf("\nWarp size in threads \t\t\t - %d", deviceProp.warpSize); printf("\nMemory Pitch \t\t\t\t - %zu bytes", deviceProp.memPitch); printf("\nMaximum threads per block \t\t - %d", deviceProp.maxThreadsPerBlock); printf("\nMaximum Thread Dimension (block) \t - %d %d %d", deviceProp.maxThreadsDim[0], deviceProp.maxThreadsDim[1], deviceProp.maxThreadsDim[2]); printf("\nMaximum Thread Dimension (grid) \t - %d %d %d", deviceProp.maxGridSize[0], deviceProp.maxGridSize[1], deviceProp.maxGridSize[2]); printf("\nTotal constant memory \t\t\t - %zu bytes", deviceProp.totalConstMem); printf("\nCUDA ver \t\t\t\t - %d.%d", deviceProp.major, deviceProp.minor); printf("\nClock rate \t\t\t\t - %d KHz", deviceProp.clockRate); printf("\nTexture Alignment \t\t\t - %zu bytes", deviceProp.textureAlignment); printf("\nDevice Overlap \t\t\t\t - %s", deviceProp.deviceOverlap ? "Allowed" : "Not Allowed"); printf("\nNumber of Multi processors \t\t - %d\n\n", deviceProp.multiProcessorCount); } else printf("\n%s", cudaGetErrorString(cudaGetLastError())); } } /*------------------------------------------------------ ** InitProblemOnce -- Initialize all of matrices and ** vectors by opening a data file specified by the user. ** ** We used dynamic array *a, *b, and *m to allocate ** the memory storages. **------------------------------------------------------ */ void InitProblemOnce(char *filename) { // char *filename = argv[1]; // printf("Enter the data file name: "); // scanf("%s", filename); printf("The file name is: %s\n", filename); fp = fopen(filename, "r"); fscanf(fp, "%d", &Size); a = (float *)malloc(Size * Size * sizeof(float)); InitMat(a, Size, Size); printf("The input matrix a is:\n"); PrintMat(a, Size, Size); b = (float *)malloc(Size * sizeof(float)); InitAry(b, Size); printf("The input array b is:\n"); PrintAry(b, Size); m = (float *)malloc(Size * Size * sizeof(float)); } /*------------------------------------------------------ ** InitPerRun() -- Initialize the contents of the ** multipier matrix **m **------------------------------------------------------ */ void InitPerRun() { int i; for (i = 0; i < Size * Size; i++) *(m + i) = 0.0; } /*------------------------------------------------------- ** Fan1() -- Calculate multiplier matrix ** Pay attention to the index. Index i give the range ** which starts from 0 to range-1. The real values of ** the index should be adjust and related with the value ** of t which is defined on the ForwardSub(). **------------------------------------------------------- */ __global__ void Fan1(float *m_cuda, float *a_cuda, int Size, int t) { // if(threadIdx.x + blockIdx.x * blockDim.x >= Size-1-t) { // printf("blockIDx.x: %d, threadIdx.x: %d, Size: %d, t:%d, // Size-1-t: %d\n",blockIdx.x,threadIdx.x,Size,t,Size-1-t); // } if (threadIdx.x + blockIdx.x * blockDim.x >= Size - 1 - t) return; *(m_cuda + Size * (blockDim.x * blockIdx.x + threadIdx.x + t + 1) + t) = *(a_cuda + Size * (blockDim.x * blockIdx.x + threadIdx.x + t + 1) + t) / *(a_cuda + Size * t + t); } /*------------------------------------------------------- ** Fan2() -- Modify the matrix A into LUD **------------------------------------------------------- */ __global__ void Fan2(float *m_cuda, float *a_cuda, float *b_cuda, int Size, int j1, int t) { if (threadIdx.x + blockIdx.x * blockDim.x >= Size - 1 - t) return; if (threadIdx.y + blockIdx.y * blockDim.y >= Size - t) return; int xidx = blockIdx.x * blockDim.x + threadIdx.x; int yidx = blockIdx.y * blockDim.y + threadIdx.y; // printf("blockIdx.x: %d, threadIdx.x: %d, blockIdx.y: %d, threadIdx.y: %d, // blockDim.x: %d, blockDim.y: // %d\n",blockIdx.x,threadIdx.x,blockIdx.y,threadIdx.y,blockDim.x,blockDim.y); a_cuda[Size * (xidx + 1 + t) + (yidx + t)] -= m_cuda[Size * (xidx + 1 + t) + t] * a_cuda[Size * t + (yidx + t)]; // a_cuda[xidx+1+t][yidx+t] -= m_cuda[xidx+1+t][t] * a_cuda[t][yidx+t]; if (yidx == 0) { // printf("blockIdx.x:%d,threadIdx.x:%d,blockIdx.y:%d,threadIdx.y:%d,blockDim.x:%d,blockDim.y:%d\n",blockIdx.x,threadIdx.x,blockIdx.y,threadIdx.y,blockDim.x,blockDim.y); // printf("xidx:%d,yidx:%d\n",xidx,yidx); b_cuda[xidx + 1 + t] -= m_cuda[Size * (xidx + 1 + t) + (yidx + t)] * b_cuda[t]; } } /*------------------------------------------------------ ** ForwardSub() -- Forward substitution of Gaussian ** elimination. **------------------------------------------------------ */ void ForwardSub() { int t; float *m_cuda, *a_cuda, *b_cuda; int A = 1; int B = 2; int C = 3; int D = 4; int E = 5; int F = 6; // printf("blockIDx.x: %d, threadIdx.x: %d, Size: %d, t: %d, Size-1-t: %d\n", // A, B, C, D, E); printf("blockIdx.x: %d, threadIdx.x: %d, blockIdx.y: %d, // threadIdx.y: %d, blockDim.x: %d, blockDim.y: %d\n", A , B, C, D, E, F); // allocate memory on GPU cudaMalloc((void **)&m_cuda, Size * Size * sizeof(float)); cudaMalloc((void **)&a_cuda, Size * Size * sizeof(float)); cudaMalloc((void **)&b_cuda, Size * sizeof(float)); // copy memory to GPU cudaMemcpy(m_cuda, m, Size * Size * sizeof(float), cudaMemcpyHostToDevice); cudaMemcpy(a_cuda, a, Size * Size * sizeof(float), cudaMemcpyHostToDevice); cudaMemcpy(b_cuda, b, Size * sizeof(float), cudaMemcpyHostToDevice); int block_size, grid_size; block_size = MAXBLOCKSIZE; grid_size = (Size / block_size) + (!(Size % block_size) ? 0 : 1); printf("1d grid size: %d\n", grid_size); dim3 dimBlock(block_size); dim3 dimGrid(grid_size); // dim3 dimGrid( (N/dimBlock.x) + (!(N%dimBlock.x)?0:1) ); int blockSize2d, gridSize2d; blockSize2d = BLOCK_SIZE_XY; gridSize2d = (Size / blockSize2d) + (!(Size % blockSize2d ? 0 : 1)); dim3 dimBlockXY(blockSize2d, blockSize2d); printf("BlockXY: %d \n", blockSize2d); dim3 dimGridXY(gridSize2d, gridSize2d); #ifdef TIMING gettimeofday(&tv_kernel_start, NULL); #endif printf("first grid size: %d second: %d\n", grid_size, gridSize2d); // begin timing kernels struct timeval time_start; gettimeofday(&time_start, NULL); for (t = 0; t < (Size - 1); t++) { Fan1<<>>(m_cuda, a_cuda, Size, t); cudaDeviceSynchronize(); Fan2<<>>(m_cuda, a_cuda, b_cuda, Size, Size - t, t); cudaDeviceSynchronize(); checkCUDAError("Fan2"); } // end timing kernels struct timeval time_end; gettimeofday(&time_end, NULL); totalKernelTime = (time_end.tv_sec * 1000000 + time_end.tv_usec) - (time_start.tv_sec * 1000000 + time_start.tv_usec); #ifdef TIMING tvsub(&time_end, &tv_kernel_start, &tv); kernel_time += tv.tv_sec * 1000.0 + (float)tv.tv_usec / 1000.0; #endif // copy memory back to CPU cudaMemcpy(m, m_cuda, Size * Size * sizeof(float), cudaMemcpyDeviceToHost); cudaMemcpy(a, a_cuda, Size * Size * sizeof(float), cudaMemcpyDeviceToHost); cudaMemcpy(b, b_cuda, Size * sizeof(float), cudaMemcpyDeviceToHost); cudaFree(m_cuda); cudaFree(a_cuda); cudaFree(b_cuda); } /*------------------------------------------------------ ** BackSub() -- Backward substitution **------------------------------------------------------ */ void BackSub() { // create a new vector to hold the final answer finalVec = (float *)malloc(Size * sizeof(float)); // solve "bottom up" int i, j; for (i = 0; i < Size; i++) { finalVec[Size - i - 1] = b[Size - i - 1]; for (j = 0; j < i; j++) { finalVec[Size - i - 1] -= *(a + Size * (Size - i - 1) + (Size - j - 1)) * finalVec[Size - j - 1]; } finalVec[Size - i - 1] = finalVec[Size - i - 1] / *(a + Size * (Size - i - 1) + (Size - i - 1)); } } void InitMat(float *ary, int nrow, int ncol) { int i, j; for (i = 0; i < nrow; i++) { for (j = 0; j < ncol; j++) { fscanf(fp, "%f", ary + Size * i + j); } } } /*------------------------------------------------------ ** PrintMat() -- Print the contents of the matrix **------------------------------------------------------ */ void PrintMat(float *ary, int nrow, int ncol) { return; int i, j; for (i = 0; i < nrow; i++) { for (j = 0; j < ncol; j++) { printf("%8.2f ", *(ary + Size * i + j)); } printf("\n"); } printf("\n"); } /*------------------------------------------------------ ** InitAry() -- Initialize the array (vector) by reading ** data from the data file **------------------------------------------------------ */ void InitAry(float *ary, int ary_size) { int i; for (i = 0; i < ary_size; i++) { fscanf(fp, "%f", &ary[i]); } } /*------------------------------------------------------ ** PrintAry() -- Print the contents of the array (vector) **------------------------------------------------------ */ void PrintAry(float *ary, int ary_size) { int i; for (i = 0; i < ary_size; i++) { printf("%.2f ", ary[i]); } printf("\n\n"); } void checkCUDAError(const char *msg) { cudaError_t err = cudaGetLastError(); if (cudaSuccess != err) { fprintf(stderr, "Cuda error: %s: %s.\n", msg, cudaGetErrorString(err)); exit(EXIT_FAILURE); } }