CuPBoP/examples/gauss/gaussian.cu

523 lines
16 KiB
Plaintext

/*-----------------------------------------------------------
** gaussian.cu -- The program is to solve a linear system Ax = b
** by using Gaussian Elimination. The algorithm on page 101
** ("Foundations of Parallel Programming") is used.
** The sequential version is gaussian.c. This parallel
** implementation converts three independent for() loops
** into three Fans. Use the data file ge_3.dat to verify
** the correction of the output.
**
** Written by Andreas Kura, 02/15/95
** Modified by Chong-wei Xu, 04/20/95
** Modified by Chris Gregg for CUDA, 07/20/2009
**-----------------------------------------------------------
*/
#include "cuda_runtime.h"
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/time.h>
#ifdef TIMING
#include "timing.h"
#endif
#ifdef RD_WG_SIZE_0_0
#define MAXBLOCKSIZE RD_WG_SIZE_0_0
#elif defined(RD_WG_SIZE_0)
#define MAXBLOCKSIZE RD_WG_SIZE_0
#elif defined(RD_WG_SIZE)
#define MAXBLOCKSIZE RD_WG_SIZE
#else
#define MAXBLOCKSIZE 512
#endif
// 2D defines. Go from specific to general
#ifdef RD_WG_SIZE_1_0
#define BLOCK_SIZE_XY RD_WG_SIZE_1_0
#elif defined(RD_WG_SIZE_1)
#define BLOCK_SIZE_XY RD_WG_SIZE_1
#elif defined(RD_WG_SIZE)
#define BLOCK_SIZE_XY RD_WG_SIZE
#else
#define BLOCK_SIZE_XY 1
#endif
#ifdef TIMING
struct timeval tv;
struct timeval tv_total_start, tv_total_end;
struct timeval tv_h2d_start, tv_h2d_end;
struct timeval tv_d2h_start, tv_d2h_end;
struct timeval tv_kernel_start, tv_kernel_end;
struct timeval tv_mem_alloc_start, tv_mem_alloc_end;
struct timeval tv_close_start, tv_close_end;
float init_time = 0, mem_alloc_time = 0, h2d_time = 0, kernel_time = 0,
d2h_time = 0, close_time = 0, total_time = 0;
#endif
int Size;
float *a, *b, *finalVec;
float *m;
FILE *fp;
void InitProblemOnce(char *filename);
void InitPerRun();
void ForwardSub();
void BackSub();
__global__ void Fan1(float *m, float *a, int Size, int t);
__global__ void Fan2(float *m, float *a, float *b, int Size, int j1, int t);
void InitMat(float *ary, int nrow, int ncol);
void InitAry(float *ary, int ary_size);
void PrintMat(float *ary, int nrow, int ncolumn);
void PrintAry(float *ary, int ary_size);
void PrintDeviceProperties();
void checkCUDAError(const char *msg);
unsigned int totalKernelTime = 0;
// create both matrix and right hand side, Ke Wang 2013/08/12 11:51:06
void create_matrix(float *m, int size) {
int i, j;
float lamda = -0.01;
float coe[2 * size - 1];
float coe_i = 0.0;
for (i = 0; i < size; i++) {
coe_i = 10 * exp(lamda * i);
j = size - 1 + i;
coe[j] = coe_i;
j = size - 1 - i;
coe[j] = coe_i;
}
for (i = 0; i < size; i++) {
for (j = 0; j < size; j++) {
m[i * size + j] = coe[size - 1 - i + j];
}
}
}
int main(int argc, char *argv[]) {
printf("WG size of kernel 1 = %d, WG size of kernel 2= %d X %d\n",
MAXBLOCKSIZE, BLOCK_SIZE_XY, BLOCK_SIZE_XY);
int verbose = 1;
int i, j;
char flag;
if (argc < 2) {
printf("Usage: gaussian -f filename / -s size [-q]\n\n");
printf("-q (quiet) suppresses printing the matrix and result values.\n");
printf("-f (filename) path of input file\n");
printf(
"-s (size) size of matrix. Create matrix and rhs in this program \n");
printf(
"The first line of the file contains the dimension of the matrix, n.");
printf("The second line of the file is a newline.\n");
printf("The next n lines contain n tab separated values for the matrix.");
printf("The next line of the file is a newline.\n");
printf("The next line of the file is a 1xn vector with tab separated "
"values.\n");
printf("The next line of the file is a newline. (optional)\n");
printf("The final line of the file is the pre-computed solution. "
"(optional)\n");
printf("Example: matrix4.txt:\n");
printf("4\n");
printf("\n");
printf("-0.6 -0.5 0.7 0.3\n");
printf("-0.3 -0.9 0.3 0.7\n");
printf("-0.4 -0.5 -0.3 -0.8\n");
printf("0.0 -0.1 0.2 0.9\n");
printf("\n");
printf("-0.85 -0.68 0.24 -0.53\n");
printf("\n");
printf("0.7 0.0 -0.4 -0.5\n");
exit(0);
}
cudaSetDevice(0);
PrintDeviceProperties();
// char filename[100];
// sprintf(filename,"matrices/matrix%d.txt",size);
for (i = 1; i < argc; i++) {
if (argv[i][0] == '-') { // flag
flag = argv[i][1];
switch (flag) {
case 's': // platform
i++;
Size = atoi(argv[i]);
printf("Create matrix internally in parse, size = %d \n", Size);
a = (float *)malloc(Size * Size * sizeof(float));
create_matrix(a, Size);
b = (float *)malloc(Size * sizeof(float));
for (j = 0; j < Size; j++)
b[j] = 1.0;
m = (float *)malloc(Size * Size * sizeof(float));
break;
case 'f': // platform
i++;
printf("Read file from %s \n", argv[i]);
InitProblemOnce(argv[i]);
break;
case 'q': // quiet
verbose = 1;
break;
}
}
}
// InitProblemOnce(filename);
InitPerRun();
// begin timing
struct timeval time_start;
gettimeofday(&time_start, NULL);
// run kernels
ForwardSub();
// end timing
struct timeval time_end;
gettimeofday(&time_end, NULL);
unsigned int time_total = (time_end.tv_sec * 1000000 + time_end.tv_usec) -
(time_start.tv_sec * 1000000 + time_start.tv_usec);
if (verbose) {
printf("Matrix m is: \n");
PrintMat(m, Size, Size);
printf("Matrix a is: \n");
PrintMat(a, Size, Size);
printf("Array b is: \n");
PrintAry(b, Size);
}
BackSub();
if (verbose) {
printf("The final solution is: \n");
PrintAry(finalVec, Size);
}
printf("\nTime total (including memory transfers)\t%f sec\n",
time_total * 1e-6);
printf("Time for CUDA kernels:\t%f sec\n", totalKernelTime * 1e-6);
/*printf("%d,%d\n",size,time_total);
fprintf(stderr,"%d,%d\n",size,time_total);*/
free(m);
free(a);
free(b);
#ifdef TIMING
printf("Exec: %f\n", kernel_time);
#endif
}
/*------------------------------------------------------
** PrintDeviceProperties
**-----------------------------------------------------
*/
void PrintDeviceProperties() {
cudaDeviceProp deviceProp;
int nDevCount = 0;
cudaGetDeviceCount(&nDevCount);
printf("Total Device found: %d", nDevCount);
for (int nDeviceIdx = 0; nDeviceIdx < nDevCount; ++nDeviceIdx) {
memset(&deviceProp, 0, sizeof(deviceProp));
if (cudaSuccess == cudaGetDeviceProperties(&deviceProp, nDeviceIdx)) {
printf("\nDevice Name \t\t - %s ", deviceProp.name);
printf("\n**************************************");
printf("\nTotal Global Memory\t\t\t - %lu KB",
deviceProp.totalGlobalMem / 1024);
printf("\nShared memory available per block \t - %lu KB",
deviceProp.sharedMemPerBlock / 1024);
printf("\nNumber of registers per thread block \t - %d",
deviceProp.regsPerBlock);
printf("\nWarp size in threads \t\t\t - %d", deviceProp.warpSize);
printf("\nMemory Pitch \t\t\t\t - %zu bytes", deviceProp.memPitch);
printf("\nMaximum threads per block \t\t - %d",
deviceProp.maxThreadsPerBlock);
printf("\nMaximum Thread Dimension (block) \t - %d %d %d",
deviceProp.maxThreadsDim[0], deviceProp.maxThreadsDim[1],
deviceProp.maxThreadsDim[2]);
printf("\nMaximum Thread Dimension (grid) \t - %d %d %d",
deviceProp.maxGridSize[0], deviceProp.maxGridSize[1],
deviceProp.maxGridSize[2]);
printf("\nTotal constant memory \t\t\t - %zu bytes",
deviceProp.totalConstMem);
printf("\nCUDA ver \t\t\t\t - %d.%d", deviceProp.major, deviceProp.minor);
printf("\nClock rate \t\t\t\t - %d KHz", deviceProp.clockRate);
printf("\nTexture Alignment \t\t\t - %zu bytes",
deviceProp.textureAlignment);
printf("\nDevice Overlap \t\t\t\t - %s",
deviceProp.deviceOverlap ? "Allowed" : "Not Allowed");
printf("\nNumber of Multi processors \t\t - %d\n\n",
deviceProp.multiProcessorCount);
} else
printf("\n%s", cudaGetErrorString(cudaGetLastError()));
}
}
/*------------------------------------------------------
** InitProblemOnce -- Initialize all of matrices and
** vectors by opening a data file specified by the user.
**
** We used dynamic array *a, *b, and *m to allocate
** the memory storages.
**------------------------------------------------------
*/
void InitProblemOnce(char *filename) {
// char *filename = argv[1];
// printf("Enter the data file name: ");
// scanf("%s", filename);
printf("The file name is: %s\n", filename);
fp = fopen(filename, "r");
fscanf(fp, "%d", &Size);
a = (float *)malloc(Size * Size * sizeof(float));
InitMat(a, Size, Size);
printf("The input matrix a is:\n");
PrintMat(a, Size, Size);
b = (float *)malloc(Size * sizeof(float));
InitAry(b, Size);
printf("The input array b is:\n");
PrintAry(b, Size);
m = (float *)malloc(Size * Size * sizeof(float));
}
/*------------------------------------------------------
** InitPerRun() -- Initialize the contents of the
** multipier matrix **m
**------------------------------------------------------
*/
void InitPerRun() {
int i;
for (i = 0; i < Size * Size; i++)
*(m + i) = 0.0;
}
/*-------------------------------------------------------
** Fan1() -- Calculate multiplier matrix
** Pay attention to the index. Index i give the range
** which starts from 0 to range-1. The real values of
** the index should be adjust and related with the value
** of t which is defined on the ForwardSub().
**-------------------------------------------------------
*/
__global__ void Fan1(float *m_cuda, float *a_cuda, int Size, int t) {
// if(threadIdx.x + blockIdx.x * blockDim.x >= Size-1-t) {
// printf("blockIDx.x: %d, threadIdx.x: %d, Size: %d, t:%d,
// Size-1-t: %d\n",blockIdx.x,threadIdx.x,Size,t,Size-1-t);
// }
if (threadIdx.x + blockIdx.x * blockDim.x >= Size - 1 - t)
return;
*(m_cuda + Size * (blockDim.x * blockIdx.x + threadIdx.x + t + 1) + t) =
*(a_cuda + Size * (blockDim.x * blockIdx.x + threadIdx.x + t + 1) + t) /
*(a_cuda + Size * t + t);
}
/*-------------------------------------------------------
** Fan2() -- Modify the matrix A into LUD
**-------------------------------------------------------
*/
__global__ void Fan2(float *m_cuda, float *a_cuda, float *b_cuda, int Size,
int j1, int t) {
if (threadIdx.x + blockIdx.x * blockDim.x >= Size - 1 - t)
return;
if (threadIdx.y + blockIdx.y * blockDim.y >= Size - t)
return;
int xidx = blockIdx.x * blockDim.x + threadIdx.x;
int yidx = blockIdx.y * blockDim.y + threadIdx.y;
// printf("blockIdx.x: %d, threadIdx.x: %d, blockIdx.y: %d, threadIdx.y: %d,
// blockDim.x: %d, blockDim.y:
// %d\n",blockIdx.x,threadIdx.x,blockIdx.y,threadIdx.y,blockDim.x,blockDim.y);
a_cuda[Size * (xidx + 1 + t) + (yidx + t)] -=
m_cuda[Size * (xidx + 1 + t) + t] * a_cuda[Size * t + (yidx + t)];
// a_cuda[xidx+1+t][yidx+t] -= m_cuda[xidx+1+t][t] * a_cuda[t][yidx+t];
if (yidx == 0) {
// printf("blockIdx.x:%d,threadIdx.x:%d,blockIdx.y:%d,threadIdx.y:%d,blockDim.x:%d,blockDim.y:%d\n",blockIdx.x,threadIdx.x,blockIdx.y,threadIdx.y,blockDim.x,blockDim.y);
// printf("xidx:%d,yidx:%d\n",xidx,yidx);
b_cuda[xidx + 1 + t] -=
m_cuda[Size * (xidx + 1 + t) + (yidx + t)] * b_cuda[t];
}
}
/*------------------------------------------------------
** ForwardSub() -- Forward substitution of Gaussian
** elimination.
**------------------------------------------------------
*/
void ForwardSub() {
int t;
float *m_cuda, *a_cuda, *b_cuda;
int A = 1;
int B = 2;
int C = 3;
int D = 4;
int E = 5;
int F = 6;
// printf("blockIDx.x: %d, threadIdx.x: %d, Size: %d, t: %d, Size-1-t: %d\n",
// A, B, C, D, E); printf("blockIdx.x: %d, threadIdx.x: %d, blockIdx.y: %d,
// threadIdx.y: %d, blockDim.x: %d, blockDim.y: %d\n", A , B, C, D, E, F);
// allocate memory on GPU
cudaMalloc((void **)&m_cuda, Size * Size * sizeof(float));
cudaMalloc((void **)&a_cuda, Size * Size * sizeof(float));
cudaMalloc((void **)&b_cuda, Size * sizeof(float));
// copy memory to GPU
cudaMemcpy(m_cuda, m, Size * Size * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(a_cuda, a, Size * Size * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(b_cuda, b, Size * sizeof(float), cudaMemcpyHostToDevice);
int block_size, grid_size;
block_size = MAXBLOCKSIZE;
grid_size = (Size / block_size) + (!(Size % block_size) ? 0 : 1);
printf("1d grid size: %d\n", grid_size);
dim3 dimBlock(block_size);
dim3 dimGrid(grid_size);
// dim3 dimGrid( (N/dimBlock.x) + (!(N%dimBlock.x)?0:1) );
int blockSize2d, gridSize2d;
blockSize2d = BLOCK_SIZE_XY;
gridSize2d = (Size / blockSize2d) + (!(Size % blockSize2d ? 0 : 1));
dim3 dimBlockXY(blockSize2d, blockSize2d);
printf("BlockXY: %d \n", blockSize2d);
dim3 dimGridXY(gridSize2d, gridSize2d);
#ifdef TIMING
gettimeofday(&tv_kernel_start, NULL);
#endif
printf("first grid size: %d second: %d\n", grid_size, gridSize2d);
// begin timing kernels
struct timeval time_start;
gettimeofday(&time_start, NULL);
for (t = 0; t < (Size - 1); t++) {
Fan1<<<dimGrid, dimBlock>>>(m_cuda, a_cuda, Size, t);
cudaDeviceSynchronize();
Fan2<<<dimGridXY, dimBlockXY>>>(m_cuda, a_cuda, b_cuda, Size, Size - t, t);
cudaDeviceSynchronize();
checkCUDAError("Fan2");
}
// end timing kernels
struct timeval time_end;
gettimeofday(&time_end, NULL);
totalKernelTime = (time_end.tv_sec * 1000000 + time_end.tv_usec) -
(time_start.tv_sec * 1000000 + time_start.tv_usec);
#ifdef TIMING
tvsub(&time_end, &tv_kernel_start, &tv);
kernel_time += tv.tv_sec * 1000.0 + (float)tv.tv_usec / 1000.0;
#endif
// copy memory back to CPU
cudaMemcpy(m, m_cuda, Size * Size * sizeof(float), cudaMemcpyDeviceToHost);
cudaMemcpy(a, a_cuda, Size * Size * sizeof(float), cudaMemcpyDeviceToHost);
cudaMemcpy(b, b_cuda, Size * sizeof(float), cudaMemcpyDeviceToHost);
cudaFree(m_cuda);
cudaFree(a_cuda);
cudaFree(b_cuda);
}
/*------------------------------------------------------
** BackSub() -- Backward substitution
**------------------------------------------------------
*/
void BackSub() {
// create a new vector to hold the final answer
finalVec = (float *)malloc(Size * sizeof(float));
// solve "bottom up"
int i, j;
for (i = 0; i < Size; i++) {
finalVec[Size - i - 1] = b[Size - i - 1];
for (j = 0; j < i; j++) {
finalVec[Size - i - 1] -= *(a + Size * (Size - i - 1) + (Size - j - 1)) *
finalVec[Size - j - 1];
}
finalVec[Size - i - 1] =
finalVec[Size - i - 1] / *(a + Size * (Size - i - 1) + (Size - i - 1));
}
}
void InitMat(float *ary, int nrow, int ncol) {
int i, j;
for (i = 0; i < nrow; i++) {
for (j = 0; j < ncol; j++) {
fscanf(fp, "%f", ary + Size * i + j);
}
}
}
/*------------------------------------------------------
** PrintMat() -- Print the contents of the matrix
**------------------------------------------------------
*/
void PrintMat(float *ary, int nrow, int ncol) {
return;
int i, j;
for (i = 0; i < nrow; i++) {
for (j = 0; j < ncol; j++) {
printf("%8.2f ", *(ary + Size * i + j));
}
printf("\n");
}
printf("\n");
}
/*------------------------------------------------------
** InitAry() -- Initialize the array (vector) by reading
** data from the data file
**------------------------------------------------------
*/
void InitAry(float *ary, int ary_size) {
int i;
for (i = 0; i < ary_size; i++) {
fscanf(fp, "%f", &ary[i]);
}
}
/*------------------------------------------------------
** PrintAry() -- Print the contents of the array (vector)
**------------------------------------------------------
*/
void PrintAry(float *ary, int ary_size) {
int i;
for (i = 0; i < ary_size; i++) {
printf("%.2f ", ary[i]);
}
printf("\n\n");
}
void checkCUDAError(const char *msg) {
cudaError_t err = cudaGetLastError();
if (cudaSuccess != err) {
fprintf(stderr, "Cuda error: %s: %s.\n", msg, cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
}