523 lines
16 KiB
Plaintext
523 lines
16 KiB
Plaintext
|
/*-----------------------------------------------------------
|
||
|
** gaussian.cu -- The program is to solve a linear system Ax = b
|
||
|
** by using Gaussian Elimination. The algorithm on page 101
|
||
|
** ("Foundations of Parallel Programming") is used.
|
||
|
** The sequential version is gaussian.c. This parallel
|
||
|
** implementation converts three independent for() loops
|
||
|
** into three Fans. Use the data file ge_3.dat to verify
|
||
|
** the correction of the output.
|
||
|
**
|
||
|
** Written by Andreas Kura, 02/15/95
|
||
|
** Modified by Chong-wei Xu, 04/20/95
|
||
|
** Modified by Chris Gregg for CUDA, 07/20/2009
|
||
|
**-----------------------------------------------------------
|
||
|
*/
|
||
|
#include "cuda_runtime.h"
|
||
|
#include <math.h>
|
||
|
#include <stdio.h>
|
||
|
#include <stdlib.h>
|
||
|
#include <string.h>
|
||
|
#include <sys/time.h>
|
||
|
|
||
|
#ifdef TIMING
|
||
|
#include "timing.h"
|
||
|
#endif
|
||
|
|
||
|
#ifdef RD_WG_SIZE_0_0
|
||
|
#define MAXBLOCKSIZE RD_WG_SIZE_0_0
|
||
|
#elif defined(RD_WG_SIZE_0)
|
||
|
#define MAXBLOCKSIZE RD_WG_SIZE_0
|
||
|
#elif defined(RD_WG_SIZE)
|
||
|
#define MAXBLOCKSIZE RD_WG_SIZE
|
||
|
#else
|
||
|
#define MAXBLOCKSIZE 512
|
||
|
#endif
|
||
|
|
||
|
// 2D defines. Go from specific to general
|
||
|
#ifdef RD_WG_SIZE_1_0
|
||
|
#define BLOCK_SIZE_XY RD_WG_SIZE_1_0
|
||
|
#elif defined(RD_WG_SIZE_1)
|
||
|
#define BLOCK_SIZE_XY RD_WG_SIZE_1
|
||
|
#elif defined(RD_WG_SIZE)
|
||
|
#define BLOCK_SIZE_XY RD_WG_SIZE
|
||
|
#else
|
||
|
#define BLOCK_SIZE_XY 1
|
||
|
#endif
|
||
|
|
||
|
#ifdef TIMING
|
||
|
struct timeval tv;
|
||
|
struct timeval tv_total_start, tv_total_end;
|
||
|
struct timeval tv_h2d_start, tv_h2d_end;
|
||
|
struct timeval tv_d2h_start, tv_d2h_end;
|
||
|
struct timeval tv_kernel_start, tv_kernel_end;
|
||
|
struct timeval tv_mem_alloc_start, tv_mem_alloc_end;
|
||
|
struct timeval tv_close_start, tv_close_end;
|
||
|
float init_time = 0, mem_alloc_time = 0, h2d_time = 0, kernel_time = 0,
|
||
|
d2h_time = 0, close_time = 0, total_time = 0;
|
||
|
#endif
|
||
|
|
||
|
int Size;
|
||
|
float *a, *b, *finalVec;
|
||
|
float *m;
|
||
|
|
||
|
FILE *fp;
|
||
|
|
||
|
void InitProblemOnce(char *filename);
|
||
|
void InitPerRun();
|
||
|
void ForwardSub();
|
||
|
void BackSub();
|
||
|
__global__ void Fan1(float *m, float *a, int Size, int t);
|
||
|
__global__ void Fan2(float *m, float *a, float *b, int Size, int j1, int t);
|
||
|
void InitMat(float *ary, int nrow, int ncol);
|
||
|
void InitAry(float *ary, int ary_size);
|
||
|
void PrintMat(float *ary, int nrow, int ncolumn);
|
||
|
void PrintAry(float *ary, int ary_size);
|
||
|
void PrintDeviceProperties();
|
||
|
void checkCUDAError(const char *msg);
|
||
|
|
||
|
unsigned int totalKernelTime = 0;
|
||
|
|
||
|
// create both matrix and right hand side, Ke Wang 2013/08/12 11:51:06
|
||
|
void create_matrix(float *m, int size) {
|
||
|
int i, j;
|
||
|
float lamda = -0.01;
|
||
|
float coe[2 * size - 1];
|
||
|
float coe_i = 0.0;
|
||
|
|
||
|
for (i = 0; i < size; i++) {
|
||
|
coe_i = 10 * exp(lamda * i);
|
||
|
j = size - 1 + i;
|
||
|
coe[j] = coe_i;
|
||
|
j = size - 1 - i;
|
||
|
coe[j] = coe_i;
|
||
|
}
|
||
|
|
||
|
for (i = 0; i < size; i++) {
|
||
|
for (j = 0; j < size; j++) {
|
||
|
m[i * size + j] = coe[size - 1 - i + j];
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
int main(int argc, char *argv[]) {
|
||
|
printf("WG size of kernel 1 = %d, WG size of kernel 2= %d X %d\n",
|
||
|
MAXBLOCKSIZE, BLOCK_SIZE_XY, BLOCK_SIZE_XY);
|
||
|
int verbose = 1;
|
||
|
int i, j;
|
||
|
char flag;
|
||
|
if (argc < 2) {
|
||
|
printf("Usage: gaussian -f filename / -s size [-q]\n\n");
|
||
|
printf("-q (quiet) suppresses printing the matrix and result values.\n");
|
||
|
printf("-f (filename) path of input file\n");
|
||
|
printf(
|
||
|
"-s (size) size of matrix. Create matrix and rhs in this program \n");
|
||
|
printf(
|
||
|
"The first line of the file contains the dimension of the matrix, n.");
|
||
|
printf("The second line of the file is a newline.\n");
|
||
|
printf("The next n lines contain n tab separated values for the matrix.");
|
||
|
printf("The next line of the file is a newline.\n");
|
||
|
printf("The next line of the file is a 1xn vector with tab separated "
|
||
|
"values.\n");
|
||
|
printf("The next line of the file is a newline. (optional)\n");
|
||
|
printf("The final line of the file is the pre-computed solution. "
|
||
|
"(optional)\n");
|
||
|
printf("Example: matrix4.txt:\n");
|
||
|
printf("4\n");
|
||
|
printf("\n");
|
||
|
printf("-0.6 -0.5 0.7 0.3\n");
|
||
|
printf("-0.3 -0.9 0.3 0.7\n");
|
||
|
printf("-0.4 -0.5 -0.3 -0.8\n");
|
||
|
printf("0.0 -0.1 0.2 0.9\n");
|
||
|
printf("\n");
|
||
|
printf("-0.85 -0.68 0.24 -0.53\n");
|
||
|
printf("\n");
|
||
|
printf("0.7 0.0 -0.4 -0.5\n");
|
||
|
exit(0);
|
||
|
}
|
||
|
|
||
|
cudaSetDevice(0);
|
||
|
|
||
|
PrintDeviceProperties();
|
||
|
// char filename[100];
|
||
|
// sprintf(filename,"matrices/matrix%d.txt",size);
|
||
|
|
||
|
for (i = 1; i < argc; i++) {
|
||
|
if (argv[i][0] == '-') { // flag
|
||
|
flag = argv[i][1];
|
||
|
switch (flag) {
|
||
|
case 's': // platform
|
||
|
i++;
|
||
|
Size = atoi(argv[i]);
|
||
|
printf("Create matrix internally in parse, size = %d \n", Size);
|
||
|
|
||
|
a = (float *)malloc(Size * Size * sizeof(float));
|
||
|
create_matrix(a, Size);
|
||
|
|
||
|
b = (float *)malloc(Size * sizeof(float));
|
||
|
for (j = 0; j < Size; j++)
|
||
|
b[j] = 1.0;
|
||
|
|
||
|
m = (float *)malloc(Size * Size * sizeof(float));
|
||
|
break;
|
||
|
case 'f': // platform
|
||
|
i++;
|
||
|
printf("Read file from %s \n", argv[i]);
|
||
|
InitProblemOnce(argv[i]);
|
||
|
break;
|
||
|
case 'q': // quiet
|
||
|
verbose = 1;
|
||
|
break;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// InitProblemOnce(filename);
|
||
|
|
||
|
InitPerRun();
|
||
|
// begin timing
|
||
|
struct timeval time_start;
|
||
|
gettimeofday(&time_start, NULL);
|
||
|
|
||
|
// run kernels
|
||
|
ForwardSub();
|
||
|
|
||
|
// end timing
|
||
|
struct timeval time_end;
|
||
|
gettimeofday(&time_end, NULL);
|
||
|
unsigned int time_total = (time_end.tv_sec * 1000000 + time_end.tv_usec) -
|
||
|
(time_start.tv_sec * 1000000 + time_start.tv_usec);
|
||
|
|
||
|
if (verbose) {
|
||
|
printf("Matrix m is: \n");
|
||
|
PrintMat(m, Size, Size);
|
||
|
|
||
|
printf("Matrix a is: \n");
|
||
|
PrintMat(a, Size, Size);
|
||
|
|
||
|
printf("Array b is: \n");
|
||
|
PrintAry(b, Size);
|
||
|
}
|
||
|
BackSub();
|
||
|
if (verbose) {
|
||
|
printf("The final solution is: \n");
|
||
|
PrintAry(finalVec, Size);
|
||
|
}
|
||
|
printf("\nTime total (including memory transfers)\t%f sec\n",
|
||
|
time_total * 1e-6);
|
||
|
printf("Time for CUDA kernels:\t%f sec\n", totalKernelTime * 1e-6);
|
||
|
|
||
|
/*printf("%d,%d\n",size,time_total);
|
||
|
fprintf(stderr,"%d,%d\n",size,time_total);*/
|
||
|
|
||
|
free(m);
|
||
|
free(a);
|
||
|
free(b);
|
||
|
|
||
|
#ifdef TIMING
|
||
|
printf("Exec: %f\n", kernel_time);
|
||
|
#endif
|
||
|
}
|
||
|
/*------------------------------------------------------
|
||
|
** PrintDeviceProperties
|
||
|
**-----------------------------------------------------
|
||
|
*/
|
||
|
void PrintDeviceProperties() {
|
||
|
cudaDeviceProp deviceProp;
|
||
|
int nDevCount = 0;
|
||
|
|
||
|
cudaGetDeviceCount(&nDevCount);
|
||
|
printf("Total Device found: %d", nDevCount);
|
||
|
for (int nDeviceIdx = 0; nDeviceIdx < nDevCount; ++nDeviceIdx) {
|
||
|
memset(&deviceProp, 0, sizeof(deviceProp));
|
||
|
if (cudaSuccess == cudaGetDeviceProperties(&deviceProp, nDeviceIdx)) {
|
||
|
printf("\nDevice Name \t\t - %s ", deviceProp.name);
|
||
|
printf("\n**************************************");
|
||
|
printf("\nTotal Global Memory\t\t\t - %lu KB",
|
||
|
deviceProp.totalGlobalMem / 1024);
|
||
|
printf("\nShared memory available per block \t - %lu KB",
|
||
|
deviceProp.sharedMemPerBlock / 1024);
|
||
|
printf("\nNumber of registers per thread block \t - %d",
|
||
|
deviceProp.regsPerBlock);
|
||
|
printf("\nWarp size in threads \t\t\t - %d", deviceProp.warpSize);
|
||
|
printf("\nMemory Pitch \t\t\t\t - %zu bytes", deviceProp.memPitch);
|
||
|
printf("\nMaximum threads per block \t\t - %d",
|
||
|
deviceProp.maxThreadsPerBlock);
|
||
|
printf("\nMaximum Thread Dimension (block) \t - %d %d %d",
|
||
|
deviceProp.maxThreadsDim[0], deviceProp.maxThreadsDim[1],
|
||
|
deviceProp.maxThreadsDim[2]);
|
||
|
printf("\nMaximum Thread Dimension (grid) \t - %d %d %d",
|
||
|
deviceProp.maxGridSize[0], deviceProp.maxGridSize[1],
|
||
|
deviceProp.maxGridSize[2]);
|
||
|
printf("\nTotal constant memory \t\t\t - %zu bytes",
|
||
|
deviceProp.totalConstMem);
|
||
|
printf("\nCUDA ver \t\t\t\t - %d.%d", deviceProp.major, deviceProp.minor);
|
||
|
printf("\nClock rate \t\t\t\t - %d KHz", deviceProp.clockRate);
|
||
|
printf("\nTexture Alignment \t\t\t - %zu bytes",
|
||
|
deviceProp.textureAlignment);
|
||
|
printf("\nDevice Overlap \t\t\t\t - %s",
|
||
|
deviceProp.deviceOverlap ? "Allowed" : "Not Allowed");
|
||
|
printf("\nNumber of Multi processors \t\t - %d\n\n",
|
||
|
deviceProp.multiProcessorCount);
|
||
|
} else
|
||
|
printf("\n%s", cudaGetErrorString(cudaGetLastError()));
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/*------------------------------------------------------
|
||
|
** InitProblemOnce -- Initialize all of matrices and
|
||
|
** vectors by opening a data file specified by the user.
|
||
|
**
|
||
|
** We used dynamic array *a, *b, and *m to allocate
|
||
|
** the memory storages.
|
||
|
**------------------------------------------------------
|
||
|
*/
|
||
|
void InitProblemOnce(char *filename) {
|
||
|
// char *filename = argv[1];
|
||
|
|
||
|
// printf("Enter the data file name: ");
|
||
|
// scanf("%s", filename);
|
||
|
printf("The file name is: %s\n", filename);
|
||
|
|
||
|
fp = fopen(filename, "r");
|
||
|
|
||
|
fscanf(fp, "%d", &Size);
|
||
|
|
||
|
a = (float *)malloc(Size * Size * sizeof(float));
|
||
|
|
||
|
InitMat(a, Size, Size);
|
||
|
printf("The input matrix a is:\n");
|
||
|
PrintMat(a, Size, Size);
|
||
|
b = (float *)malloc(Size * sizeof(float));
|
||
|
|
||
|
InitAry(b, Size);
|
||
|
printf("The input array b is:\n");
|
||
|
PrintAry(b, Size);
|
||
|
|
||
|
m = (float *)malloc(Size * Size * sizeof(float));
|
||
|
}
|
||
|
|
||
|
/*------------------------------------------------------
|
||
|
** InitPerRun() -- Initialize the contents of the
|
||
|
** multipier matrix **m
|
||
|
**------------------------------------------------------
|
||
|
*/
|
||
|
void InitPerRun() {
|
||
|
int i;
|
||
|
for (i = 0; i < Size * Size; i++)
|
||
|
*(m + i) = 0.0;
|
||
|
}
|
||
|
|
||
|
/*-------------------------------------------------------
|
||
|
** Fan1() -- Calculate multiplier matrix
|
||
|
** Pay attention to the index. Index i give the range
|
||
|
** which starts from 0 to range-1. The real values of
|
||
|
** the index should be adjust and related with the value
|
||
|
** of t which is defined on the ForwardSub().
|
||
|
**-------------------------------------------------------
|
||
|
*/
|
||
|
__global__ void Fan1(float *m_cuda, float *a_cuda, int Size, int t) {
|
||
|
// if(threadIdx.x + blockIdx.x * blockDim.x >= Size-1-t) {
|
||
|
// printf("blockIDx.x: %d, threadIdx.x: %d, Size: %d, t:%d,
|
||
|
// Size-1-t: %d\n",blockIdx.x,threadIdx.x,Size,t,Size-1-t);
|
||
|
// }
|
||
|
|
||
|
if (threadIdx.x + blockIdx.x * blockDim.x >= Size - 1 - t)
|
||
|
return;
|
||
|
*(m_cuda + Size * (blockDim.x * blockIdx.x + threadIdx.x + t + 1) + t) =
|
||
|
*(a_cuda + Size * (blockDim.x * blockIdx.x + threadIdx.x + t + 1) + t) /
|
||
|
*(a_cuda + Size * t + t);
|
||
|
}
|
||
|
|
||
|
/*-------------------------------------------------------
|
||
|
** Fan2() -- Modify the matrix A into LUD
|
||
|
**-------------------------------------------------------
|
||
|
*/
|
||
|
|
||
|
__global__ void Fan2(float *m_cuda, float *a_cuda, float *b_cuda, int Size,
|
||
|
int j1, int t) {
|
||
|
if (threadIdx.x + blockIdx.x * blockDim.x >= Size - 1 - t)
|
||
|
return;
|
||
|
if (threadIdx.y + blockIdx.y * blockDim.y >= Size - t)
|
||
|
return;
|
||
|
|
||
|
int xidx = blockIdx.x * blockDim.x + threadIdx.x;
|
||
|
int yidx = blockIdx.y * blockDim.y + threadIdx.y;
|
||
|
// printf("blockIdx.x: %d, threadIdx.x: %d, blockIdx.y: %d, threadIdx.y: %d,
|
||
|
// blockDim.x: %d, blockDim.y:
|
||
|
// %d\n",blockIdx.x,threadIdx.x,blockIdx.y,threadIdx.y,blockDim.x,blockDim.y);
|
||
|
|
||
|
a_cuda[Size * (xidx + 1 + t) + (yidx + t)] -=
|
||
|
m_cuda[Size * (xidx + 1 + t) + t] * a_cuda[Size * t + (yidx + t)];
|
||
|
// a_cuda[xidx+1+t][yidx+t] -= m_cuda[xidx+1+t][t] * a_cuda[t][yidx+t];
|
||
|
if (yidx == 0) {
|
||
|
// printf("blockIdx.x:%d,threadIdx.x:%d,blockIdx.y:%d,threadIdx.y:%d,blockDim.x:%d,blockDim.y:%d\n",blockIdx.x,threadIdx.x,blockIdx.y,threadIdx.y,blockDim.x,blockDim.y);
|
||
|
// printf("xidx:%d,yidx:%d\n",xidx,yidx);
|
||
|
b_cuda[xidx + 1 + t] -=
|
||
|
m_cuda[Size * (xidx + 1 + t) + (yidx + t)] * b_cuda[t];
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/*------------------------------------------------------
|
||
|
** ForwardSub() -- Forward substitution of Gaussian
|
||
|
** elimination.
|
||
|
**------------------------------------------------------
|
||
|
*/
|
||
|
void ForwardSub() {
|
||
|
int t;
|
||
|
float *m_cuda, *a_cuda, *b_cuda;
|
||
|
|
||
|
int A = 1;
|
||
|
int B = 2;
|
||
|
int C = 3;
|
||
|
int D = 4;
|
||
|
int E = 5;
|
||
|
int F = 6;
|
||
|
// printf("blockIDx.x: %d, threadIdx.x: %d, Size: %d, t: %d, Size-1-t: %d\n",
|
||
|
// A, B, C, D, E); printf("blockIdx.x: %d, threadIdx.x: %d, blockIdx.y: %d,
|
||
|
// threadIdx.y: %d, blockDim.x: %d, blockDim.y: %d\n", A , B, C, D, E, F);
|
||
|
|
||
|
// allocate memory on GPU
|
||
|
cudaMalloc((void **)&m_cuda, Size * Size * sizeof(float));
|
||
|
|
||
|
cudaMalloc((void **)&a_cuda, Size * Size * sizeof(float));
|
||
|
|
||
|
cudaMalloc((void **)&b_cuda, Size * sizeof(float));
|
||
|
|
||
|
// copy memory to GPU
|
||
|
cudaMemcpy(m_cuda, m, Size * Size * sizeof(float), cudaMemcpyHostToDevice);
|
||
|
cudaMemcpy(a_cuda, a, Size * Size * sizeof(float), cudaMemcpyHostToDevice);
|
||
|
cudaMemcpy(b_cuda, b, Size * sizeof(float), cudaMemcpyHostToDevice);
|
||
|
|
||
|
int block_size, grid_size;
|
||
|
|
||
|
block_size = MAXBLOCKSIZE;
|
||
|
grid_size = (Size / block_size) + (!(Size % block_size) ? 0 : 1);
|
||
|
printf("1d grid size: %d\n", grid_size);
|
||
|
|
||
|
dim3 dimBlock(block_size);
|
||
|
dim3 dimGrid(grid_size);
|
||
|
// dim3 dimGrid( (N/dimBlock.x) + (!(N%dimBlock.x)?0:1) );
|
||
|
|
||
|
int blockSize2d, gridSize2d;
|
||
|
blockSize2d = BLOCK_SIZE_XY;
|
||
|
gridSize2d = (Size / blockSize2d) + (!(Size % blockSize2d ? 0 : 1));
|
||
|
|
||
|
dim3 dimBlockXY(blockSize2d, blockSize2d);
|
||
|
|
||
|
printf("BlockXY: %d \n", blockSize2d);
|
||
|
dim3 dimGridXY(gridSize2d, gridSize2d);
|
||
|
|
||
|
#ifdef TIMING
|
||
|
gettimeofday(&tv_kernel_start, NULL);
|
||
|
#endif
|
||
|
printf("first grid size: %d second: %d\n", grid_size, gridSize2d);
|
||
|
// begin timing kernels
|
||
|
struct timeval time_start;
|
||
|
gettimeofday(&time_start, NULL);
|
||
|
for (t = 0; t < (Size - 1); t++) {
|
||
|
Fan1<<<dimGrid, dimBlock>>>(m_cuda, a_cuda, Size, t);
|
||
|
cudaDeviceSynchronize();
|
||
|
Fan2<<<dimGridXY, dimBlockXY>>>(m_cuda, a_cuda, b_cuda, Size, Size - t, t);
|
||
|
cudaDeviceSynchronize();
|
||
|
checkCUDAError("Fan2");
|
||
|
}
|
||
|
// end timing kernels
|
||
|
struct timeval time_end;
|
||
|
gettimeofday(&time_end, NULL);
|
||
|
totalKernelTime = (time_end.tv_sec * 1000000 + time_end.tv_usec) -
|
||
|
(time_start.tv_sec * 1000000 + time_start.tv_usec);
|
||
|
|
||
|
#ifdef TIMING
|
||
|
tvsub(&time_end, &tv_kernel_start, &tv);
|
||
|
kernel_time += tv.tv_sec * 1000.0 + (float)tv.tv_usec / 1000.0;
|
||
|
#endif
|
||
|
|
||
|
// copy memory back to CPU
|
||
|
cudaMemcpy(m, m_cuda, Size * Size * sizeof(float), cudaMemcpyDeviceToHost);
|
||
|
cudaMemcpy(a, a_cuda, Size * Size * sizeof(float), cudaMemcpyDeviceToHost);
|
||
|
cudaMemcpy(b, b_cuda, Size * sizeof(float), cudaMemcpyDeviceToHost);
|
||
|
cudaFree(m_cuda);
|
||
|
cudaFree(a_cuda);
|
||
|
cudaFree(b_cuda);
|
||
|
}
|
||
|
|
||
|
/*------------------------------------------------------
|
||
|
** BackSub() -- Backward substitution
|
||
|
**------------------------------------------------------
|
||
|
*/
|
||
|
|
||
|
void BackSub() {
|
||
|
// create a new vector to hold the final answer
|
||
|
finalVec = (float *)malloc(Size * sizeof(float));
|
||
|
// solve "bottom up"
|
||
|
int i, j;
|
||
|
for (i = 0; i < Size; i++) {
|
||
|
finalVec[Size - i - 1] = b[Size - i - 1];
|
||
|
for (j = 0; j < i; j++) {
|
||
|
finalVec[Size - i - 1] -= *(a + Size * (Size - i - 1) + (Size - j - 1)) *
|
||
|
finalVec[Size - j - 1];
|
||
|
}
|
||
|
finalVec[Size - i - 1] =
|
||
|
finalVec[Size - i - 1] / *(a + Size * (Size - i - 1) + (Size - i - 1));
|
||
|
}
|
||
|
}
|
||
|
|
||
|
void InitMat(float *ary, int nrow, int ncol) {
|
||
|
int i, j;
|
||
|
|
||
|
for (i = 0; i < nrow; i++) {
|
||
|
for (j = 0; j < ncol; j++) {
|
||
|
fscanf(fp, "%f", ary + Size * i + j);
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/*------------------------------------------------------
|
||
|
** PrintMat() -- Print the contents of the matrix
|
||
|
**------------------------------------------------------
|
||
|
*/
|
||
|
void PrintMat(float *ary, int nrow, int ncol) {
|
||
|
return;
|
||
|
int i, j;
|
||
|
|
||
|
for (i = 0; i < nrow; i++) {
|
||
|
for (j = 0; j < ncol; j++) {
|
||
|
printf("%8.2f ", *(ary + Size * i + j));
|
||
|
}
|
||
|
printf("\n");
|
||
|
}
|
||
|
printf("\n");
|
||
|
}
|
||
|
|
||
|
/*------------------------------------------------------
|
||
|
** InitAry() -- Initialize the array (vector) by reading
|
||
|
** data from the data file
|
||
|
**------------------------------------------------------
|
||
|
*/
|
||
|
void InitAry(float *ary, int ary_size) {
|
||
|
int i;
|
||
|
|
||
|
for (i = 0; i < ary_size; i++) {
|
||
|
fscanf(fp, "%f", &ary[i]);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/*------------------------------------------------------
|
||
|
** PrintAry() -- Print the contents of the array (vector)
|
||
|
**------------------------------------------------------
|
||
|
*/
|
||
|
void PrintAry(float *ary, int ary_size) {
|
||
|
int i;
|
||
|
for (i = 0; i < ary_size; i++) {
|
||
|
printf("%.2f ", ary[i]);
|
||
|
}
|
||
|
printf("\n\n");
|
||
|
}
|
||
|
void checkCUDAError(const char *msg) {
|
||
|
cudaError_t err = cudaGetLastError();
|
||
|
if (cudaSuccess != err) {
|
||
|
fprintf(stderr, "Cuda error: %s: %s.\n", msg, cudaGetErrorString(err));
|
||
|
exit(EXIT_FAILURE);
|
||
|
}
|
||
|
}
|