239 lines
6.5 KiB
Plaintext
239 lines
6.5 KiB
Plaintext
|
#include <assert.h>
|
||
|
#include <stdio.h>
|
||
|
#include <stdlib.h>
|
||
|
#include <time.h>
|
||
|
|
||
|
#ifdef TIMING
|
||
|
#include "timing.h"
|
||
|
|
||
|
struct timeval tv;
|
||
|
struct timeval tv_total_start, tv_total_end;
|
||
|
struct timeval tv_h2d_start, tv_h2d_end;
|
||
|
struct timeval tv_d2h_start, tv_d2h_end;
|
||
|
struct timeval tv_kernel_start, tv_kernel_end;
|
||
|
struct timeval tv_mem_alloc_start, tv_mem_alloc_end;
|
||
|
struct timeval tv_close_start, tv_close_end;
|
||
|
float init_time = 0, mem_alloc_time = 0, h2d_time = 0, kernel_time = 0,
|
||
|
d2h_time = 0, close_time = 0, total_time = 0;
|
||
|
#endif
|
||
|
|
||
|
#define BLOCK_SIZE 256
|
||
|
#define STR_SIZE 256
|
||
|
#define DEVICE 0
|
||
|
#define HALO \
|
||
|
1 // halo width along one direction when advancing to the next iteration
|
||
|
|
||
|
//#define BENCH_PRINT
|
||
|
|
||
|
void run(int argc, char **argv);
|
||
|
|
||
|
int rows, cols;
|
||
|
int *data;
|
||
|
int **wall;
|
||
|
int *result;
|
||
|
#define M_SEED 9
|
||
|
int pyramid_height;
|
||
|
|
||
|
void init(int argc, char **argv) {
|
||
|
if (argc == 4) {
|
||
|
cols = atoi(argv[1]);
|
||
|
rows = atoi(argv[2]);
|
||
|
pyramid_height = atoi(argv[3]);
|
||
|
} else {
|
||
|
printf("Usage: dynproc row_len col_len pyramid_height\n");
|
||
|
exit(0);
|
||
|
}
|
||
|
data = new int[rows * cols];
|
||
|
wall = new int *[rows];
|
||
|
for (int n = 0; n < rows; n++)
|
||
|
wall[n] = data + cols * n;
|
||
|
result = new int[cols];
|
||
|
|
||
|
int seed = M_SEED;
|
||
|
srand(seed);
|
||
|
|
||
|
for (int i = 0; i < rows; i++) {
|
||
|
for (int j = 0; j < cols; j++) {
|
||
|
wall[i][j] = rand() % 10;
|
||
|
}
|
||
|
}
|
||
|
#ifdef BENCH_PRINT
|
||
|
for (int i = 0; i < rows; i++) {
|
||
|
for (int j = 0; j < cols; j++) {
|
||
|
printf("%d ", wall[i][j]);
|
||
|
}
|
||
|
printf("\n");
|
||
|
}
|
||
|
#endif
|
||
|
}
|
||
|
|
||
|
void fatal(char *s) { fprintf(stderr, "error: %s\n", s); }
|
||
|
|
||
|
#define IN_RANGE(x, min, max) ((x) >= (min) && (x) <= (max))
|
||
|
#define CLAMP_RANGE(x, min, max) x = (x < (min)) ? min : ((x > (max)) ? max : x)
|
||
|
#define MIN(a, b) ((a) <= (b) ? (a) : (b))
|
||
|
|
||
|
__global__ void dynproc_kernel(int iteration, int *gpuWall, int *gpuSrc,
|
||
|
int *gpuResults, int cols, int rows,
|
||
|
int startStep, int border) {
|
||
|
|
||
|
__shared__ int prev[BLOCK_SIZE];
|
||
|
__shared__ int result[BLOCK_SIZE];
|
||
|
|
||
|
int bx = blockIdx.x;
|
||
|
int tx = threadIdx.x;
|
||
|
|
||
|
// each block finally computes result for a small block
|
||
|
// after N iterations.
|
||
|
// it is the non-overlapping small blocks that cover
|
||
|
// all the input data
|
||
|
|
||
|
// calculate the small block size
|
||
|
int small_block_cols = BLOCK_SIZE - iteration * HALO * 2;
|
||
|
|
||
|
// calculate the boundary for the block according to
|
||
|
// the boundary of its small block
|
||
|
int blkX = small_block_cols * bx - border;
|
||
|
int blkXmax = blkX + BLOCK_SIZE - 1;
|
||
|
|
||
|
// calculate the global thread coordination
|
||
|
int xidx = blkX + tx;
|
||
|
|
||
|
// effective range within this block that falls within
|
||
|
// the valid range of the input data
|
||
|
// used to rule out computation outside the boundary.
|
||
|
int validXmin = (blkX < 0) ? -blkX : 0;
|
||
|
int validXmax = (blkXmax > cols - 1) ? BLOCK_SIZE - 1 - (blkXmax - cols + 1)
|
||
|
: BLOCK_SIZE - 1;
|
||
|
|
||
|
int W = tx - 1;
|
||
|
int E = tx + 1;
|
||
|
|
||
|
W = (W < validXmin) ? validXmin : W;
|
||
|
E = (E > validXmax) ? validXmax : E;
|
||
|
|
||
|
bool isValid = IN_RANGE(tx, validXmin, validXmax);
|
||
|
|
||
|
if (IN_RANGE(xidx, 0, cols - 1)) {
|
||
|
prev[tx] = gpuSrc[xidx];
|
||
|
}
|
||
|
__syncthreads(); // [Ronny] Added sync to avoid race on prev Aug. 14 2012
|
||
|
bool computed;
|
||
|
for (int i = 0; i < iteration; i++) {
|
||
|
computed = false;
|
||
|
if (IN_RANGE(tx, i + 1, BLOCK_SIZE - i - 2) && isValid) {
|
||
|
computed = true;
|
||
|
int left = prev[W];
|
||
|
int up = prev[tx];
|
||
|
int right = prev[E];
|
||
|
int shortest = MIN(left, up);
|
||
|
shortest = MIN(shortest, right);
|
||
|
int index = cols * (startStep + i) + xidx;
|
||
|
result[tx] = shortest + gpuWall[index];
|
||
|
}
|
||
|
__syncthreads();
|
||
|
if (i == iteration - 1)
|
||
|
break;
|
||
|
if (computed) // Assign the computation range
|
||
|
prev[tx] = result[tx];
|
||
|
__syncthreads(); // [Ronny] Added sync to avoid race on prev Aug. 14 2012
|
||
|
}
|
||
|
|
||
|
// update the global memory
|
||
|
// after the last iteration, only threads coordinated within the
|
||
|
// small block perform the calculation and switch on ``computed''
|
||
|
if (computed) {
|
||
|
gpuResults[xidx] = result[tx];
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/*
|
||
|
compute N time steps
|
||
|
*/
|
||
|
int calc_path(int *gpuWall, int *gpuResult[2], int rows, int cols,
|
||
|
int pyramid_height, int blockCols, int borderCols) {
|
||
|
dim3 dimBlock(BLOCK_SIZE);
|
||
|
dim3 dimGrid(blockCols);
|
||
|
|
||
|
int src = 1, dst = 0;
|
||
|
for (int t = 0; t < rows - 1; t += pyramid_height) {
|
||
|
int temp = src;
|
||
|
src = dst;
|
||
|
dst = temp;
|
||
|
dynproc_kernel<<<dimGrid, dimBlock>>>(
|
||
|
MIN(pyramid_height, rows - t - 1), gpuWall, gpuResult[src],
|
||
|
gpuResult[dst], cols, rows, t, borderCols);
|
||
|
|
||
|
// for the measurement fairness
|
||
|
cudaDeviceSynchronize();
|
||
|
}
|
||
|
return dst;
|
||
|
}
|
||
|
|
||
|
int main(int argc, char **argv) {
|
||
|
cudaSetDevice(0);
|
||
|
|
||
|
run(argc, argv);
|
||
|
|
||
|
return EXIT_SUCCESS;
|
||
|
}
|
||
|
|
||
|
void run(int argc, char **argv) {
|
||
|
init(argc, argv);
|
||
|
|
||
|
/* --------------- pyramid parameters --------------- */
|
||
|
int borderCols = (pyramid_height)*HALO;
|
||
|
int smallBlockCol = BLOCK_SIZE - (pyramid_height)*HALO * 2;
|
||
|
int blockCols = cols / smallBlockCol + ((cols % smallBlockCol == 0) ? 0 : 1);
|
||
|
|
||
|
printf("pyramidHeight: %d\ngridSize: [%d]\nborder:[%d]\nblockSize: "
|
||
|
"%d\nblockGrid:[%d]\ntargetBlock:[%d]\n",
|
||
|
pyramid_height, cols, borderCols, BLOCK_SIZE, blockCols,
|
||
|
smallBlockCol);
|
||
|
|
||
|
int *gpuWall, *gpuResult[2];
|
||
|
int size = rows * cols;
|
||
|
|
||
|
cudaMalloc((void **)&gpuResult[0], sizeof(int) * cols);
|
||
|
cudaMalloc((void **)&gpuResult[1], sizeof(int) * cols);
|
||
|
cudaMemcpy(gpuResult[0], data, sizeof(int) * cols, cudaMemcpyHostToDevice);
|
||
|
cudaMalloc((void **)&gpuWall, sizeof(int) * (size - cols));
|
||
|
cudaMemcpy(gpuWall, data + cols, sizeof(int) * (size - cols),
|
||
|
cudaMemcpyHostToDevice);
|
||
|
|
||
|
#ifdef TIMING
|
||
|
gettimeofday(&tv_kernel_start, NULL);
|
||
|
#endif
|
||
|
|
||
|
int final_ret = calc_path(gpuWall, gpuResult, rows, cols, pyramid_height,
|
||
|
blockCols, borderCols);
|
||
|
|
||
|
#ifdef TIMING
|
||
|
gettimeofday(&tv_kernel_end, NULL);
|
||
|
tvsub(&tv_kernel_end, &tv_kernel_start, &tv);
|
||
|
kernel_time += tv.tv_sec * 1000.0 + (float)tv.tv_usec / 1000.0;
|
||
|
#endif
|
||
|
|
||
|
cudaMemcpy(result, gpuResult[final_ret], sizeof(int) * cols,
|
||
|
cudaMemcpyDeviceToHost);
|
||
|
|
||
|
for (int i = 0; i < cols; i++)
|
||
|
printf("%d ", data[i]);
|
||
|
printf("\n");
|
||
|
for (int i = 0; i < cols; i++)
|
||
|
printf("%d ", result[i]);
|
||
|
printf("\n");
|
||
|
|
||
|
cudaFree(gpuWall);
|
||
|
cudaFree(gpuResult[0]);
|
||
|
cudaFree(gpuResult[1]);
|
||
|
|
||
|
delete[] data;
|
||
|
delete[] wall;
|
||
|
delete[] result;
|
||
|
|
||
|
#ifdef TIMING
|
||
|
printf("Exec: %f\n", kernel_time);
|
||
|
#endif
|
||
|
}
|