CuPBoP/examples/srad_v2/srad_kernel.cu

258 lines
6.1 KiB
Plaintext

#include "srad.h"
#include <stdio.h>
__global__ void
srad_cuda_1(
float *E_C,
float *W_C,
float *N_C,
float *S_C,
float * J_cuda,
float * C_cuda,
int cols,
int rows,
float q0sqr
)
{
//block id
int bx = blockIdx.x;
int by = blockIdx.y;
//thread id
int tx = threadIdx.x;
int ty = threadIdx.y;
//indices
int index = cols * BLOCK_SIZE * by + BLOCK_SIZE * bx + cols * ty + tx;
int index_n = cols * BLOCK_SIZE * by + BLOCK_SIZE * bx + tx - cols;
int index_s = cols * BLOCK_SIZE * by + BLOCK_SIZE * bx + cols * BLOCK_SIZE + tx;
int index_w = cols * BLOCK_SIZE * by + BLOCK_SIZE * bx + cols * ty - 1;
int index_e = cols * BLOCK_SIZE * by + BLOCK_SIZE * bx + cols * ty + BLOCK_SIZE;
float n, w, e, s, jc, g2, l, num, den, qsqr, c;
//shared memory allocation
__shared__ float temp[BLOCK_SIZE][BLOCK_SIZE];
__shared__ float temp_result[BLOCK_SIZE][BLOCK_SIZE];
__shared__ float north[BLOCK_SIZE][BLOCK_SIZE];
__shared__ float south[BLOCK_SIZE][BLOCK_SIZE];
__shared__ float east[BLOCK_SIZE][BLOCK_SIZE];
__shared__ float west[BLOCK_SIZE][BLOCK_SIZE];
//load data to shared memory
north[ty][tx] = J_cuda[index_n];
south[ty][tx] = J_cuda[index_s];
if ( by == 0 ){
north[ty][tx] = J_cuda[BLOCK_SIZE * bx + tx];
}
else if ( by == gridDim.y - 1 ){
south[ty][tx] = J_cuda[cols * BLOCK_SIZE * (gridDim.y - 1) + BLOCK_SIZE * bx + cols * ( BLOCK_SIZE - 1 ) + tx];
}
__syncthreads();
west[ty][tx] = J_cuda[index_w];
east[ty][tx] = J_cuda[index_e];
if ( bx == 0 ){
west[ty][tx] = J_cuda[cols * BLOCK_SIZE * by + cols * ty];
}
else if ( bx == gridDim.x - 1 ){
east[ty][tx] = J_cuda[cols * BLOCK_SIZE * by + BLOCK_SIZE * ( gridDim.x - 1) + cols * ty + BLOCK_SIZE-1];
}
__syncthreads();
temp[ty][tx] = J_cuda[index];
__syncthreads();
jc = temp[ty][tx];
if ( ty == 0 && tx == 0 ){ //nw
n = north[ty][tx] - jc;
s = temp[ty+1][tx] - jc;
w = west[ty][tx] - jc;
e = temp[ty][tx+1] - jc;
}
else if ( ty == 0 && tx == BLOCK_SIZE-1 ){ //ne
n = north[ty][tx] - jc;
s = temp[ty+1][tx] - jc;
w = temp[ty][tx-1] - jc;
e = east[ty][tx] - jc;
}
else if ( ty == BLOCK_SIZE -1 && tx == BLOCK_SIZE - 1){ //se
n = temp[ty-1][tx] - jc;
s = south[ty][tx] - jc;
w = temp[ty][tx-1] - jc;
e = east[ty][tx] - jc;
}
else if ( ty == BLOCK_SIZE -1 && tx == 0 ){//sw
n = temp[ty-1][tx] - jc;
s = south[ty][tx] - jc;
w = west[ty][tx] - jc;
e = temp[ty][tx+1] - jc;
}
else if ( ty == 0 ){ //n
n = north[ty][tx] - jc;
s = temp[ty+1][tx] - jc;
w = temp[ty][tx-1] - jc;
e = temp[ty][tx+1] - jc;
}
else if ( tx == BLOCK_SIZE -1 ){ //e
n = temp[ty-1][tx] - jc;
s = temp[ty+1][tx] - jc;
w = temp[ty][tx-1] - jc;
e = east[ty][tx] - jc;
}
else if ( ty == BLOCK_SIZE -1){ //s
n = temp[ty-1][tx] - jc;
s = south[ty][tx] - jc;
w = temp[ty][tx-1] - jc;
e = temp[ty][tx+1] - jc;
}
else if ( tx == 0 ){ //w
n = temp[ty-1][tx] - jc;
s = temp[ty+1][tx] - jc;
w = west[ty][tx] - jc;
e = temp[ty][tx+1] - jc;
}
else{ //the data elements which are not on the borders
n = temp[ty-1][tx] - jc;
s = temp[ty+1][tx] - jc;
w = temp[ty][tx-1] - jc;
e = temp[ty][tx+1] - jc;
}
g2 = ( n * n + s * s + w * w + e * e ) / (jc * jc);
l = ( n + s + w + e ) / jc;
num = (0.5*g2) - ((1.0/16.0)*(l*l)) ;
den = 1 + (.25*l);
qsqr = num/(den*den);
// diffusion coefficent (equ 33)
den = (qsqr-q0sqr) / (q0sqr * (1+q0sqr)) ;
c = 1.0 / (1.0+den) ;
// saturate diffusion coefficent
if (c < 0){temp_result[ty][tx] = 0;}
else if (c > 1) {temp_result[ty][tx] = 1;}
else {temp_result[ty][tx] = c;}
__syncthreads();
C_cuda[index] = temp_result[ty][tx];
E_C[index] = e;
W_C[index] = w;
S_C[index] = s;
N_C[index] = n;
}
__global__ void
srad_cuda_2(
float *E_C,
float *W_C,
float *N_C,
float *S_C,
float * J_cuda,
float * C_cuda,
int cols,
int rows,
float lambda,
float q0sqr
)
{
//block id
int bx = blockIdx.x;
int by = blockIdx.y;
//thread id
int tx = threadIdx.x;
int ty = threadIdx.y;
//indices
int index = cols * BLOCK_SIZE * by + BLOCK_SIZE * bx + cols * ty + tx;
int index_s = cols * BLOCK_SIZE * by + BLOCK_SIZE * bx + cols * BLOCK_SIZE + tx;
int index_e = cols * BLOCK_SIZE * by + BLOCK_SIZE * bx + cols * ty + BLOCK_SIZE;
float cc, cn, cs, ce, cw, d_sum;
//shared memory allocation
__shared__ float south_c[BLOCK_SIZE][BLOCK_SIZE];
__shared__ float east_c[BLOCK_SIZE][BLOCK_SIZE];
__shared__ float c_cuda_temp[BLOCK_SIZE][BLOCK_SIZE];
__shared__ float c_cuda_result[BLOCK_SIZE][BLOCK_SIZE];
__shared__ float temp[BLOCK_SIZE][BLOCK_SIZE];
//load data to shared memory
temp[ty][tx] = J_cuda[index];
__syncthreads();
south_c[ty][tx] = C_cuda[index_s];
if ( by == gridDim.y - 1 ){
south_c[ty][tx] = C_cuda[cols * BLOCK_SIZE * (gridDim.y - 1) + BLOCK_SIZE * bx + cols * ( BLOCK_SIZE - 1 ) + tx];
}
__syncthreads();
east_c[ty][tx] = C_cuda[index_e];
if ( bx == gridDim.x - 1 ){
east_c[ty][tx] = C_cuda[cols * BLOCK_SIZE * by + BLOCK_SIZE * ( gridDim.x - 1) + cols * ty + BLOCK_SIZE-1];
}
__syncthreads();
c_cuda_temp[ty][tx] = C_cuda[index];
__syncthreads();
cc = c_cuda_temp[ty][tx];
if ( ty == BLOCK_SIZE -1 && tx == BLOCK_SIZE - 1){ //se
cn = cc;
cs = south_c[ty][tx];
cw = cc;
ce = east_c[ty][tx];
}
else if ( tx == BLOCK_SIZE -1 ){ //e
cn = cc;
cs = c_cuda_temp[ty+1][tx];
cw = cc;
ce = east_c[ty][tx];
}
else if ( ty == BLOCK_SIZE -1){ //s
cn = cc;
cs = south_c[ty][tx];
cw = cc;
ce = c_cuda_temp[ty][tx+1];
}
else{ //the data elements which are not on the borders
cn = cc;
cs = c_cuda_temp[ty+1][tx];
cw = cc;
ce = c_cuda_temp[ty][tx+1];
}
// divergence (equ 58)
d_sum = cn * N_C[index] + cs * S_C[index] + cw * W_C[index] + ce * E_C[index];
// image update (equ 61)
c_cuda_result[ty][tx] = temp[ty][tx] + 0.25 * lambda * d_sum;
__syncthreads();
J_cuda[index] = c_cuda_result[ty][tx];
}