258 lines
6.1 KiB
Plaintext
258 lines
6.1 KiB
Plaintext
#include "srad.h"
|
|
#include <stdio.h>
|
|
|
|
__global__ void
|
|
srad_cuda_1(
|
|
float *E_C,
|
|
float *W_C,
|
|
float *N_C,
|
|
float *S_C,
|
|
float * J_cuda,
|
|
float * C_cuda,
|
|
int cols,
|
|
int rows,
|
|
float q0sqr
|
|
)
|
|
{
|
|
|
|
//block id
|
|
int bx = blockIdx.x;
|
|
int by = blockIdx.y;
|
|
|
|
//thread id
|
|
int tx = threadIdx.x;
|
|
int ty = threadIdx.y;
|
|
|
|
//indices
|
|
int index = cols * BLOCK_SIZE * by + BLOCK_SIZE * bx + cols * ty + tx;
|
|
int index_n = cols * BLOCK_SIZE * by + BLOCK_SIZE * bx + tx - cols;
|
|
int index_s = cols * BLOCK_SIZE * by + BLOCK_SIZE * bx + cols * BLOCK_SIZE + tx;
|
|
int index_w = cols * BLOCK_SIZE * by + BLOCK_SIZE * bx + cols * ty - 1;
|
|
int index_e = cols * BLOCK_SIZE * by + BLOCK_SIZE * bx + cols * ty + BLOCK_SIZE;
|
|
|
|
float n, w, e, s, jc, g2, l, num, den, qsqr, c;
|
|
|
|
//shared memory allocation
|
|
__shared__ float temp[BLOCK_SIZE][BLOCK_SIZE];
|
|
__shared__ float temp_result[BLOCK_SIZE][BLOCK_SIZE];
|
|
|
|
__shared__ float north[BLOCK_SIZE][BLOCK_SIZE];
|
|
__shared__ float south[BLOCK_SIZE][BLOCK_SIZE];
|
|
__shared__ float east[BLOCK_SIZE][BLOCK_SIZE];
|
|
__shared__ float west[BLOCK_SIZE][BLOCK_SIZE];
|
|
|
|
//load data to shared memory
|
|
north[ty][tx] = J_cuda[index_n];
|
|
south[ty][tx] = J_cuda[index_s];
|
|
if ( by == 0 ){
|
|
north[ty][tx] = J_cuda[BLOCK_SIZE * bx + tx];
|
|
}
|
|
else if ( by == gridDim.y - 1 ){
|
|
south[ty][tx] = J_cuda[cols * BLOCK_SIZE * (gridDim.y - 1) + BLOCK_SIZE * bx + cols * ( BLOCK_SIZE - 1 ) + tx];
|
|
}
|
|
__syncthreads();
|
|
|
|
west[ty][tx] = J_cuda[index_w];
|
|
east[ty][tx] = J_cuda[index_e];
|
|
|
|
if ( bx == 0 ){
|
|
west[ty][tx] = J_cuda[cols * BLOCK_SIZE * by + cols * ty];
|
|
}
|
|
else if ( bx == gridDim.x - 1 ){
|
|
east[ty][tx] = J_cuda[cols * BLOCK_SIZE * by + BLOCK_SIZE * ( gridDim.x - 1) + cols * ty + BLOCK_SIZE-1];
|
|
}
|
|
|
|
__syncthreads();
|
|
|
|
|
|
|
|
temp[ty][tx] = J_cuda[index];
|
|
|
|
__syncthreads();
|
|
|
|
jc = temp[ty][tx];
|
|
|
|
if ( ty == 0 && tx == 0 ){ //nw
|
|
n = north[ty][tx] - jc;
|
|
s = temp[ty+1][tx] - jc;
|
|
w = west[ty][tx] - jc;
|
|
e = temp[ty][tx+1] - jc;
|
|
}
|
|
else if ( ty == 0 && tx == BLOCK_SIZE-1 ){ //ne
|
|
n = north[ty][tx] - jc;
|
|
s = temp[ty+1][tx] - jc;
|
|
w = temp[ty][tx-1] - jc;
|
|
e = east[ty][tx] - jc;
|
|
}
|
|
else if ( ty == BLOCK_SIZE -1 && tx == BLOCK_SIZE - 1){ //se
|
|
n = temp[ty-1][tx] - jc;
|
|
s = south[ty][tx] - jc;
|
|
w = temp[ty][tx-1] - jc;
|
|
e = east[ty][tx] - jc;
|
|
}
|
|
else if ( ty == BLOCK_SIZE -1 && tx == 0 ){//sw
|
|
n = temp[ty-1][tx] - jc;
|
|
s = south[ty][tx] - jc;
|
|
w = west[ty][tx] - jc;
|
|
e = temp[ty][tx+1] - jc;
|
|
}
|
|
|
|
else if ( ty == 0 ){ //n
|
|
n = north[ty][tx] - jc;
|
|
s = temp[ty+1][tx] - jc;
|
|
w = temp[ty][tx-1] - jc;
|
|
e = temp[ty][tx+1] - jc;
|
|
}
|
|
else if ( tx == BLOCK_SIZE -1 ){ //e
|
|
n = temp[ty-1][tx] - jc;
|
|
s = temp[ty+1][tx] - jc;
|
|
w = temp[ty][tx-1] - jc;
|
|
e = east[ty][tx] - jc;
|
|
}
|
|
else if ( ty == BLOCK_SIZE -1){ //s
|
|
n = temp[ty-1][tx] - jc;
|
|
s = south[ty][tx] - jc;
|
|
w = temp[ty][tx-1] - jc;
|
|
e = temp[ty][tx+1] - jc;
|
|
}
|
|
else if ( tx == 0 ){ //w
|
|
n = temp[ty-1][tx] - jc;
|
|
s = temp[ty+1][tx] - jc;
|
|
w = west[ty][tx] - jc;
|
|
e = temp[ty][tx+1] - jc;
|
|
}
|
|
else{ //the data elements which are not on the borders
|
|
n = temp[ty-1][tx] - jc;
|
|
s = temp[ty+1][tx] - jc;
|
|
w = temp[ty][tx-1] - jc;
|
|
e = temp[ty][tx+1] - jc;
|
|
}
|
|
|
|
|
|
g2 = ( n * n + s * s + w * w + e * e ) / (jc * jc);
|
|
|
|
l = ( n + s + w + e ) / jc;
|
|
|
|
num = (0.5*g2) - ((1.0/16.0)*(l*l)) ;
|
|
den = 1 + (.25*l);
|
|
qsqr = num/(den*den);
|
|
|
|
// diffusion coefficent (equ 33)
|
|
den = (qsqr-q0sqr) / (q0sqr * (1+q0sqr)) ;
|
|
c = 1.0 / (1.0+den) ;
|
|
|
|
// saturate diffusion coefficent
|
|
if (c < 0){temp_result[ty][tx] = 0;}
|
|
else if (c > 1) {temp_result[ty][tx] = 1;}
|
|
else {temp_result[ty][tx] = c;}
|
|
|
|
__syncthreads();
|
|
|
|
C_cuda[index] = temp_result[ty][tx];
|
|
E_C[index] = e;
|
|
W_C[index] = w;
|
|
S_C[index] = s;
|
|
N_C[index] = n;
|
|
|
|
}
|
|
|
|
__global__ void
|
|
srad_cuda_2(
|
|
float *E_C,
|
|
float *W_C,
|
|
float *N_C,
|
|
float *S_C,
|
|
float * J_cuda,
|
|
float * C_cuda,
|
|
int cols,
|
|
int rows,
|
|
float lambda,
|
|
float q0sqr
|
|
)
|
|
{
|
|
//block id
|
|
int bx = blockIdx.x;
|
|
int by = blockIdx.y;
|
|
|
|
//thread id
|
|
int tx = threadIdx.x;
|
|
int ty = threadIdx.y;
|
|
|
|
//indices
|
|
int index = cols * BLOCK_SIZE * by + BLOCK_SIZE * bx + cols * ty + tx;
|
|
int index_s = cols * BLOCK_SIZE * by + BLOCK_SIZE * bx + cols * BLOCK_SIZE + tx;
|
|
int index_e = cols * BLOCK_SIZE * by + BLOCK_SIZE * bx + cols * ty + BLOCK_SIZE;
|
|
float cc, cn, cs, ce, cw, d_sum;
|
|
|
|
//shared memory allocation
|
|
__shared__ float south_c[BLOCK_SIZE][BLOCK_SIZE];
|
|
__shared__ float east_c[BLOCK_SIZE][BLOCK_SIZE];
|
|
|
|
__shared__ float c_cuda_temp[BLOCK_SIZE][BLOCK_SIZE];
|
|
__shared__ float c_cuda_result[BLOCK_SIZE][BLOCK_SIZE];
|
|
__shared__ float temp[BLOCK_SIZE][BLOCK_SIZE];
|
|
|
|
//load data to shared memory
|
|
temp[ty][tx] = J_cuda[index];
|
|
|
|
__syncthreads();
|
|
|
|
south_c[ty][tx] = C_cuda[index_s];
|
|
|
|
if ( by == gridDim.y - 1 ){
|
|
south_c[ty][tx] = C_cuda[cols * BLOCK_SIZE * (gridDim.y - 1) + BLOCK_SIZE * bx + cols * ( BLOCK_SIZE - 1 ) + tx];
|
|
}
|
|
__syncthreads();
|
|
|
|
|
|
east_c[ty][tx] = C_cuda[index_e];
|
|
|
|
if ( bx == gridDim.x - 1 ){
|
|
east_c[ty][tx] = C_cuda[cols * BLOCK_SIZE * by + BLOCK_SIZE * ( gridDim.x - 1) + cols * ty + BLOCK_SIZE-1];
|
|
}
|
|
|
|
__syncthreads();
|
|
|
|
c_cuda_temp[ty][tx] = C_cuda[index];
|
|
|
|
__syncthreads();
|
|
|
|
cc = c_cuda_temp[ty][tx];
|
|
|
|
if ( ty == BLOCK_SIZE -1 && tx == BLOCK_SIZE - 1){ //se
|
|
cn = cc;
|
|
cs = south_c[ty][tx];
|
|
cw = cc;
|
|
ce = east_c[ty][tx];
|
|
}
|
|
else if ( tx == BLOCK_SIZE -1 ){ //e
|
|
cn = cc;
|
|
cs = c_cuda_temp[ty+1][tx];
|
|
cw = cc;
|
|
ce = east_c[ty][tx];
|
|
}
|
|
else if ( ty == BLOCK_SIZE -1){ //s
|
|
cn = cc;
|
|
cs = south_c[ty][tx];
|
|
cw = cc;
|
|
ce = c_cuda_temp[ty][tx+1];
|
|
}
|
|
else{ //the data elements which are not on the borders
|
|
cn = cc;
|
|
cs = c_cuda_temp[ty+1][tx];
|
|
cw = cc;
|
|
ce = c_cuda_temp[ty][tx+1];
|
|
}
|
|
|
|
// divergence (equ 58)
|
|
d_sum = cn * N_C[index] + cs * S_C[index] + cw * W_C[index] + ce * E_C[index];
|
|
|
|
// image update (equ 61)
|
|
c_cuda_result[ty][tx] = temp[ty][tx] + 0.25 * lambda * d_sum;
|
|
|
|
__syncthreads();
|
|
|
|
J_cuda[index] = c_cuda_result[ty][tx];
|
|
|
|
}
|