CuPBoP/examples/srad_v2/srad_kernel.cu

#include "srad.h"
#include <stdio.h>

__global__ void
srad_cuda_1(
		  float *E_C,
		  float *W_C,
		  float *N_C,
		  float *S_C,
		  float * J_cuda,
		  float * C_cuda,
		  int cols,
		  int rows,
		  float q0sqr
)
{

  //block id
  int bx = blockIdx.x;
  int by = blockIdx.y;

  //thread id
  int tx = threadIdx.x;
  int ty = threadIdx.y;

  //indices
  int index   = cols * BLOCK_SIZE * by + BLOCK_SIZE * bx + cols * ty + tx;
  int index_n = cols * BLOCK_SIZE * by + BLOCK_SIZE * bx + tx - cols;
  int index_s = cols * BLOCK_SIZE * by + BLOCK_SIZE * bx + cols * BLOCK_SIZE + tx;
  int index_w = cols * BLOCK_SIZE * by + BLOCK_SIZE * bx + cols * ty - 1;
  int index_e = cols * BLOCK_SIZE * by + BLOCK_SIZE * bx + cols * ty + BLOCK_SIZE;

  float n, w, e, s, jc, g2, l, num, den, qsqr, c;

  //shared memory allocation
  __shared__ float temp[BLOCK_SIZE][BLOCK_SIZE];
  __shared__ float temp_result[BLOCK_SIZE][BLOCK_SIZE];

  __shared__ float north[BLOCK_SIZE][BLOCK_SIZE];
  __shared__ float south[BLOCK_SIZE][BLOCK_SIZE];
  __shared__ float  east[BLOCK_SIZE][BLOCK_SIZE];
  __shared__ float  west[BLOCK_SIZE][BLOCK_SIZE];

  //load data to shared memory
  north[ty][tx] = J_cuda[index_n];
  south[ty][tx] = J_cuda[index_s];
  if ( by == 0 ){
  north[ty][tx] = J_cuda[BLOCK_SIZE * bx + tx];
  }
  else if ( by == gridDim.y - 1 ){
  south[ty][tx] = J_cuda[cols * BLOCK_SIZE * (gridDim.y - 1) + BLOCK_SIZE * bx + cols * ( BLOCK_SIZE - 1 ) + tx];
  }
   __syncthreads();

  west[ty][tx] = J_cuda[index_w];
  east[ty][tx] = J_cuda[index_e];

  if ( bx == 0 ){
  west[ty][tx] = J_cuda[cols * BLOCK_SIZE * by + cols * ty];
  }
  else if ( bx == gridDim.x - 1 ){
  east[ty][tx] = J_cuda[cols * BLOCK_SIZE * by + BLOCK_SIZE * ( gridDim.x - 1) + cols * ty + BLOCK_SIZE-1];
  }

  __syncthreads();


  temp[ty][tx]      = J_cuda[index];

  __syncthreads();

   jc = temp[ty][tx];

   if ( ty == 0 && tx == 0 ){ //nw
	n  = north[ty][tx] - jc;
    s  = temp[ty+1][tx] - jc;
    w  = west[ty][tx]  - jc;
    e  = temp[ty][tx+1] - jc;
   }
   else if ( ty == 0 && tx == BLOCK_SIZE-1 ){ //ne
	n  = north[ty][tx] - jc;
    s  = temp[ty+1][tx] - jc;
    w  = temp[ty][tx-1] - jc;
    e  = east[ty][tx] - jc;
   }
   else if ( ty == BLOCK_SIZE -1 && tx == BLOCK_SIZE - 1){ //se
	n  = temp[ty-1][tx] - jc;
    s  = south[ty][tx] - jc;
    w  = temp[ty][tx-1] - jc;
    e  = east[ty][tx]  - jc;
   }
   else if ( ty == BLOCK_SIZE -1 && tx == 0 ){//sw
	n  = temp[ty-1][tx] - jc;
    s  = south[ty][tx] - jc;
    w  = west[ty][tx]  - jc;
    e  = temp[ty][tx+1] - jc;
   }

   else if ( ty == 0 ){ //n
	n  = north[ty][tx] - jc;
    s  = temp[ty+1][tx] - jc;
    w  = temp[ty][tx-1] - jc;
    e  = temp[ty][tx+1] - jc;
   }
   else if ( tx == BLOCK_SIZE -1 ){ //e
	n  = temp[ty-1][tx] - jc;
    s  = temp[ty+1][tx] - jc;
    w  = temp[ty][tx-1] - jc;
    e  = east[ty][tx] - jc;
   }
   else if ( ty == BLOCK_SIZE -1){ //s
	n  = temp[ty-1][tx] - jc;
    s  = south[ty][tx] - jc;
    w  = temp[ty][tx-1] - jc;
    e  = temp[ty][tx+1] - jc;
   }
   else if ( tx == 0 ){ //w
	n  = temp[ty-1][tx] - jc;
    s  = temp[ty+1][tx] - jc;
    w  = west[ty][tx] - jc;
    e  = temp[ty][tx+1] - jc;
   }
   else{  //the data elements which are not on the borders
	n  = temp[ty-1][tx] - jc;
    s  = temp[ty+1][tx] - jc;
    w  = temp[ty][tx-1] - jc;
    e  = temp[ty][tx+1] - jc;
   }


    g2 = ( n * n + s * s + w * w + e * e ) / (jc * jc);

    l = ( n + s + w + e ) / jc;

	num  = (0.5*g2) - ((1.0/16.0)*(l*l)) ;
	den  = 1 + (.25*l);
	qsqr = num/(den*den);

	// diffusion coefficent (equ 33)
	den = (qsqr-q0sqr) / (q0sqr * (1+q0sqr)) ;
	c = 1.0 / (1.0+den) ;

    // saturate diffusion coefficent
	if (c < 0){temp_result[ty][tx] = 0;}
	else if (c > 1) {temp_result[ty][tx] = 1;}
	else {temp_result[ty][tx] = c;}

    __syncthreads();

    C_cuda[index] = temp_result[ty][tx];
	E_C[index] = e;
	W_C[index] = w;
	S_C[index] = s;
	N_C[index] = n;

}

__global__ void
srad_cuda_2(
		  float *E_C,
		  float *W_C,
		  float *N_C,
		  float *S_C,
		  float * J_cuda,
		  float * C_cuda,
		  int cols,
		  int rows,
		  float lambda,
		  float q0sqr
)
{
	//block id
	int bx = blockIdx.x;
    int by = blockIdx.y;

	//thread id
    int tx = threadIdx.x;
    int ty = threadIdx.y;

	//indices
    int index   = cols * BLOCK_SIZE * by + BLOCK_SIZE * bx + cols * ty + tx;
	int index_s = cols * BLOCK_SIZE * by + BLOCK_SIZE * bx + cols * BLOCK_SIZE + tx;
    int index_e = cols * BLOCK_SIZE * by + BLOCK_SIZE * bx + cols * ty + BLOCK_SIZE;
	float cc, cn, cs, ce, cw, d_sum;

	//shared memory allocation
	__shared__ float south_c[BLOCK_SIZE][BLOCK_SIZE];
    __shared__ float  east_c[BLOCK_SIZE][BLOCK_SIZE];

    __shared__ float c_cuda_temp[BLOCK_SIZE][BLOCK_SIZE];
    __shared__ float c_cuda_result[BLOCK_SIZE][BLOCK_SIZE];
    __shared__ float temp[BLOCK_SIZE][BLOCK_SIZE];

    //load data to shared memory
	temp[ty][tx]      = J_cuda[index];

    __syncthreads();

	south_c[ty][tx] = C_cuda[index_s];

	if ( by == gridDim.y - 1 ){
	south_c[ty][tx] = C_cuda[cols * BLOCK_SIZE * (gridDim.y - 1) + BLOCK_SIZE * bx + cols * ( BLOCK_SIZE - 1 ) + tx];
	}
	__syncthreads();


	east_c[ty][tx] = C_cuda[index_e];

	if ( bx == gridDim.x - 1 ){
	east_c[ty][tx] = C_cuda[cols * BLOCK_SIZE * by + BLOCK_SIZE * ( gridDim.x - 1) + cols * ty + BLOCK_SIZE-1];
	}

    __syncthreads();

    c_cuda_temp[ty][tx]      = C_cuda[index];

    __syncthreads();

	cc = c_cuda_temp[ty][tx];

   if ( ty == BLOCK_SIZE -1 && tx == BLOCK_SIZE - 1){ //se
	cn  = cc;
    cs  = south_c[ty][tx];
    cw  = cc;
    ce  = east_c[ty][tx];
   }
   else if ( tx == BLOCK_SIZE -1 ){ //e
	cn  = cc;
    cs  = c_cuda_temp[ty+1][tx];
    cw  = cc;
    ce  = east_c[ty][tx];
   }
   else if ( ty == BLOCK_SIZE -1){ //s
	cn  = cc;
    cs  = south_c[ty][tx];
    cw  = cc;
    ce  = c_cuda_temp[ty][tx+1];
   }
   else{ //the data elements which are not on the borders
	cn  = cc;
    cs  = c_cuda_temp[ty+1][tx];
    cw  = cc;
    ce  = c_cuda_temp[ty][tx+1];
   }

   // divergence (equ 58)
   d_sum = cn * N_C[index] + cs * S_C[index] + cw * W_C[index] + ce * E_C[index];

   // image update (equ 61)
   c_cuda_result[ty][tx] = temp[ty][tx] + 0.25 * lambda * d_sum;

   __syncthreads();

   J_cuda[index] = c_cuda_result[ty][tx];

}
add codebase for TACO submission 2022-05-04 20:59:38 +08:00			`#include "srad.h"`
			`#include <stdio.h>`

			`__global__ void`
			`srad_cuda_1(`
			`float *E_C,`
			`float *W_C,`
			`float *N_C,`
			`float *S_C,`
			`float * J_cuda,`
			`float * C_cuda,`
			`int cols,`
			`int rows,`
			`float q0sqr`
			`)`
			`{`

			`//block id`
			`int bx = blockIdx.x;`
			`int by = blockIdx.y;`

			`//thread id`
			`int tx = threadIdx.x;`
			`int ty = threadIdx.y;`

			`//indices`
			`int index = cols * BLOCK_SIZE * by + BLOCK_SIZE * bx + cols * ty + tx;`
			`int index_n = cols * BLOCK_SIZE * by + BLOCK_SIZE * bx + tx - cols;`
			`int index_s = cols * BLOCK_SIZE * by + BLOCK_SIZE * bx + cols * BLOCK_SIZE + tx;`
			`int index_w = cols * BLOCK_SIZE * by + BLOCK_SIZE * bx + cols * ty - 1;`
			`int index_e = cols * BLOCK_SIZE * by + BLOCK_SIZE * bx + cols * ty + BLOCK_SIZE;`

			`float n, w, e, s, jc, g2, l, num, den, qsqr, c;`

			`//shared memory allocation`
			`__shared__ float temp[BLOCK_SIZE][BLOCK_SIZE];`
			`__shared__ float temp_result[BLOCK_SIZE][BLOCK_SIZE];`

			`__shared__ float north[BLOCK_SIZE][BLOCK_SIZE];`
			`__shared__ float south[BLOCK_SIZE][BLOCK_SIZE];`
			`__shared__ float east[BLOCK_SIZE][BLOCK_SIZE];`
			`__shared__ float west[BLOCK_SIZE][BLOCK_SIZE];`

			`//load data to shared memory`
			`north[ty][tx] = J_cuda[index_n];`
			`south[ty][tx] = J_cuda[index_s];`
			`if ( by == 0 ){`
			`north[ty][tx] = J_cuda[BLOCK_SIZE * bx + tx];`
			`}`
			`else if ( by == gridDim.y - 1 ){`
			`south[ty][tx] = J_cuda[cols * BLOCK_SIZE * (gridDim.y - 1) + BLOCK_SIZE * bx + cols * ( BLOCK_SIZE - 1 ) + tx];`
			`}`
			`__syncthreads();`

			`west[ty][tx] = J_cuda[index_w];`
			`east[ty][tx] = J_cuda[index_e];`

			`if ( bx == 0 ){`
			`west[ty][tx] = J_cuda[cols * BLOCK_SIZE * by + cols * ty];`
			`}`
			`else if ( bx == gridDim.x - 1 ){`
			`east[ty][tx] = J_cuda[cols * BLOCK_SIZE * by + BLOCK_SIZE * ( gridDim.x - 1) + cols * ty + BLOCK_SIZE-1];`
			`}`

			`__syncthreads();`



			`temp[ty][tx] = J_cuda[index];`

			`__syncthreads();`

			`jc = temp[ty][tx];`

			`if ( ty == 0 && tx == 0 ){ //nw`
			`n = north[ty][tx] - jc;`
			`s = temp[ty+1][tx] - jc;`
			`w = west[ty][tx] - jc;`
			`e = temp[ty][tx+1] - jc;`
			`}`
			`else if ( ty == 0 && tx == BLOCK_SIZE-1 ){ //ne`
			`n = north[ty][tx] - jc;`
			`s = temp[ty+1][tx] - jc;`
			`w = temp[ty][tx-1] - jc;`
			`e = east[ty][tx] - jc;`
			`}`
			`else if ( ty == BLOCK_SIZE -1 && tx == BLOCK_SIZE - 1){ //se`
			`n = temp[ty-1][tx] - jc;`
			`s = south[ty][tx] - jc;`
			`w = temp[ty][tx-1] - jc;`
			`e = east[ty][tx] - jc;`
			`}`
			`else if ( ty == BLOCK_SIZE -1 && tx == 0 ){//sw`
			`n = temp[ty-1][tx] - jc;`
			`s = south[ty][tx] - jc;`
			`w = west[ty][tx] - jc;`
			`e = temp[ty][tx+1] - jc;`
			`}`

			`else if ( ty == 0 ){ //n`
			`n = north[ty][tx] - jc;`
			`s = temp[ty+1][tx] - jc;`
			`w = temp[ty][tx-1] - jc;`
			`e = temp[ty][tx+1] - jc;`
			`}`
			`else if ( tx == BLOCK_SIZE -1 ){ //e`
			`n = temp[ty-1][tx] - jc;`
			`s = temp[ty+1][tx] - jc;`
			`w = temp[ty][tx-1] - jc;`
			`e = east[ty][tx] - jc;`
			`}`
			`else if ( ty == BLOCK_SIZE -1){ //s`
			`n = temp[ty-1][tx] - jc;`
			`s = south[ty][tx] - jc;`
			`w = temp[ty][tx-1] - jc;`
			`e = temp[ty][tx+1] - jc;`
			`}`
			`else if ( tx == 0 ){ //w`
			`n = temp[ty-1][tx] - jc;`
			`s = temp[ty+1][tx] - jc;`
			`w = west[ty][tx] - jc;`
			`e = temp[ty][tx+1] - jc;`
			`}`
			`else{ //the data elements which are not on the borders`
			`n = temp[ty-1][tx] - jc;`
			`s = temp[ty+1][tx] - jc;`
			`w = temp[ty][tx-1] - jc;`
			`e = temp[ty][tx+1] - jc;`
			`}`


			`g2 = ( n * n + s * s + w * w + e * e ) / (jc * jc);`

			`l = ( n + s + w + e ) / jc;`

			`num = (0.5g2) - ((1.0/16.0)(l*l)) ;`
			`den = 1 + (.25*l);`
			`qsqr = num/(den*den);`

			`// diffusion coefficent (equ 33)`
			`den = (qsqr-q0sqr) / (q0sqr * (1+q0sqr)) ;`
			`c = 1.0 / (1.0+den) ;`

			`// saturate diffusion coefficent`
			`if (c < 0){temp_result[ty][tx] = 0;}`
			`else if (c > 1) {temp_result[ty][tx] = 1;}`
			`else {temp_result[ty][tx] = c;}`

			`__syncthreads();`

			`C_cuda[index] = temp_result[ty][tx];`
			`E_C[index] = e;`
			`W_C[index] = w;`
			`S_C[index] = s;`
			`N_C[index] = n;`

			`}`

			`__global__ void`
			`srad_cuda_2(`
			`float *E_C,`
			`float *W_C,`
			`float *N_C,`
			`float *S_C,`
			`float * J_cuda,`
			`float * C_cuda,`
			`int cols,`
			`int rows,`
			`float lambda,`
			`float q0sqr`
			`)`
			`{`
			`//block id`
			`int bx = blockIdx.x;`
			`int by = blockIdx.y;`

			`//thread id`
			`int tx = threadIdx.x;`
			`int ty = threadIdx.y;`

			`//indices`
			`int index = cols * BLOCK_SIZE * by + BLOCK_SIZE * bx + cols * ty + tx;`
			`int index_s = cols * BLOCK_SIZE * by + BLOCK_SIZE * bx + cols * BLOCK_SIZE + tx;`
			`int index_e = cols * BLOCK_SIZE * by + BLOCK_SIZE * bx + cols * ty + BLOCK_SIZE;`
			`float cc, cn, cs, ce, cw, d_sum;`

			`//shared memory allocation`
			`__shared__ float south_c[BLOCK_SIZE][BLOCK_SIZE];`
			`__shared__ float east_c[BLOCK_SIZE][BLOCK_SIZE];`

			`__shared__ float c_cuda_temp[BLOCK_SIZE][BLOCK_SIZE];`
			`__shared__ float c_cuda_result[BLOCK_SIZE][BLOCK_SIZE];`
			`__shared__ float temp[BLOCK_SIZE][BLOCK_SIZE];`

			`//load data to shared memory`
			`temp[ty][tx] = J_cuda[index];`

			`__syncthreads();`

			`south_c[ty][tx] = C_cuda[index_s];`

			`if ( by == gridDim.y - 1 ){`
			`south_c[ty][tx] = C_cuda[cols * BLOCK_SIZE * (gridDim.y - 1) + BLOCK_SIZE * bx + cols * ( BLOCK_SIZE - 1 ) + tx];`
			`}`
			`__syncthreads();`


			`east_c[ty][tx] = C_cuda[index_e];`

			`if ( bx == gridDim.x - 1 ){`
			`east_c[ty][tx] = C_cuda[cols * BLOCK_SIZE * by + BLOCK_SIZE * ( gridDim.x - 1) + cols * ty + BLOCK_SIZE-1];`
			`}`

			`__syncthreads();`

			`c_cuda_temp[ty][tx] = C_cuda[index];`

			`__syncthreads();`

			`cc = c_cuda_temp[ty][tx];`

			`if ( ty == BLOCK_SIZE -1 && tx == BLOCK_SIZE - 1){ //se`
			`cn = cc;`
			`cs = south_c[ty][tx];`
			`cw = cc;`
			`ce = east_c[ty][tx];`
			`}`
			`else if ( tx == BLOCK_SIZE -1 ){ //e`
			`cn = cc;`
			`cs = c_cuda_temp[ty+1][tx];`
			`cw = cc;`
			`ce = east_c[ty][tx];`
			`}`
			`else if ( ty == BLOCK_SIZE -1){ //s`
			`cn = cc;`
			`cs = south_c[ty][tx];`
			`cw = cc;`
			`ce = c_cuda_temp[ty][tx+1];`
			`}`
			`else{ //the data elements which are not on the borders`
			`cn = cc;`
			`cs = c_cuda_temp[ty+1][tx];`
			`cw = cc;`
			`ce = c_cuda_temp[ty][tx+1];`
			`}`

			`// divergence (equ 58)`
			`d_sum = cn * N_C[index] + cs * S_C[index] + cw * W_C[index] + ce * E_C[index];`

			`// image update (equ 61)`
			`c_cuda_result[ty][tx] = temp[ty][tx] + 0.25 * lambda * d_sum;`

			`__syncthreads();`

			`J_cuda[index] = c_cuda_result[ty][tx];`

			`}`