CuPBoP/examples/nw/needle_kernel.cu

#include "needle.h"
#include <stdio.h>

#define SDATA(index) CUT_BANK_CHECKER(sdata, index)

__device__ int maximum_device(int a, int b, int c) {

  int k;
  if (a <= b)
    k = b;
  else
    k = a;

  if (k <= c)
    return (c);
  else
    return (k);
}
__global__ void needle_cuda_shared_1(int *referrence, int *matrix_cuda,
                                     int cols, int penalty, int i,
                                     int block_width) {
  int bx = blockIdx.x;
  int tx = threadIdx.x;

  int b_index_x = bx;
  int b_index_y = i - 1 - bx;

  int index =
      cols * BLOCK_SIZE * b_index_y + BLOCK_SIZE * b_index_x + tx + (cols + 1);
  int index_n =
      cols * BLOCK_SIZE * b_index_y + BLOCK_SIZE * b_index_x + tx + (1);
  int index_w = cols * BLOCK_SIZE * b_index_y + BLOCK_SIZE * b_index_x + (cols);
  int index_nw = cols * BLOCK_SIZE * b_index_y + BLOCK_SIZE * b_index_x;

  __shared__ int temp[BLOCK_SIZE + 1][BLOCK_SIZE + 1];
  __shared__ int ref[BLOCK_SIZE][BLOCK_SIZE];

  if (tx == 0)
    temp[tx][0] = matrix_cuda[index_nw];

  for (int ty = 0; ty < BLOCK_SIZE; ty++)
    ref[ty][tx] = referrence[index + cols * ty];

  __syncthreads();

  temp[tx + 1][0] = matrix_cuda[index_w + cols * tx];

  __syncthreads();

  temp[0][tx + 1] = matrix_cuda[index_n];

  __syncthreads();

  for (int m = 0; m < BLOCK_SIZE; m++) {

    if (tx <= m) {

      int t_index_x = tx + 1;
      int t_index_y = m - tx + 1;

      temp[t_index_y][t_index_x] =
          maximum_device(temp[t_index_y - 1][t_index_x - 1] +
                             ref[t_index_y - 1][t_index_x - 1],
                         temp[t_index_y][t_index_x - 1] - penalty,
                         temp[t_index_y - 1][t_index_x] - penalty);
    }

    __syncthreads();
  }

  for (int m = BLOCK_SIZE - 2; m >= 0; m--) {

    if (tx <= m) {

      int t_index_x = tx + BLOCK_SIZE - m;
      int t_index_y = BLOCK_SIZE - tx;

      temp[t_index_y][t_index_x] =
          maximum_device(temp[t_index_y - 1][t_index_x - 1] +
                             ref[t_index_y - 1][t_index_x - 1],
                         temp[t_index_y][t_index_x - 1] - penalty,
                         temp[t_index_y - 1][t_index_x] - penalty);
    }

    __syncthreads();
  }

  for (int ty = 0; ty < BLOCK_SIZE; ty++)
    matrix_cuda[index + ty * cols] = temp[ty + 1][tx + 1];
}

__global__ void needle_cuda_shared_2(int *referrence, int *matrix_cuda,

                                     int cols, int penalty, int i,
                                     int block_width) {

  int bx = blockIdx.x;
  int tx = threadIdx.x;

  int b_index_x = bx + block_width - i;
  int b_index_y = block_width - bx - 1;

  int index =
      cols * BLOCK_SIZE * b_index_y + BLOCK_SIZE * b_index_x + tx + (cols + 1);
  int index_n =
      cols * BLOCK_SIZE * b_index_y + BLOCK_SIZE * b_index_x + tx + (1);
  int index_w = cols * BLOCK_SIZE * b_index_y + BLOCK_SIZE * b_index_x + (cols);
  int index_nw = cols * BLOCK_SIZE * b_index_y + BLOCK_SIZE * b_index_x;

  __shared__ int temp[BLOCK_SIZE + 1][BLOCK_SIZE + 1];
  __shared__ int ref[BLOCK_SIZE][BLOCK_SIZE];

  for (int ty = 0; ty < BLOCK_SIZE; ty++)
    ref[ty][tx] = referrence[index + cols * ty];

  __syncthreads();

  if (tx == 0)
    temp[tx][0] = matrix_cuda[index_nw];

  temp[tx + 1][0] = matrix_cuda[index_w + cols * tx];

  __syncthreads();

  temp[0][tx + 1] = matrix_cuda[index_n];

  __syncthreads();

  for (int m = 0; m < BLOCK_SIZE; m++) {

    if (tx <= m) {

      int t_index_x = tx + 1;
      int t_index_y = m - tx + 1;

      temp[t_index_y][t_index_x] =
          maximum_device(temp[t_index_y - 1][t_index_x - 1] +
                             ref[t_index_y - 1][t_index_x - 1],
                         temp[t_index_y][t_index_x - 1] - penalty,
                         temp[t_index_y - 1][t_index_x] - penalty);
    }

    __syncthreads();
  }

  for (int m = BLOCK_SIZE - 2; m >= 0; m--) {

    if (tx <= m) {

      int t_index_x = tx + BLOCK_SIZE - m;
      int t_index_y = BLOCK_SIZE - tx;

      temp[t_index_y][t_index_x] =
          maximum_device(temp[t_index_y - 1][t_index_x - 1] +
                             ref[t_index_y - 1][t_index_x - 1],
                         temp[t_index_y][t_index_x - 1] - penalty,
                         temp[t_index_y - 1][t_index_x] - penalty);
    }

    __syncthreads();
  }

  for (int ty = 0; ty < BLOCK_SIZE; ty++)
    matrix_cuda[index + ty * cols] = temp[ty + 1][tx + 1];
}
add codebase for TACO submission 2022-05-04 20:59:38 +08:00			`#include "needle.h"`
			`#include <stdio.h>`

			`#define SDATA(index) CUT_BANK_CHECKER(sdata, index)`

			`__device__ int maximum_device(int a, int b, int c) {`

			`int k;`
			`if (a <= b)`
			`k = b;`
			`else`
			`k = a;`

			`if (k <= c)`
			`return (c);`
			`else`
			`return (k);`
			`}`
			`__global__ void needle_cuda_shared_1(int referrence, int matrix_cuda,`
			`int cols, int penalty, int i,`
			`int block_width) {`
			`int bx = blockIdx.x;`
			`int tx = threadIdx.x;`

			`int b_index_x = bx;`
			`int b_index_y = i - 1 - bx;`

			`int index =`
			`cols * BLOCK_SIZE * b_index_y + BLOCK_SIZE * b_index_x + tx + (cols + 1);`
			`int index_n =`
			`cols * BLOCK_SIZE * b_index_y + BLOCK_SIZE * b_index_x + tx + (1);`
			`int index_w = cols * BLOCK_SIZE * b_index_y + BLOCK_SIZE * b_index_x + (cols);`
			`int index_nw = cols * BLOCK_SIZE * b_index_y + BLOCK_SIZE * b_index_x;`

			`__shared__ int temp[BLOCK_SIZE + 1][BLOCK_SIZE + 1];`
			`__shared__ int ref[BLOCK_SIZE][BLOCK_SIZE];`

			`if (tx == 0)`
			`temp[tx][0] = matrix_cuda[index_nw];`

			`for (int ty = 0; ty < BLOCK_SIZE; ty++)`
			`ref[ty][tx] = referrence[index + cols * ty];`

			`__syncthreads();`

			`temp[tx + 1][0] = matrix_cuda[index_w + cols * tx];`

			`__syncthreads();`

			`temp[0][tx + 1] = matrix_cuda[index_n];`

			`__syncthreads();`

			`for (int m = 0; m < BLOCK_SIZE; m++) {`

			`if (tx <= m) {`

			`int t_index_x = tx + 1;`
			`int t_index_y = m - tx + 1;`

			`temp[t_index_y][t_index_x] =`
			`maximum_device(temp[t_index_y - 1][t_index_x - 1] +`
			`ref[t_index_y - 1][t_index_x - 1],`
			`temp[t_index_y][t_index_x - 1] - penalty,`
			`temp[t_index_y - 1][t_index_x] - penalty);`
			`}`

			`__syncthreads();`
			`}`

			`for (int m = BLOCK_SIZE - 2; m >= 0; m--) {`

			`if (tx <= m) {`

			`int t_index_x = tx + BLOCK_SIZE - m;`
			`int t_index_y = BLOCK_SIZE - tx;`

			`temp[t_index_y][t_index_x] =`
			`maximum_device(temp[t_index_y - 1][t_index_x - 1] +`
			`ref[t_index_y - 1][t_index_x - 1],`
			`temp[t_index_y][t_index_x - 1] - penalty,`
			`temp[t_index_y - 1][t_index_x] - penalty);`
			`}`

			`__syncthreads();`
			`}`

			`for (int ty = 0; ty < BLOCK_SIZE; ty++)`
			`matrix_cuda[index + ty * cols] = temp[ty + 1][tx + 1];`
			`}`

			`__global__ void needle_cuda_shared_2(int referrence, int matrix_cuda,`

			`int cols, int penalty, int i,`
			`int block_width) {`

			`int bx = blockIdx.x;`
			`int tx = threadIdx.x;`

			`int b_index_x = bx + block_width - i;`
			`int b_index_y = block_width - bx - 1;`

			`int index =`
			`cols * BLOCK_SIZE * b_index_y + BLOCK_SIZE * b_index_x + tx + (cols + 1);`
			`int index_n =`
			`cols * BLOCK_SIZE * b_index_y + BLOCK_SIZE * b_index_x + tx + (1);`
			`int index_w = cols * BLOCK_SIZE * b_index_y + BLOCK_SIZE * b_index_x + (cols);`
			`int index_nw = cols * BLOCK_SIZE * b_index_y + BLOCK_SIZE * b_index_x;`

			`__shared__ int temp[BLOCK_SIZE + 1][BLOCK_SIZE + 1];`
			`__shared__ int ref[BLOCK_SIZE][BLOCK_SIZE];`

			`for (int ty = 0; ty < BLOCK_SIZE; ty++)`
			`ref[ty][tx] = referrence[index + cols * ty];`

			`__syncthreads();`

			`if (tx == 0)`
			`temp[tx][0] = matrix_cuda[index_nw];`

			`temp[tx + 1][0] = matrix_cuda[index_w + cols * tx];`

			`__syncthreads();`

			`temp[0][tx + 1] = matrix_cuda[index_n];`

			`__syncthreads();`

			`for (int m = 0; m < BLOCK_SIZE; m++) {`

			`if (tx <= m) {`

			`int t_index_x = tx + 1;`
			`int t_index_y = m - tx + 1;`

			`temp[t_index_y][t_index_x] =`
			`maximum_device(temp[t_index_y - 1][t_index_x - 1] +`
			`ref[t_index_y - 1][t_index_x - 1],`
			`temp[t_index_y][t_index_x - 1] - penalty,`
			`temp[t_index_y - 1][t_index_x] - penalty);`
			`}`

			`__syncthreads();`
			`}`

			`for (int m = BLOCK_SIZE - 2; m >= 0; m--) {`

			`if (tx <= m) {`

			`int t_index_x = tx + BLOCK_SIZE - m;`
			`int t_index_y = BLOCK_SIZE - tx;`

			`temp[t_index_y][t_index_x] =`
			`maximum_device(temp[t_index_y - 1][t_index_x - 1] +`
			`ref[t_index_y - 1][t_index_x - 1],`
			`temp[t_index_y][t_index_x - 1] - penalty,`
			`temp[t_index_y - 1][t_index_x] - penalty);`
			`}`

			`__syncthreads();`
			`}`

			`for (int ty = 0; ty < BLOCK_SIZE; ty++)`
			`matrix_cuda[index + ty * cols] = temp[ty + 1][tx + 1];`
			`}`