CuPBoP/examples/nw/needle_kernel.cu

166 lines
4.3 KiB
Plaintext

#include "needle.h"
#include <stdio.h>
#define SDATA(index) CUT_BANK_CHECKER(sdata, index)
__device__ int maximum_device(int a, int b, int c) {
int k;
if (a <= b)
k = b;
else
k = a;
if (k <= c)
return (c);
else
return (k);
}
__global__ void needle_cuda_shared_1(int *referrence, int *matrix_cuda,
int cols, int penalty, int i,
int block_width) {
int bx = blockIdx.x;
int tx = threadIdx.x;
int b_index_x = bx;
int b_index_y = i - 1 - bx;
int index =
cols * BLOCK_SIZE * b_index_y + BLOCK_SIZE * b_index_x + tx + (cols + 1);
int index_n =
cols * BLOCK_SIZE * b_index_y + BLOCK_SIZE * b_index_x + tx + (1);
int index_w = cols * BLOCK_SIZE * b_index_y + BLOCK_SIZE * b_index_x + (cols);
int index_nw = cols * BLOCK_SIZE * b_index_y + BLOCK_SIZE * b_index_x;
__shared__ int temp[BLOCK_SIZE + 1][BLOCK_SIZE + 1];
__shared__ int ref[BLOCK_SIZE][BLOCK_SIZE];
if (tx == 0)
temp[tx][0] = matrix_cuda[index_nw];
for (int ty = 0; ty < BLOCK_SIZE; ty++)
ref[ty][tx] = referrence[index + cols * ty];
__syncthreads();
temp[tx + 1][0] = matrix_cuda[index_w + cols * tx];
__syncthreads();
temp[0][tx + 1] = matrix_cuda[index_n];
__syncthreads();
for (int m = 0; m < BLOCK_SIZE; m++) {
if (tx <= m) {
int t_index_x = tx + 1;
int t_index_y = m - tx + 1;
temp[t_index_y][t_index_x] =
maximum_device(temp[t_index_y - 1][t_index_x - 1] +
ref[t_index_y - 1][t_index_x - 1],
temp[t_index_y][t_index_x - 1] - penalty,
temp[t_index_y - 1][t_index_x] - penalty);
}
__syncthreads();
}
for (int m = BLOCK_SIZE - 2; m >= 0; m--) {
if (tx <= m) {
int t_index_x = tx + BLOCK_SIZE - m;
int t_index_y = BLOCK_SIZE - tx;
temp[t_index_y][t_index_x] =
maximum_device(temp[t_index_y - 1][t_index_x - 1] +
ref[t_index_y - 1][t_index_x - 1],
temp[t_index_y][t_index_x - 1] - penalty,
temp[t_index_y - 1][t_index_x] - penalty);
}
__syncthreads();
}
for (int ty = 0; ty < BLOCK_SIZE; ty++)
matrix_cuda[index + ty * cols] = temp[ty + 1][tx + 1];
}
__global__ void needle_cuda_shared_2(int *referrence, int *matrix_cuda,
int cols, int penalty, int i,
int block_width) {
int bx = blockIdx.x;
int tx = threadIdx.x;
int b_index_x = bx + block_width - i;
int b_index_y = block_width - bx - 1;
int index =
cols * BLOCK_SIZE * b_index_y + BLOCK_SIZE * b_index_x + tx + (cols + 1);
int index_n =
cols * BLOCK_SIZE * b_index_y + BLOCK_SIZE * b_index_x + tx + (1);
int index_w = cols * BLOCK_SIZE * b_index_y + BLOCK_SIZE * b_index_x + (cols);
int index_nw = cols * BLOCK_SIZE * b_index_y + BLOCK_SIZE * b_index_x;
__shared__ int temp[BLOCK_SIZE + 1][BLOCK_SIZE + 1];
__shared__ int ref[BLOCK_SIZE][BLOCK_SIZE];
for (int ty = 0; ty < BLOCK_SIZE; ty++)
ref[ty][tx] = referrence[index + cols * ty];
__syncthreads();
if (tx == 0)
temp[tx][0] = matrix_cuda[index_nw];
temp[tx + 1][0] = matrix_cuda[index_w + cols * tx];
__syncthreads();
temp[0][tx + 1] = matrix_cuda[index_n];
__syncthreads();
for (int m = 0; m < BLOCK_SIZE; m++) {
if (tx <= m) {
int t_index_x = tx + 1;
int t_index_y = m - tx + 1;
temp[t_index_y][t_index_x] =
maximum_device(temp[t_index_y - 1][t_index_x - 1] +
ref[t_index_y - 1][t_index_x - 1],
temp[t_index_y][t_index_x - 1] - penalty,
temp[t_index_y - 1][t_index_x] - penalty);
}
__syncthreads();
}
for (int m = BLOCK_SIZE - 2; m >= 0; m--) {
if (tx <= m) {
int t_index_x = tx + BLOCK_SIZE - m;
int t_index_y = BLOCK_SIZE - tx;
temp[t_index_y][t_index_x] =
maximum_device(temp[t_index_y - 1][t_index_x - 1] +
ref[t_index_y - 1][t_index_x - 1],
temp[t_index_y][t_index_x - 1] - penalty,
temp[t_index_y - 1][t_index_x] - penalty);
}
__syncthreads();
}
for (int ty = 0; ty < BLOCK_SIZE; ty++)
matrix_cuda[index + ty * cols] = temp[ty + 1][tx + 1];
}