#include "needle.h" #include #define SDATA(index) CUT_BANK_CHECKER(sdata, index) __device__ int maximum_device(int a, int b, int c) { int k; if (a <= b) k = b; else k = a; if (k <= c) return (c); else return (k); } __global__ void needle_cuda_shared_1(int *referrence, int *matrix_cuda, int cols, int penalty, int i, int block_width) { int bx = blockIdx.x; int tx = threadIdx.x; int b_index_x = bx; int b_index_y = i - 1 - bx; int index = cols * BLOCK_SIZE * b_index_y + BLOCK_SIZE * b_index_x + tx + (cols + 1); int index_n = cols * BLOCK_SIZE * b_index_y + BLOCK_SIZE * b_index_x + tx + (1); int index_w = cols * BLOCK_SIZE * b_index_y + BLOCK_SIZE * b_index_x + (cols); int index_nw = cols * BLOCK_SIZE * b_index_y + BLOCK_SIZE * b_index_x; __shared__ int temp[BLOCK_SIZE + 1][BLOCK_SIZE + 1]; __shared__ int ref[BLOCK_SIZE][BLOCK_SIZE]; if (tx == 0) temp[tx][0] = matrix_cuda[index_nw]; for (int ty = 0; ty < BLOCK_SIZE; ty++) ref[ty][tx] = referrence[index + cols * ty]; __syncthreads(); temp[tx + 1][0] = matrix_cuda[index_w + cols * tx]; __syncthreads(); temp[0][tx + 1] = matrix_cuda[index_n]; __syncthreads(); for (int m = 0; m < BLOCK_SIZE; m++) { if (tx <= m) { int t_index_x = tx + 1; int t_index_y = m - tx + 1; temp[t_index_y][t_index_x] = maximum_device(temp[t_index_y - 1][t_index_x - 1] + ref[t_index_y - 1][t_index_x - 1], temp[t_index_y][t_index_x - 1] - penalty, temp[t_index_y - 1][t_index_x] - penalty); } __syncthreads(); } for (int m = BLOCK_SIZE - 2; m >= 0; m--) { if (tx <= m) { int t_index_x = tx + BLOCK_SIZE - m; int t_index_y = BLOCK_SIZE - tx; temp[t_index_y][t_index_x] = maximum_device(temp[t_index_y - 1][t_index_x - 1] + ref[t_index_y - 1][t_index_x - 1], temp[t_index_y][t_index_x - 1] - penalty, temp[t_index_y - 1][t_index_x] - penalty); } __syncthreads(); } for (int ty = 0; ty < BLOCK_SIZE; ty++) matrix_cuda[index + ty * cols] = temp[ty + 1][tx + 1]; } __global__ void needle_cuda_shared_2(int *referrence, int *matrix_cuda, int cols, int penalty, int i, int block_width) { int bx = blockIdx.x; int tx = threadIdx.x; int b_index_x = bx + block_width - i; int b_index_y = block_width - bx - 1; int index = cols * BLOCK_SIZE * b_index_y + BLOCK_SIZE * b_index_x + tx + (cols + 1); int index_n = cols * BLOCK_SIZE * b_index_y + BLOCK_SIZE * b_index_x + tx + (1); int index_w = cols * BLOCK_SIZE * b_index_y + BLOCK_SIZE * b_index_x + (cols); int index_nw = cols * BLOCK_SIZE * b_index_y + BLOCK_SIZE * b_index_x; __shared__ int temp[BLOCK_SIZE + 1][BLOCK_SIZE + 1]; __shared__ int ref[BLOCK_SIZE][BLOCK_SIZE]; for (int ty = 0; ty < BLOCK_SIZE; ty++) ref[ty][tx] = referrence[index + cols * ty]; __syncthreads(); if (tx == 0) temp[tx][0] = matrix_cuda[index_nw]; temp[tx + 1][0] = matrix_cuda[index_w + cols * tx]; __syncthreads(); temp[0][tx + 1] = matrix_cuda[index_n]; __syncthreads(); for (int m = 0; m < BLOCK_SIZE; m++) { if (tx <= m) { int t_index_x = tx + 1; int t_index_y = m - tx + 1; temp[t_index_y][t_index_x] = maximum_device(temp[t_index_y - 1][t_index_x - 1] + ref[t_index_y - 1][t_index_x - 1], temp[t_index_y][t_index_x - 1] - penalty, temp[t_index_y - 1][t_index_x] - penalty); } __syncthreads(); } for (int m = BLOCK_SIZE - 2; m >= 0; m--) { if (tx <= m) { int t_index_x = tx + BLOCK_SIZE - m; int t_index_y = BLOCK_SIZE - tx; temp[t_index_y][t_index_x] = maximum_device(temp[t_index_y - 1][t_index_x - 1] + ref[t_index_y - 1][t_index_x - 1], temp[t_index_y][t_index_x - 1] - penalty, temp[t_index_y - 1][t_index_x] - penalty); } __syncthreads(); } for (int ty = 0; ty < BLOCK_SIZE; ty++) matrix_cuda[index + ty * cols] = temp[ty + 1][tx + 1]; }