166 lines
4.3 KiB
Plaintext
166 lines
4.3 KiB
Plaintext
#include "needle.h"
|
|
#include <stdio.h>
|
|
|
|
#define SDATA(index) CUT_BANK_CHECKER(sdata, index)
|
|
|
|
__device__ int maximum_device(int a, int b, int c) {
|
|
|
|
int k;
|
|
if (a <= b)
|
|
k = b;
|
|
else
|
|
k = a;
|
|
|
|
if (k <= c)
|
|
return (c);
|
|
else
|
|
return (k);
|
|
}
|
|
__global__ void needle_cuda_shared_1(int *referrence, int *matrix_cuda,
|
|
int cols, int penalty, int i,
|
|
int block_width) {
|
|
int bx = blockIdx.x;
|
|
int tx = threadIdx.x;
|
|
|
|
int b_index_x = bx;
|
|
int b_index_y = i - 1 - bx;
|
|
|
|
int index =
|
|
cols * BLOCK_SIZE * b_index_y + BLOCK_SIZE * b_index_x + tx + (cols + 1);
|
|
int index_n =
|
|
cols * BLOCK_SIZE * b_index_y + BLOCK_SIZE * b_index_x + tx + (1);
|
|
int index_w = cols * BLOCK_SIZE * b_index_y + BLOCK_SIZE * b_index_x + (cols);
|
|
int index_nw = cols * BLOCK_SIZE * b_index_y + BLOCK_SIZE * b_index_x;
|
|
|
|
__shared__ int temp[BLOCK_SIZE + 1][BLOCK_SIZE + 1];
|
|
__shared__ int ref[BLOCK_SIZE][BLOCK_SIZE];
|
|
|
|
if (tx == 0)
|
|
temp[tx][0] = matrix_cuda[index_nw];
|
|
|
|
for (int ty = 0; ty < BLOCK_SIZE; ty++)
|
|
ref[ty][tx] = referrence[index + cols * ty];
|
|
|
|
__syncthreads();
|
|
|
|
temp[tx + 1][0] = matrix_cuda[index_w + cols * tx];
|
|
|
|
__syncthreads();
|
|
|
|
temp[0][tx + 1] = matrix_cuda[index_n];
|
|
|
|
__syncthreads();
|
|
|
|
for (int m = 0; m < BLOCK_SIZE; m++) {
|
|
|
|
if (tx <= m) {
|
|
|
|
int t_index_x = tx + 1;
|
|
int t_index_y = m - tx + 1;
|
|
|
|
temp[t_index_y][t_index_x] =
|
|
maximum_device(temp[t_index_y - 1][t_index_x - 1] +
|
|
ref[t_index_y - 1][t_index_x - 1],
|
|
temp[t_index_y][t_index_x - 1] - penalty,
|
|
temp[t_index_y - 1][t_index_x] - penalty);
|
|
}
|
|
|
|
__syncthreads();
|
|
}
|
|
|
|
for (int m = BLOCK_SIZE - 2; m >= 0; m--) {
|
|
|
|
if (tx <= m) {
|
|
|
|
int t_index_x = tx + BLOCK_SIZE - m;
|
|
int t_index_y = BLOCK_SIZE - tx;
|
|
|
|
temp[t_index_y][t_index_x] =
|
|
maximum_device(temp[t_index_y - 1][t_index_x - 1] +
|
|
ref[t_index_y - 1][t_index_x - 1],
|
|
temp[t_index_y][t_index_x - 1] - penalty,
|
|
temp[t_index_y - 1][t_index_x] - penalty);
|
|
}
|
|
|
|
__syncthreads();
|
|
}
|
|
|
|
for (int ty = 0; ty < BLOCK_SIZE; ty++)
|
|
matrix_cuda[index + ty * cols] = temp[ty + 1][tx + 1];
|
|
}
|
|
|
|
__global__ void needle_cuda_shared_2(int *referrence, int *matrix_cuda,
|
|
|
|
int cols, int penalty, int i,
|
|
int block_width) {
|
|
|
|
int bx = blockIdx.x;
|
|
int tx = threadIdx.x;
|
|
|
|
int b_index_x = bx + block_width - i;
|
|
int b_index_y = block_width - bx - 1;
|
|
|
|
int index =
|
|
cols * BLOCK_SIZE * b_index_y + BLOCK_SIZE * b_index_x + tx + (cols + 1);
|
|
int index_n =
|
|
cols * BLOCK_SIZE * b_index_y + BLOCK_SIZE * b_index_x + tx + (1);
|
|
int index_w = cols * BLOCK_SIZE * b_index_y + BLOCK_SIZE * b_index_x + (cols);
|
|
int index_nw = cols * BLOCK_SIZE * b_index_y + BLOCK_SIZE * b_index_x;
|
|
|
|
__shared__ int temp[BLOCK_SIZE + 1][BLOCK_SIZE + 1];
|
|
__shared__ int ref[BLOCK_SIZE][BLOCK_SIZE];
|
|
|
|
for (int ty = 0; ty < BLOCK_SIZE; ty++)
|
|
ref[ty][tx] = referrence[index + cols * ty];
|
|
|
|
__syncthreads();
|
|
|
|
if (tx == 0)
|
|
temp[tx][0] = matrix_cuda[index_nw];
|
|
|
|
temp[tx + 1][0] = matrix_cuda[index_w + cols * tx];
|
|
|
|
__syncthreads();
|
|
|
|
temp[0][tx + 1] = matrix_cuda[index_n];
|
|
|
|
__syncthreads();
|
|
|
|
for (int m = 0; m < BLOCK_SIZE; m++) {
|
|
|
|
if (tx <= m) {
|
|
|
|
int t_index_x = tx + 1;
|
|
int t_index_y = m - tx + 1;
|
|
|
|
temp[t_index_y][t_index_x] =
|
|
maximum_device(temp[t_index_y - 1][t_index_x - 1] +
|
|
ref[t_index_y - 1][t_index_x - 1],
|
|
temp[t_index_y][t_index_x - 1] - penalty,
|
|
temp[t_index_y - 1][t_index_x] - penalty);
|
|
}
|
|
|
|
__syncthreads();
|
|
}
|
|
|
|
for (int m = BLOCK_SIZE - 2; m >= 0; m--) {
|
|
|
|
if (tx <= m) {
|
|
|
|
int t_index_x = tx + BLOCK_SIZE - m;
|
|
int t_index_y = BLOCK_SIZE - tx;
|
|
|
|
temp[t_index_y][t_index_x] =
|
|
maximum_device(temp[t_index_y - 1][t_index_x - 1] +
|
|
ref[t_index_y - 1][t_index_x - 1],
|
|
temp[t_index_y][t_index_x - 1] - penalty,
|
|
temp[t_index_y - 1][t_index_x] - penalty);
|
|
}
|
|
|
|
__syncthreads();
|
|
}
|
|
|
|
for (int ty = 0; ty < BLOCK_SIZE; ty++)
|
|
matrix_cuda[index + ty * cols] = temp[ty + 1][tx + 1];
|
|
}
|