302 lines
9.8 KiB
Plaintext
302 lines
9.8 KiB
Plaintext
#define LIMIT -999
|
|
#include "needle.h"
|
|
#include <cuda.h>
|
|
#include <math.h>
|
|
#include <stdio.h>
|
|
#include <stdlib.h>
|
|
#include <string.h>
|
|
#include <sys/time.h>
|
|
|
|
// includes, kernels
|
|
#include "needle_kernel.cu"
|
|
|
|
#ifdef TIMING
|
|
#include "timing.h"
|
|
|
|
struct timeval tv;
|
|
struct timeval tv_total_start, tv_total_end;
|
|
struct timeval tv_h2d_start, tv_h2d_end;
|
|
struct timeval tv_d2h_start, tv_d2h_end;
|
|
struct timeval tv_kernel_start, tv_kernel_end;
|
|
struct timeval tv_mem_alloc_start, tv_mem_alloc_end;
|
|
struct timeval tv_close_start, tv_close_end;
|
|
float init_time = 0, mem_alloc_time = 0, h2d_time = 0, kernel_time = 0,
|
|
d2h_time = 0, close_time = 0, total_time = 0;
|
|
#endif
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
// declaration, forward
|
|
void runTest(int argc, char **argv);
|
|
|
|
int maximum(int a, int b, int c) {
|
|
|
|
int k;
|
|
if (a <= b)
|
|
k = b;
|
|
else
|
|
k = a;
|
|
|
|
if (k <= c)
|
|
return (c);
|
|
else
|
|
return (k);
|
|
}
|
|
|
|
int blosum62[24][24] = {{4, -1, -2, -2, 0, -1, -1, 0, -2, -1, -1, -1,
|
|
-1, -2, -1, 1, 0, -3, -2, 0, -2, -1, 0, -4},
|
|
{-1, 5, 0, -2, -3, 1, 0, -2, 0, -3, -2, 2,
|
|
-1, -3, -2, -1, -1, -3, -2, -3, -1, 0, -1, -4},
|
|
{-2, 0, 6, 1, -3, 0, 0, 0, 1, -3, -3, 0,
|
|
-2, -3, -2, 1, 0, -4, -2, -3, 3, 0, -1, -4},
|
|
{-2, -2, 1, 6, -3, 0, 2, -1, -1, -3, -4, -1,
|
|
-3, -3, -1, 0, -1, -4, -3, -3, 4, 1, -1, -4},
|
|
{0, -3, -3, -3, 9, -3, -4, -3, -3, -1, -1, -3,
|
|
-1, -2, -3, -1, -1, -2, -2, -1, -3, -3, -2, -4},
|
|
{-1, 1, 0, 0, -3, 5, 2, -2, 0, -3, -2, 1,
|
|
0, -3, -1, 0, -1, -2, -1, -2, 0, 3, -1, -4},
|
|
{-1, 0, 0, 2, -4, 2, 5, -2, 0, -3, -3, 1,
|
|
-2, -3, -1, 0, -1, -3, -2, -2, 1, 4, -1, -4},
|
|
{0, -2, 0, -1, -3, -2, -2, 6, -2, -4, -4, -2,
|
|
-3, -3, -2, 0, -2, -2, -3, -3, -1, -2, -1, -4},
|
|
{-2, 0, 1, -1, -3, 0, 0, -2, 8, -3, -3, -1,
|
|
-2, -1, -2, -1, -2, -2, 2, -3, 0, 0, -1, -4},
|
|
{-1, -3, -3, -3, -1, -3, -3, -4, -3, 4, 2, -3,
|
|
1, 0, -3, -2, -1, -3, -1, 3, -3, -3, -1, -4},
|
|
{-1, -2, -3, -4, -1, -2, -3, -4, -3, 2, 4, -2,
|
|
2, 0, -3, -2, -1, -2, -1, 1, -4, -3, -1, -4},
|
|
{-1, 2, 0, -1, -3, 1, 1, -2, -1, -3, -2, 5,
|
|
-1, -3, -1, 0, -1, -3, -2, -2, 0, 1, -1, -4},
|
|
{-1, -1, -2, -3, -1, 0, -2, -3, -2, 1, 2, -1,
|
|
5, 0, -2, -1, -1, -1, -1, 1, -3, -1, -1, -4},
|
|
{-2, -3, -3, -3, -2, -3, -3, -3, -1, 0, 0, -3,
|
|
0, 6, -4, -2, -2, 1, 3, -1, -3, -3, -1, -4},
|
|
{-1, -2, -2, -1, -3, -1, -1, -2, -2, -3, -3, -1,
|
|
-2, -4, 7, -1, -1, -4, -3, -2, -2, -1, -2, -4},
|
|
{1, -1, 1, 0, -1, 0, 0, 0, -1, -2, -2, 0,
|
|
-1, -2, -1, 4, 1, -3, -2, -2, 0, 0, 0, -4},
|
|
{0, -1, 0, -1, -1, -1, -1, -2, -2, -1, -1, -1,
|
|
-1, -2, -1, 1, 5, -2, -2, 0, -1, -1, 0, -4},
|
|
{-3, -3, -4, -4, -2, -2, -3, -2, -2, -3, -2, -3,
|
|
-1, 1, -4, -3, -2, 11, 2, -3, -4, -3, -2, -4},
|
|
{-2, -2, -2, -3, -2, -1, -2, -3, 2, -1, -1, -2,
|
|
-1, 3, -3, -2, -2, 2, 7, -1, -3, -2, -1, -4},
|
|
{0, -3, -3, -3, -1, -2, -2, -3, -3, 3, 1, -2,
|
|
1, -1, -2, -2, 0, -3, -1, 4, -3, -2, -1, -4},
|
|
{-2, -1, 3, 4, -3, 0, 1, -1, 0, -3, -4, 0,
|
|
-3, -3, -2, 0, -1, -4, -3, -3, 4, 1, -1, -4},
|
|
{-1, 0, 0, 1, -3, 3, 4, -2, 0, -3, -3, 1,
|
|
-1, -3, -1, 0, -1, -3, -2, -2, 1, 4, -1, -4},
|
|
{0, -1, -1, -1, -2, -1, -1, -1, -1, -1, -1, -1,
|
|
-1, -1, -2, 0, 0, -2, -1, -1, -1, -1, -1, -4},
|
|
{-4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4,
|
|
-4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, 1}};
|
|
|
|
double gettime() {
|
|
struct timeval t;
|
|
gettimeofday(&t, NULL);
|
|
return t.tv_sec + t.tv_usec * 1e-6;
|
|
}
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
// Program main
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
int main(int argc, char **argv) {
|
|
cudaSetDevice(0);
|
|
printf("WG size of kernel = %d \n", BLOCK_SIZE);
|
|
|
|
runTest(argc, argv);
|
|
|
|
return EXIT_SUCCESS;
|
|
}
|
|
|
|
void usage(int argc, char **argv) {
|
|
fprintf(stderr, "Usage: %s <max_rows/max_cols> <penalty> \n", argv[0]);
|
|
fprintf(stderr, "\t<dimension> - x and y dimensions\n");
|
|
fprintf(stderr, "\t<penalty> - penalty(positive integer)\n");
|
|
exit(1);
|
|
}
|
|
|
|
void runTest(int argc, char **argv) {
|
|
int max_rows, max_cols, penalty;
|
|
int *input_itemsets, *output_itemsets, *referrence;
|
|
int *matrix_cuda, *referrence_cuda;
|
|
int size;
|
|
|
|
// the lengths of the two sequences should be able to divided by 16.
|
|
// And at current stage max_rows needs to equal max_cols
|
|
if (argc == 3) {
|
|
max_rows = atoi(argv[1]);
|
|
max_cols = atoi(argv[1]);
|
|
penalty = atoi(argv[2]);
|
|
} else {
|
|
usage(argc, argv);
|
|
}
|
|
|
|
if (atoi(argv[1]) % 16 != 0) {
|
|
fprintf(stderr, "The dimension values must be a multiple of 16\n");
|
|
exit(1);
|
|
}
|
|
|
|
max_rows = max_rows + 1;
|
|
max_cols = max_cols + 1;
|
|
referrence = (int *)malloc(max_rows * max_cols * sizeof(int));
|
|
input_itemsets = (int *)malloc(max_rows * max_cols * sizeof(int));
|
|
output_itemsets = (int *)malloc(max_rows * max_cols * sizeof(int));
|
|
|
|
if (!input_itemsets)
|
|
fprintf(stderr, "error: can not allocate memory");
|
|
|
|
srand(7);
|
|
|
|
for (int i = 0; i < max_cols; i++) {
|
|
for (int j = 0; j < max_rows; j++) {
|
|
input_itemsets[i * max_cols + j] = 0;
|
|
}
|
|
}
|
|
|
|
printf("Start Needleman-Wunsch\n");
|
|
|
|
for (int i = 1; i < max_rows; i++) { // please define your own sequence.
|
|
input_itemsets[i * max_cols] = rand() % 10 + 1;
|
|
}
|
|
for (int j = 1; j < max_cols; j++) { // please define your own sequence.
|
|
input_itemsets[j] = rand() % 10 + 1;
|
|
}
|
|
|
|
for (int i = 1; i < max_cols; i++) {
|
|
for (int j = 1; j < max_rows; j++) {
|
|
referrence[i * max_cols + j] =
|
|
blosum62[input_itemsets[i * max_cols]][input_itemsets[j]];
|
|
}
|
|
}
|
|
|
|
for (int i = 1; i < max_rows; i++)
|
|
input_itemsets[i * max_cols] = -i * penalty;
|
|
for (int j = 1; j < max_cols; j++)
|
|
input_itemsets[j] = -j * penalty;
|
|
|
|
size = max_cols * max_rows;
|
|
cudaMalloc((void **)&referrence_cuda, sizeof(int) * size);
|
|
cudaMalloc((void **)&matrix_cuda, sizeof(int) * size);
|
|
|
|
cudaMemcpy(referrence_cuda, referrence, sizeof(int) * size,
|
|
cudaMemcpyHostToDevice);
|
|
cudaMemcpy(matrix_cuda, input_itemsets, sizeof(int) * size,
|
|
cudaMemcpyHostToDevice);
|
|
cudaDeviceSynchronize();
|
|
|
|
dim3 dimGrid;
|
|
dim3 dimBlock(BLOCK_SIZE, 1);
|
|
int block_width = (max_cols - 1) / BLOCK_SIZE;
|
|
|
|
#ifdef TIMING
|
|
gettimeofday(&tv_kernel_start, NULL);
|
|
#endif
|
|
|
|
printf("Processing top-left matrix\n");
|
|
// process top-left matrix
|
|
for (int i = 1; i <= block_width; i++) {
|
|
dimGrid.x = i;
|
|
dimGrid.y = 1;
|
|
needle_cuda_shared_1<<<dimGrid, dimBlock>>>(
|
|
referrence_cuda, matrix_cuda, max_cols, penalty, i, block_width);
|
|
cudaDeviceSynchronize();
|
|
}
|
|
cudaMemcpy(output_itemsets, matrix_cuda, sizeof(int) * size,
|
|
cudaMemcpyDeviceToHost);
|
|
cudaDeviceSynchronize();
|
|
|
|
printf("Processing bottom-right matrix\n");
|
|
// process bottom-right matrix
|
|
for (int i = block_width - 1; i >= 1; i--) {
|
|
dimGrid.x = i;
|
|
dimGrid.y = 1;
|
|
needle_cuda_shared_2<<<dimGrid, dimBlock>>>(
|
|
referrence_cuda, matrix_cuda, max_cols, penalty, i, block_width);
|
|
cudaDeviceSynchronize();
|
|
}
|
|
|
|
#ifdef TIMING
|
|
gettimeofday(&tv_kernel_end, NULL);
|
|
tvsub(&tv_kernel_end, &tv_kernel_start, &tv);
|
|
kernel_time += tv.tv_sec * 1000.0 + (float)tv.tv_usec / 1000.0;
|
|
#endif
|
|
|
|
cudaMemcpy(output_itemsets, matrix_cuda, sizeof(int) * size,
|
|
cudaMemcpyDeviceToHost);
|
|
|
|
//#define TRACEBACK
|
|
|
|
FILE *fpo = fopen("result.txt", "w");
|
|
fprintf(fpo, "print traceback value GPU:\n");
|
|
|
|
for (int i = max_rows - 2, j = max_rows - 2; i >= 0, j >= 0;) {
|
|
int nw, n, w, traceback;
|
|
if (i == max_rows - 2 && j == max_rows - 2)
|
|
fprintf(fpo, "%d ",
|
|
output_itemsets[i * max_cols + j]); // print the first element
|
|
if (i == 0 && j == 0)
|
|
break;
|
|
if (i > 0 && j > 0) {
|
|
nw = output_itemsets[(i - 1) * max_cols + j - 1];
|
|
w = output_itemsets[i * max_cols + j - 1];
|
|
n = output_itemsets[(i - 1) * max_cols + j];
|
|
} else if (i == 0) {
|
|
nw = n = LIMIT;
|
|
w = output_itemsets[i * max_cols + j - 1];
|
|
} else if (j == 0) {
|
|
nw = w = LIMIT;
|
|
n = output_itemsets[(i - 1) * max_cols + j];
|
|
} else {
|
|
}
|
|
|
|
// traceback = maximum(nw, w, n);
|
|
int new_nw, new_w, new_n;
|
|
new_nw = nw + referrence[i * max_cols + j];
|
|
new_w = w - penalty;
|
|
new_n = n - penalty;
|
|
|
|
traceback = maximum(new_nw, new_w, new_n);
|
|
if (traceback == new_nw)
|
|
traceback = nw;
|
|
if (traceback == new_w)
|
|
traceback = w;
|
|
if (traceback == new_n)
|
|
traceback = n;
|
|
|
|
fprintf(fpo, "%d ", traceback);
|
|
|
|
if (traceback == nw) {
|
|
i--;
|
|
j--;
|
|
continue;
|
|
}
|
|
|
|
else if (traceback == w) {
|
|
j--;
|
|
continue;
|
|
}
|
|
|
|
else if (traceback == n) {
|
|
i--;
|
|
continue;
|
|
}
|
|
|
|
else
|
|
;
|
|
}
|
|
|
|
fclose(fpo);
|
|
|
|
cudaFree(referrence_cuda);
|
|
cudaFree(matrix_cuda);
|
|
|
|
free(referrence);
|
|
free(input_itemsets);
|
|
free(output_itemsets);
|
|
|
|
#ifdef TIMING
|
|
printf("Exec: %f\n", kernel_time);
|
|
#endif
|
|
}
|