#define LIMIT -999 #include "needle.h" #include #include #include #include #include #include // includes, kernels #include "needle_kernel.cu" #ifdef TIMING #include "timing.h" struct timeval tv; struct timeval tv_total_start, tv_total_end; struct timeval tv_h2d_start, tv_h2d_end; struct timeval tv_d2h_start, tv_d2h_end; struct timeval tv_kernel_start, tv_kernel_end; struct timeval tv_mem_alloc_start, tv_mem_alloc_end; struct timeval tv_close_start, tv_close_end; float init_time = 0, mem_alloc_time = 0, h2d_time = 0, kernel_time = 0, d2h_time = 0, close_time = 0, total_time = 0; #endif //////////////////////////////////////////////////////////////////////////////// // declaration, forward void runTest(int argc, char **argv); int maximum(int a, int b, int c) { int k; if (a <= b) k = b; else k = a; if (k <= c) return (c); else return (k); } int blosum62[24][24] = {{4, -1, -2, -2, 0, -1, -1, 0, -2, -1, -1, -1, -1, -2, -1, 1, 0, -3, -2, 0, -2, -1, 0, -4}, {-1, 5, 0, -2, -3, 1, 0, -2, 0, -3, -2, 2, -1, -3, -2, -1, -1, -3, -2, -3, -1, 0, -1, -4}, {-2, 0, 6, 1, -3, 0, 0, 0, 1, -3, -3, 0, -2, -3, -2, 1, 0, -4, -2, -3, 3, 0, -1, -4}, {-2, -2, 1, 6, -3, 0, 2, -1, -1, -3, -4, -1, -3, -3, -1, 0, -1, -4, -3, -3, 4, 1, -1, -4}, {0, -3, -3, -3, 9, -3, -4, -3, -3, -1, -1, -3, -1, -2, -3, -1, -1, -2, -2, -1, -3, -3, -2, -4}, {-1, 1, 0, 0, -3, 5, 2, -2, 0, -3, -2, 1, 0, -3, -1, 0, -1, -2, -1, -2, 0, 3, -1, -4}, {-1, 0, 0, 2, -4, 2, 5, -2, 0, -3, -3, 1, -2, -3, -1, 0, -1, -3, -2, -2, 1, 4, -1, -4}, {0, -2, 0, -1, -3, -2, -2, 6, -2, -4, -4, -2, -3, -3, -2, 0, -2, -2, -3, -3, -1, -2, -1, -4}, {-2, 0, 1, -1, -3, 0, 0, -2, 8, -3, -3, -1, -2, -1, -2, -1, -2, -2, 2, -3, 0, 0, -1, -4}, {-1, -3, -3, -3, -1, -3, -3, -4, -3, 4, 2, -3, 1, 0, -3, -2, -1, -3, -1, 3, -3, -3, -1, -4}, {-1, -2, -3, -4, -1, -2, -3, -4, -3, 2, 4, -2, 2, 0, -3, -2, -1, -2, -1, 1, -4, -3, -1, -4}, {-1, 2, 0, -1, -3, 1, 1, -2, -1, -3, -2, 5, -1, -3, -1, 0, -1, -3, -2, -2, 0, 1, -1, -4}, {-1, -1, -2, -3, -1, 0, -2, -3, -2, 1, 2, -1, 5, 0, -2, -1, -1, -1, -1, 1, -3, -1, -1, -4}, {-2, -3, -3, -3, -2, -3, -3, -3, -1, 0, 0, -3, 0, 6, -4, -2, -2, 1, 3, -1, -3, -3, -1, -4}, {-1, -2, -2, -1, -3, -1, -1, -2, -2, -3, -3, -1, -2, -4, 7, -1, -1, -4, -3, -2, -2, -1, -2, -4}, {1, -1, 1, 0, -1, 0, 0, 0, -1, -2, -2, 0, -1, -2, -1, 4, 1, -3, -2, -2, 0, 0, 0, -4}, {0, -1, 0, -1, -1, -1, -1, -2, -2, -1, -1, -1, -1, -2, -1, 1, 5, -2, -2, 0, -1, -1, 0, -4}, {-3, -3, -4, -4, -2, -2, -3, -2, -2, -3, -2, -3, -1, 1, -4, -3, -2, 11, 2, -3, -4, -3, -2, -4}, {-2, -2, -2, -3, -2, -1, -2, -3, 2, -1, -1, -2, -1, 3, -3, -2, -2, 2, 7, -1, -3, -2, -1, -4}, {0, -3, -3, -3, -1, -2, -2, -3, -3, 3, 1, -2, 1, -1, -2, -2, 0, -3, -1, 4, -3, -2, -1, -4}, {-2, -1, 3, 4, -3, 0, 1, -1, 0, -3, -4, 0, -3, -3, -2, 0, -1, -4, -3, -3, 4, 1, -1, -4}, {-1, 0, 0, 1, -3, 3, 4, -2, 0, -3, -3, 1, -1, -3, -1, 0, -1, -3, -2, -2, 1, 4, -1, -4}, {0, -1, -1, -1, -2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -2, 0, 0, -2, -1, -1, -1, -1, -1, -4}, {-4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, 1}}; double gettime() { struct timeval t; gettimeofday(&t, NULL); return t.tv_sec + t.tv_usec * 1e-6; } //////////////////////////////////////////////////////////////////////////////// // Program main //////////////////////////////////////////////////////////////////////////////// int main(int argc, char **argv) { cudaSetDevice(0); printf("WG size of kernel = %d \n", BLOCK_SIZE); runTest(argc, argv); return EXIT_SUCCESS; } void usage(int argc, char **argv) { fprintf(stderr, "Usage: %s \n", argv[0]); fprintf(stderr, "\t - x and y dimensions\n"); fprintf(stderr, "\t - penalty(positive integer)\n"); exit(1); } void runTest(int argc, char **argv) { int max_rows, max_cols, penalty; int *input_itemsets, *output_itemsets, *referrence; int *matrix_cuda, *referrence_cuda; int size; // the lengths of the two sequences should be able to divided by 16. // And at current stage max_rows needs to equal max_cols if (argc == 3) { max_rows = atoi(argv[1]); max_cols = atoi(argv[1]); penalty = atoi(argv[2]); } else { usage(argc, argv); } if (atoi(argv[1]) % 16 != 0) { fprintf(stderr, "The dimension values must be a multiple of 16\n"); exit(1); } max_rows = max_rows + 1; max_cols = max_cols + 1; referrence = (int *)malloc(max_rows * max_cols * sizeof(int)); input_itemsets = (int *)malloc(max_rows * max_cols * sizeof(int)); output_itemsets = (int *)malloc(max_rows * max_cols * sizeof(int)); if (!input_itemsets) fprintf(stderr, "error: can not allocate memory"); srand(7); for (int i = 0; i < max_cols; i++) { for (int j = 0; j < max_rows; j++) { input_itemsets[i * max_cols + j] = 0; } } printf("Start Needleman-Wunsch\n"); for (int i = 1; i < max_rows; i++) { // please define your own sequence. input_itemsets[i * max_cols] = rand() % 10 + 1; } for (int j = 1; j < max_cols; j++) { // please define your own sequence. input_itemsets[j] = rand() % 10 + 1; } for (int i = 1; i < max_cols; i++) { for (int j = 1; j < max_rows; j++) { referrence[i * max_cols + j] = blosum62[input_itemsets[i * max_cols]][input_itemsets[j]]; } } for (int i = 1; i < max_rows; i++) input_itemsets[i * max_cols] = -i * penalty; for (int j = 1; j < max_cols; j++) input_itemsets[j] = -j * penalty; size = max_cols * max_rows; cudaMalloc((void **)&referrence_cuda, sizeof(int) * size); cudaMalloc((void **)&matrix_cuda, sizeof(int) * size); cudaMemcpy(referrence_cuda, referrence, sizeof(int) * size, cudaMemcpyHostToDevice); cudaMemcpy(matrix_cuda, input_itemsets, sizeof(int) * size, cudaMemcpyHostToDevice); cudaDeviceSynchronize(); dim3 dimGrid; dim3 dimBlock(BLOCK_SIZE, 1); int block_width = (max_cols - 1) / BLOCK_SIZE; #ifdef TIMING gettimeofday(&tv_kernel_start, NULL); #endif printf("Processing top-left matrix\n"); // process top-left matrix for (int i = 1; i <= block_width; i++) { dimGrid.x = i; dimGrid.y = 1; needle_cuda_shared_1<<>>( referrence_cuda, matrix_cuda, max_cols, penalty, i, block_width); cudaDeviceSynchronize(); } cudaMemcpy(output_itemsets, matrix_cuda, sizeof(int) * size, cudaMemcpyDeviceToHost); cudaDeviceSynchronize(); printf("Processing bottom-right matrix\n"); // process bottom-right matrix for (int i = block_width - 1; i >= 1; i--) { dimGrid.x = i; dimGrid.y = 1; needle_cuda_shared_2<<>>( referrence_cuda, matrix_cuda, max_cols, penalty, i, block_width); cudaDeviceSynchronize(); } #ifdef TIMING gettimeofday(&tv_kernel_end, NULL); tvsub(&tv_kernel_end, &tv_kernel_start, &tv); kernel_time += tv.tv_sec * 1000.0 + (float)tv.tv_usec / 1000.0; #endif cudaMemcpy(output_itemsets, matrix_cuda, sizeof(int) * size, cudaMemcpyDeviceToHost); //#define TRACEBACK FILE *fpo = fopen("result.txt", "w"); fprintf(fpo, "print traceback value GPU:\n"); for (int i = max_rows - 2, j = max_rows - 2; i >= 0, j >= 0;) { int nw, n, w, traceback; if (i == max_rows - 2 && j == max_rows - 2) fprintf(fpo, "%d ", output_itemsets[i * max_cols + j]); // print the first element if (i == 0 && j == 0) break; if (i > 0 && j > 0) { nw = output_itemsets[(i - 1) * max_cols + j - 1]; w = output_itemsets[i * max_cols + j - 1]; n = output_itemsets[(i - 1) * max_cols + j]; } else if (i == 0) { nw = n = LIMIT; w = output_itemsets[i * max_cols + j - 1]; } else if (j == 0) { nw = w = LIMIT; n = output_itemsets[(i - 1) * max_cols + j]; } else { } // traceback = maximum(nw, w, n); int new_nw, new_w, new_n; new_nw = nw + referrence[i * max_cols + j]; new_w = w - penalty; new_n = n - penalty; traceback = maximum(new_nw, new_w, new_n); if (traceback == new_nw) traceback = nw; if (traceback == new_w) traceback = w; if (traceback == new_n) traceback = n; fprintf(fpo, "%d ", traceback); if (traceback == nw) { i--; j--; continue; } else if (traceback == w) { j--; continue; } else if (traceback == n) { i--; continue; } else ; } fclose(fpo); cudaFree(referrence_cuda); cudaFree(matrix_cuda); free(referrence); free(input_itemsets); free(output_itemsets); #ifdef TIMING printf("Exec: %f\n", kernel_time); #endif }