#include #include #include #include #include #define MAX_THREADS_PER_BLOCK 512 int no_of_nodes; int edge_list_size; FILE *fp; // Structure to hold a node information struct Node { int starting; int no_of_edges; }; #include "kernel.cu" #include "kernel2.cu" void BFSGraph(int argc, char **argv); //////////////////////////////////////////////////////////////////////////////// // Main Program //////////////////////////////////////////////////////////////////////////////// int main(int argc, char **argv) { cudaSetDevice(0); no_of_nodes = 0; edge_list_size = 0; BFSGraph(argc, argv); } void Usage(int argc, char **argv) { fprintf(stderr, "Usage: %s \n", argv[0]); } //////////////////////////////////////////////////////////////////////////////// // Apply BFS on a Graph using CUDA //////////////////////////////////////////////////////////////////////////////// void BFSGraph(int argc, char **argv) { char *input_f; if (argc != 2) { Usage(argc, argv); exit(0); } input_f = argv[1]; printf("Reading File\n"); // Read in Graph from a file fp = fopen(input_f, "r"); if (!fp) { printf("Error Reading graph file\n"); return; } int source = 0; fscanf(fp, "%d", &no_of_nodes); int num_of_blocks = 1; int num_of_threads_per_block = no_of_nodes; // Make execution Parameters according to the number of nodes // Distribute threads across multiple Blocks if necessary if (no_of_nodes > MAX_THREADS_PER_BLOCK) { num_of_blocks = (int)ceil(no_of_nodes / (double)MAX_THREADS_PER_BLOCK); num_of_threads_per_block = MAX_THREADS_PER_BLOCK; } // allocate host memory Node *h_graph_nodes = (Node *)malloc(sizeof(Node) * no_of_nodes); bool *h_graph_mask = (bool *)malloc(sizeof(bool) * no_of_nodes); bool *h_updating_graph_mask = (bool *)malloc(sizeof(bool) * no_of_nodes); bool *h_graph_visited = (bool *)malloc(sizeof(bool) * no_of_nodes); int start, edgeno; // initalize the memory for (unsigned int i = 0; i < no_of_nodes; i++) { fscanf(fp, "%d %d", &start, &edgeno); h_graph_nodes[i].starting = start; h_graph_nodes[i].no_of_edges = edgeno; h_graph_mask[i] = false; h_updating_graph_mask[i] = false; h_graph_visited[i] = false; } // read the source node from the file fscanf(fp, "%d", &source); source = 0; // set the source node as true in the mask h_graph_mask[source] = true; h_graph_visited[source] = true; fscanf(fp, "%d", &edge_list_size); int id, cost; int *h_graph_edges = (int *)malloc(sizeof(int) * edge_list_size); for (int i = 0; i < edge_list_size; i++) { fscanf(fp, "%d", &id); fscanf(fp, "%d", &cost); h_graph_edges[i] = id; } if (fp) fclose(fp); printf("Read File\n"); // Copy the Node list to device memory Node *d_graph_nodes; cudaMalloc((void **)&d_graph_nodes, sizeof(Node) * no_of_nodes); cudaMemcpy(d_graph_nodes, h_graph_nodes, sizeof(Node) * no_of_nodes, cudaMemcpyHostToDevice); // Copy the Edge List to device Memory int *d_graph_edges; cudaMalloc((void **)&d_graph_edges, sizeof(int) * edge_list_size); cudaMemcpy(d_graph_edges, h_graph_edges, sizeof(int) * edge_list_size, cudaMemcpyHostToDevice); // Copy the Mask to device memory bool *d_graph_mask; cudaMalloc((void **)&d_graph_mask, sizeof(bool) * no_of_nodes); cudaMemcpy(d_graph_mask, h_graph_mask, sizeof(bool) * no_of_nodes, cudaMemcpyHostToDevice); bool *d_updating_graph_mask; cudaMalloc((void **)&d_updating_graph_mask, sizeof(bool) * no_of_nodes); cudaMemcpy(d_updating_graph_mask, h_updating_graph_mask, sizeof(bool) * no_of_nodes, cudaMemcpyHostToDevice); // Copy the Visited nodes array to device memory bool *d_graph_visited; cudaMalloc((void **)&d_graph_visited, sizeof(bool) * no_of_nodes); cudaMemcpy(d_graph_visited, h_graph_visited, sizeof(bool) * no_of_nodes, cudaMemcpyHostToDevice); // allocate mem for the result on host side int *h_cost = (int *)malloc(sizeof(int) * no_of_nodes); for (int i = 0; i < no_of_nodes; i++) h_cost[i] = -1; h_cost[source] = 0; // allocate device memory for result int *d_cost; cudaMalloc((void **)&d_cost, sizeof(int) * no_of_nodes); cudaMemcpy(d_cost, h_cost, sizeof(int) * no_of_nodes, cudaMemcpyHostToDevice); // make a bool to check if the execution is over bool *d_over; cudaMalloc((void **)&d_over, sizeof(bool)); printf("Copied Everything to GPU memory\n"); // setup execution parameters dim3 grid(num_of_blocks, 1, 1); dim3 threads(num_of_threads_per_block, 1, 1); int k = 0; printf("Start traversing the tree\n"); bool stop; // Call the Kernel untill all the elements of Frontier are not false do { // if no thread changes this value then the loop stops stop = false; cudaMemcpy(d_over, &stop, sizeof(bool), cudaMemcpyHostToDevice); Kernel<<>>(d_graph_nodes, d_graph_edges, d_graph_mask, d_updating_graph_mask, d_graph_visited, d_cost, no_of_nodes); cudaDeviceSynchronize(); // check if kernel execution generated and error Kernel2<<>>(d_graph_mask, d_updating_graph_mask, d_graph_visited, d_over, no_of_nodes); cudaDeviceSynchronize(); // check if kernel execution generated and error cudaMemcpy(&stop, d_over, sizeof(bool), cudaMemcpyDeviceToHost); k++; } while (stop); printf("Kernel Executed %d times\n", k); // copy result from device to host cudaMemcpy(h_cost, d_cost, sizeof(int) * no_of_nodes, cudaMemcpyDeviceToHost); // Store the result into a file FILE *fpo = fopen("result.txt", "w"); for (int i = 0; i < no_of_nodes; i++) fprintf(fpo, "%d) cost:%d\n", i, h_cost[i]); fclose(fpo); printf("Result stored in result.txt\n"); // cleanup memory free(h_graph_nodes); free(h_graph_edges); free(h_graph_mask); free(h_updating_graph_mask); free(h_graph_visited); free(h_cost); cudaFree(d_graph_nodes); cudaFree(d_graph_edges); cudaFree(d_graph_mask); cudaFree(d_updating_graph_mask); cudaFree(d_graph_visited); cudaFree(d_cost); }