CuPBoP/examples/bfs/bfs.cu

214 lines
6.1 KiB
Plaintext

#include <cuda.h>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#define MAX_THREADS_PER_BLOCK 512
int no_of_nodes;
int edge_list_size;
FILE *fp;
// Structure to hold a node information
struct Node {
int starting;
int no_of_edges;
};
#include "kernel.cu"
#include "kernel2.cu"
void BFSGraph(int argc, char **argv);
////////////////////////////////////////////////////////////////////////////////
// Main Program
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv) {
cudaSetDevice(0);
no_of_nodes = 0;
edge_list_size = 0;
BFSGraph(argc, argv);
}
void Usage(int argc, char **argv) {
fprintf(stderr, "Usage: %s <input_file>\n", argv[0]);
}
////////////////////////////////////////////////////////////////////////////////
// Apply BFS on a Graph using CUDA
////////////////////////////////////////////////////////////////////////////////
void BFSGraph(int argc, char **argv) {
char *input_f;
if (argc != 2) {
Usage(argc, argv);
exit(0);
}
input_f = argv[1];
printf("Reading File\n");
// Read in Graph from a file
fp = fopen(input_f, "r");
if (!fp) {
printf("Error Reading graph file\n");
return;
}
int source = 0;
fscanf(fp, "%d", &no_of_nodes);
int num_of_blocks = 1;
int num_of_threads_per_block = no_of_nodes;
// Make execution Parameters according to the number of nodes
// Distribute threads across multiple Blocks if necessary
if (no_of_nodes > MAX_THREADS_PER_BLOCK) {
num_of_blocks = (int)ceil(no_of_nodes / (double)MAX_THREADS_PER_BLOCK);
num_of_threads_per_block = MAX_THREADS_PER_BLOCK;
}
// allocate host memory
Node *h_graph_nodes = (Node *)malloc(sizeof(Node) * no_of_nodes);
bool *h_graph_mask = (bool *)malloc(sizeof(bool) * no_of_nodes);
bool *h_updating_graph_mask = (bool *)malloc(sizeof(bool) * no_of_nodes);
bool *h_graph_visited = (bool *)malloc(sizeof(bool) * no_of_nodes);
int start, edgeno;
// initalize the memory
for (unsigned int i = 0; i < no_of_nodes; i++) {
fscanf(fp, "%d %d", &start, &edgeno);
h_graph_nodes[i].starting = start;
h_graph_nodes[i].no_of_edges = edgeno;
h_graph_mask[i] = false;
h_updating_graph_mask[i] = false;
h_graph_visited[i] = false;
}
// read the source node from the file
fscanf(fp, "%d", &source);
source = 0;
// set the source node as true in the mask
h_graph_mask[source] = true;
h_graph_visited[source] = true;
fscanf(fp, "%d", &edge_list_size);
int id, cost;
int *h_graph_edges = (int *)malloc(sizeof(int) * edge_list_size);
for (int i = 0; i < edge_list_size; i++) {
fscanf(fp, "%d", &id);
fscanf(fp, "%d", &cost);
h_graph_edges[i] = id;
}
if (fp)
fclose(fp);
printf("Read File\n");
// Copy the Node list to device memory
Node *d_graph_nodes;
cudaMalloc((void **)&d_graph_nodes, sizeof(Node) * no_of_nodes);
cudaMemcpy(d_graph_nodes, h_graph_nodes, sizeof(Node) * no_of_nodes,
cudaMemcpyHostToDevice);
// Copy the Edge List to device Memory
int *d_graph_edges;
cudaMalloc((void **)&d_graph_edges, sizeof(int) * edge_list_size);
cudaMemcpy(d_graph_edges, h_graph_edges, sizeof(int) * edge_list_size,
cudaMemcpyHostToDevice);
// Copy the Mask to device memory
bool *d_graph_mask;
cudaMalloc((void **)&d_graph_mask, sizeof(bool) * no_of_nodes);
cudaMemcpy(d_graph_mask, h_graph_mask, sizeof(bool) * no_of_nodes,
cudaMemcpyHostToDevice);
bool *d_updating_graph_mask;
cudaMalloc((void **)&d_updating_graph_mask, sizeof(bool) * no_of_nodes);
cudaMemcpy(d_updating_graph_mask, h_updating_graph_mask,
sizeof(bool) * no_of_nodes, cudaMemcpyHostToDevice);
// Copy the Visited nodes array to device memory
bool *d_graph_visited;
cudaMalloc((void **)&d_graph_visited, sizeof(bool) * no_of_nodes);
cudaMemcpy(d_graph_visited, h_graph_visited, sizeof(bool) * no_of_nodes,
cudaMemcpyHostToDevice);
// allocate mem for the result on host side
int *h_cost = (int *)malloc(sizeof(int) * no_of_nodes);
for (int i = 0; i < no_of_nodes; i++)
h_cost[i] = -1;
h_cost[source] = 0;
// allocate device memory for result
int *d_cost;
cudaMalloc((void **)&d_cost, sizeof(int) * no_of_nodes);
cudaMemcpy(d_cost, h_cost, sizeof(int) * no_of_nodes, cudaMemcpyHostToDevice);
// make a bool to check if the execution is over
bool *d_over;
cudaMalloc((void **)&d_over, sizeof(bool));
printf("Copied Everything to GPU memory\n");
// setup execution parameters
dim3 grid(num_of_blocks, 1, 1);
dim3 threads(num_of_threads_per_block, 1, 1);
int k = 0;
printf("Start traversing the tree\n");
bool stop;
// Call the Kernel untill all the elements of Frontier are not false
do {
// if no thread changes this value then the loop stops
stop = false;
cudaMemcpy(d_over, &stop, sizeof(bool), cudaMemcpyHostToDevice);
Kernel<<<grid, threads, 0>>>(d_graph_nodes, d_graph_edges, d_graph_mask,
d_updating_graph_mask, d_graph_visited, d_cost,
no_of_nodes);
cudaDeviceSynchronize();
// check if kernel execution generated and error
Kernel2<<<grid, threads, 0>>>(d_graph_mask, d_updating_graph_mask,
d_graph_visited, d_over, no_of_nodes);
cudaDeviceSynchronize();
// check if kernel execution generated and error
cudaMemcpy(&stop, d_over, sizeof(bool), cudaMemcpyDeviceToHost);
k++;
} while (stop);
printf("Kernel Executed %d times\n", k);
// copy result from device to host
cudaMemcpy(h_cost, d_cost, sizeof(int) * no_of_nodes, cudaMemcpyDeviceToHost);
// Store the result into a file
FILE *fpo = fopen("result.txt", "w");
for (int i = 0; i < no_of_nodes; i++)
fprintf(fpo, "%d) cost:%d\n", i, h_cost[i]);
fclose(fpo);
printf("Result stored in result.txt\n");
// cleanup memory
free(h_graph_nodes);
free(h_graph_edges);
free(h_graph_mask);
free(h_updating_graph_mask);
free(h_graph_visited);
free(h_cost);
cudaFree(d_graph_nodes);
cudaFree(d_graph_edges);
cudaFree(d_graph_mask);
cudaFree(d_updating_graph_mask);
cudaFree(d_graph_visited);
cudaFree(d_cost);
}