214 lines
6.1 KiB
Plaintext
214 lines
6.1 KiB
Plaintext
|
#include <cuda.h>
|
||
|
#include <math.h>
|
||
|
#include <stdio.h>
|
||
|
#include <stdlib.h>
|
||
|
#include <string.h>
|
||
|
|
||
|
#define MAX_THREADS_PER_BLOCK 512
|
||
|
|
||
|
int no_of_nodes;
|
||
|
int edge_list_size;
|
||
|
FILE *fp;
|
||
|
|
||
|
// Structure to hold a node information
|
||
|
struct Node {
|
||
|
int starting;
|
||
|
int no_of_edges;
|
||
|
};
|
||
|
|
||
|
#include "kernel.cu"
|
||
|
#include "kernel2.cu"
|
||
|
|
||
|
void BFSGraph(int argc, char **argv);
|
||
|
|
||
|
////////////////////////////////////////////////////////////////////////////////
|
||
|
// Main Program
|
||
|
////////////////////////////////////////////////////////////////////////////////
|
||
|
int main(int argc, char **argv) {
|
||
|
cudaSetDevice(0);
|
||
|
no_of_nodes = 0;
|
||
|
edge_list_size = 0;
|
||
|
BFSGraph(argc, argv);
|
||
|
}
|
||
|
|
||
|
void Usage(int argc, char **argv) {
|
||
|
|
||
|
fprintf(stderr, "Usage: %s <input_file>\n", argv[0]);
|
||
|
}
|
||
|
////////////////////////////////////////////////////////////////////////////////
|
||
|
// Apply BFS on a Graph using CUDA
|
||
|
////////////////////////////////////////////////////////////////////////////////
|
||
|
void BFSGraph(int argc, char **argv) {
|
||
|
|
||
|
char *input_f;
|
||
|
if (argc != 2) {
|
||
|
Usage(argc, argv);
|
||
|
exit(0);
|
||
|
}
|
||
|
|
||
|
input_f = argv[1];
|
||
|
printf("Reading File\n");
|
||
|
// Read in Graph from a file
|
||
|
fp = fopen(input_f, "r");
|
||
|
if (!fp) {
|
||
|
printf("Error Reading graph file\n");
|
||
|
return;
|
||
|
}
|
||
|
|
||
|
int source = 0;
|
||
|
|
||
|
fscanf(fp, "%d", &no_of_nodes);
|
||
|
|
||
|
int num_of_blocks = 1;
|
||
|
int num_of_threads_per_block = no_of_nodes;
|
||
|
|
||
|
// Make execution Parameters according to the number of nodes
|
||
|
// Distribute threads across multiple Blocks if necessary
|
||
|
if (no_of_nodes > MAX_THREADS_PER_BLOCK) {
|
||
|
num_of_blocks = (int)ceil(no_of_nodes / (double)MAX_THREADS_PER_BLOCK);
|
||
|
num_of_threads_per_block = MAX_THREADS_PER_BLOCK;
|
||
|
}
|
||
|
|
||
|
// allocate host memory
|
||
|
Node *h_graph_nodes = (Node *)malloc(sizeof(Node) * no_of_nodes);
|
||
|
bool *h_graph_mask = (bool *)malloc(sizeof(bool) * no_of_nodes);
|
||
|
bool *h_updating_graph_mask = (bool *)malloc(sizeof(bool) * no_of_nodes);
|
||
|
bool *h_graph_visited = (bool *)malloc(sizeof(bool) * no_of_nodes);
|
||
|
|
||
|
int start, edgeno;
|
||
|
// initalize the memory
|
||
|
for (unsigned int i = 0; i < no_of_nodes; i++) {
|
||
|
fscanf(fp, "%d %d", &start, &edgeno);
|
||
|
h_graph_nodes[i].starting = start;
|
||
|
h_graph_nodes[i].no_of_edges = edgeno;
|
||
|
h_graph_mask[i] = false;
|
||
|
h_updating_graph_mask[i] = false;
|
||
|
h_graph_visited[i] = false;
|
||
|
}
|
||
|
|
||
|
// read the source node from the file
|
||
|
fscanf(fp, "%d", &source);
|
||
|
source = 0;
|
||
|
|
||
|
// set the source node as true in the mask
|
||
|
h_graph_mask[source] = true;
|
||
|
h_graph_visited[source] = true;
|
||
|
|
||
|
fscanf(fp, "%d", &edge_list_size);
|
||
|
|
||
|
int id, cost;
|
||
|
int *h_graph_edges = (int *)malloc(sizeof(int) * edge_list_size);
|
||
|
for (int i = 0; i < edge_list_size; i++) {
|
||
|
fscanf(fp, "%d", &id);
|
||
|
fscanf(fp, "%d", &cost);
|
||
|
h_graph_edges[i] = id;
|
||
|
}
|
||
|
|
||
|
if (fp)
|
||
|
fclose(fp);
|
||
|
|
||
|
printf("Read File\n");
|
||
|
|
||
|
// Copy the Node list to device memory
|
||
|
Node *d_graph_nodes;
|
||
|
cudaMalloc((void **)&d_graph_nodes, sizeof(Node) * no_of_nodes);
|
||
|
cudaMemcpy(d_graph_nodes, h_graph_nodes, sizeof(Node) * no_of_nodes,
|
||
|
cudaMemcpyHostToDevice);
|
||
|
|
||
|
// Copy the Edge List to device Memory
|
||
|
int *d_graph_edges;
|
||
|
cudaMalloc((void **)&d_graph_edges, sizeof(int) * edge_list_size);
|
||
|
cudaMemcpy(d_graph_edges, h_graph_edges, sizeof(int) * edge_list_size,
|
||
|
cudaMemcpyHostToDevice);
|
||
|
|
||
|
// Copy the Mask to device memory
|
||
|
bool *d_graph_mask;
|
||
|
cudaMalloc((void **)&d_graph_mask, sizeof(bool) * no_of_nodes);
|
||
|
cudaMemcpy(d_graph_mask, h_graph_mask, sizeof(bool) * no_of_nodes,
|
||
|
cudaMemcpyHostToDevice);
|
||
|
|
||
|
bool *d_updating_graph_mask;
|
||
|
cudaMalloc((void **)&d_updating_graph_mask, sizeof(bool) * no_of_nodes);
|
||
|
cudaMemcpy(d_updating_graph_mask, h_updating_graph_mask,
|
||
|
sizeof(bool) * no_of_nodes, cudaMemcpyHostToDevice);
|
||
|
|
||
|
// Copy the Visited nodes array to device memory
|
||
|
bool *d_graph_visited;
|
||
|
cudaMalloc((void **)&d_graph_visited, sizeof(bool) * no_of_nodes);
|
||
|
cudaMemcpy(d_graph_visited, h_graph_visited, sizeof(bool) * no_of_nodes,
|
||
|
cudaMemcpyHostToDevice);
|
||
|
|
||
|
// allocate mem for the result on host side
|
||
|
int *h_cost = (int *)malloc(sizeof(int) * no_of_nodes);
|
||
|
for (int i = 0; i < no_of_nodes; i++)
|
||
|
h_cost[i] = -1;
|
||
|
h_cost[source] = 0;
|
||
|
|
||
|
// allocate device memory for result
|
||
|
int *d_cost;
|
||
|
cudaMalloc((void **)&d_cost, sizeof(int) * no_of_nodes);
|
||
|
cudaMemcpy(d_cost, h_cost, sizeof(int) * no_of_nodes, cudaMemcpyHostToDevice);
|
||
|
|
||
|
// make a bool to check if the execution is over
|
||
|
bool *d_over;
|
||
|
cudaMalloc((void **)&d_over, sizeof(bool));
|
||
|
|
||
|
printf("Copied Everything to GPU memory\n");
|
||
|
|
||
|
// setup execution parameters
|
||
|
dim3 grid(num_of_blocks, 1, 1);
|
||
|
dim3 threads(num_of_threads_per_block, 1, 1);
|
||
|
|
||
|
int k = 0;
|
||
|
printf("Start traversing the tree\n");
|
||
|
bool stop;
|
||
|
// Call the Kernel untill all the elements of Frontier are not false
|
||
|
do {
|
||
|
// if no thread changes this value then the loop stops
|
||
|
stop = false;
|
||
|
cudaMemcpy(d_over, &stop, sizeof(bool), cudaMemcpyHostToDevice);
|
||
|
|
||
|
Kernel<<<grid, threads, 0>>>(d_graph_nodes, d_graph_edges, d_graph_mask,
|
||
|
d_updating_graph_mask, d_graph_visited, d_cost,
|
||
|
no_of_nodes);
|
||
|
cudaDeviceSynchronize();
|
||
|
// check if kernel execution generated and error
|
||
|
|
||
|
Kernel2<<<grid, threads, 0>>>(d_graph_mask, d_updating_graph_mask,
|
||
|
d_graph_visited, d_over, no_of_nodes);
|
||
|
cudaDeviceSynchronize();
|
||
|
// check if kernel execution generated and error
|
||
|
|
||
|
cudaMemcpy(&stop, d_over, sizeof(bool), cudaMemcpyDeviceToHost);
|
||
|
|
||
|
k++;
|
||
|
} while (stop);
|
||
|
|
||
|
printf("Kernel Executed %d times\n", k);
|
||
|
|
||
|
// copy result from device to host
|
||
|
cudaMemcpy(h_cost, d_cost, sizeof(int) * no_of_nodes, cudaMemcpyDeviceToHost);
|
||
|
|
||
|
// Store the result into a file
|
||
|
FILE *fpo = fopen("result.txt", "w");
|
||
|
for (int i = 0; i < no_of_nodes; i++)
|
||
|
fprintf(fpo, "%d) cost:%d\n", i, h_cost[i]);
|
||
|
fclose(fpo);
|
||
|
printf("Result stored in result.txt\n");
|
||
|
|
||
|
// cleanup memory
|
||
|
free(h_graph_nodes);
|
||
|
free(h_graph_edges);
|
||
|
free(h_graph_mask);
|
||
|
free(h_updating_graph_mask);
|
||
|
free(h_graph_visited);
|
||
|
free(h_cost);
|
||
|
|
||
|
cudaFree(d_graph_nodes);
|
||
|
cudaFree(d_graph_edges);
|
||
|
cudaFree(d_graph_mask);
|
||
|
cudaFree(d_updating_graph_mask);
|
||
|
cudaFree(d_graph_visited);
|
||
|
cudaFree(d_cost);
|
||
|
}
|