#include <cuda.h>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#define MAX_THREADS_PER_BLOCK 512

int no_of_nodes;
int edge_list_size;
FILE *fp;

// Structure to hold a node information
struct Node {
  int starting;
  int no_of_edges;
};

#include "kernel.cu"
#include "kernel2.cu"

void BFSGraph(int argc, char **argv);

////////////////////////////////////////////////////////////////////////////////
// Main Program
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv) {
  cudaSetDevice(0);
  no_of_nodes = 0;
  edge_list_size = 0;
  BFSGraph(argc, argv);
}

void Usage(int argc, char **argv) {

  fprintf(stderr, "Usage: %s <input_file>\n", argv[0]);
}
////////////////////////////////////////////////////////////////////////////////
// Apply BFS on a Graph using CUDA
////////////////////////////////////////////////////////////////////////////////
void BFSGraph(int argc, char **argv) {

  char *input_f;
  if (argc != 2) {
    Usage(argc, argv);
    exit(0);
  }

  input_f = argv[1];
  printf("Reading File\n");
  // Read in Graph from a file
  fp = fopen(input_f, "r");
  if (!fp) {
    printf("Error Reading graph file\n");
    return;
  }

  int source = 0;

  fscanf(fp, "%d", &no_of_nodes);

  int num_of_blocks = 1;
  int num_of_threads_per_block = no_of_nodes;

  // Make execution Parameters according to the number of nodes
  // Distribute threads across multiple Blocks if necessary
  if (no_of_nodes > MAX_THREADS_PER_BLOCK) {
    num_of_blocks = (int)ceil(no_of_nodes / (double)MAX_THREADS_PER_BLOCK);
    num_of_threads_per_block = MAX_THREADS_PER_BLOCK;
  }

  // allocate host memory
  Node *h_graph_nodes = (Node *)malloc(sizeof(Node) * no_of_nodes);
  bool *h_graph_mask = (bool *)malloc(sizeof(bool) * no_of_nodes);
  bool *h_updating_graph_mask = (bool *)malloc(sizeof(bool) * no_of_nodes);
  bool *h_graph_visited = (bool *)malloc(sizeof(bool) * no_of_nodes);

  int start, edgeno;
  // initalize the memory
  for (unsigned int i = 0; i < no_of_nodes; i++) {
    fscanf(fp, "%d %d", &start, &edgeno);
    h_graph_nodes[i].starting = start;
    h_graph_nodes[i].no_of_edges = edgeno;
    h_graph_mask[i] = false;
    h_updating_graph_mask[i] = false;
    h_graph_visited[i] = false;
  }

  // read the source node from the file
  fscanf(fp, "%d", &source);
  source = 0;

  // set the source node as true in the mask
  h_graph_mask[source] = true;
  h_graph_visited[source] = true;

  fscanf(fp, "%d", &edge_list_size);

  int id, cost;
  int *h_graph_edges = (int *)malloc(sizeof(int) * edge_list_size);
  for (int i = 0; i < edge_list_size; i++) {
    fscanf(fp, "%d", &id);
    fscanf(fp, "%d", &cost);
    h_graph_edges[i] = id;
  }

  if (fp)
    fclose(fp);

  printf("Read File\n");

  // Copy the Node list to device memory
  Node *d_graph_nodes;
  cudaMalloc((void **)&d_graph_nodes, sizeof(Node) * no_of_nodes);
  cudaMemcpy(d_graph_nodes, h_graph_nodes, sizeof(Node) * no_of_nodes,
             cudaMemcpyHostToDevice);

  // Copy the Edge List to device Memory
  int *d_graph_edges;
  cudaMalloc((void **)&d_graph_edges, sizeof(int) * edge_list_size);
  cudaMemcpy(d_graph_edges, h_graph_edges, sizeof(int) * edge_list_size,
             cudaMemcpyHostToDevice);

  // Copy the Mask to device memory
  bool *d_graph_mask;
  cudaMalloc((void **)&d_graph_mask, sizeof(bool) * no_of_nodes);
  cudaMemcpy(d_graph_mask, h_graph_mask, sizeof(bool) * no_of_nodes,
             cudaMemcpyHostToDevice);

  bool *d_updating_graph_mask;
  cudaMalloc((void **)&d_updating_graph_mask, sizeof(bool) * no_of_nodes);
  cudaMemcpy(d_updating_graph_mask, h_updating_graph_mask,
             sizeof(bool) * no_of_nodes, cudaMemcpyHostToDevice);

  // Copy the Visited nodes array to device memory
  bool *d_graph_visited;
  cudaMalloc((void **)&d_graph_visited, sizeof(bool) * no_of_nodes);
  cudaMemcpy(d_graph_visited, h_graph_visited, sizeof(bool) * no_of_nodes,
             cudaMemcpyHostToDevice);

  // allocate mem for the result on host side
  int *h_cost = (int *)malloc(sizeof(int) * no_of_nodes);
  for (int i = 0; i < no_of_nodes; i++)
    h_cost[i] = -1;
  h_cost[source] = 0;

  // allocate device memory for result
  int *d_cost;
  cudaMalloc((void **)&d_cost, sizeof(int) * no_of_nodes);
  cudaMemcpy(d_cost, h_cost, sizeof(int) * no_of_nodes, cudaMemcpyHostToDevice);

  // make a bool to check if the execution is over
  bool *d_over;
  cudaMalloc((void **)&d_over, sizeof(bool));

  printf("Copied Everything to GPU memory\n");

  // setup execution parameters
  dim3 grid(num_of_blocks, 1, 1);
  dim3 threads(num_of_threads_per_block, 1, 1);

  int k = 0;
  printf("Start traversing the tree\n");
  bool stop;
  // Call the Kernel untill all the elements of Frontier are not false
  do {
    // if no thread changes this value then the loop stops
    stop = false;
    cudaMemcpy(d_over, &stop, sizeof(bool), cudaMemcpyHostToDevice);

    Kernel<<<grid, threads, 0>>>(d_graph_nodes, d_graph_edges, d_graph_mask,
                                 d_updating_graph_mask, d_graph_visited, d_cost,
                                 no_of_nodes);
    cudaDeviceSynchronize();
    // check if kernel execution generated and error

    Kernel2<<<grid, threads, 0>>>(d_graph_mask, d_updating_graph_mask,
                                  d_graph_visited, d_over, no_of_nodes);
    cudaDeviceSynchronize();
    // check if kernel execution generated and error

    cudaMemcpy(&stop, d_over, sizeof(bool), cudaMemcpyDeviceToHost);

    k++;
  } while (stop);

  printf("Kernel Executed %d times\n", k);

  // copy result from device to host
  cudaMemcpy(h_cost, d_cost, sizeof(int) * no_of_nodes, cudaMemcpyDeviceToHost);

  // Store the result into a file
  FILE *fpo = fopen("result.txt", "w");
  for (int i = 0; i < no_of_nodes; i++)
    fprintf(fpo, "%d) cost:%d\n", i, h_cost[i]);
  fclose(fpo);
  printf("Result stored in result.txt\n");

  // cleanup memory
  free(h_graph_nodes);
  free(h_graph_edges);
  free(h_graph_mask);
  free(h_updating_graph_mask);
  free(h_graph_visited);
  free(h_cost);

  cudaFree(d_graph_nodes);
  cudaFree(d_graph_edges);
  cudaFree(d_graph_mask);
  cudaFree(d_updating_graph_mask);
  cudaFree(d_graph_visited);
  cudaFree(d_cost);
}