fix bug in CI

2022-06-20 23:03:01 -04:00 · 2022-06-20 23:03:01 -04:00 · c1045d8140
parent 2618bd21a7
commit c1045d8140
4 changed files with 59 additions and 117 deletions
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@ -37,7 +37,7 @@ jobs:
        run: |
          cd ${{ github.workspace }}
          cd runtime/threadPool
-          git clone git@github.com:cameron314/concurrentqueue.git
+          git clone https://github.com/cameron314/concurrentqueue.git
      - name: Build project
        run: |
          mkdir build
--- a/runtime/threadPool/CMakeLists.txt
+++ b/runtime/threadPool/CMakeLists.txt
@ -17,4 +17,4 @@ include_directories(./concurrentqueue)

 file(GLOB proj_SOURCES "src/x86/*.cpp")
 add_library(${LIB_NAME} SHARED ${proj_SOURCES})
- #include "blockingconcurrentqueue.h"
+# include "blockingconcurrentqueue.h"
--- a/runtime/threadPool/include/x86/structures.h
+++ b/runtime/threadPool/include/x86/structures.h
@ -1,18 +1,16 @@
 #ifndef C_STRUCTURES_H
 #define C_STRUCTURES_H

+#include "blockingconcurrentqueue.h"
 #include "cuda_runtime.h"
 #include "pthread.h"
-#include "blockingconcurrentqueue.h"

-typedef struct device
-{
+typedef struct device {
  int max_compute_units;
  int device_id;
 } cu_device;

-typedef struct c_thread
-{
+typedef struct c_thread {
  pthread_t thread;
  unsigned long executed_commands;
  unsigned index;
@ -21,8 +19,7 @@ typedef struct c_thread
 } cu_ptd;

 // kernel information
-typedef struct kernel
-{
+typedef struct kernel {

  void *(*start_routine)(void *);

@ -46,15 +43,16 @@ typedef struct kernel
  int startBlockId;
  int endBlockId;

-  kernel(const kernel &obj) : start_routine(obj.start_routine), args(obj.args),
-                              shared_mem(obj.shared_mem), blockSize(obj.blockSize),
-                              gridDim(obj.gridDim), blockDim(obj.blockDim), totalBlocks(obj.totalBlocks) {}
+  kernel(const kernel &obj)
+      : start_routine(obj.start_routine), args(obj.args),
+        shared_mem(obj.shared_mem), blockSize(obj.blockSize),
+        gridDim(obj.gridDim), blockDim(obj.blockDim),
+        totalBlocks(obj.totalBlocks) {}
 } cu_kernel;

 using kernel_queue = moodycamel::BlockingConcurrentQueue<kernel *>;

-typedef struct scheduler_pool
-{
+typedef struct scheduler_pool {

  struct c_thread *thread_pool;

@ -77,8 +75,7 @@ typedef struct scheduler_pool

 } cu_pool;

-typedef struct command
-{
+typedef struct command {

  struct kernel *ker;

@ -87,16 +84,14 @@ typedef struct command

 } cu_command;

-typedef struct argument
-{
+typedef struct argument {
  // size of the argument to allocation
  size_t size;
  void *value;
  unsigned int index;
 } cu_argument;

-typedef struct input_arg
-{
+typedef struct input_arg {
  // real values for the input
  char *p;
  struct argument *argus[];
@ -105,16 +100,14 @@ typedef struct input_arg
  // so that we can parse the arguments p
 } cu_input;

-enum StreamType
-{
+enum StreamType {
  DEFAULT,
  LOW,
  HIGH,
  EXT,
 };

-struct cStreamDataInternal
-{
+struct cStreamDataInternal {
  /*
      status of the stream (run , wait)
      Run: Stream will asynchronously assign the kernel assign with this stream
@ -129,8 +122,7 @@ struct cStreamDataInternal
  unsigned int count; // number of task left in the stream
 };

-typedef struct streamData
-{
+typedef struct streamData {

  // execution status of current event monitor
  struct cStreamDataInternal ev;
@ -143,8 +135,7 @@ typedef struct streamData

 } cstreamData;

-typedef struct asyncKernel
-{
+typedef struct asyncKernel {
  unsigned int numBlocks;
  unsigned int numThreads;
  struct event *evt;
@ -157,20 +148,17 @@ typedef struct asyncKernel

 // command queue of command nodes

-typedef struct kernel_arg_array
-{
+typedef struct kernel_arg_array {
  size_t size;
  unsigned int index;
 } karg_arr;

-typedef struct kernel_image_arg
-{
+typedef struct kernel_image_arg {
  size_t size;
  unsigned int index;
 } k_arg;

-typedef struct callParams
-{
+typedef struct callParams {
  dim3 gridDim;
  dim3 blockDim;
  size_t shareMem;
--- a/runtime/threadPool/src/x86/api.cpp
+++ b/runtime/threadPool/src/x86/api.cpp
@ -1,4 +1,5 @@
 #include "api.h"
+#include "blockingconcurrentqueue.h"
 #include "def.h"
 #include "macros.h"
 #include "structures.h"
@ -6,7 +7,6 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <thread>
-#include "blockingconcurrentqueue.h"

 /*

@ -17,8 +17,7 @@
 Initialize the device
 */
 int device_max_compute_units = 1;
-int init_device()
-{
+int init_device() {
  cu_device *device = (cu_device *)calloc(1, sizeof(cu_device));
  if (device == NULL)
    return C_ERROR_MEMALLOC;
@ -39,8 +38,7 @@ int init_device()
 // Create Kernel
 static int kernelIds = 0;
 cu_kernel *create_kernel(const void *func, dim3 gridDim, dim3 blockDim,
-                         void **args, size_t sharedMem, cudaStream_t stream)
-{
+                         void **args, size_t sharedMem, cudaStream_t stream) {
  cu_kernel *ker = (cu_kernel *)calloc(1, sizeof(cu_kernel));

  // set the function pointer
@ -77,54 +75,24 @@ __thread int block_index_z = 0;
 __thread int thread_memory_size = 0;
 __thread int *dynamic_shared_memory = NULL;
 __thread int warp_shfl[32] = {
-    0,
-    0,
-    0,
-    0,
-    0,
-    0,
-    0,
-    0,
-    0,
-    0,
-    0,
-    0,
-    0,
-    0,
-    0,
-    0,
-    0,
-    0,
-    0,
-    0,
-    0,
-    0,
-    0,
-    0,
-    0,
-    0,
-    0,
-    0,
-    0,
-    0,
-    0,
-    0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 };

 /*
    Enqueue Kernel (k) to the scheduler kernelQueue
 */
-int schedulerEnqueueKernel(cu_kernel *k)
-{
-  int totalBlocks = k->totalBlocks; // calculate gpu_block_to_execute_per_cpu_thread
+int schedulerEnqueueKernel(cu_kernel *k) {
+  int totalBlocks =
+      k->totalBlocks; // calculate gpu_block_to_execute_per_cpu_thread
  int gpuBlockToExecutePerCpuThread =
-      (totalBlocks + device_max_compute_units - 1) /
-      device_max_compute_units;
-  for (int startBlockIdx = 0; startBlockIdx < totalBlocks; startBlockIdx += gpuBlockToExecutePerCpuThread)
-  {
+      (totalBlocks + device_max_compute_units - 1) / device_max_compute_units;
+  for (int startBlockIdx = 0; startBlockIdx < totalBlocks;
+       startBlockIdx += gpuBlockToExecutePerCpuThread) {
    cu_kernel *p = new cu_kernel(*k);
    p->startBlockId = startBlockIdx;
-    p->endBlockId = std::min(startBlockIdx + gpuBlockToExecutePerCpuThread - 1, totalBlocks - 1);
+    p->endBlockId = std::min(startBlockIdx + gpuBlockToExecutePerCpuThread - 1,
+                             totalBlocks - 1);
    scheduler->kernelQueue->enqueue(p);
  }

@ -136,18 +104,14 @@ int schedulerEnqueueKernel(cu_kernel *k)
 /*
  Kernel Launch with numBlocks and numThreadsPerBlock
 */
-int cuLaunchKernel(cu_kernel **k)
-{
+int cuLaunchKernel(cu_kernel **k) {
  // Calculate Block Size N/numBlocks
  cu_kernel *ker = *k;
  int status = C_RUN;
  // stream == 0 add to the kernelQueue
-  if (ker->stream == 0)
-  {
+  if (ker->stream == 0) {
    schedulerEnqueueKernel(ker);
-  }
-  else
-  {
+  } else {
    printf("MultiStream no implemente\n");
    exit(1);
  }
@ -157,19 +121,17 @@ int cuLaunchKernel(cu_kernel **k)
 /*
    Thread Gets Work
 */
-int get_work(c_thread *th)
-{
+int get_work(c_thread *th) {
  int dynamic_shared_mem_size = 0;
  dim3 gridDim;
  dim3 blockDim;
-  while (true)
-  {
+  while (true) {
    // try to get a task from the queue
    cu_kernel *k;
    th->busy = false;
-    bool getTask = scheduler->kernelQueue->wait_dequeue_timed(k, std::chrono::milliseconds(5));
-    if (getTask)
-    {
+    bool getTask = scheduler->kernelQueue->wait_dequeue_timed(
+        k, std::chrono::milliseconds(5));
+    if (getTask) {
      th->busy = true;
      // set runtime configuration
      gridDim = k->gridDim;
@ -185,8 +147,8 @@ int get_work(c_thread *th)
      if (dynamic_shared_mem_size > 0)
        dynamic_shared_memory = (int *)malloc(dynamic_shared_mem_size);
      // execute GPU blocks
-      for (block_index = k->startBlockId; block_index < k->endBlockId + 1; block_index++)
-      {
+      for (block_index = k->startBlockId; block_index < k->endBlockId + 1;
+           block_index++) {
        int tmp = block_index;
        block_index_x = tmp / (grid_size_y * grid_size_z);
        tmp = tmp % (grid_size_y * grid_size_z);
@ -197,8 +159,7 @@ int get_work(c_thread *th)
      }
    }
    // if cannot get tasks, check whether programs stop
-    else if (scheduler->threadpool_shutdown_requested)
-    {
+    else if (scheduler->threadpool_shutdown_requested) {
      th->busy = false;
      return true; // thread exit
    }
@ -206,8 +167,7 @@ int get_work(c_thread *th)
  return 0;
 }

-void *driver_thread(void *p)
-{
+void *driver_thread(void *p) {
  struct c_thread *td = (struct c_thread *)p;
  int is_exit = 0;
  td->exit = false;
@ -216,14 +176,11 @@ void *driver_thread(void *p)
  is_exit = get_work(td);

  // exit the routine
-  if (is_exit)
-  {
+  if (is_exit) {
    td->exit = true;
    // pthread_exit
    pthread_exit(NULL);
-  }
-  else
-  {
+  } else {
    printf("driver thread stop incorrectly\n");
    exit(1);
  }
@ -232,8 +189,7 @@ void *driver_thread(void *p)
 /*
 Initialize the scheduler
 */
-int scheduler_init(cu_device device)
-{
+int scheduler_init(cu_device device) {
  scheduler = (cu_pool *)calloc(1, sizeof(cu_pool));
  scheduler->num_worker_threads = device.max_compute_units;
  scheduler->num_kernel_queued = 0;
@ -243,8 +199,7 @@ int scheduler_init(cu_device device)
  scheduler->kernelQueue = new kernel_queue;

  scheduler->idle_threads = 0;
-  for (int i = 0; i < scheduler->num_worker_threads; i++)
-  {
+  for (int i = 0; i < scheduler->num_worker_threads; i++) {
    scheduler->thread_pool[i].index = i;
    pthread_create(&scheduler->thread_pool[i].thread, NULL, driver_thread,
                   (void *)&scheduler->thread_pool[i]);
@ -253,8 +208,7 @@ int scheduler_init(cu_device device)
  return C_SUCCESS;
 }

-void scheduler_uninit()
-{
+void scheduler_uninit() {
  printf("Scheduler Unitit no Implemente\n");
  exit(1);
 }
@ -271,17 +225,17 @@ void scheduler_uninit()
  Sense Like Barrier
  Counting Barrier basically
 */
-void cuSynchronizeBarrier()
-{
-  while(1) {
+void cuSynchronizeBarrier() {
+  while (1) {
    // sync is complete, only if queue size == 0 and none of
    // driver threads are busy
-    if(scheduler->kernelQueue->size_approx() == 0) {
+    if (scheduler->kernelQueue->size_approx() == 0) {
      bool none_busy = true;
      for (int i = 0; i < scheduler->num_worker_threads; i++) {
-        none_busy&=(!scheduler->thread_pool[i].busy);
+        none_busy &= (!scheduler->thread_pool[i].busy);
      }
-      if(none_busy) break;
+      if (none_busy)
+        break;
    }
  }
 }