diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 44de99a..a7c1491 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -155,41 +155,3 @@ jobs:
           llc --relocation-model=pic --filetype=obj  host.bc
           g++ -o lavaMD -fPIC -no-pie -I${{ github.workspace }}/runtime/threadPool/include -L${{ github.workspace }}/build/runtime   -L${{ github.workspace }}/build/runtime/threadPool   main.c host.o kernel.o util/timer/timer.c util/num/num.c -lpthread -lc -lx86Runtime -lthreadPool -pthread
           ./lavaMD -boxes1d 10
-      - name: Execute the dwt2d example
-        run: |
-          cd ${{ github.workspace }}/SC_evaluate/rodinia-cox/dwt2d
-          clang++ -I. -I/include -fno-strict-aliasing dwt_cuda/fdwt53.cu dwt_cuda/fdwt97.cu  dwt_cuda/common.cu  dwt_cuda/rdwt97.cu  dwt_cuda/rdwt53.cu components.cu dwt.cu main.cu -c --cuda-path=${{ github.workspace }}/cuda-10.1  --cuda-gpu-arch=sm_61 -L${{ github.workspace }}/cuda-10.1/lib64 -lcudart_static -ldl -lrt -pthread -save-temps -v  || true
-          export LD_LIBRARY_PATH=${{ github.workspace }}/build/runtime:${{ github.workspace }}/build/runtime/threadPool:$LD_LIBRARY_PATH
-          export PATH=${{ github.workspace }}/build/compilation:$PATH
-          kernelTranslator common-cuda-nvptx64-nvidia-cuda-sm_61.bc common.bc
-          kernelTranslator components-cuda-nvptx64-nvidia-cuda-sm_61.bc components.bc
-          kernelTranslator fdwt53-cuda-nvptx64-nvidia-cuda-sm_61.bc fdwt53.bc
-          kernelTranslator dwt-cuda-nvptx64-nvidia-cuda-sm_61.bc dwt.bc
-          kernelTranslator fdwt97-cuda-nvptx64-nvidia-cuda-sm_61.bc fdwt97.bc
-          kernelTranslator rdwt97-cuda-nvptx64-nvidia-cuda-sm_61.bc rdwt97.bc
-          kernelTranslator rdwt53-cuda-nvptx64-nvidia-cuda-sm_61.bc rdwt53.bc
-          hostTranslator main-host-x86_64-unknown-linux-gnu.bc host.bc
-          hostTranslator common-host-x86_64-unknown-linux-gnu.bc common_host.bc
-          hostTranslator components-host-x86_64-unknown-linux-gnu.bc components_host.bc
-          hostTranslator dwt-host-x86_64-unknown-linux-gnu.bc dwt_host.bc
-          hostTranslator fdwt53-host-x86_64-unknown-linux-gnu.bc fdwt53_host.bc
-          hostTranslator fdwt97-host-x86_64-unknown-linux-gnu.bc fdwt97_host.bc
-          hostTranslator rdwt53-host-x86_64-unknown-linux-gnu.bc rdwt53_host.bc
-          hostTranslator rdwt97-host-x86_64-unknown-linux-gnu.bc rdwt97_host.bc
-          llc --relocation-model=pic --filetype=obj  common.bc
-          llc --relocation-model=pic --filetype=obj  components.bc
-          llc --relocation-model=pic --filetype=obj  fdwt53.bc
-          llc --relocation-model=pic --filetype=obj  dwt.bc
-          llc --relocation-model=pic --filetype=obj  host.bc
-          llc --relocation-model=pic --filetype=obj  common_host.bc
-          llc --relocation-model=pic --filetype=obj  components_host.bc
-          llc --relocation-model=pic --filetype=obj  fdwt53_host.bc
-          llc --relocation-model=pic --filetype=obj  dwt_host.bc
-          llc --relocation-model=pic --filetype=obj  fdwt97_host.bc
-          llc --relocation-model=pic --filetype=obj  rdwt97_host.bc
-          llc --relocation-model=pic --filetype=obj  rdwt53_host.bc
-          llc --relocation-model=pic --filetype=obj  fdwt97.bc
-          llc --relocation-model=pic --filetype=obj  rdwt97.bc
-          llc --relocation-model=pic --filetype=obj  rdwt53.bc
-          g++ -o dwt2d -fPIC -no-pie -I${{ github.workspace }}/runtime/threadPool/include -L${{ github.workspace }}/build/runtime   -L${{ github.workspace }}/build/runtime/threadPool common.o components.o dwt.o fdwt53.o fdwt97.o rdwt97.o rdwt53.o host.o common_host.o components_host.o dwt_host.o fdwt53_host.o fdwt97_host.o rdwt97_host.o rdwt53_host.o -lpthread -lc -lx86Runtime -lthreadPool -pthread
-          ./dwt2d 192.bmp -d 192x192 -f -5 -l 3
diff --git a/runtime/src/x86/cudaRuntimeImpl.cpp b/runtime/src/x86/cudaRuntimeImpl.cpp
index fd7c3d9..b36761c 100644
--- a/runtime/src/x86/cudaRuntimeImpl.cpp
+++ b/runtime/src/x86/cudaRuntimeImpl.cpp
@@ -118,17 +118,17 @@ static int stream_counter = 1;
 
 */
 cudaError_t cudaStreamCreate(cudaStream_t *pStream) {
-  printf("No Implement\n");
+  printf("cudaStreamCreate no Implement\n");
   exit(1);
 }
 
 cudaError_t cudaStreamDestroy(cudaStream_t stream) {
-  printf("No Implement\n");
+  printf("cudaStreamDestroy No Implement\n");
   exit(1);
 }
 
 cudaError_t cudaStreamSynchronize(cudaStream_t stream) {
-  printf("No Implement\n");
+  printf("cudaStreamSynchronize No Implement\n");
   exit(1);
 }
 
diff --git a/runtime/threadPool/include/x86/structures.h b/runtime/threadPool/include/x86/structures.h
index 9362eef..e1b1111 100644
--- a/runtime/threadPool/include/x86/structures.h
+++ b/runtime/threadPool/include/x86/structures.h
@@ -17,6 +17,7 @@ typedef struct c_thread
   unsigned long executed_commands;
   unsigned index;
   bool exit;
+  bool busy;
 } cu_ptd;
 
 // kernel information
diff --git a/runtime/threadPool/src/x86/api.cpp b/runtime/threadPool/src/x86/api.cpp
index be1ab8a..b7539ed 100644
--- a/runtime/threadPool/src/x86/api.cpp
+++ b/runtime/threadPool/src/x86/api.cpp
@@ -24,7 +24,6 @@ int init_device()
     return C_ERROR_MEMALLOC;
 
   device->max_compute_units = std::thread::hardware_concurrency();
-  device->max_compute_units = 4;
   std::cout << device->max_compute_units
             << " concurrent threads are supported.\n";
   device_max_compute_units = device->max_compute_units;
@@ -129,8 +128,8 @@ int schedulerEnqueueKernel(cu_kernel *k)
     scheduler->kernelQueue->enqueue(p);
   }
 
-  printf("total: %d  execute per cpu: %d\n", totalBlocks,
-         gpuBlockToExecutePerCpuThread);
+  // printf("total: %d  execute per cpu: %d\n", totalBlocks,
+  //        gpuBlockToExecutePerCpuThread);
   return C_SUCCESS;
 }
 
@@ -139,12 +138,6 @@ int schedulerEnqueueKernel(cu_kernel *k)
 */
 int cuLaunchKernel(cu_kernel **k)
 {
-  if (!scheduler)
-  {
-    init_device();
-  }
-  std::cout << "launch\n"
-            << std::flush;
   // Calculate Block Size N/numBlocks
   cu_kernel *ker = *k;
   int status = C_RUN;
@@ -173,9 +166,11 @@ int get_work(c_thread *th)
   {
     // try to get a task from the queue
     cu_kernel *k;
+    th->busy = false;
     bool getTask = scheduler->kernelQueue->wait_dequeue_timed(k, std::chrono::milliseconds(5));
     if (getTask)
     {
+      th->busy = true;
       // set runtime configuration
       gridDim = k->gridDim;
       blockDim = k->blockDim;
@@ -190,7 +185,6 @@ int get_work(c_thread *th)
       if (dynamic_shared_mem_size > 0)
         dynamic_shared_memory = (int *)malloc(dynamic_shared_mem_size);
       // execute GPU blocks
-      printf("exec: from: %d to : %d\n",k->startBlockId, k->endBlockId);
       for (block_index = k->startBlockId; block_index < k->endBlockId + 1; block_index++)
       {
         int tmp = block_index;
@@ -201,11 +195,11 @@ int get_work(c_thread *th)
         block_index_z = tmp;
         k->start_routine(k->args);
       }
-      printf("done: from: %d to : %d\n",k->startBlockId, k->endBlockId);
     }
     // if cannot get tasks, check whether programs stop
     else if (scheduler->threadpool_shutdown_requested)
     {
+      th->busy = false;
       return true; // thread exit
     }
   }
@@ -217,17 +211,15 @@ void *driver_thread(void *p)
   struct c_thread *td = (struct c_thread *)p;
   int is_exit = 0;
   td->exit = false;
+  td->busy = false;
   // get work
-  printf("before getwork\n");
   is_exit = get_work(td);
-  printf("after getwork\n");
 
   // exit the routine
   if (is_exit)
   {
     td->exit = true;
     // pthread_exit
-    printf("pthread exit\n");
     pthread_exit(NULL);
   }
   else
@@ -263,7 +255,7 @@ int scheduler_init(cu_device device)
 
 void scheduler_uninit()
 {
-  printf("No Implemente\n");
+  printf("Scheduler Unitit no Implemente\n");
   exit(1);
 }
 
@@ -281,7 +273,15 @@ void scheduler_uninit()
 */
 void cuSynchronizeBarrier()
 {
-  while (scheduler->kernelQueue->size_approx() > 0)
-    ;
-  printf("size: %d\n",scheduler->kernelQueue->size_approx());
+  while(1) {
+    // sync is complete, only if queue size == 0 and none of 
+    // driver threads are busy
+    if(scheduler->kernelQueue->size_approx() == 0) {
+      bool none_busy = true;
+      for (int i = 0; i < scheduler->num_worker_threads; i++) {
+        none_busy&=(!scheduler->thread_pool[i].busy);
+      }
+      if(none_busy) break;
+    }
+  }
 }