diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 44de99a..a7c1491 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -155,41 +155,3 @@ jobs: llc --relocation-model=pic --filetype=obj host.bc g++ -o lavaMD -fPIC -no-pie -I${{ github.workspace }}/runtime/threadPool/include -L${{ github.workspace }}/build/runtime -L${{ github.workspace }}/build/runtime/threadPool main.c host.o kernel.o util/timer/timer.c util/num/num.c -lpthread -lc -lx86Runtime -lthreadPool -pthread ./lavaMD -boxes1d 10 - - name: Execute the dwt2d example - run: | - cd ${{ github.workspace }}/SC_evaluate/rodinia-cox/dwt2d - clang++ -I. -I/include -fno-strict-aliasing dwt_cuda/fdwt53.cu dwt_cuda/fdwt97.cu dwt_cuda/common.cu dwt_cuda/rdwt97.cu dwt_cuda/rdwt53.cu components.cu dwt.cu main.cu -c --cuda-path=${{ github.workspace }}/cuda-10.1 --cuda-gpu-arch=sm_61 -L${{ github.workspace }}/cuda-10.1/lib64 -lcudart_static -ldl -lrt -pthread -save-temps -v || true - export LD_LIBRARY_PATH=${{ github.workspace }}/build/runtime:${{ github.workspace }}/build/runtime/threadPool:$LD_LIBRARY_PATH - export PATH=${{ github.workspace }}/build/compilation:$PATH - kernelTranslator common-cuda-nvptx64-nvidia-cuda-sm_61.bc common.bc - kernelTranslator components-cuda-nvptx64-nvidia-cuda-sm_61.bc components.bc - kernelTranslator fdwt53-cuda-nvptx64-nvidia-cuda-sm_61.bc fdwt53.bc - kernelTranslator dwt-cuda-nvptx64-nvidia-cuda-sm_61.bc dwt.bc - kernelTranslator fdwt97-cuda-nvptx64-nvidia-cuda-sm_61.bc fdwt97.bc - kernelTranslator rdwt97-cuda-nvptx64-nvidia-cuda-sm_61.bc rdwt97.bc - kernelTranslator rdwt53-cuda-nvptx64-nvidia-cuda-sm_61.bc rdwt53.bc - hostTranslator main-host-x86_64-unknown-linux-gnu.bc host.bc - hostTranslator common-host-x86_64-unknown-linux-gnu.bc common_host.bc - hostTranslator components-host-x86_64-unknown-linux-gnu.bc components_host.bc - hostTranslator dwt-host-x86_64-unknown-linux-gnu.bc dwt_host.bc - hostTranslator fdwt53-host-x86_64-unknown-linux-gnu.bc fdwt53_host.bc - hostTranslator fdwt97-host-x86_64-unknown-linux-gnu.bc fdwt97_host.bc - hostTranslator rdwt53-host-x86_64-unknown-linux-gnu.bc rdwt53_host.bc - hostTranslator rdwt97-host-x86_64-unknown-linux-gnu.bc rdwt97_host.bc - llc --relocation-model=pic --filetype=obj common.bc - llc --relocation-model=pic --filetype=obj components.bc - llc --relocation-model=pic --filetype=obj fdwt53.bc - llc --relocation-model=pic --filetype=obj dwt.bc - llc --relocation-model=pic --filetype=obj host.bc - llc --relocation-model=pic --filetype=obj common_host.bc - llc --relocation-model=pic --filetype=obj components_host.bc - llc --relocation-model=pic --filetype=obj fdwt53_host.bc - llc --relocation-model=pic --filetype=obj dwt_host.bc - llc --relocation-model=pic --filetype=obj fdwt97_host.bc - llc --relocation-model=pic --filetype=obj rdwt97_host.bc - llc --relocation-model=pic --filetype=obj rdwt53_host.bc - llc --relocation-model=pic --filetype=obj fdwt97.bc - llc --relocation-model=pic --filetype=obj rdwt97.bc - llc --relocation-model=pic --filetype=obj rdwt53.bc - g++ -o dwt2d -fPIC -no-pie -I${{ github.workspace }}/runtime/threadPool/include -L${{ github.workspace }}/build/runtime -L${{ github.workspace }}/build/runtime/threadPool common.o components.o dwt.o fdwt53.o fdwt97.o rdwt97.o rdwt53.o host.o common_host.o components_host.o dwt_host.o fdwt53_host.o fdwt97_host.o rdwt97_host.o rdwt53_host.o -lpthread -lc -lx86Runtime -lthreadPool -pthread - ./dwt2d 192.bmp -d 192x192 -f -5 -l 3 diff --git a/runtime/src/x86/cudaRuntimeImpl.cpp b/runtime/src/x86/cudaRuntimeImpl.cpp index fd7c3d9..b36761c 100644 --- a/runtime/src/x86/cudaRuntimeImpl.cpp +++ b/runtime/src/x86/cudaRuntimeImpl.cpp @@ -118,17 +118,17 @@ static int stream_counter = 1; */ cudaError_t cudaStreamCreate(cudaStream_t *pStream) { - printf("No Implement\n"); + printf("cudaStreamCreate no Implement\n"); exit(1); } cudaError_t cudaStreamDestroy(cudaStream_t stream) { - printf("No Implement\n"); + printf("cudaStreamDestroy No Implement\n"); exit(1); } cudaError_t cudaStreamSynchronize(cudaStream_t stream) { - printf("No Implement\n"); + printf("cudaStreamSynchronize No Implement\n"); exit(1); } diff --git a/runtime/threadPool/include/x86/structures.h b/runtime/threadPool/include/x86/structures.h index 9362eef..e1b1111 100644 --- a/runtime/threadPool/include/x86/structures.h +++ b/runtime/threadPool/include/x86/structures.h @@ -17,6 +17,7 @@ typedef struct c_thread unsigned long executed_commands; unsigned index; bool exit; + bool busy; } cu_ptd; // kernel information diff --git a/runtime/threadPool/src/x86/api.cpp b/runtime/threadPool/src/x86/api.cpp index be1ab8a..b7539ed 100644 --- a/runtime/threadPool/src/x86/api.cpp +++ b/runtime/threadPool/src/x86/api.cpp @@ -24,7 +24,6 @@ int init_device() return C_ERROR_MEMALLOC; device->max_compute_units = std::thread::hardware_concurrency(); - device->max_compute_units = 4; std::cout << device->max_compute_units << " concurrent threads are supported.\n"; device_max_compute_units = device->max_compute_units; @@ -129,8 +128,8 @@ int schedulerEnqueueKernel(cu_kernel *k) scheduler->kernelQueue->enqueue(p); } - printf("total: %d execute per cpu: %d\n", totalBlocks, - gpuBlockToExecutePerCpuThread); + // printf("total: %d execute per cpu: %d\n", totalBlocks, + // gpuBlockToExecutePerCpuThread); return C_SUCCESS; } @@ -139,12 +138,6 @@ int schedulerEnqueueKernel(cu_kernel *k) */ int cuLaunchKernel(cu_kernel **k) { - if (!scheduler) - { - init_device(); - } - std::cout << "launch\n" - << std::flush; // Calculate Block Size N/numBlocks cu_kernel *ker = *k; int status = C_RUN; @@ -173,9 +166,11 @@ int get_work(c_thread *th) { // try to get a task from the queue cu_kernel *k; + th->busy = false; bool getTask = scheduler->kernelQueue->wait_dequeue_timed(k, std::chrono::milliseconds(5)); if (getTask) { + th->busy = true; // set runtime configuration gridDim = k->gridDim; blockDim = k->blockDim; @@ -190,7 +185,6 @@ int get_work(c_thread *th) if (dynamic_shared_mem_size > 0) dynamic_shared_memory = (int *)malloc(dynamic_shared_mem_size); // execute GPU blocks - printf("exec: from: %d to : %d\n",k->startBlockId, k->endBlockId); for (block_index = k->startBlockId; block_index < k->endBlockId + 1; block_index++) { int tmp = block_index; @@ -201,11 +195,11 @@ int get_work(c_thread *th) block_index_z = tmp; k->start_routine(k->args); } - printf("done: from: %d to : %d\n",k->startBlockId, k->endBlockId); } // if cannot get tasks, check whether programs stop else if (scheduler->threadpool_shutdown_requested) { + th->busy = false; return true; // thread exit } } @@ -217,17 +211,15 @@ void *driver_thread(void *p) struct c_thread *td = (struct c_thread *)p; int is_exit = 0; td->exit = false; + td->busy = false; // get work - printf("before getwork\n"); is_exit = get_work(td); - printf("after getwork\n"); // exit the routine if (is_exit) { td->exit = true; // pthread_exit - printf("pthread exit\n"); pthread_exit(NULL); } else @@ -263,7 +255,7 @@ int scheduler_init(cu_device device) void scheduler_uninit() { - printf("No Implemente\n"); + printf("Scheduler Unitit no Implemente\n"); exit(1); } @@ -281,7 +273,15 @@ void scheduler_uninit() */ void cuSynchronizeBarrier() { - while (scheduler->kernelQueue->size_approx() > 0) - ; - printf("size: %d\n",scheduler->kernelQueue->size_approx()); + while(1) { + // sync is complete, only if queue size == 0 and none of + // driver threads are busy + if(scheduler->kernelQueue->size_approx() == 0) { + bool none_busy = true; + for (int i = 0; i < scheduler->num_worker_threads; i++) { + none_busy&=(!scheduler->thread_pool[i].busy); + } + if(none_busy) break; + } + } }