#include "api.h" #include "blockingconcurrentqueue.h" #include "debug.hpp" #include "def.h" #include "macros.h" #include "structures.h" #include #include #include #include /* Initialize the device */ int device_max_compute_units = 1; bool device_initilized = false; int init_device() { if (device_initilized) return C_SUCCESS; device_initilized = true; cu_device *device = (cu_device *)calloc(1, sizeof(cu_device)); if (device == NULL) return C_ERROR_MEMALLOC; device->max_compute_units = std::thread::hardware_concurrency(); DEBUG_INFO("%d concurrent threads are supported.\n", device->max_compute_units); device_max_compute_units = device->max_compute_units; // initialize scheduler return scheduler_init(*device); } // Create Kernel static int kernelIds = 0; cu_kernel *create_kernel(const void *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream) { cu_kernel *ker = (cu_kernel *)calloc(1, sizeof(cu_kernel)); // set the function pointer ker->start_routine = (void *(*)(void *))func; ker->args = args; ker->gridDim = gridDim; ker->blockDim = blockDim; ker->shared_mem = sharedMem; ker->stream = stream; ker->totalBlocks = gridDim.x * gridDim.y * gridDim.z; ker->blockSize = blockDim.x * blockDim.y * blockDim.z; return ker; } // scheduler static cu_pool *scheduler; __thread int block_size = 0; __thread int block_size_x = 0; __thread int block_size_y = 0; __thread int block_size_z = 0; __thread int grid_size_x = 0; __thread int grid_size_y = 0; __thread int grid_size_z = 0; __thread int block_index = 0; __thread int block_index_x = 0; __thread int block_index_y = 0; __thread int block_index_z = 0; __thread int thread_memory_size = 0; __thread int *dynamic_shared_memory = NULL; __thread int warp_shfl[32] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }; /* Enqueue Kernel (k) to the scheduler kernelQueue */ int TaskToExecute; int schedulerEnqueueKernel(cu_kernel *k) { int totalBlocks = k->totalBlocks; // calculate gpu_block_to_execute_per_cpu_thread int gpuBlockToExecutePerCpuThread = (totalBlocks + device_max_compute_units - 1) / device_max_compute_units; TaskToExecute = (totalBlocks + gpuBlockToExecutePerCpuThread - 1) / gpuBlockToExecutePerCpuThread; for (int startBlockIdx = 0; startBlockIdx < totalBlocks; startBlockIdx += gpuBlockToExecutePerCpuThread) { cu_kernel *p = new cu_kernel(*k); p->startBlockId = startBlockIdx; p->endBlockId = std::min(startBlockIdx + gpuBlockToExecutePerCpuThread - 1, totalBlocks - 1); scheduler->kernelQueue->enqueue(p); } return C_SUCCESS; } /* Kernel Launch with numBlocks and numThreadsPerBlock */ int cuLaunchKernel(cu_kernel **k) { if (!device_initilized) { init_device(); } // Calculate Block Size N/numBlocks cu_kernel *ker = *k; int status = C_RUN; // set complete to false, this variable is used for sync for (int i = 0; i < scheduler->num_worker_threads; i++) { scheduler->thread_pool[i].completeTask = 0; } schedulerEnqueueKernel(ker); return 0; } /* Thread Gets Work */ int get_work(c_thread *th) { int dynamic_shared_mem_size = 0; dim3 gridDim; dim3 blockDim; while (true) { // try to get a task from the queue cu_kernel *k; th->busy = scheduler->kernelQueue->wait_dequeue_timed( k, std::chrono::milliseconds(5)); if (th->busy) { // set runtime configuration gridDim = k->gridDim; blockDim = k->blockDim; dynamic_shared_mem_size = k->shared_mem; block_size = k->blockSize; block_size_x = blockDim.x; block_size_y = blockDim.y; block_size_z = blockDim.z; grid_size_x = gridDim.x; grid_size_y = gridDim.y; grid_size_z = gridDim.z; if (dynamic_shared_mem_size > 0) dynamic_shared_memory = (int *)malloc(dynamic_shared_mem_size); // execute GPU blocks for (block_index = k->startBlockId; block_index < k->endBlockId + 1; block_index++) { int tmp = block_index; block_index_x = tmp / (grid_size_y * grid_size_z); tmp = tmp % (grid_size_y * grid_size_z); block_index_y = tmp / (grid_size_z); tmp = tmp % (grid_size_z); block_index_z = tmp; k->start_routine(k->args); } th->completeTask++; } // if cannot get tasks, check whether programs stop if (scheduler->threadpool_shutdown_requested) { return true; // thread exit } } return 0; } void *driver_thread(void *p) { struct c_thread *td = (struct c_thread *)p; int is_exit = 0; td->exit = false; td->busy = false; // get work is_exit = get_work(td); // exit the routine if (is_exit) { td->exit = true; pthread_exit(NULL); } else { assert(0 && "driver thread stop incorrectly\n"); } } /* Initialize the scheduler */ int scheduler_init(cu_device device) { scheduler = (cu_pool *)calloc(1, sizeof(cu_pool)); scheduler->num_worker_threads = device.max_compute_units; scheduler->num_kernel_queued = 0; scheduler->thread_pool = (struct c_thread *)calloc( scheduler->num_worker_threads, sizeof(c_thread)); scheduler->kernelQueue = new kernel_queue; scheduler->idle_threads = 0; for (int i = 0; i < scheduler->num_worker_threads; i++) { scheduler->thread_pool[i].index = i; pthread_create(&scheduler->thread_pool[i].thread, NULL, driver_thread, (void *)&scheduler->thread_pool[i]); } return C_SUCCESS; } void scheduler_uninit() { assert(0 && "Scheduler Unitit no Implemente\n"); } /* Barrier for Kernel Launch */ void cuSynchronizeBarrier() { if (!device_initilized) { init_device(); } while (1) { // after compilation transformation, each kernel launch has a // following sync if (scheduler->kernelQueue->size_approx() == 0) { int completeBlock = 0; for (int i = 0; i < scheduler->num_worker_threads; i++) { completeBlock += (scheduler->thread_pool[i].completeTask); } if (completeBlock == TaskToExecute) break; } } }