diff --git a/runtime/include/x86/cudaRuntimeImpl.h b/runtime/include/x86/cudaRuntimeImpl.h index 206d686..c2925dd 100644 --- a/runtime/include/x86/cudaRuntimeImpl.h +++ b/runtime/include/x86/cudaRuntimeImpl.h @@ -22,6 +22,7 @@ cudaError_t cudaSetDevice(int device); cudaError_t cudaStreamCopyAttributes(cudaStream_t dst, cudaStream_t src); cudaError_t cudaStreamCreate(cudaStream_t *pStream); cudaError_t cudaStreamDestroy(cudaStream_t stream); +cudaError_t cudaStreamQuery(cudaStream_t stream); cudaError_t cudaStreamSynchronize(cudaStream_t stream); } #endif diff --git a/runtime/src/x86/cudaRuntimeImpl.cpp b/runtime/src/x86/cudaRuntimeImpl.cpp index 8648034..8f60525 100644 --- a/runtime/src/x86/cudaRuntimeImpl.cpp +++ b/runtime/src/x86/cudaRuntimeImpl.cpp @@ -107,21 +107,18 @@ cudaError_t cudaStreamCopyAttributes(cudaStream_t dst, cudaStream_t src) { static int stream_counter = 1; /* - cudaStream_t is a Opaque Structure - Overwrites cudaStream_t into custom cstreamData structure - (does hardware uses the cudaStream_t stream) +From our evaluation, CPU backend can gain little benefit +from multi stream. Thus, we only use single stream */ -cudaError_t cudaStreamCreate(cudaStream_t *pStream) { - assert(0 && "cudaStreamCreate no Implement\n"); -} +cudaError_t cudaStreamCreate(cudaStream_t *pStream) { return cudaSuccess; } -cudaError_t cudaStreamDestroy(cudaStream_t stream) { - assert(0 && "cudaStreamDestroy No Implement\n"); -} +cudaError_t cudaStreamDestroy(cudaStream_t stream) { return cudaSuccess; } -cudaError_t cudaStreamSynchronize(cudaStream_t stream) { - assert(0 && "cudaStreamSynchronize No Implement\n"); -} +// All kernel launch will following a sync, thus, this should +// always be true +cudaError_t cudaStreamQuery(cudaStream_t stream) { return cudaSuccess; } + +cudaError_t cudaStreamSynchronize(cudaStream_t stream) { return cudaSuccess; } cudaError_t cudaGetDeviceCount(int *count) { // dummy value diff --git a/runtime/threadPool/src/x86/api.cpp b/runtime/threadPool/src/x86/api.cpp index 18347d6..4a319fc 100644 --- a/runtime/threadPool/src/x86/api.cpp +++ b/runtime/threadPool/src/x86/api.cpp @@ -16,7 +16,7 @@ int device_max_compute_units = 1; bool device_initilized = false; int init_device() { if (device_initilized) - return 0; + return C_SUCCESS; device_initilized = true; cu_device *device = (cu_device *)calloc(1, sizeof(cu_device)); if (device == NULL) @@ -28,11 +28,7 @@ int init_device() { device_max_compute_units = device->max_compute_units; // initialize scheduler - int ret = scheduler_init(*device); - if (ret != C_SUCCESS) - return ret; - - return C_SUCCESS; + return scheduler_init(*device); } // Create Kernel @@ -84,7 +80,8 @@ int schedulerEnqueueKernel(cu_kernel *k) { k->totalBlocks; // calculate gpu_block_to_execute_per_cpu_thread int gpuBlockToExecutePerCpuThread = (totalBlocks + device_max_compute_units - 1) / device_max_compute_units; - TaskToExecute = 0; + TaskToExecute = (totalBlocks + gpuBlockToExecutePerCpuThread - 1) / + gpuBlockToExecutePerCpuThread; for (int startBlockIdx = 0; startBlockIdx < totalBlocks; startBlockIdx += gpuBlockToExecutePerCpuThread) { cu_kernel *p = new cu_kernel(*k); @@ -92,7 +89,6 @@ int schedulerEnqueueKernel(cu_kernel *k) { p->endBlockId = std::min(startBlockIdx + gpuBlockToExecutePerCpuThread - 1, totalBlocks - 1); scheduler->kernelQueue->enqueue(p); - TaskToExecute++; } return C_SUCCESS; } @@ -107,16 +103,12 @@ int cuLaunchKernel(cu_kernel **k) { // Calculate Block Size N/numBlocks cu_kernel *ker = *k; int status = C_RUN; - // stream == 0 add to the kernelQueue - if (ker->stream == 0) { - // set complete to false, this variable is used for sync - for (int i = 0; i < scheduler->num_worker_threads; i++) { - scheduler->thread_pool[i].completeTask = 0; - } - schedulerEnqueueKernel(ker); - } else { - assert(0 && "MultiStream no implemente\n"); + // set complete to false, this variable is used for sync + for (int i = 0; i < scheduler->num_worker_threads; i++) { + scheduler->thread_pool[i].completeTask = 0; } + schedulerEnqueueKernel(ker); + return 0; }