implement multistream APIs for CPU backend

This commit is contained in:
Ruobing Han 2022-09-19 10:41:40 -04:00
parent ca089c4274
commit f712c30b09
3 changed files with 19 additions and 29 deletions

View File

@ -22,6 +22,7 @@ cudaError_t cudaSetDevice(int device);
cudaError_t cudaStreamCopyAttributes(cudaStream_t dst, cudaStream_t src); cudaError_t cudaStreamCopyAttributes(cudaStream_t dst, cudaStream_t src);
cudaError_t cudaStreamCreate(cudaStream_t *pStream); cudaError_t cudaStreamCreate(cudaStream_t *pStream);
cudaError_t cudaStreamDestroy(cudaStream_t stream); cudaError_t cudaStreamDestroy(cudaStream_t stream);
cudaError_t cudaStreamQuery(cudaStream_t stream);
cudaError_t cudaStreamSynchronize(cudaStream_t stream); cudaError_t cudaStreamSynchronize(cudaStream_t stream);
} }
#endif #endif

View File

@ -107,21 +107,18 @@ cudaError_t cudaStreamCopyAttributes(cudaStream_t dst, cudaStream_t src) {
static int stream_counter = 1; static int stream_counter = 1;
/* /*
cudaStream_t is a Opaque Structure From our evaluation, CPU backend can gain little benefit
Overwrites cudaStream_t into custom cstreamData structure from multi stream. Thus, we only use single stream
(does hardware uses the cudaStream_t stream)
*/ */
cudaError_t cudaStreamCreate(cudaStream_t *pStream) { cudaError_t cudaStreamCreate(cudaStream_t *pStream) { return cudaSuccess; }
assert(0 && "cudaStreamCreate no Implement\n");
}
cudaError_t cudaStreamDestroy(cudaStream_t stream) { cudaError_t cudaStreamDestroy(cudaStream_t stream) { return cudaSuccess; }
assert(0 && "cudaStreamDestroy No Implement\n");
}
cudaError_t cudaStreamSynchronize(cudaStream_t stream) { // All kernel launch will following a sync, thus, this should
assert(0 && "cudaStreamSynchronize No Implement\n"); // always be true
} cudaError_t cudaStreamQuery(cudaStream_t stream) { return cudaSuccess; }
cudaError_t cudaStreamSynchronize(cudaStream_t stream) { return cudaSuccess; }
cudaError_t cudaGetDeviceCount(int *count) { cudaError_t cudaGetDeviceCount(int *count) {
// dummy value // dummy value

View File

@ -16,7 +16,7 @@ int device_max_compute_units = 1;
bool device_initilized = false; bool device_initilized = false;
int init_device() { int init_device() {
if (device_initilized) if (device_initilized)
return 0; return C_SUCCESS;
device_initilized = true; device_initilized = true;
cu_device *device = (cu_device *)calloc(1, sizeof(cu_device)); cu_device *device = (cu_device *)calloc(1, sizeof(cu_device));
if (device == NULL) if (device == NULL)
@ -28,11 +28,7 @@ int init_device() {
device_max_compute_units = device->max_compute_units; device_max_compute_units = device->max_compute_units;
// initialize scheduler // initialize scheduler
int ret = scheduler_init(*device); return scheduler_init(*device);
if (ret != C_SUCCESS)
return ret;
return C_SUCCESS;
} }
// Create Kernel // Create Kernel
@ -84,7 +80,8 @@ int schedulerEnqueueKernel(cu_kernel *k) {
k->totalBlocks; // calculate gpu_block_to_execute_per_cpu_thread k->totalBlocks; // calculate gpu_block_to_execute_per_cpu_thread
int gpuBlockToExecutePerCpuThread = int gpuBlockToExecutePerCpuThread =
(totalBlocks + device_max_compute_units - 1) / device_max_compute_units; (totalBlocks + device_max_compute_units - 1) / device_max_compute_units;
TaskToExecute = 0; TaskToExecute = (totalBlocks + gpuBlockToExecutePerCpuThread - 1) /
gpuBlockToExecutePerCpuThread;
for (int startBlockIdx = 0; startBlockIdx < totalBlocks; for (int startBlockIdx = 0; startBlockIdx < totalBlocks;
startBlockIdx += gpuBlockToExecutePerCpuThread) { startBlockIdx += gpuBlockToExecutePerCpuThread) {
cu_kernel *p = new cu_kernel(*k); cu_kernel *p = new cu_kernel(*k);
@ -92,7 +89,6 @@ int schedulerEnqueueKernel(cu_kernel *k) {
p->endBlockId = std::min(startBlockIdx + gpuBlockToExecutePerCpuThread - 1, p->endBlockId = std::min(startBlockIdx + gpuBlockToExecutePerCpuThread - 1,
totalBlocks - 1); totalBlocks - 1);
scheduler->kernelQueue->enqueue(p); scheduler->kernelQueue->enqueue(p);
TaskToExecute++;
} }
return C_SUCCESS; return C_SUCCESS;
} }
@ -107,16 +103,12 @@ int cuLaunchKernel(cu_kernel **k) {
// Calculate Block Size N/numBlocks // Calculate Block Size N/numBlocks
cu_kernel *ker = *k; cu_kernel *ker = *k;
int status = C_RUN; int status = C_RUN;
// stream == 0 add to the kernelQueue // set complete to false, this variable is used for sync
if (ker->stream == 0) { for (int i = 0; i < scheduler->num_worker_threads; i++) {
// set complete to false, this variable is used for sync scheduler->thread_pool[i].completeTask = 0;
for (int i = 0; i < scheduler->num_worker_threads; i++) {
scheduler->thread_pool[i].completeTask = 0;
}
schedulerEnqueueKernel(ker);
} else {
assert(0 && "MultiStream no implemente\n");
} }
schedulerEnqueueKernel(ker);
return 0; return 0;
} }