implement multistream APIs for CPU backend
This commit is contained in:
parent
ca089c4274
commit
f712c30b09
|
@ -22,6 +22,7 @@ cudaError_t cudaSetDevice(int device);
|
|||
cudaError_t cudaStreamCopyAttributes(cudaStream_t dst, cudaStream_t src);
|
||||
cudaError_t cudaStreamCreate(cudaStream_t *pStream);
|
||||
cudaError_t cudaStreamDestroy(cudaStream_t stream);
|
||||
cudaError_t cudaStreamQuery(cudaStream_t stream);
|
||||
cudaError_t cudaStreamSynchronize(cudaStream_t stream);
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -107,21 +107,18 @@ cudaError_t cudaStreamCopyAttributes(cudaStream_t dst, cudaStream_t src) {
|
|||
|
||||
static int stream_counter = 1;
|
||||
/*
|
||||
cudaStream_t is a Opaque Structure
|
||||
Overwrites cudaStream_t into custom cstreamData structure
|
||||
(does hardware uses the cudaStream_t stream)
|
||||
From our evaluation, CPU backend can gain little benefit
|
||||
from multi stream. Thus, we only use single stream
|
||||
*/
|
||||
cudaError_t cudaStreamCreate(cudaStream_t *pStream) {
|
||||
assert(0 && "cudaStreamCreate no Implement\n");
|
||||
}
|
||||
cudaError_t cudaStreamCreate(cudaStream_t *pStream) { return cudaSuccess; }
|
||||
|
||||
cudaError_t cudaStreamDestroy(cudaStream_t stream) {
|
||||
assert(0 && "cudaStreamDestroy No Implement\n");
|
||||
}
|
||||
cudaError_t cudaStreamDestroy(cudaStream_t stream) { return cudaSuccess; }
|
||||
|
||||
cudaError_t cudaStreamSynchronize(cudaStream_t stream) {
|
||||
assert(0 && "cudaStreamSynchronize No Implement\n");
|
||||
}
|
||||
// All kernel launch will following a sync, thus, this should
|
||||
// always be true
|
||||
cudaError_t cudaStreamQuery(cudaStream_t stream) { return cudaSuccess; }
|
||||
|
||||
cudaError_t cudaStreamSynchronize(cudaStream_t stream) { return cudaSuccess; }
|
||||
|
||||
cudaError_t cudaGetDeviceCount(int *count) {
|
||||
// dummy value
|
||||
|
|
|
@ -16,7 +16,7 @@ int device_max_compute_units = 1;
|
|||
bool device_initilized = false;
|
||||
int init_device() {
|
||||
if (device_initilized)
|
||||
return 0;
|
||||
return C_SUCCESS;
|
||||
device_initilized = true;
|
||||
cu_device *device = (cu_device *)calloc(1, sizeof(cu_device));
|
||||
if (device == NULL)
|
||||
|
@ -28,11 +28,7 @@ int init_device() {
|
|||
device_max_compute_units = device->max_compute_units;
|
||||
|
||||
// initialize scheduler
|
||||
int ret = scheduler_init(*device);
|
||||
if (ret != C_SUCCESS)
|
||||
return ret;
|
||||
|
||||
return C_SUCCESS;
|
||||
return scheduler_init(*device);
|
||||
}
|
||||
|
||||
// Create Kernel
|
||||
|
@ -84,7 +80,8 @@ int schedulerEnqueueKernel(cu_kernel *k) {
|
|||
k->totalBlocks; // calculate gpu_block_to_execute_per_cpu_thread
|
||||
int gpuBlockToExecutePerCpuThread =
|
||||
(totalBlocks + device_max_compute_units - 1) / device_max_compute_units;
|
||||
TaskToExecute = 0;
|
||||
TaskToExecute = (totalBlocks + gpuBlockToExecutePerCpuThread - 1) /
|
||||
gpuBlockToExecutePerCpuThread;
|
||||
for (int startBlockIdx = 0; startBlockIdx < totalBlocks;
|
||||
startBlockIdx += gpuBlockToExecutePerCpuThread) {
|
||||
cu_kernel *p = new cu_kernel(*k);
|
||||
|
@ -92,7 +89,6 @@ int schedulerEnqueueKernel(cu_kernel *k) {
|
|||
p->endBlockId = std::min(startBlockIdx + gpuBlockToExecutePerCpuThread - 1,
|
||||
totalBlocks - 1);
|
||||
scheduler->kernelQueue->enqueue(p);
|
||||
TaskToExecute++;
|
||||
}
|
||||
return C_SUCCESS;
|
||||
}
|
||||
|
@ -107,16 +103,12 @@ int cuLaunchKernel(cu_kernel **k) {
|
|||
// Calculate Block Size N/numBlocks
|
||||
cu_kernel *ker = *k;
|
||||
int status = C_RUN;
|
||||
// stream == 0 add to the kernelQueue
|
||||
if (ker->stream == 0) {
|
||||
// set complete to false, this variable is used for sync
|
||||
for (int i = 0; i < scheduler->num_worker_threads; i++) {
|
||||
scheduler->thread_pool[i].completeTask = 0;
|
||||
}
|
||||
schedulerEnqueueKernel(ker);
|
||||
} else {
|
||||
assert(0 && "MultiStream no implemente\n");
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in New Issue