implement multistream APIs for CPU backend
This commit is contained in:
parent
ca089c4274
commit
f712c30b09
|
@ -22,6 +22,7 @@ cudaError_t cudaSetDevice(int device);
|
||||||
cudaError_t cudaStreamCopyAttributes(cudaStream_t dst, cudaStream_t src);
|
cudaError_t cudaStreamCopyAttributes(cudaStream_t dst, cudaStream_t src);
|
||||||
cudaError_t cudaStreamCreate(cudaStream_t *pStream);
|
cudaError_t cudaStreamCreate(cudaStream_t *pStream);
|
||||||
cudaError_t cudaStreamDestroy(cudaStream_t stream);
|
cudaError_t cudaStreamDestroy(cudaStream_t stream);
|
||||||
|
cudaError_t cudaStreamQuery(cudaStream_t stream);
|
||||||
cudaError_t cudaStreamSynchronize(cudaStream_t stream);
|
cudaError_t cudaStreamSynchronize(cudaStream_t stream);
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -107,21 +107,18 @@ cudaError_t cudaStreamCopyAttributes(cudaStream_t dst, cudaStream_t src) {
|
||||||
|
|
||||||
static int stream_counter = 1;
|
static int stream_counter = 1;
|
||||||
/*
|
/*
|
||||||
cudaStream_t is a Opaque Structure
|
From our evaluation, CPU backend can gain little benefit
|
||||||
Overwrites cudaStream_t into custom cstreamData structure
|
from multi stream. Thus, we only use single stream
|
||||||
(does hardware uses the cudaStream_t stream)
|
|
||||||
*/
|
*/
|
||||||
cudaError_t cudaStreamCreate(cudaStream_t *pStream) {
|
cudaError_t cudaStreamCreate(cudaStream_t *pStream) { return cudaSuccess; }
|
||||||
assert(0 && "cudaStreamCreate no Implement\n");
|
|
||||||
}
|
|
||||||
|
|
||||||
cudaError_t cudaStreamDestroy(cudaStream_t stream) {
|
cudaError_t cudaStreamDestroy(cudaStream_t stream) { return cudaSuccess; }
|
||||||
assert(0 && "cudaStreamDestroy No Implement\n");
|
|
||||||
}
|
|
||||||
|
|
||||||
cudaError_t cudaStreamSynchronize(cudaStream_t stream) {
|
// All kernel launch will following a sync, thus, this should
|
||||||
assert(0 && "cudaStreamSynchronize No Implement\n");
|
// always be true
|
||||||
}
|
cudaError_t cudaStreamQuery(cudaStream_t stream) { return cudaSuccess; }
|
||||||
|
|
||||||
|
cudaError_t cudaStreamSynchronize(cudaStream_t stream) { return cudaSuccess; }
|
||||||
|
|
||||||
cudaError_t cudaGetDeviceCount(int *count) {
|
cudaError_t cudaGetDeviceCount(int *count) {
|
||||||
// dummy value
|
// dummy value
|
||||||
|
|
|
@ -16,7 +16,7 @@ int device_max_compute_units = 1;
|
||||||
bool device_initilized = false;
|
bool device_initilized = false;
|
||||||
int init_device() {
|
int init_device() {
|
||||||
if (device_initilized)
|
if (device_initilized)
|
||||||
return 0;
|
return C_SUCCESS;
|
||||||
device_initilized = true;
|
device_initilized = true;
|
||||||
cu_device *device = (cu_device *)calloc(1, sizeof(cu_device));
|
cu_device *device = (cu_device *)calloc(1, sizeof(cu_device));
|
||||||
if (device == NULL)
|
if (device == NULL)
|
||||||
|
@ -28,11 +28,7 @@ int init_device() {
|
||||||
device_max_compute_units = device->max_compute_units;
|
device_max_compute_units = device->max_compute_units;
|
||||||
|
|
||||||
// initialize scheduler
|
// initialize scheduler
|
||||||
int ret = scheduler_init(*device);
|
return scheduler_init(*device);
|
||||||
if (ret != C_SUCCESS)
|
|
||||||
return ret;
|
|
||||||
|
|
||||||
return C_SUCCESS;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Create Kernel
|
// Create Kernel
|
||||||
|
@ -84,7 +80,8 @@ int schedulerEnqueueKernel(cu_kernel *k) {
|
||||||
k->totalBlocks; // calculate gpu_block_to_execute_per_cpu_thread
|
k->totalBlocks; // calculate gpu_block_to_execute_per_cpu_thread
|
||||||
int gpuBlockToExecutePerCpuThread =
|
int gpuBlockToExecutePerCpuThread =
|
||||||
(totalBlocks + device_max_compute_units - 1) / device_max_compute_units;
|
(totalBlocks + device_max_compute_units - 1) / device_max_compute_units;
|
||||||
TaskToExecute = 0;
|
TaskToExecute = (totalBlocks + gpuBlockToExecutePerCpuThread - 1) /
|
||||||
|
gpuBlockToExecutePerCpuThread;
|
||||||
for (int startBlockIdx = 0; startBlockIdx < totalBlocks;
|
for (int startBlockIdx = 0; startBlockIdx < totalBlocks;
|
||||||
startBlockIdx += gpuBlockToExecutePerCpuThread) {
|
startBlockIdx += gpuBlockToExecutePerCpuThread) {
|
||||||
cu_kernel *p = new cu_kernel(*k);
|
cu_kernel *p = new cu_kernel(*k);
|
||||||
|
@ -92,7 +89,6 @@ int schedulerEnqueueKernel(cu_kernel *k) {
|
||||||
p->endBlockId = std::min(startBlockIdx + gpuBlockToExecutePerCpuThread - 1,
|
p->endBlockId = std::min(startBlockIdx + gpuBlockToExecutePerCpuThread - 1,
|
||||||
totalBlocks - 1);
|
totalBlocks - 1);
|
||||||
scheduler->kernelQueue->enqueue(p);
|
scheduler->kernelQueue->enqueue(p);
|
||||||
TaskToExecute++;
|
|
||||||
}
|
}
|
||||||
return C_SUCCESS;
|
return C_SUCCESS;
|
||||||
}
|
}
|
||||||
|
@ -107,16 +103,12 @@ int cuLaunchKernel(cu_kernel **k) {
|
||||||
// Calculate Block Size N/numBlocks
|
// Calculate Block Size N/numBlocks
|
||||||
cu_kernel *ker = *k;
|
cu_kernel *ker = *k;
|
||||||
int status = C_RUN;
|
int status = C_RUN;
|
||||||
// stream == 0 add to the kernelQueue
|
|
||||||
if (ker->stream == 0) {
|
|
||||||
// set complete to false, this variable is used for sync
|
// set complete to false, this variable is used for sync
|
||||||
for (int i = 0; i < scheduler->num_worker_threads; i++) {
|
for (int i = 0; i < scheduler->num_worker_threads; i++) {
|
||||||
scheduler->thread_pool[i].completeTask = 0;
|
scheduler->thread_pool[i].completeTask = 0;
|
||||||
}
|
}
|
||||||
schedulerEnqueueKernel(ker);
|
schedulerEnqueueKernel(ker);
|
||||||
} else {
|
|
||||||
assert(0 && "MultiStream no implemente\n");
|
|
||||||
}
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue