implement multistream APIs for CPU backend
This commit is contained in:
		
							parent
							
								
									ca089c4274
								
							
						
					
					
						commit
						f712c30b09
					
				|  | @ -22,6 +22,7 @@ cudaError_t cudaSetDevice(int device); | |||
| cudaError_t cudaStreamCopyAttributes(cudaStream_t dst, cudaStream_t src); | ||||
| cudaError_t cudaStreamCreate(cudaStream_t *pStream); | ||||
| cudaError_t cudaStreamDestroy(cudaStream_t stream); | ||||
| cudaError_t cudaStreamQuery(cudaStream_t stream); | ||||
| cudaError_t cudaStreamSynchronize(cudaStream_t stream); | ||||
| } | ||||
| #endif | ||||
|  |  | |||
|  | @ -107,21 +107,18 @@ cudaError_t cudaStreamCopyAttributes(cudaStream_t dst, cudaStream_t src) { | |||
| 
 | ||||
| static int stream_counter = 1; | ||||
| /*
 | ||||
|   cudaStream_t is a Opaque Structure | ||||
|   Overwrites cudaStream_t into custom cstreamData structure | ||||
|   (does hardware uses the cudaStream_t stream) | ||||
| From our evaluation, CPU backend can gain little benefit | ||||
| from multi stream. Thus, we only use single stream | ||||
| */ | ||||
| cudaError_t cudaStreamCreate(cudaStream_t *pStream) { | ||||
|   assert(0 && "cudaStreamCreate no Implement\n"); | ||||
| } | ||||
| cudaError_t cudaStreamCreate(cudaStream_t *pStream) { return cudaSuccess; } | ||||
| 
 | ||||
| cudaError_t cudaStreamDestroy(cudaStream_t stream) { | ||||
|   assert(0 && "cudaStreamDestroy No Implement\n"); | ||||
| } | ||||
| cudaError_t cudaStreamDestroy(cudaStream_t stream) { return cudaSuccess; } | ||||
| 
 | ||||
| cudaError_t cudaStreamSynchronize(cudaStream_t stream) { | ||||
|   assert(0 && "cudaStreamSynchronize No Implement\n"); | ||||
| } | ||||
| // All kernel launch will following a sync, thus, this should
 | ||||
| // always be true
 | ||||
| cudaError_t cudaStreamQuery(cudaStream_t stream) { return cudaSuccess; } | ||||
| 
 | ||||
| cudaError_t cudaStreamSynchronize(cudaStream_t stream) { return cudaSuccess; } | ||||
| 
 | ||||
| cudaError_t cudaGetDeviceCount(int *count) { | ||||
|   // dummy value
 | ||||
|  |  | |||
|  | @ -16,7 +16,7 @@ int device_max_compute_units = 1; | |||
| bool device_initilized = false; | ||||
| int init_device() { | ||||
|   if (device_initilized) | ||||
|     return 0; | ||||
|     return C_SUCCESS; | ||||
|   device_initilized = true; | ||||
|   cu_device *device = (cu_device *)calloc(1, sizeof(cu_device)); | ||||
|   if (device == NULL) | ||||
|  | @ -28,11 +28,7 @@ int init_device() { | |||
|   device_max_compute_units = device->max_compute_units; | ||||
| 
 | ||||
|   // initialize scheduler
 | ||||
|   int ret = scheduler_init(*device); | ||||
|   if (ret != C_SUCCESS) | ||||
|     return ret; | ||||
| 
 | ||||
|   return C_SUCCESS; | ||||
|   return scheduler_init(*device); | ||||
| } | ||||
| 
 | ||||
| // Create Kernel
 | ||||
|  | @ -84,7 +80,8 @@ int schedulerEnqueueKernel(cu_kernel *k) { | |||
|       k->totalBlocks; // calculate gpu_block_to_execute_per_cpu_thread
 | ||||
|   int gpuBlockToExecutePerCpuThread = | ||||
|       (totalBlocks + device_max_compute_units - 1) / device_max_compute_units; | ||||
|   TaskToExecute = 0; | ||||
|   TaskToExecute = (totalBlocks + gpuBlockToExecutePerCpuThread - 1) / | ||||
|                   gpuBlockToExecutePerCpuThread; | ||||
|   for (int startBlockIdx = 0; startBlockIdx < totalBlocks; | ||||
|        startBlockIdx += gpuBlockToExecutePerCpuThread) { | ||||
|     cu_kernel *p = new cu_kernel(*k); | ||||
|  | @ -92,7 +89,6 @@ int schedulerEnqueueKernel(cu_kernel *k) { | |||
|     p->endBlockId = std::min(startBlockIdx + gpuBlockToExecutePerCpuThread - 1, | ||||
|                              totalBlocks - 1); | ||||
|     scheduler->kernelQueue->enqueue(p); | ||||
|     TaskToExecute++; | ||||
|   } | ||||
|   return C_SUCCESS; | ||||
| } | ||||
|  | @ -107,16 +103,12 @@ int cuLaunchKernel(cu_kernel **k) { | |||
|   // Calculate Block Size N/numBlocks
 | ||||
|   cu_kernel *ker = *k; | ||||
|   int status = C_RUN; | ||||
|   // stream == 0 add to the kernelQueue
 | ||||
|   if (ker->stream == 0) { | ||||
|     // set complete to false, this variable is used for sync
 | ||||
|     for (int i = 0; i < scheduler->num_worker_threads; i++) { | ||||
|       scheduler->thread_pool[i].completeTask = 0; | ||||
|     } | ||||
|     schedulerEnqueueKernel(ker); | ||||
|   } else { | ||||
|     assert(0 && "MultiStream no implemente\n"); | ||||
|   // set complete to false, this variable is used for sync
 | ||||
|   for (int i = 0; i < scheduler->num_worker_threads; i++) { | ||||
|     scheduler->thread_pool[i].completeTask = 0; | ||||
|   } | ||||
|   schedulerEnqueueKernel(ker); | ||||
| 
 | ||||
|   return 0; | ||||
| } | ||||
| 
 | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue