diff --git a/runtime/CMakeLists.txt b/runtime/CMakeLists.txt index ac46896..4c4e8e7 100644 --- a/runtime/CMakeLists.txt +++ b/runtime/CMakeLists.txt @@ -10,6 +10,7 @@ set(CMAKE_VERBOSE_MAKEFILE ON) add_subdirectory(threadPool) # compile x86 runtime library +include_directories(../common) include_directories(./include/) include_directories(./include/x86) include_directories(./threadPool/include/) diff --git a/runtime/include/x86/cudaKernelImpl.h b/runtime/include/x86/cudaKernelImpl.h index fbccbe9..e65d0c1 100644 --- a/runtime/include/x86/cudaKernelImpl.h +++ b/runtime/include/x86/cudaKernelImpl.h @@ -1,6 +1,5 @@ #ifndef __RUNTIME_IMPL__ #define __KERNEL_IMPL__ -#include "cudaStatus.h" #include "structures.h" #include extern "C" { diff --git a/runtime/include/x86/cudaRuntimeImpl.h b/runtime/include/x86/cudaRuntimeImpl.h index 1823206..206d686 100644 --- a/runtime/include/x86/cudaRuntimeImpl.h +++ b/runtime/include/x86/cudaRuntimeImpl.h @@ -1,6 +1,5 @@ #ifndef __RUNTIME_IMPL__ #define __RUNTIME_IMPL__ -#include "cudaStatus.h" #include "structures.h" #include extern "C" { diff --git a/runtime/include/x86/cudaStatus.h b/runtime/include/x86/cudaStatus.h deleted file mode 100644 index 79b5b19..0000000 --- a/runtime/include/x86/cudaStatus.h +++ /dev/null @@ -1,18 +0,0 @@ -#ifndef __RUNTIME_STATUS__ -#define __RUNTIME_STATUS__ -#include - -// enum cudaError_t { -// CudaSuccess = 0, -// CudaErrorInvalidValue = 1, -// CudaErrorInvalidMemoryAllocation = 2, -// }; - -// enum cudaMemcpyKind { -// cudaMemcpyHostToHost = 0, -// cudaMemcpyHostToDevice = 1, -// cudaMemcpyDeviceToHost = 2, -// cudaMemcpyDeviceToDevice = 3, -// cudaMemcpyDefault = 4, -// }; -#endif diff --git a/runtime/src/vortex/cudaKernelImpl.cpp b/runtime/src/vortex/cudaKernelImpl.cpp deleted file mode 100644 index 56803c4..0000000 --- a/runtime/src/vortex/cudaKernelImpl.cpp +++ /dev/null @@ -1,19 +0,0 @@ -#include "cudaKernelImpl.h" -#include -double __nv_exp(double base) { return exp(base); } -double __nv_sqrt(double v) { return sqrt(v); } -float __nv_sqrtf(float v) { return sqrt(v); } -float __nv_powif(float base, int exp) { return pow(base, exp); } -float __nv_logf(float v) { return logf(v); } -float __nv_expf(float v) { return expf(v); } -float __nv_log10f(float v) { return log10f(v); } -float __nv_fast_log2f(float v) { return log2f(v); } -double __nv_powi(double base, int exp) { return pow(base, exp); } -float __nv_powf(float base, float exp) { return pow(base, exp); } -float __nv_fast_powf(float base, float exp) { return pow(base, exp); } -float __nv_fmodf(float x, float y) { return fmod(x, y); } -int __nv_isnanf(float v) { return isnan(v); } -int __nv_isinff(float v) { return isinf(v); } -float __nv_fabsf(float v) { return abs(v); } -int __nvvm_mul24_i(int a, int b) { return a * b; } -double _ZL3expd(double base) { return exp(base); } diff --git a/runtime/src/vortex/cudaRuntimeImpl.cpp b/runtime/src/vortex/cudaRuntimeImpl.cpp deleted file mode 100644 index fb2012a..0000000 --- a/runtime/src/vortex/cudaRuntimeImpl.cpp +++ /dev/null @@ -1,251 +0,0 @@ -#include "cudaRuntimeImpl.h" -#include "api.h" -#include "cuda_runtime.h" -#include "def.h" -#include "macros.h" -#include "structures.h" -#include -#include -#include -#include -#include -cudaError_t cudaGetDevice(int *devPtr) { *devPtr = 0; } -const char *cudaGetErrorName(cudaError_t error) { return "SUCCESS\n"; } -cudaError_t cudaDeviceReset(void) { scheduler_uninit(); } -cudaError_t cudaDeviceSynchronize(void) { cuSynchronizeBarrier(); } -cudaError_t cudaThreadSynchronize(void) { cuSynchronizeBarrier(); } -cudaError_t cudaFree(void *devPtr) { free(devPtr); } -cudaError_t cudaFreeHost(void *devPtr) { free(devPtr); } - -cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, - void **args, size_t sharedMem, - cudaStream_t stream) { - // if scheduler is null init device - // printf( - // "cudaLaunchKernel : Grid: x:%d y:%d z:%d Block: %d, %d, %d ShMem:%lu\n - // ", gridDim.x, gridDim.y, gridDim.z, blockDim.x, blockDim.y, blockDim.z, - // sharedMem); - - cu_kernel *ker = - create_kernel(func, gridDim, blockDim, args, sharedMem, stream); - - int lstatus = cuLaunchKernel(&ker); - - // std::cout << "ret cudaLKernel" << std::endl; -} -cudaError_t cudaMalloc(void **devPtr, size_t size) { - *devPtr = malloc(size); - if (devPtr == NULL) - return cudaErrorMemoryAllocation; - return cudaSuccess; -} -cudaError_t cudaMallocHost(void **devPtr, size_t size) { - *devPtr = malloc(size); - if (devPtr == NULL) - return cudaErrorMemoryAllocation; - return cudaSuccess; -} -cudaError_t cudaMemset(void *devPtr, int value, size_t count) { - memset(devPtr, value, count); - return cudaSuccess; -} -cudaError_t cudaMemcpy(void *dst, const void *src, size_t count, - cudaMemcpyKind kind) { - if (kind == cudaMemcpyHostToHost) { - memcpy(dst, src, count); - } else if (kind == cudaMemcpyDeviceToHost) { - // how does the code know which device accessing the memory - memcpy(dst, src, count); - } else if (kind == cudaMemcpyHostToDevice) { - // how does the code know which device accessing the memory - memcpy(dst, src, count); - } else if (kind == cudaMemcpyDeviceToHost) { - // how does the code know which device accessing the memory - memcpy(dst, src, count); - } else if (kind == cudaMemcpyDeviceToDevice) { - - memcpy(dst, src, count); - } else if (kind == cudaMemcpyDefault) { - memcpy(dst, src, count); - } - return cudaSuccess; -} - -cudaError_t cudaMemcpyToSymbol_host(void *dst, const void *src, size_t count, - size_t offset, cudaMemcpyKind kind) { - assert(offset == 0 && "DO not support offset !=0\n"); - memcpy(dst, src + offset, count); - return cudaSuccess; -} - -cudaError_t cudaSetDevice(int device) { - // error checking - // std::cout << "cudaSetDevice Called" << std::endl; - init_device(); - // std::cout << "cudaSetDevice Ret" << std::endl; -} - -cudaError_t cudaStreamCopyAttributes(cudaStream_t dst, cudaStream_t src) { - cstreamData *dst_stream = (cstreamData *)dst; - cstreamData *src_stream = (cstreamData *)src; - - if (dst_stream == NULL || src_stream == NULL) { - return cudaErrorInvalidValue; // 1 - } - - dst_stream->stream_priority = src_stream->stream_priority; - dst_stream->stream_flags = src_stream->stream_flags; - - return cudaSuccess; // 0 -} - -static int stream_counter = 1; -/* - cudaStream_t is a Opaque Structure - - Overwrites cudaStream_t into custom cstreamData structure - (does hardware uses the cudaStream_t stream) - -*/ -cudaError_t cudaStreamCreate(cudaStream_t *pStream) { - cstreamData *s = (cstreamData *)calloc(1, sizeof(cstreamData)); - if (s == NULL) - return cudaErrorMemoryAllocation; - s->ev.status = C_RUN; - s->id = stream_counter; - stream_counter++; - s->stream_priority = DEFAULT; - create_KernelQueue(&(s->kernelQueue)); - - INIT_LOCK(s->stream_lock); - *pStream = (cudaStream_t)(s); - - return cudaSuccess; -} - -cudaError_t cudaStreamDestroy(cudaStream_t stream) { - cstreamData *s = (cstreamData *)(stream); - - free(s->kernelQueue); - - DESTROY_LOCK(s->stream_lock); - - free(s); - - return cudaSuccess; -} - -cudaError_t cudaStreamSynchronize(cudaStream_t stream) { - cstreamData *e = ((cstreamData *)(stream)); - MUTEX_LOCK(e->stream_lock); - - e->ev.status = C_SYNCHRONIZE; - e->ev.numKernelsToWait = e->kernelQueue->waiting_count; - MUTEX_UNLOCK(e->stream_lock); -} - -cudaError_t cudaGetDeviceCount(int *count) { - // dummy value - *count = 1; -} - -cudaError_t cudaGetDeviceProperties(cudaDeviceProp *deviceProp, int device) { - - // dummy values - if (device == 0) { - strcpy(deviceProp->name, "pthread"); - deviceProp->totalGlobalMem = 0; - deviceProp->sharedMemPerBlock = 0; - deviceProp->regsPerBlock = 0; - deviceProp->warpSize = 0; - deviceProp->memPitch = 0; - deviceProp->maxThreadsPerBlock = 0; - deviceProp->maxThreadsDim[0] = 1; - deviceProp->maxThreadsDim[1] = 1; - deviceProp->maxThreadsDim[2] = 1; - - deviceProp->maxGridSize[0] = 1; - deviceProp->maxGridSize[1] = 1; - deviceProp->maxGridSize[2] = 1; - - deviceProp->totalConstMem = 0; - deviceProp->major = 0; - deviceProp->minor = 0; - deviceProp->clockRate = 0; - deviceProp->textureAlignment = 0; - deviceProp->deviceOverlap = false; - deviceProp->multiProcessorCount = 0; - } - return cudaSuccess; -} - -static cudaError_t lastError = cudaSuccess; -const char *cudaGetErrorString(cudaError_t error) { - if (error == cudaSuccess) { - return "Cuda Get Error Success"; - } -} - -cudaError_t cudaGetLastError(void) { return lastError; } - -static callParams callParamTemp; - -/* - Internal Cuda Library Functions -*/ -extern "C" { - -extern cudaError_t CUDARTAPI __cudaPopCallConfiguration(dim3 *gridDim, - dim3 *blockDim, - size_t *sharedMem, - void **stream) { - // printf("__cudaPopCallConfiguration: Grid: x:%d y:%d z:%d Block: %d, %d, %d - // ShMem: %lu\n", - // gridDim->x, gridDim->y, gridDim->z, blockDim->x, blockDim->y, blockDim->z, - // *sharedMem); - - *gridDim = callParamTemp.gridDim; - *blockDim = callParamTemp.blockDim; - *sharedMem = callParamTemp.shareMem; - *stream = callParamTemp.stream; - - // printf("__cudaPopCallConfiguration After : Grid: x:%d y:%d z:%d Block: %d, - // %d, %d ShMem: %lu\n", gridDim->x, gridDim->y, gridDim->z, blockDim->x, - // blockDim->y, blockDim->z, *sharedMem); - - // exit(1); - - return cudaSuccess; -} - -extern __host__ __device__ unsigned CUDARTAPI __cudaPushCallConfiguration( - dim3 gridDim, dim3 blockDim, size_t sharedMem = 0, void *stream = 0) { - - // printf("__cudaPushCallConfiguration Grid: x:%d y:%d z:%d Block: %d, %d, %d - // " - // "ShMem: %lu\n ", - // gridDim.x, gridDim.y, gridDim.z, blockDim.x, blockDim.y, blockDim.z, - // sharedMem); - - // memory checks allocations - callParamTemp.gridDim = gridDim; - - // std::cout << "assign gridDim" << std::endl; - - callParamTemp.blockDim = blockDim; - // std::cout << "assign blockDim" << std::endl; - callParamTemp.shareMem = sharedMem; - // std::cout << "assign shareMem" << std::endl; - (callParamTemp.stream) = stream; - - // printf("__cudaPushCallConfiguration After Grid: x:%d y:%d z:%d Block: %d, - // %d, %d ShMem: %lu\n", - // gridDim.x, gridDim.y, gridDim.z, blockDim.x, blockDim.y, blockDim.z, - // sharedMem); - - // return 0 continues the Pop - return cudaSuccess; - - // return ne 0 skips the Pop -} -} diff --git a/runtime/src/x86/cudaRuntimeImpl.cpp b/runtime/src/x86/cudaRuntimeImpl.cpp index b36761c..8648034 100644 --- a/runtime/src/x86/cudaRuntimeImpl.cpp +++ b/runtime/src/x86/cudaRuntimeImpl.cpp @@ -1,6 +1,7 @@ #include "cudaRuntimeImpl.h" #include "api.h" #include "cuda_runtime.h" +#include "debug.hpp" #include "def.h" #include "macros.h" #include "structures.h" @@ -38,11 +39,10 @@ cudaError_t cudaFreeHost(void *devPtr) { cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream) { - // if scheduler is null init device - // printf( - // "cudaLaunchKernel : Grid: x:%d y:%d z:%d Block: %d, %d, %d ShMem:%lu\n - // ", gridDim.x, gridDim.y, gridDim.z, blockDim.x, blockDim.y, blockDim.z, - // sharedMem); + DEBUG_INFO( + "cudaLaunchKernel : Grid: x:%d y:%d z:%d Block: %d, %d, %d ShMem: %d\n", + gridDim.x, gridDim.y, gridDim.z, blockDim.x, blockDim.y, blockDim.z, + sharedMem); cu_kernel *ker = create_kernel(func, gridDim, blockDim, args, sharedMem, stream); @@ -66,16 +66,12 @@ cudaError_t cudaMemcpy(void *dst, const void *src, size_t count, if (kind == cudaMemcpyHostToHost) { memcpy(dst, src, count); } else if (kind == cudaMemcpyDeviceToHost) { - // how does the code know which device accessing the memory memcpy(dst, src, count); } else if (kind == cudaMemcpyHostToDevice) { - // how does the code know which device accessing the memory memcpy(dst, src, count); } else if (kind == cudaMemcpyDeviceToHost) { - // how does the code know which device accessing the memory memcpy(dst, src, count); } else if (kind == cudaMemcpyDeviceToDevice) { - memcpy(dst, dst, count); } else if (kind == cudaMemcpyDefault) { memcpy(dst, src, count); @@ -100,36 +96,31 @@ cudaError_t cudaStreamCopyAttributes(cudaStream_t dst, cudaStream_t src) { cstreamData *src_stream = (cstreamData *)src; if (dst_stream == NULL || src_stream == NULL) { - return cudaErrorInvalidValue; // 1 + return cudaErrorInvalidValue; } dst_stream->stream_priority = src_stream->stream_priority; dst_stream->stream_flags = src_stream->stream_flags; - return cudaSuccess; // 0 + return cudaSuccess; } static int stream_counter = 1; /* cudaStream_t is a Opaque Structure - Overwrites cudaStream_t into custom cstreamData structure (does hardware uses the cudaStream_t stream) - */ cudaError_t cudaStreamCreate(cudaStream_t *pStream) { - printf("cudaStreamCreate no Implement\n"); - exit(1); + assert(0 && "cudaStreamCreate no Implement\n"); } cudaError_t cudaStreamDestroy(cudaStream_t stream) { - printf("cudaStreamDestroy No Implement\n"); - exit(1); + assert(0 && "cudaStreamDestroy No Implement\n"); } cudaError_t cudaStreamSynchronize(cudaStream_t stream) { - printf("cudaStreamSynchronize No Implement\n"); - exit(1); + assert(0 && "cudaStreamSynchronize No Implement\n"); } cudaError_t cudaGetDeviceCount(int *count) { @@ -139,7 +130,6 @@ cudaError_t cudaGetDeviceCount(int *count) { } cudaError_t cudaGetDeviceProperties(cudaDeviceProp *deviceProp, int device) { - // dummy values if (device == 0) { strcpy(deviceProp->name, "pthread"); @@ -188,53 +178,31 @@ extern cudaError_t CUDARTAPI __cudaPopCallConfiguration(dim3 *gridDim, dim3 *blockDim, size_t *sharedMem, void **stream) { - // printf("__cudaPopCallConfiguration: Grid: x:%d y:%d z:%d Block: %d, %d, %d - // ShMem: %lu\n", - // gridDim->x, gridDim->y, gridDim->z, blockDim->x, blockDim->y, blockDim->z, - // *sharedMem); + DEBUG_INFO("__cudaPopCallConfiguration: Grid: x:%d y:%d z:%d Block: %d, %d, " + "%d ShMem: %lu\n", + gridDim->x, gridDim->y, gridDim->z, blockDim->x, blockDim->y, + blockDim->z, *sharedMem); *gridDim = callParamTemp.gridDim; *blockDim = callParamTemp.blockDim; *sharedMem = callParamTemp.shareMem; *stream = callParamTemp.stream; - // printf("__cudaPopCallConfiguration After : Grid: x:%d y:%d z:%d Block: %d, - // %d, %d ShMem: %lu\n", gridDim->x, gridDim->y, gridDim->z, blockDim->x, - // blockDim->y, blockDim->z, *sharedMem); - - // exit(1); - return cudaSuccess; } extern __host__ __device__ unsigned CUDARTAPI __cudaPushCallConfiguration( dim3 gridDim, dim3 blockDim, size_t sharedMem = 0, void *stream = 0) { - - // printf("__cudaPushCallConfiguration Grid: x:%d y:%d z:%d Block: %d, %d, %d - // " - // "ShMem: %lu\n ", - // gridDim.x, gridDim.y, gridDim.z, blockDim.x, blockDim.y, blockDim.z, - // sharedMem); + DEBUG_INFO("__cudaPushCallConfiguration: Grid: x:%d y:%d z:%d Block: %d, %d, " + "%d ShMem: %lu\n", + gridDim.x, gridDim.y, gridDim.z, blockDim.x, blockDim.y, + blockDim.z, sharedMem); // memory checks allocations callParamTemp.gridDim = gridDim; - - // std::cout << "assign gridDim" << std::endl; - callParamTemp.blockDim = blockDim; - // std::cout << "assign blockDim" << std::endl; callParamTemp.shareMem = sharedMem; - // std::cout << "assign shareMem" << std::endl; (callParamTemp.stream) = stream; - - // printf("__cudaPushCallConfiguration After Grid: x:%d y:%d z:%d Block: %d, - // %d, %d ShMem: %lu\n", - // gridDim.x, gridDim.y, gridDim.z, blockDim.x, blockDim.y, blockDim.z, - // sharedMem); - - // return 0 continues the Pop return cudaSuccess; - - // return ne 0 skips the Pop } } diff --git a/runtime/threadPool/CMakeLists.txt b/runtime/threadPool/CMakeLists.txt index f36d45a..5d53566 100644 --- a/runtime/threadPool/CMakeLists.txt +++ b/runtime/threadPool/CMakeLists.txt @@ -11,6 +11,7 @@ set(LIB_NAME threadPool) set(CMAKE_CXX_STANDARD 14) set(CMAKE_BUILD_TYPE Debug) +include_directories(../../common) include_directories(./include) include_directories(./include/x86) include_directories(../../external/moodycamel) diff --git a/runtime/threadPool/src/x86/api.cpp b/runtime/threadPool/src/x86/api.cpp index ad34b36..18347d6 100644 --- a/runtime/threadPool/src/x86/api.cpp +++ b/runtime/threadPool/src/x86/api.cpp @@ -1,5 +1,6 @@ #include "api.h" #include "blockingconcurrentqueue.h" +#include "debug.hpp" #include "def.h" #include "macros.h" #include "structures.h" @@ -22,8 +23,8 @@ int init_device() { return C_ERROR_MEMALLOC; device->max_compute_units = std::thread::hardware_concurrency(); - std::cout << device->max_compute_units - << " concurrent threads are supported.\n"; + DEBUG_INFO("%d concurrent threads are supported.\n", + device->max_compute_units); device_max_compute_units = device->max_compute_units; // initialize scheduler @@ -46,13 +47,9 @@ cu_kernel *create_kernel(const void *func, dim3 gridDim, dim3 blockDim, ker->gridDim = gridDim; ker->blockDim = blockDim; - ker->shared_mem = sharedMem; - ker->stream = stream; - ker->totalBlocks = gridDim.x * gridDim.y * gridDim.z; - ker->blockSize = blockDim.x * blockDim.y * blockDim.z; return ker; } @@ -97,9 +94,6 @@ int schedulerEnqueueKernel(cu_kernel *k) { scheduler->kernelQueue->enqueue(p); TaskToExecute++; } - - // printf("total: %d execute per cpu: %d\n", totalBlocks, - // gpuBlockToExecutePerCpuThread); return C_SUCCESS; } @@ -121,8 +115,7 @@ int cuLaunchKernel(cu_kernel **k) { } schedulerEnqueueKernel(ker); } else { - printf("MultiStream no implemente\n"); - exit(1); + assert(0 && "MultiStream no implemente\n"); } return 0; } @@ -185,11 +178,9 @@ void *driver_thread(void *p) { // exit the routine if (is_exit) { td->exit = true; - // pthread_exit pthread_exit(NULL); } else { - printf("driver thread stop incorrectly\n"); - exit(1); + assert(0 && "driver thread stop incorrectly\n"); } } @@ -215,29 +206,17 @@ int scheduler_init(cu_device device) { return C_SUCCESS; } -void scheduler_uninit() { - printf("Scheduler Unitit no Implemente\n"); - exit(1); -} +void scheduler_uninit() { assert(0 && "Scheduler Unitit no Implemente\n"); } /* Barrier for Kernel Launch - - During kernel launch, increment the number of work items required to finish - Each kernel will point to the same event - - During Running Command, decrement the event.work_item count - when count is 0, all work items for this kernel launch is finish - - Sense Like Barrier - Counting Barrier basically */ void cuSynchronizeBarrier() { if (!device_initilized) { init_device(); } while (1) { - // (TODO): currently, we assume each kernel launch has a + // after compilation transformation, each kernel launch has a // following sync if (scheduler->kernelQueue->size_approx() == 0) { int completeBlock = 0;