diff --git a/compilation/CMakeLists.txt b/compilation/CMakeLists.txt index 868f9c4..e7fec26 100644 --- a/compilation/CMakeLists.txt +++ b/compilation/CMakeLists.txt @@ -7,14 +7,14 @@ project( set(CMAKE_VERBOSE_MAKEFILE ON) # compile kernel translator -include_directories(./KernelTranslation/include) +include_directories(./KernelTranslation/include/x86) add_subdirectory(KernelTranslation) add_executable(kernelTranslator KernelTranslation.cpp) target_link_libraries(kernelTranslator spmd2mpmd ${GCC_COVERAGE_LINK_FLAGS}) # compile host translator -include_directories(./HostTranslation/include) +include_directories(./HostTranslation/include/x86) add_subdirectory(HostTranslation) add_executable(hostTranslator HostTranslation.cpp) diff --git a/compilation/HostTranslation/CMakeLists.txt b/compilation/HostTranslation/CMakeLists.txt index 2e97dd6..c219077 100644 --- a/compilation/HostTranslation/CMakeLists.txt +++ b/compilation/HostTranslation/CMakeLists.txt @@ -12,10 +12,10 @@ set(LIB_NAME cudaRuntime2cpuRuntime) set(CMAKE_CXX_STANDARD 14) set(CMAKE_BUILD_TYPE Debug) -include_directories(./include) +include_directories(./include/x86) -file(GLOB proj_HEADERS "include/*.h") -file(GLOB proj_SOURCES "lib/*.cpp") +file(GLOB proj_HEADERS "include/x86/*.h") +file(GLOB proj_SOURCES "src/x86/*.cpp") # Add core library. add_library(${LIB_NAME} SHARED ${proj_HEADERS} ${proj_SOURCES}) diff --git a/compilation/HostTranslation/include/RemoveCudaBuiltin.h b/compilation/HostTranslation/include/x86/RemoveCudaBuiltin.h similarity index 100% rename from compilation/HostTranslation/include/RemoveCudaBuiltin.h rename to compilation/HostTranslation/include/x86/RemoveCudaBuiltin.h diff --git a/compilation/HostTranslation/include/ReplaceConstantMemory.h b/compilation/HostTranslation/include/x86/ReplaceConstantMemory.h similarity index 100% rename from compilation/HostTranslation/include/ReplaceConstantMemory.h rename to compilation/HostTranslation/include/x86/ReplaceConstantMemory.h diff --git a/compilation/HostTranslation/include/ReplaceCudaBuiltin.h b/compilation/HostTranslation/include/x86/ReplaceCudaBuiltin.h similarity index 100% rename from compilation/HostTranslation/include/ReplaceCudaBuiltin.h rename to compilation/HostTranslation/include/x86/ReplaceCudaBuiltin.h diff --git a/compilation/HostTranslation/include/ReplaceKernelArgs.h b/compilation/HostTranslation/include/x86/ReplaceKernelArgs.h similarity index 100% rename from compilation/HostTranslation/include/ReplaceKernelArgs.h rename to compilation/HostTranslation/include/x86/ReplaceKernelArgs.h diff --git a/compilation/HostTranslation/lib/GenerateHostStub.cpp b/compilation/HostTranslation/src/x86/GenerateHostStub.cpp similarity index 100% rename from compilation/HostTranslation/lib/GenerateHostStub.cpp rename to compilation/HostTranslation/src/x86/GenerateHostStub.cpp diff --git a/compilation/HostTranslation/lib/InitializeDevice.cpp b/compilation/HostTranslation/src/x86/InitializeDevice.cpp similarity index 100% rename from compilation/HostTranslation/lib/InitializeDevice.cpp rename to compilation/HostTranslation/src/x86/InitializeDevice.cpp diff --git a/compilation/HostTranslation/lib/RemoveCudaBuiltin.cpp b/compilation/HostTranslation/src/x86/RemoveCudaBuiltin.cpp similarity index 100% rename from compilation/HostTranslation/lib/RemoveCudaBuiltin.cpp rename to compilation/HostTranslation/src/x86/RemoveCudaBuiltin.cpp diff --git a/compilation/HostTranslation/lib/ReplaceConstantMemory.cpp b/compilation/HostTranslation/src/x86/ReplaceConstantMemory.cpp similarity index 100% rename from compilation/HostTranslation/lib/ReplaceConstantMemory.cpp rename to compilation/HostTranslation/src/x86/ReplaceConstantMemory.cpp diff --git a/compilation/HostTranslation/lib/ReplaceCudaBuiltin.cpp b/compilation/HostTranslation/src/x86/ReplaceCudaBuiltin.cpp similarity index 100% rename from compilation/HostTranslation/lib/ReplaceCudaBuiltin.cpp rename to compilation/HostTranslation/src/x86/ReplaceCudaBuiltin.cpp diff --git a/compilation/HostTranslation/lib/ReplaceKernelArgs.cpp b/compilation/HostTranslation/src/x86/ReplaceKernelArgs.cpp similarity index 100% rename from compilation/HostTranslation/lib/ReplaceKernelArgs.cpp rename to compilation/HostTranslation/src/x86/ReplaceKernelArgs.cpp diff --git a/compilation/KernelTranslation/CMakeLists.txt b/compilation/KernelTranslation/CMakeLists.txt index 7ec3898..25968d6 100644 --- a/compilation/KernelTranslation/CMakeLists.txt +++ b/compilation/KernelTranslation/CMakeLists.txt @@ -12,10 +12,10 @@ set(LIB_NAME spmd2mpmd) set(CMAKE_CXX_STANDARD 14) set(CMAKE_BUILD_TYPE Debug) -include_directories(./include) +include_directories(./include/x86) -file(GLOB proj_HEADERS "include/*.h") -file(GLOB proj_SOURCES "lib/*.cpp") +file(GLOB proj_HEADERS "include/x86/*.h") +file(GLOB proj_SOURCES "src/x86/*.cpp") # Add core library. add_library(${LIB_NAME} SHARED ${proj_HEADERS} ${proj_SOURCES}) diff --git a/compilation/KernelTranslation/include/generate_x86_format.h b/compilation/KernelTranslation/include/x86/generate_x86_format.h similarity index 100% rename from compilation/KernelTranslation/include/generate_x86_format.h rename to compilation/KernelTranslation/include/x86/generate_x86_format.h diff --git a/compilation/KernelTranslation/include/handle_sync.h b/compilation/KernelTranslation/include/x86/handle_sync.h similarity index 100% rename from compilation/KernelTranslation/include/handle_sync.h rename to compilation/KernelTranslation/include/x86/handle_sync.h diff --git a/compilation/KernelTranslation/include/init.h b/compilation/KernelTranslation/include/x86/init.h similarity index 100% rename from compilation/KernelTranslation/include/init.h rename to compilation/KernelTranslation/include/x86/init.h diff --git a/compilation/KernelTranslation/include/insert_sync.h b/compilation/KernelTranslation/include/x86/insert_sync.h similarity index 100% rename from compilation/KernelTranslation/include/insert_sync.h rename to compilation/KernelTranslation/include/x86/insert_sync.h diff --git a/compilation/KernelTranslation/include/insert_warp_loop.h b/compilation/KernelTranslation/include/x86/insert_warp_loop.h similarity index 100% rename from compilation/KernelTranslation/include/insert_warp_loop.h rename to compilation/KernelTranslation/include/x86/insert_warp_loop.h diff --git a/compilation/KernelTranslation/include/memory_hierarchy.h b/compilation/KernelTranslation/include/x86/memory_hierarchy.h similarity index 100% rename from compilation/KernelTranslation/include/memory_hierarchy.h rename to compilation/KernelTranslation/include/x86/memory_hierarchy.h diff --git a/compilation/KernelTranslation/include/performance.h b/compilation/KernelTranslation/include/x86/performance.h similarity index 100% rename from compilation/KernelTranslation/include/performance.h rename to compilation/KernelTranslation/include/x86/performance.h diff --git a/compilation/KernelTranslation/include/tool.h b/compilation/KernelTranslation/include/x86/tool.h similarity index 100% rename from compilation/KernelTranslation/include/tool.h rename to compilation/KernelTranslation/include/x86/tool.h diff --git a/compilation/KernelTranslation/include/warp_func.h b/compilation/KernelTranslation/include/x86/warp_func.h similarity index 100% rename from compilation/KernelTranslation/include/warp_func.h rename to compilation/KernelTranslation/include/x86/warp_func.h diff --git a/compilation/KernelTranslation/lib/generate_x86_format.cpp b/compilation/KernelTranslation/src/x86/generate_x86_format.cpp similarity index 100% rename from compilation/KernelTranslation/lib/generate_x86_format.cpp rename to compilation/KernelTranslation/src/x86/generate_x86_format.cpp diff --git a/compilation/KernelTranslation/lib/handle_sync.cpp b/compilation/KernelTranslation/src/x86/handle_sync.cpp similarity index 100% rename from compilation/KernelTranslation/lib/handle_sync.cpp rename to compilation/KernelTranslation/src/x86/handle_sync.cpp diff --git a/compilation/KernelTranslation/lib/init.cpp b/compilation/KernelTranslation/src/x86/init.cpp similarity index 100% rename from compilation/KernelTranslation/lib/init.cpp rename to compilation/KernelTranslation/src/x86/init.cpp diff --git a/compilation/KernelTranslation/lib/insert_sync.cpp b/compilation/KernelTranslation/src/x86/insert_sync.cpp similarity index 100% rename from compilation/KernelTranslation/lib/insert_sync.cpp rename to compilation/KernelTranslation/src/x86/insert_sync.cpp diff --git a/compilation/KernelTranslation/lib/insert_warp_loop.cpp b/compilation/KernelTranslation/src/x86/insert_warp_loop.cpp similarity index 100% rename from compilation/KernelTranslation/lib/insert_warp_loop.cpp rename to compilation/KernelTranslation/src/x86/insert_warp_loop.cpp diff --git a/compilation/KernelTranslation/lib/memory_hierarchy.cpp b/compilation/KernelTranslation/src/x86/memory_hierarchy.cpp similarity index 100% rename from compilation/KernelTranslation/lib/memory_hierarchy.cpp rename to compilation/KernelTranslation/src/x86/memory_hierarchy.cpp diff --git a/compilation/KernelTranslation/lib/performance.cpp b/compilation/KernelTranslation/src/x86/performance.cpp similarity index 100% rename from compilation/KernelTranslation/lib/performance.cpp rename to compilation/KernelTranslation/src/x86/performance.cpp diff --git a/compilation/KernelTranslation/lib/tool.cpp b/compilation/KernelTranslation/src/x86/tool.cpp similarity index 100% rename from compilation/KernelTranslation/lib/tool.cpp rename to compilation/KernelTranslation/src/x86/tool.cpp diff --git a/compilation/KernelTranslation/lib/warp_func.cpp b/compilation/KernelTranslation/src/x86/warp_func.cpp similarity index 100% rename from compilation/KernelTranslation/lib/warp_func.cpp rename to compilation/KernelTranslation/src/x86/warp_func.cpp diff --git a/runtime/CMakeLists.txt b/runtime/CMakeLists.txt index 31591ae..6385824 100644 --- a/runtime/CMakeLists.txt +++ b/runtime/CMakeLists.txt @@ -10,7 +10,7 @@ set(CMAKE_VERBOSE_MAKEFILE ON) add_subdirectory(threadPool) # compile x86 runtime library -include_directories(./include) -include_directories(./threadPool/include) -file(GLOB proj_SOURCES "lib/*.cpp") +include_directories(./include/x86) +include_directories(./threadPool/include/x86) +file(GLOB proj_SOURCES "src/vortex/*.cpp") add_library(${LIB_NAME} SHARED ${proj_SOURCES}) diff --git a/runtime/include/cudaKernelImpl.h b/runtime/include/x86/cudaKernelImpl.h similarity index 100% rename from runtime/include/cudaKernelImpl.h rename to runtime/include/x86/cudaKernelImpl.h diff --git a/runtime/include/cudaRuntimeImpl.h b/runtime/include/x86/cudaRuntimeImpl.h similarity index 100% rename from runtime/include/cudaRuntimeImpl.h rename to runtime/include/x86/cudaRuntimeImpl.h diff --git a/runtime/include/cudaStatus.h b/runtime/include/x86/cudaStatus.h similarity index 100% rename from runtime/include/cudaStatus.h rename to runtime/include/x86/cudaStatus.h diff --git a/runtime/lib/cudaKernelImpl.cpp b/runtime/src/vortex/cudaKernelImpl.cpp similarity index 100% rename from runtime/lib/cudaKernelImpl.cpp rename to runtime/src/vortex/cudaKernelImpl.cpp diff --git a/runtime/lib/cudaRuntimeImpl.cpp b/runtime/src/vortex/cudaRuntimeImpl.cpp similarity index 100% rename from runtime/lib/cudaRuntimeImpl.cpp rename to runtime/src/vortex/cudaRuntimeImpl.cpp diff --git a/runtime/src/x86/cudaKernelImpl.cpp b/runtime/src/x86/cudaKernelImpl.cpp new file mode 100644 index 0000000..56803c4 --- /dev/null +++ b/runtime/src/x86/cudaKernelImpl.cpp @@ -0,0 +1,19 @@ +#include "cudaKernelImpl.h" +#include +double __nv_exp(double base) { return exp(base); } +double __nv_sqrt(double v) { return sqrt(v); } +float __nv_sqrtf(float v) { return sqrt(v); } +float __nv_powif(float base, int exp) { return pow(base, exp); } +float __nv_logf(float v) { return logf(v); } +float __nv_expf(float v) { return expf(v); } +float __nv_log10f(float v) { return log10f(v); } +float __nv_fast_log2f(float v) { return log2f(v); } +double __nv_powi(double base, int exp) { return pow(base, exp); } +float __nv_powf(float base, float exp) { return pow(base, exp); } +float __nv_fast_powf(float base, float exp) { return pow(base, exp); } +float __nv_fmodf(float x, float y) { return fmod(x, y); } +int __nv_isnanf(float v) { return isnan(v); } +int __nv_isinff(float v) { return isinf(v); } +float __nv_fabsf(float v) { return abs(v); } +int __nvvm_mul24_i(int a, int b) { return a * b; } +double _ZL3expd(double base) { return exp(base); } diff --git a/runtime/src/x86/cudaRuntimeImpl.cpp b/runtime/src/x86/cudaRuntimeImpl.cpp new file mode 100644 index 0000000..d15dae1 --- /dev/null +++ b/runtime/src/x86/cudaRuntimeImpl.cpp @@ -0,0 +1,245 @@ +#include "cudaRuntimeImpl.h" +#include "api.h" +#include "cuda_runtime.h" +#include "def.h" +#include "macros.h" +#include "structures.h" +#include +#include +#include +#include +#include +cudaError_t cudaGetDevice(int *devPtr) { *devPtr = 0; } +const char *cudaGetErrorName(cudaError_t error) { return "SUCCESS\n"; } +cudaError_t cudaDeviceReset(void) { scheduler_uninit(); } +cudaError_t cudaDeviceSynchronize(void) { cuSynchronizeBarrier(); } +cudaError_t cudaThreadSynchronize(void) { cuSynchronizeBarrier(); } +cudaError_t cudaFree(void *devPtr) { free(devPtr); } +cudaError_t cudaFreeHost(void *devPtr) { free(devPtr); } + +cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, + void **args, size_t sharedMem, + cudaStream_t stream) { + // if scheduler is null init device + // printf( + // "cudaLaunchKernel : Grid: x:%d y:%d z:%d Block: %d, %d, %d ShMem:%lu\n + // ", gridDim.x, gridDim.y, gridDim.z, blockDim.x, blockDim.y, blockDim.z, + // sharedMem); + + cu_kernel *ker = + create_kernel(func, gridDim, blockDim, args, sharedMem, stream); + + int lstatus = cuLaunchKernel(&ker); + + // std::cout << "ret cudaLKernel" << std::endl; +} +cudaError_t cudaMalloc(void **devPtr, size_t size) { + *devPtr = malloc(size); + if (devPtr == NULL) + return cudaErrorMemoryAllocation; + return cudaSuccess; +} +cudaError_t cudaMemset(void *devPtr, int value, size_t count) { + memset(devPtr, value, count); + return cudaSuccess; +} +cudaError_t cudaMemcpy(void *dst, const void *src, size_t count, + cudaMemcpyKind kind) { + if (kind == cudaMemcpyHostToHost) { + memcpy(dst, src, count); + } else if (kind == cudaMemcpyDeviceToHost) { + // how does the code know which device accessing the memory + memcpy(dst, src, count); + } else if (kind == cudaMemcpyHostToDevice) { + // how does the code know which device accessing the memory + memcpy(dst, src, count); + } else if (kind == cudaMemcpyDeviceToHost) { + // how does the code know which device accessing the memory + memcpy(dst, src, count); + } else if (kind == cudaMemcpyDeviceToDevice) { + + memcpy(dst, dst, count); + } else if (kind == cudaMemcpyDefault) { + memcpy(dst, src, count); + } + return cudaSuccess; +} + +cudaError_t cudaMemcpyToSymbol_host(void *dst, const void *src, size_t count, + size_t offset, cudaMemcpyKind kind) { + assert(offset == 0 && "DO not support offset !=0\n"); + memcpy(dst, src + offset, count); + return cudaSuccess; +} + +cudaError_t cudaSetDevice(int device) { + // error checking + // std::cout << "cudaSetDevice Called" << std::endl; + init_device(); + // std::cout << "cudaSetDevice Ret" << std::endl; +} + +cudaError_t cudaStreamCopyAttributes(cudaStream_t dst, cudaStream_t src) { + cstreamData *dst_stream = (cstreamData *)dst; + cstreamData *src_stream = (cstreamData *)src; + + if (dst_stream == NULL || src_stream == NULL) { + return cudaErrorInvalidValue; // 1 + } + + dst_stream->stream_priority = src_stream->stream_priority; + dst_stream->stream_flags = src_stream->stream_flags; + + return cudaSuccess; // 0 +} + +static int stream_counter = 1; +/* + cudaStream_t is a Opaque Structure + + Overwrites cudaStream_t into custom cstreamData structure + (does hardware uses the cudaStream_t stream) + +*/ +cudaError_t cudaStreamCreate(cudaStream_t *pStream) { + cstreamData *s = (cstreamData *)calloc(1, sizeof(cstreamData)); + if (s == NULL) + return cudaErrorMemoryAllocation; + s->ev.status = C_RUN; + s->id = stream_counter; + stream_counter++; + s->stream_priority = DEFAULT; + create_KernelQueue(&(s->kernelQueue)); + + INIT_LOCK(s->stream_lock); + *pStream = (cudaStream_t)(s); + + return cudaSuccess; +} + +cudaError_t cudaStreamDestroy(cudaStream_t stream) { + cstreamData *s = (cstreamData *)(stream); + + free(s->kernelQueue); + + DESTROY_LOCK(s->stream_lock); + + free(s); + + return cudaSuccess; +} + +cudaError_t cudaStreamSynchronize(cudaStream_t stream) { + cstreamData *e = ((cstreamData *)(stream)); + MUTEX_LOCK(e->stream_lock); + + e->ev.status = C_SYNCHRONIZE; + e->ev.numKernelsToWait = e->kernelQueue->waiting_count; + MUTEX_UNLOCK(e->stream_lock); +} + +cudaError_t cudaGetDeviceCount(int *count) { + // dummy value + *count = 1; +} + +cudaError_t cudaGetDeviceProperties(cudaDeviceProp *deviceProp, int device) { + + // dummy values + if (device == 0) { + strcpy(deviceProp->name, "pthread"); + deviceProp->totalGlobalMem = 0; + deviceProp->sharedMemPerBlock = 0; + deviceProp->regsPerBlock = 0; + deviceProp->warpSize = 0; + deviceProp->memPitch = 0; + deviceProp->maxThreadsPerBlock = 0; + deviceProp->maxThreadsDim[0] = 1; + deviceProp->maxThreadsDim[1] = 1; + deviceProp->maxThreadsDim[2] = 1; + + deviceProp->maxGridSize[0] = 1; + deviceProp->maxGridSize[1] = 1; + deviceProp->maxGridSize[2] = 1; + + deviceProp->totalConstMem = 0; + deviceProp->major = 0; + deviceProp->minor = 0; + deviceProp->clockRate = 0; + deviceProp->textureAlignment = 0; + deviceProp->deviceOverlap = false; + deviceProp->multiProcessorCount = 0; + } + return cudaSuccess; +} + +static cudaError_t lastError = cudaSuccess; +const char *cudaGetErrorString(cudaError_t error) { + if (error == cudaSuccess) { + return "Cuda Get Error Success"; + } +} + +cudaError_t cudaGetLastError(void) { return lastError; } + +static callParams callParamTemp; + +/* + Internal Cuda Library Functions +*/ +extern "C" { + +extern cudaError_t CUDARTAPI __cudaPopCallConfiguration(dim3 *gridDim, + dim3 *blockDim, + size_t *sharedMem, + void **stream) { + // printf("__cudaPopCallConfiguration: Grid: x:%d y:%d z:%d Block: %d, %d, %d + // ShMem: %lu\n", + // gridDim->x, gridDim->y, gridDim->z, blockDim->x, blockDim->y, blockDim->z, + // *sharedMem); + + *gridDim = callParamTemp.gridDim; + *blockDim = callParamTemp.blockDim; + *sharedMem = callParamTemp.shareMem; + *stream = callParamTemp.stream; + + // printf("__cudaPopCallConfiguration After : Grid: x:%d y:%d z:%d Block: %d, + // %d, %d ShMem: %lu\n", gridDim->x, gridDim->y, gridDim->z, blockDim->x, + // blockDim->y, blockDim->z, *sharedMem); + + // exit(1); + + return cudaSuccess; +} + +extern __host__ __device__ unsigned CUDARTAPI __cudaPushCallConfiguration( + dim3 gridDim, dim3 blockDim, size_t sharedMem = 0, void *stream = 0) { + + // printf("__cudaPushCallConfiguration Grid: x:%d y:%d z:%d Block: %d, %d, %d + // " + // "ShMem: %lu\n ", + // gridDim.x, gridDim.y, gridDim.z, blockDim.x, blockDim.y, blockDim.z, + // sharedMem); + + // memory checks allocations + callParamTemp.gridDim = gridDim; + + // std::cout << "assign gridDim" << std::endl; + + callParamTemp.blockDim = blockDim; + // std::cout << "assign blockDim" << std::endl; + callParamTemp.shareMem = sharedMem; + // std::cout << "assign shareMem" << std::endl; + (callParamTemp.stream) = stream; + + // printf("__cudaPushCallConfiguration After Grid: x:%d y:%d z:%d Block: %d, + // %d, %d ShMem: %lu\n", + // gridDim.x, gridDim.y, gridDim.z, blockDim.x, blockDim.y, blockDim.z, + // sharedMem); + + // return 0 continues the Pop + return cudaSuccess; + + // return ne 0 skips the Pop +} +} diff --git a/runtime/threadPool/CMakeLists.txt b/runtime/threadPool/CMakeLists.txt index ed599bf..2ad0b79 100644 --- a/runtime/threadPool/CMakeLists.txt +++ b/runtime/threadPool/CMakeLists.txt @@ -11,7 +11,7 @@ set(LIB_NAME threadPool) set(CMAKE_CXX_STANDARD 14) set(CMAKE_BUILD_TYPE Debug) -include_directories(./include) +include_directories(./include/x86) -file(GLOB proj_SOURCES "lib/*.cpp") +file(GLOB proj_SOURCES "src/vortex/*.cpp") add_library(${LIB_NAME} SHARED ${proj_SOURCES}) diff --git a/runtime/threadPool/include/api.h b/runtime/threadPool/include/x86/api.h similarity index 100% rename from runtime/threadPool/include/api.h rename to runtime/threadPool/include/x86/api.h diff --git a/runtime/threadPool/include/def.h b/runtime/threadPool/include/x86/def.h similarity index 100% rename from runtime/threadPool/include/def.h rename to runtime/threadPool/include/x86/def.h diff --git a/runtime/threadPool/include/macros.h b/runtime/threadPool/include/x86/macros.h similarity index 100% rename from runtime/threadPool/include/macros.h rename to runtime/threadPool/include/x86/macros.h diff --git a/runtime/threadPool/include/structures.h b/runtime/threadPool/include/x86/structures.h similarity index 100% rename from runtime/threadPool/include/structures.h rename to runtime/threadPool/include/x86/structures.h diff --git a/runtime/threadPool/lib/api.cpp b/runtime/threadPool/src/x86/api.cpp similarity index 100% rename from runtime/threadPool/lib/api.cpp rename to runtime/threadPool/src/x86/api.cpp