update runtime and threadPool with debug tools

This commit is contained in:
Ruobing Han 2022-09-15 18:43:14 -04:00
parent f2a4f7fe64
commit 3875e179b4
9 changed files with 27 additions and 368 deletions

View File

@ -10,6 +10,7 @@ set(CMAKE_VERBOSE_MAKEFILE ON)
add_subdirectory(threadPool) add_subdirectory(threadPool)
# compile x86 runtime library # compile x86 runtime library
include_directories(../common)
include_directories(./include/) include_directories(./include/)
include_directories(./include/x86) include_directories(./include/x86)
include_directories(./threadPool/include/) include_directories(./threadPool/include/)

View File

@ -1,6 +1,5 @@
#ifndef __RUNTIME_IMPL__ #ifndef __RUNTIME_IMPL__
#define __KERNEL_IMPL__ #define __KERNEL_IMPL__
#include "cudaStatus.h"
#include "structures.h" #include "structures.h"
#include <stdint.h> #include <stdint.h>
extern "C" { extern "C" {

View File

@ -1,6 +1,5 @@
#ifndef __RUNTIME_IMPL__ #ifndef __RUNTIME_IMPL__
#define __RUNTIME_IMPL__ #define __RUNTIME_IMPL__
#include "cudaStatus.h"
#include "structures.h" #include "structures.h"
#include <stdint.h> #include <stdint.h>
extern "C" { extern "C" {

View File

@ -1,18 +0,0 @@
#ifndef __RUNTIME_STATUS__
#define __RUNTIME_STATUS__
#include <stdio.h>
// enum cudaError_t {
// CudaSuccess = 0,
// CudaErrorInvalidValue = 1,
// CudaErrorInvalidMemoryAllocation = 2,
// };
// enum cudaMemcpyKind {
// cudaMemcpyHostToHost = 0,
// cudaMemcpyHostToDevice = 1,
// cudaMemcpyDeviceToHost = 2,
// cudaMemcpyDeviceToDevice = 3,
// cudaMemcpyDefault = 4,
// };
#endif

View File

@ -1,19 +0,0 @@
#include "cudaKernelImpl.h"
#include <math.h>
double __nv_exp(double base) { return exp(base); }
double __nv_sqrt(double v) { return sqrt(v); }
float __nv_sqrtf(float v) { return sqrt(v); }
float __nv_powif(float base, int exp) { return pow(base, exp); }
float __nv_logf(float v) { return logf(v); }
float __nv_expf(float v) { return expf(v); }
float __nv_log10f(float v) { return log10f(v); }
float __nv_fast_log2f(float v) { return log2f(v); }
double __nv_powi(double base, int exp) { return pow(base, exp); }
float __nv_powf(float base, float exp) { return pow(base, exp); }
float __nv_fast_powf(float base, float exp) { return pow(base, exp); }
float __nv_fmodf(float x, float y) { return fmod(x, y); }
int __nv_isnanf(float v) { return isnan(v); }
int __nv_isinff(float v) { return isinf(v); }
float __nv_fabsf(float v) { return abs(v); }
int __nvvm_mul24_i(int a, int b) { return a * b; }
double _ZL3expd(double base) { return exp(base); }

View File

@ -1,251 +0,0 @@
#include "cudaRuntimeImpl.h"
#include "api.h"
#include "cuda_runtime.h"
#include "def.h"
#include "macros.h"
#include "structures.h"
#include <iostream>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
cudaError_t cudaGetDevice(int *devPtr) { *devPtr = 0; }
const char *cudaGetErrorName(cudaError_t error) { return "SUCCESS\n"; }
cudaError_t cudaDeviceReset(void) { scheduler_uninit(); }
cudaError_t cudaDeviceSynchronize(void) { cuSynchronizeBarrier(); }
cudaError_t cudaThreadSynchronize(void) { cuSynchronizeBarrier(); }
cudaError_t cudaFree(void *devPtr) { free(devPtr); }
cudaError_t cudaFreeHost(void *devPtr) { free(devPtr); }
cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim,
void **args, size_t sharedMem,
cudaStream_t stream) {
// if scheduler is null init device
// printf(
// "cudaLaunchKernel : Grid: x:%d y:%d z:%d Block: %d, %d, %d ShMem:%lu\n
// ", gridDim.x, gridDim.y, gridDim.z, blockDim.x, blockDim.y, blockDim.z,
// sharedMem);
cu_kernel *ker =
create_kernel(func, gridDim, blockDim, args, sharedMem, stream);
int lstatus = cuLaunchKernel(&ker);
// std::cout << "ret cudaLKernel" << std::endl;
}
cudaError_t cudaMalloc(void **devPtr, size_t size) {
*devPtr = malloc(size);
if (devPtr == NULL)
return cudaErrorMemoryAllocation;
return cudaSuccess;
}
cudaError_t cudaMallocHost(void **devPtr, size_t size) {
*devPtr = malloc(size);
if (devPtr == NULL)
return cudaErrorMemoryAllocation;
return cudaSuccess;
}
cudaError_t cudaMemset(void *devPtr, int value, size_t count) {
memset(devPtr, value, count);
return cudaSuccess;
}
cudaError_t cudaMemcpy(void *dst, const void *src, size_t count,
cudaMemcpyKind kind) {
if (kind == cudaMemcpyHostToHost) {
memcpy(dst, src, count);
} else if (kind == cudaMemcpyDeviceToHost) {
// how does the code know which device accessing the memory
memcpy(dst, src, count);
} else if (kind == cudaMemcpyHostToDevice) {
// how does the code know which device accessing the memory
memcpy(dst, src, count);
} else if (kind == cudaMemcpyDeviceToHost) {
// how does the code know which device accessing the memory
memcpy(dst, src, count);
} else if (kind == cudaMemcpyDeviceToDevice) {
memcpy(dst, src, count);
} else if (kind == cudaMemcpyDefault) {
memcpy(dst, src, count);
}
return cudaSuccess;
}
cudaError_t cudaMemcpyToSymbol_host(void *dst, const void *src, size_t count,
size_t offset, cudaMemcpyKind kind) {
assert(offset == 0 && "DO not support offset !=0\n");
memcpy(dst, src + offset, count);
return cudaSuccess;
}
cudaError_t cudaSetDevice(int device) {
// error checking
// std::cout << "cudaSetDevice Called" << std::endl;
init_device();
// std::cout << "cudaSetDevice Ret" << std::endl;
}
cudaError_t cudaStreamCopyAttributes(cudaStream_t dst, cudaStream_t src) {
cstreamData *dst_stream = (cstreamData *)dst;
cstreamData *src_stream = (cstreamData *)src;
if (dst_stream == NULL || src_stream == NULL) {
return cudaErrorInvalidValue; // 1
}
dst_stream->stream_priority = src_stream->stream_priority;
dst_stream->stream_flags = src_stream->stream_flags;
return cudaSuccess; // 0
}
static int stream_counter = 1;
/*
cudaStream_t is a Opaque Structure
Overwrites cudaStream_t into custom cstreamData structure
(does hardware uses the cudaStream_t stream)
*/
cudaError_t cudaStreamCreate(cudaStream_t *pStream) {
cstreamData *s = (cstreamData *)calloc(1, sizeof(cstreamData));
if (s == NULL)
return cudaErrorMemoryAllocation;
s->ev.status = C_RUN;
s->id = stream_counter;
stream_counter++;
s->stream_priority = DEFAULT;
create_KernelQueue(&(s->kernelQueue));
INIT_LOCK(s->stream_lock);
*pStream = (cudaStream_t)(s);
return cudaSuccess;
}
cudaError_t cudaStreamDestroy(cudaStream_t stream) {
cstreamData *s = (cstreamData *)(stream);
free(s->kernelQueue);
DESTROY_LOCK(s->stream_lock);
free(s);
return cudaSuccess;
}
cudaError_t cudaStreamSynchronize(cudaStream_t stream) {
cstreamData *e = ((cstreamData *)(stream));
MUTEX_LOCK(e->stream_lock);
e->ev.status = C_SYNCHRONIZE;
e->ev.numKernelsToWait = e->kernelQueue->waiting_count;
MUTEX_UNLOCK(e->stream_lock);
}
cudaError_t cudaGetDeviceCount(int *count) {
// dummy value
*count = 1;
}
cudaError_t cudaGetDeviceProperties(cudaDeviceProp *deviceProp, int device) {
// dummy values
if (device == 0) {
strcpy(deviceProp->name, "pthread");
deviceProp->totalGlobalMem = 0;
deviceProp->sharedMemPerBlock = 0;
deviceProp->regsPerBlock = 0;
deviceProp->warpSize = 0;
deviceProp->memPitch = 0;
deviceProp->maxThreadsPerBlock = 0;
deviceProp->maxThreadsDim[0] = 1;
deviceProp->maxThreadsDim[1] = 1;
deviceProp->maxThreadsDim[2] = 1;
deviceProp->maxGridSize[0] = 1;
deviceProp->maxGridSize[1] = 1;
deviceProp->maxGridSize[2] = 1;
deviceProp->totalConstMem = 0;
deviceProp->major = 0;
deviceProp->minor = 0;
deviceProp->clockRate = 0;
deviceProp->textureAlignment = 0;
deviceProp->deviceOverlap = false;
deviceProp->multiProcessorCount = 0;
}
return cudaSuccess;
}
static cudaError_t lastError = cudaSuccess;
const char *cudaGetErrorString(cudaError_t error) {
if (error == cudaSuccess) {
return "Cuda Get Error Success";
}
}
cudaError_t cudaGetLastError(void) { return lastError; }
static callParams callParamTemp;
/*
Internal Cuda Library Functions
*/
extern "C" {
extern cudaError_t CUDARTAPI __cudaPopCallConfiguration(dim3 *gridDim,
dim3 *blockDim,
size_t *sharedMem,
void **stream) {
// printf("__cudaPopCallConfiguration: Grid: x:%d y:%d z:%d Block: %d, %d, %d
// ShMem: %lu\n",
// gridDim->x, gridDim->y, gridDim->z, blockDim->x, blockDim->y, blockDim->z,
// *sharedMem);
*gridDim = callParamTemp.gridDim;
*blockDim = callParamTemp.blockDim;
*sharedMem = callParamTemp.shareMem;
*stream = callParamTemp.stream;
// printf("__cudaPopCallConfiguration After : Grid: x:%d y:%d z:%d Block: %d,
// %d, %d ShMem: %lu\n", gridDim->x, gridDim->y, gridDim->z, blockDim->x,
// blockDim->y, blockDim->z, *sharedMem);
// exit(1);
return cudaSuccess;
}
extern __host__ __device__ unsigned CUDARTAPI __cudaPushCallConfiguration(
dim3 gridDim, dim3 blockDim, size_t sharedMem = 0, void *stream = 0) {
// printf("__cudaPushCallConfiguration Grid: x:%d y:%d z:%d Block: %d, %d, %d
// "
// "ShMem: %lu\n ",
// gridDim.x, gridDim.y, gridDim.z, blockDim.x, blockDim.y, blockDim.z,
// sharedMem);
// memory checks allocations
callParamTemp.gridDim = gridDim;
// std::cout << "assign gridDim" << std::endl;
callParamTemp.blockDim = blockDim;
// std::cout << "assign blockDim" << std::endl;
callParamTemp.shareMem = sharedMem;
// std::cout << "assign shareMem" << std::endl;
(callParamTemp.stream) = stream;
// printf("__cudaPushCallConfiguration After Grid: x:%d y:%d z:%d Block: %d,
// %d, %d ShMem: %lu\n",
// gridDim.x, gridDim.y, gridDim.z, blockDim.x, blockDim.y, blockDim.z,
// sharedMem);
// return 0 continues the Pop
return cudaSuccess;
// return ne 0 skips the Pop
}
}

View File

@ -1,6 +1,7 @@
#include "cudaRuntimeImpl.h" #include "cudaRuntimeImpl.h"
#include "api.h" #include "api.h"
#include "cuda_runtime.h" #include "cuda_runtime.h"
#include "debug.hpp"
#include "def.h" #include "def.h"
#include "macros.h" #include "macros.h"
#include "structures.h" #include "structures.h"
@ -38,11 +39,10 @@ cudaError_t cudaFreeHost(void *devPtr) {
cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim,
void **args, size_t sharedMem, void **args, size_t sharedMem,
cudaStream_t stream) { cudaStream_t stream) {
// if scheduler is null init device DEBUG_INFO(
// printf( "cudaLaunchKernel : Grid: x:%d y:%d z:%d Block: %d, %d, %d ShMem: %d\n",
// "cudaLaunchKernel : Grid: x:%d y:%d z:%d Block: %d, %d, %d ShMem:%lu\n gridDim.x, gridDim.y, gridDim.z, blockDim.x, blockDim.y, blockDim.z,
// ", gridDim.x, gridDim.y, gridDim.z, blockDim.x, blockDim.y, blockDim.z, sharedMem);
// sharedMem);
cu_kernel *ker = cu_kernel *ker =
create_kernel(func, gridDim, blockDim, args, sharedMem, stream); create_kernel(func, gridDim, blockDim, args, sharedMem, stream);
@ -66,16 +66,12 @@ cudaError_t cudaMemcpy(void *dst, const void *src, size_t count,
if (kind == cudaMemcpyHostToHost) { if (kind == cudaMemcpyHostToHost) {
memcpy(dst, src, count); memcpy(dst, src, count);
} else if (kind == cudaMemcpyDeviceToHost) { } else if (kind == cudaMemcpyDeviceToHost) {
// how does the code know which device accessing the memory
memcpy(dst, src, count); memcpy(dst, src, count);
} else if (kind == cudaMemcpyHostToDevice) { } else if (kind == cudaMemcpyHostToDevice) {
// how does the code know which device accessing the memory
memcpy(dst, src, count); memcpy(dst, src, count);
} else if (kind == cudaMemcpyDeviceToHost) { } else if (kind == cudaMemcpyDeviceToHost) {
// how does the code know which device accessing the memory
memcpy(dst, src, count); memcpy(dst, src, count);
} else if (kind == cudaMemcpyDeviceToDevice) { } else if (kind == cudaMemcpyDeviceToDevice) {
memcpy(dst, dst, count); memcpy(dst, dst, count);
} else if (kind == cudaMemcpyDefault) { } else if (kind == cudaMemcpyDefault) {
memcpy(dst, src, count); memcpy(dst, src, count);
@ -100,36 +96,31 @@ cudaError_t cudaStreamCopyAttributes(cudaStream_t dst, cudaStream_t src) {
cstreamData *src_stream = (cstreamData *)src; cstreamData *src_stream = (cstreamData *)src;
if (dst_stream == NULL || src_stream == NULL) { if (dst_stream == NULL || src_stream == NULL) {
return cudaErrorInvalidValue; // 1 return cudaErrorInvalidValue;
} }
dst_stream->stream_priority = src_stream->stream_priority; dst_stream->stream_priority = src_stream->stream_priority;
dst_stream->stream_flags = src_stream->stream_flags; dst_stream->stream_flags = src_stream->stream_flags;
return cudaSuccess; // 0 return cudaSuccess;
} }
static int stream_counter = 1; static int stream_counter = 1;
/* /*
cudaStream_t is a Opaque Structure cudaStream_t is a Opaque Structure
Overwrites cudaStream_t into custom cstreamData structure Overwrites cudaStream_t into custom cstreamData structure
(does hardware uses the cudaStream_t stream) (does hardware uses the cudaStream_t stream)
*/ */
cudaError_t cudaStreamCreate(cudaStream_t *pStream) { cudaError_t cudaStreamCreate(cudaStream_t *pStream) {
printf("cudaStreamCreate no Implement\n"); assert(0 && "cudaStreamCreate no Implement\n");
exit(1);
} }
cudaError_t cudaStreamDestroy(cudaStream_t stream) { cudaError_t cudaStreamDestroy(cudaStream_t stream) {
printf("cudaStreamDestroy No Implement\n"); assert(0 && "cudaStreamDestroy No Implement\n");
exit(1);
} }
cudaError_t cudaStreamSynchronize(cudaStream_t stream) { cudaError_t cudaStreamSynchronize(cudaStream_t stream) {
printf("cudaStreamSynchronize No Implement\n"); assert(0 && "cudaStreamSynchronize No Implement\n");
exit(1);
} }
cudaError_t cudaGetDeviceCount(int *count) { cudaError_t cudaGetDeviceCount(int *count) {
@ -139,7 +130,6 @@ cudaError_t cudaGetDeviceCount(int *count) {
} }
cudaError_t cudaGetDeviceProperties(cudaDeviceProp *deviceProp, int device) { cudaError_t cudaGetDeviceProperties(cudaDeviceProp *deviceProp, int device) {
// dummy values // dummy values
if (device == 0) { if (device == 0) {
strcpy(deviceProp->name, "pthread"); strcpy(deviceProp->name, "pthread");
@ -188,53 +178,31 @@ extern cudaError_t CUDARTAPI __cudaPopCallConfiguration(dim3 *gridDim,
dim3 *blockDim, dim3 *blockDim,
size_t *sharedMem, size_t *sharedMem,
void **stream) { void **stream) {
// printf("__cudaPopCallConfiguration: Grid: x:%d y:%d z:%d Block: %d, %d, %d DEBUG_INFO("__cudaPopCallConfiguration: Grid: x:%d y:%d z:%d Block: %d, %d, "
// ShMem: %lu\n", "%d ShMem: %lu\n",
// gridDim->x, gridDim->y, gridDim->z, blockDim->x, blockDim->y, blockDim->z, gridDim->x, gridDim->y, gridDim->z, blockDim->x, blockDim->y,
// *sharedMem); blockDim->z, *sharedMem);
*gridDim = callParamTemp.gridDim; *gridDim = callParamTemp.gridDim;
*blockDim = callParamTemp.blockDim; *blockDim = callParamTemp.blockDim;
*sharedMem = callParamTemp.shareMem; *sharedMem = callParamTemp.shareMem;
*stream = callParamTemp.stream; *stream = callParamTemp.stream;
// printf("__cudaPopCallConfiguration After : Grid: x:%d y:%d z:%d Block: %d,
// %d, %d ShMem: %lu\n", gridDim->x, gridDim->y, gridDim->z, blockDim->x,
// blockDim->y, blockDim->z, *sharedMem);
// exit(1);
return cudaSuccess; return cudaSuccess;
} }
extern __host__ __device__ unsigned CUDARTAPI __cudaPushCallConfiguration( extern __host__ __device__ unsigned CUDARTAPI __cudaPushCallConfiguration(
dim3 gridDim, dim3 blockDim, size_t sharedMem = 0, void *stream = 0) { dim3 gridDim, dim3 blockDim, size_t sharedMem = 0, void *stream = 0) {
DEBUG_INFO("__cudaPushCallConfiguration: Grid: x:%d y:%d z:%d Block: %d, %d, "
// printf("__cudaPushCallConfiguration Grid: x:%d y:%d z:%d Block: %d, %d, %d "%d ShMem: %lu\n",
// " gridDim.x, gridDim.y, gridDim.z, blockDim.x, blockDim.y,
// "ShMem: %lu\n ", blockDim.z, sharedMem);
// gridDim.x, gridDim.y, gridDim.z, blockDim.x, blockDim.y, blockDim.z,
// sharedMem);
// memory checks allocations // memory checks allocations
callParamTemp.gridDim = gridDim; callParamTemp.gridDim = gridDim;
// std::cout << "assign gridDim" << std::endl;
callParamTemp.blockDim = blockDim; callParamTemp.blockDim = blockDim;
// std::cout << "assign blockDim" << std::endl;
callParamTemp.shareMem = sharedMem; callParamTemp.shareMem = sharedMem;
// std::cout << "assign shareMem" << std::endl;
(callParamTemp.stream) = stream; (callParamTemp.stream) = stream;
// printf("__cudaPushCallConfiguration After Grid: x:%d y:%d z:%d Block: %d,
// %d, %d ShMem: %lu\n",
// gridDim.x, gridDim.y, gridDim.z, blockDim.x, blockDim.y, blockDim.z,
// sharedMem);
// return 0 continues the Pop
return cudaSuccess; return cudaSuccess;
// return ne 0 skips the Pop
} }
} }

View File

@ -11,6 +11,7 @@ set(LIB_NAME threadPool)
set(CMAKE_CXX_STANDARD 14) set(CMAKE_CXX_STANDARD 14)
set(CMAKE_BUILD_TYPE Debug) set(CMAKE_BUILD_TYPE Debug)
include_directories(../../common)
include_directories(./include) include_directories(./include)
include_directories(./include/x86) include_directories(./include/x86)
include_directories(../../external/moodycamel) include_directories(../../external/moodycamel)

View File

@ -1,5 +1,6 @@
#include "api.h" #include "api.h"
#include "blockingconcurrentqueue.h" #include "blockingconcurrentqueue.h"
#include "debug.hpp"
#include "def.h" #include "def.h"
#include "macros.h" #include "macros.h"
#include "structures.h" #include "structures.h"
@ -22,8 +23,8 @@ int init_device() {
return C_ERROR_MEMALLOC; return C_ERROR_MEMALLOC;
device->max_compute_units = std::thread::hardware_concurrency(); device->max_compute_units = std::thread::hardware_concurrency();
std::cout << device->max_compute_units DEBUG_INFO("%d concurrent threads are supported.\n",
<< " concurrent threads are supported.\n"; device->max_compute_units);
device_max_compute_units = device->max_compute_units; device_max_compute_units = device->max_compute_units;
// initialize scheduler // initialize scheduler
@ -46,13 +47,9 @@ cu_kernel *create_kernel(const void *func, dim3 gridDim, dim3 blockDim,
ker->gridDim = gridDim; ker->gridDim = gridDim;
ker->blockDim = blockDim; ker->blockDim = blockDim;
ker->shared_mem = sharedMem; ker->shared_mem = sharedMem;
ker->stream = stream; ker->stream = stream;
ker->totalBlocks = gridDim.x * gridDim.y * gridDim.z; ker->totalBlocks = gridDim.x * gridDim.y * gridDim.z;
ker->blockSize = blockDim.x * blockDim.y * blockDim.z; ker->blockSize = blockDim.x * blockDim.y * blockDim.z;
return ker; return ker;
} }
@ -97,9 +94,6 @@ int schedulerEnqueueKernel(cu_kernel *k) {
scheduler->kernelQueue->enqueue(p); scheduler->kernelQueue->enqueue(p);
TaskToExecute++; TaskToExecute++;
} }
// printf("total: %d execute per cpu: %d\n", totalBlocks,
// gpuBlockToExecutePerCpuThread);
return C_SUCCESS; return C_SUCCESS;
} }
@ -121,8 +115,7 @@ int cuLaunchKernel(cu_kernel **k) {
} }
schedulerEnqueueKernel(ker); schedulerEnqueueKernel(ker);
} else { } else {
printf("MultiStream no implemente\n"); assert(0 && "MultiStream no implemente\n");
exit(1);
} }
return 0; return 0;
} }
@ -185,11 +178,9 @@ void *driver_thread(void *p) {
// exit the routine // exit the routine
if (is_exit) { if (is_exit) {
td->exit = true; td->exit = true;
// pthread_exit
pthread_exit(NULL); pthread_exit(NULL);
} else { } else {
printf("driver thread stop incorrectly\n"); assert(0 && "driver thread stop incorrectly\n");
exit(1);
} }
} }
@ -215,29 +206,17 @@ int scheduler_init(cu_device device) {
return C_SUCCESS; return C_SUCCESS;
} }
void scheduler_uninit() { void scheduler_uninit() { assert(0 && "Scheduler Unitit no Implemente\n"); }
printf("Scheduler Unitit no Implemente\n");
exit(1);
}
/* /*
Barrier for Kernel Launch Barrier for Kernel Launch
During kernel launch, increment the number of work items required to finish
Each kernel will point to the same event
During Running Command, decrement the event.work_item count
when count is 0, all work items for this kernel launch is finish
Sense Like Barrier
Counting Barrier basically
*/ */
void cuSynchronizeBarrier() { void cuSynchronizeBarrier() {
if (!device_initilized) { if (!device_initilized) {
init_device(); init_device();
} }
while (1) { while (1) {
// (TODO): currently, we assume each kernel launch has a // after compilation transformation, each kernel launch has a
// following sync // following sync
if (scheduler->kernelQueue->size_approx() == 0) { if (scheduler->kernelQueue->size_approx() == 0) {
int completeBlock = 0; int completeBlock = 0;