update runtime and threadPool with debug tools
This commit is contained in:
parent
f2a4f7fe64
commit
3875e179b4
|
@ -10,6 +10,7 @@ set(CMAKE_VERBOSE_MAKEFILE ON)
|
||||||
add_subdirectory(threadPool)
|
add_subdirectory(threadPool)
|
||||||
|
|
||||||
# compile x86 runtime library
|
# compile x86 runtime library
|
||||||
|
include_directories(../common)
|
||||||
include_directories(./include/)
|
include_directories(./include/)
|
||||||
include_directories(./include/x86)
|
include_directories(./include/x86)
|
||||||
include_directories(./threadPool/include/)
|
include_directories(./threadPool/include/)
|
||||||
|
|
|
@ -1,6 +1,5 @@
|
||||||
#ifndef __RUNTIME_IMPL__
|
#ifndef __RUNTIME_IMPL__
|
||||||
#define __KERNEL_IMPL__
|
#define __KERNEL_IMPL__
|
||||||
#include "cudaStatus.h"
|
|
||||||
#include "structures.h"
|
#include "structures.h"
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
extern "C" {
|
extern "C" {
|
||||||
|
|
|
@ -1,6 +1,5 @@
|
||||||
#ifndef __RUNTIME_IMPL__
|
#ifndef __RUNTIME_IMPL__
|
||||||
#define __RUNTIME_IMPL__
|
#define __RUNTIME_IMPL__
|
||||||
#include "cudaStatus.h"
|
|
||||||
#include "structures.h"
|
#include "structures.h"
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
extern "C" {
|
extern "C" {
|
||||||
|
|
|
@ -1,18 +0,0 @@
|
||||||
#ifndef __RUNTIME_STATUS__
|
|
||||||
#define __RUNTIME_STATUS__
|
|
||||||
#include <stdio.h>
|
|
||||||
|
|
||||||
// enum cudaError_t {
|
|
||||||
// CudaSuccess = 0,
|
|
||||||
// CudaErrorInvalidValue = 1,
|
|
||||||
// CudaErrorInvalidMemoryAllocation = 2,
|
|
||||||
// };
|
|
||||||
|
|
||||||
// enum cudaMemcpyKind {
|
|
||||||
// cudaMemcpyHostToHost = 0,
|
|
||||||
// cudaMemcpyHostToDevice = 1,
|
|
||||||
// cudaMemcpyDeviceToHost = 2,
|
|
||||||
// cudaMemcpyDeviceToDevice = 3,
|
|
||||||
// cudaMemcpyDefault = 4,
|
|
||||||
// };
|
|
||||||
#endif
|
|
|
@ -1,19 +0,0 @@
|
||||||
#include "cudaKernelImpl.h"
|
|
||||||
#include <math.h>
|
|
||||||
double __nv_exp(double base) { return exp(base); }
|
|
||||||
double __nv_sqrt(double v) { return sqrt(v); }
|
|
||||||
float __nv_sqrtf(float v) { return sqrt(v); }
|
|
||||||
float __nv_powif(float base, int exp) { return pow(base, exp); }
|
|
||||||
float __nv_logf(float v) { return logf(v); }
|
|
||||||
float __nv_expf(float v) { return expf(v); }
|
|
||||||
float __nv_log10f(float v) { return log10f(v); }
|
|
||||||
float __nv_fast_log2f(float v) { return log2f(v); }
|
|
||||||
double __nv_powi(double base, int exp) { return pow(base, exp); }
|
|
||||||
float __nv_powf(float base, float exp) { return pow(base, exp); }
|
|
||||||
float __nv_fast_powf(float base, float exp) { return pow(base, exp); }
|
|
||||||
float __nv_fmodf(float x, float y) { return fmod(x, y); }
|
|
||||||
int __nv_isnanf(float v) { return isnan(v); }
|
|
||||||
int __nv_isinff(float v) { return isinf(v); }
|
|
||||||
float __nv_fabsf(float v) { return abs(v); }
|
|
||||||
int __nvvm_mul24_i(int a, int b) { return a * b; }
|
|
||||||
double _ZL3expd(double base) { return exp(base); }
|
|
|
@ -1,251 +0,0 @@
|
||||||
#include "cudaRuntimeImpl.h"
|
|
||||||
#include "api.h"
|
|
||||||
#include "cuda_runtime.h"
|
|
||||||
#include "def.h"
|
|
||||||
#include "macros.h"
|
|
||||||
#include "structures.h"
|
|
||||||
#include <iostream>
|
|
||||||
#include <stdint.h>
|
|
||||||
#include <stdio.h>
|
|
||||||
#include <stdlib.h>
|
|
||||||
#include <string.h>
|
|
||||||
cudaError_t cudaGetDevice(int *devPtr) { *devPtr = 0; }
|
|
||||||
const char *cudaGetErrorName(cudaError_t error) { return "SUCCESS\n"; }
|
|
||||||
cudaError_t cudaDeviceReset(void) { scheduler_uninit(); }
|
|
||||||
cudaError_t cudaDeviceSynchronize(void) { cuSynchronizeBarrier(); }
|
|
||||||
cudaError_t cudaThreadSynchronize(void) { cuSynchronizeBarrier(); }
|
|
||||||
cudaError_t cudaFree(void *devPtr) { free(devPtr); }
|
|
||||||
cudaError_t cudaFreeHost(void *devPtr) { free(devPtr); }
|
|
||||||
|
|
||||||
cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim,
|
|
||||||
void **args, size_t sharedMem,
|
|
||||||
cudaStream_t stream) {
|
|
||||||
// if scheduler is null init device
|
|
||||||
// printf(
|
|
||||||
// "cudaLaunchKernel : Grid: x:%d y:%d z:%d Block: %d, %d, %d ShMem:%lu\n
|
|
||||||
// ", gridDim.x, gridDim.y, gridDim.z, blockDim.x, blockDim.y, blockDim.z,
|
|
||||||
// sharedMem);
|
|
||||||
|
|
||||||
cu_kernel *ker =
|
|
||||||
create_kernel(func, gridDim, blockDim, args, sharedMem, stream);
|
|
||||||
|
|
||||||
int lstatus = cuLaunchKernel(&ker);
|
|
||||||
|
|
||||||
// std::cout << "ret cudaLKernel" << std::endl;
|
|
||||||
}
|
|
||||||
cudaError_t cudaMalloc(void **devPtr, size_t size) {
|
|
||||||
*devPtr = malloc(size);
|
|
||||||
if (devPtr == NULL)
|
|
||||||
return cudaErrorMemoryAllocation;
|
|
||||||
return cudaSuccess;
|
|
||||||
}
|
|
||||||
cudaError_t cudaMallocHost(void **devPtr, size_t size) {
|
|
||||||
*devPtr = malloc(size);
|
|
||||||
if (devPtr == NULL)
|
|
||||||
return cudaErrorMemoryAllocation;
|
|
||||||
return cudaSuccess;
|
|
||||||
}
|
|
||||||
cudaError_t cudaMemset(void *devPtr, int value, size_t count) {
|
|
||||||
memset(devPtr, value, count);
|
|
||||||
return cudaSuccess;
|
|
||||||
}
|
|
||||||
cudaError_t cudaMemcpy(void *dst, const void *src, size_t count,
|
|
||||||
cudaMemcpyKind kind) {
|
|
||||||
if (kind == cudaMemcpyHostToHost) {
|
|
||||||
memcpy(dst, src, count);
|
|
||||||
} else if (kind == cudaMemcpyDeviceToHost) {
|
|
||||||
// how does the code know which device accessing the memory
|
|
||||||
memcpy(dst, src, count);
|
|
||||||
} else if (kind == cudaMemcpyHostToDevice) {
|
|
||||||
// how does the code know which device accessing the memory
|
|
||||||
memcpy(dst, src, count);
|
|
||||||
} else if (kind == cudaMemcpyDeviceToHost) {
|
|
||||||
// how does the code know which device accessing the memory
|
|
||||||
memcpy(dst, src, count);
|
|
||||||
} else if (kind == cudaMemcpyDeviceToDevice) {
|
|
||||||
|
|
||||||
memcpy(dst, src, count);
|
|
||||||
} else if (kind == cudaMemcpyDefault) {
|
|
||||||
memcpy(dst, src, count);
|
|
||||||
}
|
|
||||||
return cudaSuccess;
|
|
||||||
}
|
|
||||||
|
|
||||||
cudaError_t cudaMemcpyToSymbol_host(void *dst, const void *src, size_t count,
|
|
||||||
size_t offset, cudaMemcpyKind kind) {
|
|
||||||
assert(offset == 0 && "DO not support offset !=0\n");
|
|
||||||
memcpy(dst, src + offset, count);
|
|
||||||
return cudaSuccess;
|
|
||||||
}
|
|
||||||
|
|
||||||
cudaError_t cudaSetDevice(int device) {
|
|
||||||
// error checking
|
|
||||||
// std::cout << "cudaSetDevice Called" << std::endl;
|
|
||||||
init_device();
|
|
||||||
// std::cout << "cudaSetDevice Ret" << std::endl;
|
|
||||||
}
|
|
||||||
|
|
||||||
cudaError_t cudaStreamCopyAttributes(cudaStream_t dst, cudaStream_t src) {
|
|
||||||
cstreamData *dst_stream = (cstreamData *)dst;
|
|
||||||
cstreamData *src_stream = (cstreamData *)src;
|
|
||||||
|
|
||||||
if (dst_stream == NULL || src_stream == NULL) {
|
|
||||||
return cudaErrorInvalidValue; // 1
|
|
||||||
}
|
|
||||||
|
|
||||||
dst_stream->stream_priority = src_stream->stream_priority;
|
|
||||||
dst_stream->stream_flags = src_stream->stream_flags;
|
|
||||||
|
|
||||||
return cudaSuccess; // 0
|
|
||||||
}
|
|
||||||
|
|
||||||
static int stream_counter = 1;
|
|
||||||
/*
|
|
||||||
cudaStream_t is a Opaque Structure
|
|
||||||
|
|
||||||
Overwrites cudaStream_t into custom cstreamData structure
|
|
||||||
(does hardware uses the cudaStream_t stream)
|
|
||||||
|
|
||||||
*/
|
|
||||||
cudaError_t cudaStreamCreate(cudaStream_t *pStream) {
|
|
||||||
cstreamData *s = (cstreamData *)calloc(1, sizeof(cstreamData));
|
|
||||||
if (s == NULL)
|
|
||||||
return cudaErrorMemoryAllocation;
|
|
||||||
s->ev.status = C_RUN;
|
|
||||||
s->id = stream_counter;
|
|
||||||
stream_counter++;
|
|
||||||
s->stream_priority = DEFAULT;
|
|
||||||
create_KernelQueue(&(s->kernelQueue));
|
|
||||||
|
|
||||||
INIT_LOCK(s->stream_lock);
|
|
||||||
*pStream = (cudaStream_t)(s);
|
|
||||||
|
|
||||||
return cudaSuccess;
|
|
||||||
}
|
|
||||||
|
|
||||||
cudaError_t cudaStreamDestroy(cudaStream_t stream) {
|
|
||||||
cstreamData *s = (cstreamData *)(stream);
|
|
||||||
|
|
||||||
free(s->kernelQueue);
|
|
||||||
|
|
||||||
DESTROY_LOCK(s->stream_lock);
|
|
||||||
|
|
||||||
free(s);
|
|
||||||
|
|
||||||
return cudaSuccess;
|
|
||||||
}
|
|
||||||
|
|
||||||
cudaError_t cudaStreamSynchronize(cudaStream_t stream) {
|
|
||||||
cstreamData *e = ((cstreamData *)(stream));
|
|
||||||
MUTEX_LOCK(e->stream_lock);
|
|
||||||
|
|
||||||
e->ev.status = C_SYNCHRONIZE;
|
|
||||||
e->ev.numKernelsToWait = e->kernelQueue->waiting_count;
|
|
||||||
MUTEX_UNLOCK(e->stream_lock);
|
|
||||||
}
|
|
||||||
|
|
||||||
cudaError_t cudaGetDeviceCount(int *count) {
|
|
||||||
// dummy value
|
|
||||||
*count = 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
cudaError_t cudaGetDeviceProperties(cudaDeviceProp *deviceProp, int device) {
|
|
||||||
|
|
||||||
// dummy values
|
|
||||||
if (device == 0) {
|
|
||||||
strcpy(deviceProp->name, "pthread");
|
|
||||||
deviceProp->totalGlobalMem = 0;
|
|
||||||
deviceProp->sharedMemPerBlock = 0;
|
|
||||||
deviceProp->regsPerBlock = 0;
|
|
||||||
deviceProp->warpSize = 0;
|
|
||||||
deviceProp->memPitch = 0;
|
|
||||||
deviceProp->maxThreadsPerBlock = 0;
|
|
||||||
deviceProp->maxThreadsDim[0] = 1;
|
|
||||||
deviceProp->maxThreadsDim[1] = 1;
|
|
||||||
deviceProp->maxThreadsDim[2] = 1;
|
|
||||||
|
|
||||||
deviceProp->maxGridSize[0] = 1;
|
|
||||||
deviceProp->maxGridSize[1] = 1;
|
|
||||||
deviceProp->maxGridSize[2] = 1;
|
|
||||||
|
|
||||||
deviceProp->totalConstMem = 0;
|
|
||||||
deviceProp->major = 0;
|
|
||||||
deviceProp->minor = 0;
|
|
||||||
deviceProp->clockRate = 0;
|
|
||||||
deviceProp->textureAlignment = 0;
|
|
||||||
deviceProp->deviceOverlap = false;
|
|
||||||
deviceProp->multiProcessorCount = 0;
|
|
||||||
}
|
|
||||||
return cudaSuccess;
|
|
||||||
}
|
|
||||||
|
|
||||||
static cudaError_t lastError = cudaSuccess;
|
|
||||||
const char *cudaGetErrorString(cudaError_t error) {
|
|
||||||
if (error == cudaSuccess) {
|
|
||||||
return "Cuda Get Error Success";
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
cudaError_t cudaGetLastError(void) { return lastError; }
|
|
||||||
|
|
||||||
static callParams callParamTemp;
|
|
||||||
|
|
||||||
/*
|
|
||||||
Internal Cuda Library Functions
|
|
||||||
*/
|
|
||||||
extern "C" {
|
|
||||||
|
|
||||||
extern cudaError_t CUDARTAPI __cudaPopCallConfiguration(dim3 *gridDim,
|
|
||||||
dim3 *blockDim,
|
|
||||||
size_t *sharedMem,
|
|
||||||
void **stream) {
|
|
||||||
// printf("__cudaPopCallConfiguration: Grid: x:%d y:%d z:%d Block: %d, %d, %d
|
|
||||||
// ShMem: %lu\n",
|
|
||||||
// gridDim->x, gridDim->y, gridDim->z, blockDim->x, blockDim->y, blockDim->z,
|
|
||||||
// *sharedMem);
|
|
||||||
|
|
||||||
*gridDim = callParamTemp.gridDim;
|
|
||||||
*blockDim = callParamTemp.blockDim;
|
|
||||||
*sharedMem = callParamTemp.shareMem;
|
|
||||||
*stream = callParamTemp.stream;
|
|
||||||
|
|
||||||
// printf("__cudaPopCallConfiguration After : Grid: x:%d y:%d z:%d Block: %d,
|
|
||||||
// %d, %d ShMem: %lu\n", gridDim->x, gridDim->y, gridDim->z, blockDim->x,
|
|
||||||
// blockDim->y, blockDim->z, *sharedMem);
|
|
||||||
|
|
||||||
// exit(1);
|
|
||||||
|
|
||||||
return cudaSuccess;
|
|
||||||
}
|
|
||||||
|
|
||||||
extern __host__ __device__ unsigned CUDARTAPI __cudaPushCallConfiguration(
|
|
||||||
dim3 gridDim, dim3 blockDim, size_t sharedMem = 0, void *stream = 0) {
|
|
||||||
|
|
||||||
// printf("__cudaPushCallConfiguration Grid: x:%d y:%d z:%d Block: %d, %d, %d
|
|
||||||
// "
|
|
||||||
// "ShMem: %lu\n ",
|
|
||||||
// gridDim.x, gridDim.y, gridDim.z, blockDim.x, blockDim.y, blockDim.z,
|
|
||||||
// sharedMem);
|
|
||||||
|
|
||||||
// memory checks allocations
|
|
||||||
callParamTemp.gridDim = gridDim;
|
|
||||||
|
|
||||||
// std::cout << "assign gridDim" << std::endl;
|
|
||||||
|
|
||||||
callParamTemp.blockDim = blockDim;
|
|
||||||
// std::cout << "assign blockDim" << std::endl;
|
|
||||||
callParamTemp.shareMem = sharedMem;
|
|
||||||
// std::cout << "assign shareMem" << std::endl;
|
|
||||||
(callParamTemp.stream) = stream;
|
|
||||||
|
|
||||||
// printf("__cudaPushCallConfiguration After Grid: x:%d y:%d z:%d Block: %d,
|
|
||||||
// %d, %d ShMem: %lu\n",
|
|
||||||
// gridDim.x, gridDim.y, gridDim.z, blockDim.x, blockDim.y, blockDim.z,
|
|
||||||
// sharedMem);
|
|
||||||
|
|
||||||
// return 0 continues the Pop
|
|
||||||
return cudaSuccess;
|
|
||||||
|
|
||||||
// return ne 0 skips the Pop
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -1,6 +1,7 @@
|
||||||
#include "cudaRuntimeImpl.h"
|
#include "cudaRuntimeImpl.h"
|
||||||
#include "api.h"
|
#include "api.h"
|
||||||
#include "cuda_runtime.h"
|
#include "cuda_runtime.h"
|
||||||
|
#include "debug.hpp"
|
||||||
#include "def.h"
|
#include "def.h"
|
||||||
#include "macros.h"
|
#include "macros.h"
|
||||||
#include "structures.h"
|
#include "structures.h"
|
||||||
|
@ -38,11 +39,10 @@ cudaError_t cudaFreeHost(void *devPtr) {
|
||||||
cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim,
|
cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim,
|
||||||
void **args, size_t sharedMem,
|
void **args, size_t sharedMem,
|
||||||
cudaStream_t stream) {
|
cudaStream_t stream) {
|
||||||
// if scheduler is null init device
|
DEBUG_INFO(
|
||||||
// printf(
|
"cudaLaunchKernel : Grid: x:%d y:%d z:%d Block: %d, %d, %d ShMem: %d\n",
|
||||||
// "cudaLaunchKernel : Grid: x:%d y:%d z:%d Block: %d, %d, %d ShMem:%lu\n
|
gridDim.x, gridDim.y, gridDim.z, blockDim.x, blockDim.y, blockDim.z,
|
||||||
// ", gridDim.x, gridDim.y, gridDim.z, blockDim.x, blockDim.y, blockDim.z,
|
sharedMem);
|
||||||
// sharedMem);
|
|
||||||
|
|
||||||
cu_kernel *ker =
|
cu_kernel *ker =
|
||||||
create_kernel(func, gridDim, blockDim, args, sharedMem, stream);
|
create_kernel(func, gridDim, blockDim, args, sharedMem, stream);
|
||||||
|
@ -66,16 +66,12 @@ cudaError_t cudaMemcpy(void *dst, const void *src, size_t count,
|
||||||
if (kind == cudaMemcpyHostToHost) {
|
if (kind == cudaMemcpyHostToHost) {
|
||||||
memcpy(dst, src, count);
|
memcpy(dst, src, count);
|
||||||
} else if (kind == cudaMemcpyDeviceToHost) {
|
} else if (kind == cudaMemcpyDeviceToHost) {
|
||||||
// how does the code know which device accessing the memory
|
|
||||||
memcpy(dst, src, count);
|
memcpy(dst, src, count);
|
||||||
} else if (kind == cudaMemcpyHostToDevice) {
|
} else if (kind == cudaMemcpyHostToDevice) {
|
||||||
// how does the code know which device accessing the memory
|
|
||||||
memcpy(dst, src, count);
|
memcpy(dst, src, count);
|
||||||
} else if (kind == cudaMemcpyDeviceToHost) {
|
} else if (kind == cudaMemcpyDeviceToHost) {
|
||||||
// how does the code know which device accessing the memory
|
|
||||||
memcpy(dst, src, count);
|
memcpy(dst, src, count);
|
||||||
} else if (kind == cudaMemcpyDeviceToDevice) {
|
} else if (kind == cudaMemcpyDeviceToDevice) {
|
||||||
|
|
||||||
memcpy(dst, dst, count);
|
memcpy(dst, dst, count);
|
||||||
} else if (kind == cudaMemcpyDefault) {
|
} else if (kind == cudaMemcpyDefault) {
|
||||||
memcpy(dst, src, count);
|
memcpy(dst, src, count);
|
||||||
|
@ -100,36 +96,31 @@ cudaError_t cudaStreamCopyAttributes(cudaStream_t dst, cudaStream_t src) {
|
||||||
cstreamData *src_stream = (cstreamData *)src;
|
cstreamData *src_stream = (cstreamData *)src;
|
||||||
|
|
||||||
if (dst_stream == NULL || src_stream == NULL) {
|
if (dst_stream == NULL || src_stream == NULL) {
|
||||||
return cudaErrorInvalidValue; // 1
|
return cudaErrorInvalidValue;
|
||||||
}
|
}
|
||||||
|
|
||||||
dst_stream->stream_priority = src_stream->stream_priority;
|
dst_stream->stream_priority = src_stream->stream_priority;
|
||||||
dst_stream->stream_flags = src_stream->stream_flags;
|
dst_stream->stream_flags = src_stream->stream_flags;
|
||||||
|
|
||||||
return cudaSuccess; // 0
|
return cudaSuccess;
|
||||||
}
|
}
|
||||||
|
|
||||||
static int stream_counter = 1;
|
static int stream_counter = 1;
|
||||||
/*
|
/*
|
||||||
cudaStream_t is a Opaque Structure
|
cudaStream_t is a Opaque Structure
|
||||||
|
|
||||||
Overwrites cudaStream_t into custom cstreamData structure
|
Overwrites cudaStream_t into custom cstreamData structure
|
||||||
(does hardware uses the cudaStream_t stream)
|
(does hardware uses the cudaStream_t stream)
|
||||||
|
|
||||||
*/
|
*/
|
||||||
cudaError_t cudaStreamCreate(cudaStream_t *pStream) {
|
cudaError_t cudaStreamCreate(cudaStream_t *pStream) {
|
||||||
printf("cudaStreamCreate no Implement\n");
|
assert(0 && "cudaStreamCreate no Implement\n");
|
||||||
exit(1);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
cudaError_t cudaStreamDestroy(cudaStream_t stream) {
|
cudaError_t cudaStreamDestroy(cudaStream_t stream) {
|
||||||
printf("cudaStreamDestroy No Implement\n");
|
assert(0 && "cudaStreamDestroy No Implement\n");
|
||||||
exit(1);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
cudaError_t cudaStreamSynchronize(cudaStream_t stream) {
|
cudaError_t cudaStreamSynchronize(cudaStream_t stream) {
|
||||||
printf("cudaStreamSynchronize No Implement\n");
|
assert(0 && "cudaStreamSynchronize No Implement\n");
|
||||||
exit(1);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
cudaError_t cudaGetDeviceCount(int *count) {
|
cudaError_t cudaGetDeviceCount(int *count) {
|
||||||
|
@ -139,7 +130,6 @@ cudaError_t cudaGetDeviceCount(int *count) {
|
||||||
}
|
}
|
||||||
|
|
||||||
cudaError_t cudaGetDeviceProperties(cudaDeviceProp *deviceProp, int device) {
|
cudaError_t cudaGetDeviceProperties(cudaDeviceProp *deviceProp, int device) {
|
||||||
|
|
||||||
// dummy values
|
// dummy values
|
||||||
if (device == 0) {
|
if (device == 0) {
|
||||||
strcpy(deviceProp->name, "pthread");
|
strcpy(deviceProp->name, "pthread");
|
||||||
|
@ -188,53 +178,31 @@ extern cudaError_t CUDARTAPI __cudaPopCallConfiguration(dim3 *gridDim,
|
||||||
dim3 *blockDim,
|
dim3 *blockDim,
|
||||||
size_t *sharedMem,
|
size_t *sharedMem,
|
||||||
void **stream) {
|
void **stream) {
|
||||||
// printf("__cudaPopCallConfiguration: Grid: x:%d y:%d z:%d Block: %d, %d, %d
|
DEBUG_INFO("__cudaPopCallConfiguration: Grid: x:%d y:%d z:%d Block: %d, %d, "
|
||||||
// ShMem: %lu\n",
|
"%d ShMem: %lu\n",
|
||||||
// gridDim->x, gridDim->y, gridDim->z, blockDim->x, blockDim->y, blockDim->z,
|
gridDim->x, gridDim->y, gridDim->z, blockDim->x, blockDim->y,
|
||||||
// *sharedMem);
|
blockDim->z, *sharedMem);
|
||||||
|
|
||||||
*gridDim = callParamTemp.gridDim;
|
*gridDim = callParamTemp.gridDim;
|
||||||
*blockDim = callParamTemp.blockDim;
|
*blockDim = callParamTemp.blockDim;
|
||||||
*sharedMem = callParamTemp.shareMem;
|
*sharedMem = callParamTemp.shareMem;
|
||||||
*stream = callParamTemp.stream;
|
*stream = callParamTemp.stream;
|
||||||
|
|
||||||
// printf("__cudaPopCallConfiguration After : Grid: x:%d y:%d z:%d Block: %d,
|
|
||||||
// %d, %d ShMem: %lu\n", gridDim->x, gridDim->y, gridDim->z, blockDim->x,
|
|
||||||
// blockDim->y, blockDim->z, *sharedMem);
|
|
||||||
|
|
||||||
// exit(1);
|
|
||||||
|
|
||||||
return cudaSuccess;
|
return cudaSuccess;
|
||||||
}
|
}
|
||||||
|
|
||||||
extern __host__ __device__ unsigned CUDARTAPI __cudaPushCallConfiguration(
|
extern __host__ __device__ unsigned CUDARTAPI __cudaPushCallConfiguration(
|
||||||
dim3 gridDim, dim3 blockDim, size_t sharedMem = 0, void *stream = 0) {
|
dim3 gridDim, dim3 blockDim, size_t sharedMem = 0, void *stream = 0) {
|
||||||
|
DEBUG_INFO("__cudaPushCallConfiguration: Grid: x:%d y:%d z:%d Block: %d, %d, "
|
||||||
// printf("__cudaPushCallConfiguration Grid: x:%d y:%d z:%d Block: %d, %d, %d
|
"%d ShMem: %lu\n",
|
||||||
// "
|
gridDim.x, gridDim.y, gridDim.z, blockDim.x, blockDim.y,
|
||||||
// "ShMem: %lu\n ",
|
blockDim.z, sharedMem);
|
||||||
// gridDim.x, gridDim.y, gridDim.z, blockDim.x, blockDim.y, blockDim.z,
|
|
||||||
// sharedMem);
|
|
||||||
|
|
||||||
// memory checks allocations
|
// memory checks allocations
|
||||||
callParamTemp.gridDim = gridDim;
|
callParamTemp.gridDim = gridDim;
|
||||||
|
|
||||||
// std::cout << "assign gridDim" << std::endl;
|
|
||||||
|
|
||||||
callParamTemp.blockDim = blockDim;
|
callParamTemp.blockDim = blockDim;
|
||||||
// std::cout << "assign blockDim" << std::endl;
|
|
||||||
callParamTemp.shareMem = sharedMem;
|
callParamTemp.shareMem = sharedMem;
|
||||||
// std::cout << "assign shareMem" << std::endl;
|
|
||||||
(callParamTemp.stream) = stream;
|
(callParamTemp.stream) = stream;
|
||||||
|
|
||||||
// printf("__cudaPushCallConfiguration After Grid: x:%d y:%d z:%d Block: %d,
|
|
||||||
// %d, %d ShMem: %lu\n",
|
|
||||||
// gridDim.x, gridDim.y, gridDim.z, blockDim.x, blockDim.y, blockDim.z,
|
|
||||||
// sharedMem);
|
|
||||||
|
|
||||||
// return 0 continues the Pop
|
|
||||||
return cudaSuccess;
|
return cudaSuccess;
|
||||||
|
|
||||||
// return ne 0 skips the Pop
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -11,6 +11,7 @@ set(LIB_NAME threadPool)
|
||||||
|
|
||||||
set(CMAKE_CXX_STANDARD 14)
|
set(CMAKE_CXX_STANDARD 14)
|
||||||
set(CMAKE_BUILD_TYPE Debug)
|
set(CMAKE_BUILD_TYPE Debug)
|
||||||
|
include_directories(../../common)
|
||||||
include_directories(./include)
|
include_directories(./include)
|
||||||
include_directories(./include/x86)
|
include_directories(./include/x86)
|
||||||
include_directories(../../external/moodycamel)
|
include_directories(../../external/moodycamel)
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
#include "api.h"
|
#include "api.h"
|
||||||
#include "blockingconcurrentqueue.h"
|
#include "blockingconcurrentqueue.h"
|
||||||
|
#include "debug.hpp"
|
||||||
#include "def.h"
|
#include "def.h"
|
||||||
#include "macros.h"
|
#include "macros.h"
|
||||||
#include "structures.h"
|
#include "structures.h"
|
||||||
|
@ -22,8 +23,8 @@ int init_device() {
|
||||||
return C_ERROR_MEMALLOC;
|
return C_ERROR_MEMALLOC;
|
||||||
|
|
||||||
device->max_compute_units = std::thread::hardware_concurrency();
|
device->max_compute_units = std::thread::hardware_concurrency();
|
||||||
std::cout << device->max_compute_units
|
DEBUG_INFO("%d concurrent threads are supported.\n",
|
||||||
<< " concurrent threads are supported.\n";
|
device->max_compute_units);
|
||||||
device_max_compute_units = device->max_compute_units;
|
device_max_compute_units = device->max_compute_units;
|
||||||
|
|
||||||
// initialize scheduler
|
// initialize scheduler
|
||||||
|
@ -46,13 +47,9 @@ cu_kernel *create_kernel(const void *func, dim3 gridDim, dim3 blockDim,
|
||||||
|
|
||||||
ker->gridDim = gridDim;
|
ker->gridDim = gridDim;
|
||||||
ker->blockDim = blockDim;
|
ker->blockDim = blockDim;
|
||||||
|
|
||||||
ker->shared_mem = sharedMem;
|
ker->shared_mem = sharedMem;
|
||||||
|
|
||||||
ker->stream = stream;
|
ker->stream = stream;
|
||||||
|
|
||||||
ker->totalBlocks = gridDim.x * gridDim.y * gridDim.z;
|
ker->totalBlocks = gridDim.x * gridDim.y * gridDim.z;
|
||||||
|
|
||||||
ker->blockSize = blockDim.x * blockDim.y * blockDim.z;
|
ker->blockSize = blockDim.x * blockDim.y * blockDim.z;
|
||||||
return ker;
|
return ker;
|
||||||
}
|
}
|
||||||
|
@ -97,9 +94,6 @@ int schedulerEnqueueKernel(cu_kernel *k) {
|
||||||
scheduler->kernelQueue->enqueue(p);
|
scheduler->kernelQueue->enqueue(p);
|
||||||
TaskToExecute++;
|
TaskToExecute++;
|
||||||
}
|
}
|
||||||
|
|
||||||
// printf("total: %d execute per cpu: %d\n", totalBlocks,
|
|
||||||
// gpuBlockToExecutePerCpuThread);
|
|
||||||
return C_SUCCESS;
|
return C_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -121,8 +115,7 @@ int cuLaunchKernel(cu_kernel **k) {
|
||||||
}
|
}
|
||||||
schedulerEnqueueKernel(ker);
|
schedulerEnqueueKernel(ker);
|
||||||
} else {
|
} else {
|
||||||
printf("MultiStream no implemente\n");
|
assert(0 && "MultiStream no implemente\n");
|
||||||
exit(1);
|
|
||||||
}
|
}
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
@ -185,11 +178,9 @@ void *driver_thread(void *p) {
|
||||||
// exit the routine
|
// exit the routine
|
||||||
if (is_exit) {
|
if (is_exit) {
|
||||||
td->exit = true;
|
td->exit = true;
|
||||||
// pthread_exit
|
|
||||||
pthread_exit(NULL);
|
pthread_exit(NULL);
|
||||||
} else {
|
} else {
|
||||||
printf("driver thread stop incorrectly\n");
|
assert(0 && "driver thread stop incorrectly\n");
|
||||||
exit(1);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -215,29 +206,17 @@ int scheduler_init(cu_device device) {
|
||||||
return C_SUCCESS;
|
return C_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
void scheduler_uninit() {
|
void scheduler_uninit() { assert(0 && "Scheduler Unitit no Implemente\n"); }
|
||||||
printf("Scheduler Unitit no Implemente\n");
|
|
||||||
exit(1);
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
Barrier for Kernel Launch
|
Barrier for Kernel Launch
|
||||||
|
|
||||||
During kernel launch, increment the number of work items required to finish
|
|
||||||
Each kernel will point to the same event
|
|
||||||
|
|
||||||
During Running Command, decrement the event.work_item count
|
|
||||||
when count is 0, all work items for this kernel launch is finish
|
|
||||||
|
|
||||||
Sense Like Barrier
|
|
||||||
Counting Barrier basically
|
|
||||||
*/
|
*/
|
||||||
void cuSynchronizeBarrier() {
|
void cuSynchronizeBarrier() {
|
||||||
if (!device_initilized) {
|
if (!device_initilized) {
|
||||||
init_device();
|
init_device();
|
||||||
}
|
}
|
||||||
while (1) {
|
while (1) {
|
||||||
// (TODO): currently, we assume each kernel launch has a
|
// after compilation transformation, each kernel launch has a
|
||||||
// following sync
|
// following sync
|
||||||
if (scheduler->kernelQueue->size_approx() == 0) {
|
if (scheduler->kernelQueue->size_approx() == 0) {
|
||||||
int completeBlock = 0;
|
int completeBlock = 0;
|
||||||
|
|
Loading…
Reference in New Issue