use lock-free queue
This commit is contained in:
parent
cbf4cd90d8
commit
db585083bb
|
@ -155,41 +155,3 @@ jobs:
|
||||||
llc --relocation-model=pic --filetype=obj host.bc
|
llc --relocation-model=pic --filetype=obj host.bc
|
||||||
g++ -o lavaMD -fPIC -no-pie -I${{ github.workspace }}/runtime/threadPool/include -L${{ github.workspace }}/build/runtime -L${{ github.workspace }}/build/runtime/threadPool main.c host.o kernel.o util/timer/timer.c util/num/num.c -lpthread -lc -lx86Runtime -lthreadPool -pthread
|
g++ -o lavaMD -fPIC -no-pie -I${{ github.workspace }}/runtime/threadPool/include -L${{ github.workspace }}/build/runtime -L${{ github.workspace }}/build/runtime/threadPool main.c host.o kernel.o util/timer/timer.c util/num/num.c -lpthread -lc -lx86Runtime -lthreadPool -pthread
|
||||||
./lavaMD -boxes1d 10
|
./lavaMD -boxes1d 10
|
||||||
- name: Execute the dwt2d example
|
|
||||||
run: |
|
|
||||||
cd ${{ github.workspace }}/SC_evaluate/rodinia-cox/dwt2d
|
|
||||||
clang++ -I. -I/include -fno-strict-aliasing dwt_cuda/fdwt53.cu dwt_cuda/fdwt97.cu dwt_cuda/common.cu dwt_cuda/rdwt97.cu dwt_cuda/rdwt53.cu components.cu dwt.cu main.cu -c --cuda-path=${{ github.workspace }}/cuda-10.1 --cuda-gpu-arch=sm_61 -L${{ github.workspace }}/cuda-10.1/lib64 -lcudart_static -ldl -lrt -pthread -save-temps -v || true
|
|
||||||
export LD_LIBRARY_PATH=${{ github.workspace }}/build/runtime:${{ github.workspace }}/build/runtime/threadPool:$LD_LIBRARY_PATH
|
|
||||||
export PATH=${{ github.workspace }}/build/compilation:$PATH
|
|
||||||
kernelTranslator common-cuda-nvptx64-nvidia-cuda-sm_61.bc common.bc
|
|
||||||
kernelTranslator components-cuda-nvptx64-nvidia-cuda-sm_61.bc components.bc
|
|
||||||
kernelTranslator fdwt53-cuda-nvptx64-nvidia-cuda-sm_61.bc fdwt53.bc
|
|
||||||
kernelTranslator dwt-cuda-nvptx64-nvidia-cuda-sm_61.bc dwt.bc
|
|
||||||
kernelTranslator fdwt97-cuda-nvptx64-nvidia-cuda-sm_61.bc fdwt97.bc
|
|
||||||
kernelTranslator rdwt97-cuda-nvptx64-nvidia-cuda-sm_61.bc rdwt97.bc
|
|
||||||
kernelTranslator rdwt53-cuda-nvptx64-nvidia-cuda-sm_61.bc rdwt53.bc
|
|
||||||
hostTranslator main-host-x86_64-unknown-linux-gnu.bc host.bc
|
|
||||||
hostTranslator common-host-x86_64-unknown-linux-gnu.bc common_host.bc
|
|
||||||
hostTranslator components-host-x86_64-unknown-linux-gnu.bc components_host.bc
|
|
||||||
hostTranslator dwt-host-x86_64-unknown-linux-gnu.bc dwt_host.bc
|
|
||||||
hostTranslator fdwt53-host-x86_64-unknown-linux-gnu.bc fdwt53_host.bc
|
|
||||||
hostTranslator fdwt97-host-x86_64-unknown-linux-gnu.bc fdwt97_host.bc
|
|
||||||
hostTranslator rdwt53-host-x86_64-unknown-linux-gnu.bc rdwt53_host.bc
|
|
||||||
hostTranslator rdwt97-host-x86_64-unknown-linux-gnu.bc rdwt97_host.bc
|
|
||||||
llc --relocation-model=pic --filetype=obj common.bc
|
|
||||||
llc --relocation-model=pic --filetype=obj components.bc
|
|
||||||
llc --relocation-model=pic --filetype=obj fdwt53.bc
|
|
||||||
llc --relocation-model=pic --filetype=obj dwt.bc
|
|
||||||
llc --relocation-model=pic --filetype=obj host.bc
|
|
||||||
llc --relocation-model=pic --filetype=obj common_host.bc
|
|
||||||
llc --relocation-model=pic --filetype=obj components_host.bc
|
|
||||||
llc --relocation-model=pic --filetype=obj fdwt53_host.bc
|
|
||||||
llc --relocation-model=pic --filetype=obj dwt_host.bc
|
|
||||||
llc --relocation-model=pic --filetype=obj fdwt97_host.bc
|
|
||||||
llc --relocation-model=pic --filetype=obj rdwt97_host.bc
|
|
||||||
llc --relocation-model=pic --filetype=obj rdwt53_host.bc
|
|
||||||
llc --relocation-model=pic --filetype=obj fdwt97.bc
|
|
||||||
llc --relocation-model=pic --filetype=obj rdwt97.bc
|
|
||||||
llc --relocation-model=pic --filetype=obj rdwt53.bc
|
|
||||||
g++ -o dwt2d -fPIC -no-pie -I${{ github.workspace }}/runtime/threadPool/include -L${{ github.workspace }}/build/runtime -L${{ github.workspace }}/build/runtime/threadPool common.o components.o dwt.o fdwt53.o fdwt97.o rdwt97.o rdwt53.o host.o common_host.o components_host.o dwt_host.o fdwt53_host.o fdwt97_host.o rdwt97_host.o rdwt53_host.o -lpthread -lc -lx86Runtime -lthreadPool -pthread
|
|
||||||
./dwt2d 192.bmp -d 192x192 -f -5 -l 3
|
|
||||||
|
|
|
@ -118,17 +118,17 @@ static int stream_counter = 1;
|
||||||
|
|
||||||
*/
|
*/
|
||||||
cudaError_t cudaStreamCreate(cudaStream_t *pStream) {
|
cudaError_t cudaStreamCreate(cudaStream_t *pStream) {
|
||||||
printf("No Implement\n");
|
printf("cudaStreamCreate no Implement\n");
|
||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
|
|
||||||
cudaError_t cudaStreamDestroy(cudaStream_t stream) {
|
cudaError_t cudaStreamDestroy(cudaStream_t stream) {
|
||||||
printf("No Implement\n");
|
printf("cudaStreamDestroy No Implement\n");
|
||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
|
|
||||||
cudaError_t cudaStreamSynchronize(cudaStream_t stream) {
|
cudaError_t cudaStreamSynchronize(cudaStream_t stream) {
|
||||||
printf("No Implement\n");
|
printf("cudaStreamSynchronize No Implement\n");
|
||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -17,6 +17,7 @@ typedef struct c_thread
|
||||||
unsigned long executed_commands;
|
unsigned long executed_commands;
|
||||||
unsigned index;
|
unsigned index;
|
||||||
bool exit;
|
bool exit;
|
||||||
|
bool busy;
|
||||||
} cu_ptd;
|
} cu_ptd;
|
||||||
|
|
||||||
// kernel information
|
// kernel information
|
||||||
|
|
|
@ -24,7 +24,6 @@ int init_device()
|
||||||
return C_ERROR_MEMALLOC;
|
return C_ERROR_MEMALLOC;
|
||||||
|
|
||||||
device->max_compute_units = std::thread::hardware_concurrency();
|
device->max_compute_units = std::thread::hardware_concurrency();
|
||||||
device->max_compute_units = 4;
|
|
||||||
std::cout << device->max_compute_units
|
std::cout << device->max_compute_units
|
||||||
<< " concurrent threads are supported.\n";
|
<< " concurrent threads are supported.\n";
|
||||||
device_max_compute_units = device->max_compute_units;
|
device_max_compute_units = device->max_compute_units;
|
||||||
|
@ -129,8 +128,8 @@ int schedulerEnqueueKernel(cu_kernel *k)
|
||||||
scheduler->kernelQueue->enqueue(p);
|
scheduler->kernelQueue->enqueue(p);
|
||||||
}
|
}
|
||||||
|
|
||||||
printf("total: %d execute per cpu: %d\n", totalBlocks,
|
// printf("total: %d execute per cpu: %d\n", totalBlocks,
|
||||||
gpuBlockToExecutePerCpuThread);
|
// gpuBlockToExecutePerCpuThread);
|
||||||
return C_SUCCESS;
|
return C_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -139,12 +138,6 @@ int schedulerEnqueueKernel(cu_kernel *k)
|
||||||
*/
|
*/
|
||||||
int cuLaunchKernel(cu_kernel **k)
|
int cuLaunchKernel(cu_kernel **k)
|
||||||
{
|
{
|
||||||
if (!scheduler)
|
|
||||||
{
|
|
||||||
init_device();
|
|
||||||
}
|
|
||||||
std::cout << "launch\n"
|
|
||||||
<< std::flush;
|
|
||||||
// Calculate Block Size N/numBlocks
|
// Calculate Block Size N/numBlocks
|
||||||
cu_kernel *ker = *k;
|
cu_kernel *ker = *k;
|
||||||
int status = C_RUN;
|
int status = C_RUN;
|
||||||
|
@ -173,9 +166,11 @@ int get_work(c_thread *th)
|
||||||
{
|
{
|
||||||
// try to get a task from the queue
|
// try to get a task from the queue
|
||||||
cu_kernel *k;
|
cu_kernel *k;
|
||||||
|
th->busy = false;
|
||||||
bool getTask = scheduler->kernelQueue->wait_dequeue_timed(k, std::chrono::milliseconds(5));
|
bool getTask = scheduler->kernelQueue->wait_dequeue_timed(k, std::chrono::milliseconds(5));
|
||||||
if (getTask)
|
if (getTask)
|
||||||
{
|
{
|
||||||
|
th->busy = true;
|
||||||
// set runtime configuration
|
// set runtime configuration
|
||||||
gridDim = k->gridDim;
|
gridDim = k->gridDim;
|
||||||
blockDim = k->blockDim;
|
blockDim = k->blockDim;
|
||||||
|
@ -190,7 +185,6 @@ int get_work(c_thread *th)
|
||||||
if (dynamic_shared_mem_size > 0)
|
if (dynamic_shared_mem_size > 0)
|
||||||
dynamic_shared_memory = (int *)malloc(dynamic_shared_mem_size);
|
dynamic_shared_memory = (int *)malloc(dynamic_shared_mem_size);
|
||||||
// execute GPU blocks
|
// execute GPU blocks
|
||||||
printf("exec: from: %d to : %d\n",k->startBlockId, k->endBlockId);
|
|
||||||
for (block_index = k->startBlockId; block_index < k->endBlockId + 1; block_index++)
|
for (block_index = k->startBlockId; block_index < k->endBlockId + 1; block_index++)
|
||||||
{
|
{
|
||||||
int tmp = block_index;
|
int tmp = block_index;
|
||||||
|
@ -201,11 +195,11 @@ int get_work(c_thread *th)
|
||||||
block_index_z = tmp;
|
block_index_z = tmp;
|
||||||
k->start_routine(k->args);
|
k->start_routine(k->args);
|
||||||
}
|
}
|
||||||
printf("done: from: %d to : %d\n",k->startBlockId, k->endBlockId);
|
|
||||||
}
|
}
|
||||||
// if cannot get tasks, check whether programs stop
|
// if cannot get tasks, check whether programs stop
|
||||||
else if (scheduler->threadpool_shutdown_requested)
|
else if (scheduler->threadpool_shutdown_requested)
|
||||||
{
|
{
|
||||||
|
th->busy = false;
|
||||||
return true; // thread exit
|
return true; // thread exit
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -217,17 +211,15 @@ void *driver_thread(void *p)
|
||||||
struct c_thread *td = (struct c_thread *)p;
|
struct c_thread *td = (struct c_thread *)p;
|
||||||
int is_exit = 0;
|
int is_exit = 0;
|
||||||
td->exit = false;
|
td->exit = false;
|
||||||
|
td->busy = false;
|
||||||
// get work
|
// get work
|
||||||
printf("before getwork\n");
|
|
||||||
is_exit = get_work(td);
|
is_exit = get_work(td);
|
||||||
printf("after getwork\n");
|
|
||||||
|
|
||||||
// exit the routine
|
// exit the routine
|
||||||
if (is_exit)
|
if (is_exit)
|
||||||
{
|
{
|
||||||
td->exit = true;
|
td->exit = true;
|
||||||
// pthread_exit
|
// pthread_exit
|
||||||
printf("pthread exit\n");
|
|
||||||
pthread_exit(NULL);
|
pthread_exit(NULL);
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
|
@ -263,7 +255,7 @@ int scheduler_init(cu_device device)
|
||||||
|
|
||||||
void scheduler_uninit()
|
void scheduler_uninit()
|
||||||
{
|
{
|
||||||
printf("No Implemente\n");
|
printf("Scheduler Unitit no Implemente\n");
|
||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -281,7 +273,15 @@ void scheduler_uninit()
|
||||||
*/
|
*/
|
||||||
void cuSynchronizeBarrier()
|
void cuSynchronizeBarrier()
|
||||||
{
|
{
|
||||||
while (scheduler->kernelQueue->size_approx() > 0)
|
while(1) {
|
||||||
;
|
// sync is complete, only if queue size == 0 and none of
|
||||||
printf("size: %d\n",scheduler->kernelQueue->size_approx());
|
// driver threads are busy
|
||||||
|
if(scheduler->kernelQueue->size_approx() == 0) {
|
||||||
|
bool none_busy = true;
|
||||||
|
for (int i = 0; i < scheduler->num_worker_threads; i++) {
|
||||||
|
none_busy&=(!scheduler->thread_pool[i].busy);
|
||||||
|
}
|
||||||
|
if(none_busy) break;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue