CuPBoP/examples/btree/kernel/kernel_gpu_cuda_wrapper_2.cu

#ifdef __cplusplus
extern "C" {
#endif

//========================================================================================================================================================================================================200
//	INCLUDE
//========================================================================================================================================================================================================200

//======================================================================================================================================================150
//	COMMON
//======================================================================================================================================================150

#include "../common.h"									// (in the main program folder)	needed to recognized input parameters

//======================================================================================================================================================150
//	UTILITIES
//======================================================================================================================================================150

#include "../util/cuda/cuda.h"							// (in library path specified to compiler)	needed by for device functions
#include "../util/timer/timer.h"						// (in library path specified to compiler)	needed by timer

//======================================================================================================================================================150
//	KERNEL
//======================================================================================================================================================150

#include "./kernel_gpu_cuda_2.cu"						// (in the current directory)	GPU kernel, cannot include with header file because of complications with passing of constant memory variables

//======================================================================================================================================================150
//	HEADER
//======================================================================================================================================================150

#include "./kernel_gpu_cuda_wrapper_2.h"				// (in the current directory)

//========================================================================================================================================================================================================200
//	FUNCTION
//========================================================================================================================================================================================================200

void
kernel_gpu_cuda_wrapper_2(	knode *knodes,
							long knodes_elem,
							long knodes_mem,

							int order,
							long maxheight,
							int count,

							long *currKnode,
							long *offset,
							long *lastKnode,
							long *offset_2,
							int *start,
							int *end,
							int *recstart,
							int *reclength)
{

	//======================================================================================================================================================150
	//	CPU VARIABLES
	//======================================================================================================================================================150

	// timer
	long long time0;
	long long time1;
	long long time2;
	long long time3;
	long long time4;
	long long time5;
	long long time6;

	time0 = get_time();

	//======================================================================================================================================================150
	//	GPU SETUP
	//======================================================================================================================================================150

	//====================================================================================================100
	//	INITIAL DRIVER OVERHEAD
	//====================================================================================================100

	cudaThreadSynchronize();

	//====================================================================================================100
	//	EXECUTION PARAMETERS
	//====================================================================================================100

	int numBlocks;
	numBlocks = count;
	int threadsPerBlock;
	threadsPerBlock = order < 1024 ? order : 1024;

	printf("# of blocks = %d, # of threads/block = %d (ensure that device can handle)\n", numBlocks, threadsPerBlock);

	time1 = get_time();

	//======================================================================================================================================================150
	//	GPU MEMORY				MALLOC
	//======================================================================================================================================================150

	//====================================================================================================100
	//	DEVICE IN
	//====================================================================================================100

	//==================================================50
	//	knodesD
	//==================================================50

	knode *knodesD;
	cudaMalloc((void**)&knodesD, knodes_mem);
	checkCUDAError("cudaMalloc  recordsD");

	//==================================================50
	//	currKnodeD
	//==================================================50

	long *currKnodeD;
	cudaMalloc((void**)&currKnodeD, count*sizeof(long));
	checkCUDAError("cudaMalloc  currKnodeD");

	//==================================================50
	//	offsetD
	//==================================================50

	long *offsetD;
	cudaMalloc((void**)&offsetD, count*sizeof(long));
	checkCUDAError("cudaMalloc  offsetD");

	//==================================================50
	//	lastKnodeD
	//==================================================50

	long *lastKnodeD;
	cudaMalloc((void**)&lastKnodeD, count*sizeof(long));
	checkCUDAError("cudaMalloc  lastKnodeD");

	//==================================================50
	//	offset_2D
	//==================================================50

	long *offset_2D;
	cudaMalloc((void**)&offset_2D, count*sizeof(long));
	checkCUDAError("cudaMalloc  offset_2D");

	//==================================================50
	//	startD
	//==================================================50

	int *startD;
	cudaMalloc((void**)&startD, count*sizeof(int));
	checkCUDAError("cudaMalloc startD");

	//==================================================50
	//	endD
	//==================================================50

	int *endD;
	cudaMalloc((void**)&endD, count*sizeof(int));
	checkCUDAError("cudaMalloc endD");

	//====================================================================================================100
	//	DEVICE IN/OUT
	//====================================================================================================100

	//==================================================50
	//	ansDStart
	//==================================================50

	int *ansDStart;
	cudaMalloc((void**)&ansDStart, count*sizeof(int));
	checkCUDAError("cudaMalloc ansDStart");

	//==================================================50
	//	ansDLength
	//==================================================50

	int *ansDLength;
	cudaMalloc((void**)&ansDLength, count*sizeof(int));
	checkCUDAError("cudaMalloc ansDLength");

	time2 = get_time();

	//======================================================================================================================================================150
	//	GPU MEMORY			COPY
	//======================================================================================================================================================150

	//====================================================================================================100
	//	DEVICE IN
	//====================================================================================================100

	//==================================================50
	//	knodesD
	//==================================================50

	cudaMemcpy(knodesD, knodes, knodes_mem, cudaMemcpyHostToDevice);
	checkCUDAError("cudaMalloc cudaMemcpy memD");

	//==================================================50
	//	currKnodeD
	//==================================================50

	cudaMemcpy(currKnodeD, currKnode, count*sizeof(long), cudaMemcpyHostToDevice);
	checkCUDAError("cudaMalloc cudaMemcpy currKnodeD");

	//==================================================50
	//	offsetD
	//==================================================50

	cudaMemcpy(offsetD, offset, count*sizeof(long), cudaMemcpyHostToDevice);
	checkCUDAError("cudaMalloc cudaMemcpy offsetD");

	//==================================================50
	//	lastKnodeD
	//==================================================50

	cudaMemcpy(lastKnodeD, lastKnode, count*sizeof(long), cudaMemcpyHostToDevice);
	checkCUDAError("cudaMalloc cudaMemcpy lastKnodeD");

	//==================================================50
	//	offset_2D
	//==================================================50

	cudaMemcpy(offset_2D, offset_2, count*sizeof(long), cudaMemcpyHostToDevice);
	checkCUDAError("cudaMalloc cudaMemcpy offset_2D");

	//==================================================50
	//	startD
	//==================================================50

	cudaMemcpy(startD, start, count*sizeof(int), cudaMemcpyHostToDevice);
	checkCUDAError("cudaMemcpy startD");

	//==================================================50
	//	endD
	//==================================================50

	cudaMemcpy(endD, end, count*sizeof(int), cudaMemcpyHostToDevice);
	checkCUDAError("cudaMemcpy endD");

	//====================================================================================================100
	//	DEVICE IN/OUT
	//====================================================================================================100

	//==================================================50
	//	ansDStart
	//==================================================50

	cudaMemcpy(ansDStart, recstart, count*sizeof(int), cudaMemcpyHostToDevice);
	checkCUDAError("cudaMemcpy ansDStart");

	//==================================================50
	//	ansDLength
	//==================================================50

	cudaMemcpy(ansDLength, reclength, count*sizeof(int), cudaMemcpyHostToDevice);
	checkCUDAError("cudaMemcpy ansDLength");

	time3 = get_time();

	//======================================================================================================================================================150
	//	KERNEL
	//======================================================================================================================================================150

	// [GPU] findRangeK kernel
	findRangeK<<<numBlocks, threadsPerBlock>>>(	maxheight,
												knodesD,
												knodes_elem,

												currKnodeD,
												offsetD,
												lastKnodeD,
												offset_2D,
												startD,
												endD,
												ansDStart,
												ansDLength);
	cudaThreadSynchronize();
	checkCUDAError("findRangeK");

	time4 = get_time();

	//======================================================================================================================================================150
	//	GPU MEMORY			COPY (CONTD.)
	//======================================================================================================================================================150

	//====================================================================================================100
	//	DEVICE IN/OUT
	//====================================================================================================100

	//==================================================50
	//	ansDStart
	//==================================================50

	cudaMemcpy(recstart, ansDStart, count*sizeof(int), cudaMemcpyDeviceToHost);
	checkCUDAError("cudaMemcpy ansDStart");

	//==================================================50
	//	ansDLength
	//==================================================50

	cudaMemcpy(reclength, ansDLength, count*sizeof(int), cudaMemcpyDeviceToHost);
	checkCUDAError("cudaMemcpy ansDLength");

	time5 = get_time();

	//======================================================================================================================================================150
	//	GPU MEMORY DEALLOCATION
	//======================================================================================================================================================150

	cudaFree(knodesD);

	cudaFree(currKnodeD);
	cudaFree(offsetD);
	cudaFree(lastKnodeD);
	cudaFree(offset_2D);
	cudaFree(startD);
	cudaFree(endD);
	cudaFree(ansDStart);
	cudaFree(ansDLength);

	time6 = get_time();

	//======================================================================================================================================================150
	//	DISPLAY TIMING
	//======================================================================================================================================================150

	printf("Time spent in different stages of GPU_CUDA KERNEL:\n");

	printf("%15.12f s, %15.12f % : GPU: SET DEVICE / DRIVER INIT\n",	(float) (time1-time0) / 1000000, (float) (time1-time0) / (float) (time6-time0) * 100);
	printf("%15.12f s, %15.12f % : GPU MEM: ALO\n", 					(float) (time2-time1) / 1000000, (float) (time2-time1) / (float) (time6-time0) * 100);
	printf("%15.12f s, %15.12f % : GPU MEM: COPY IN\n",					(float) (time3-time2) / 1000000, (float) (time3-time2) / (float) (time6-time0) * 100);

	printf("%15.12f s, %15.12f % : GPU: KERNEL\n",						(float) (time4-time3) / 1000000, (float) (time4-time3) / (float) (time6-time0) * 100);

	printf("%15.12f s, %15.12f % : GPU MEM: COPY OUT\n",				(float) (time5-time4) / 1000000, (float) (time5-time4) / (float) (time6-time0) * 100);
	printf("%15.12f s, %15.12f % : GPU MEM: FRE\n", 					(float) (time6-time5) / 1000000, (float) (time6-time5) / (float) (time6-time0) * 100);

	printf("Total time:\n");
	printf("%.12f s\n", 												(float) (time6-time0) / 1000000);

}

//========================================================================================================================================================================================================200
//	END
//========================================================================================================================================================================================================200

#ifdef __cplusplus
}
#endif
add codebase for TACO submission 2022-05-04 20:59:38 +08:00			`#ifdef __cplusplus`
			`extern "C" {`
			`#endif`

			`//========================================================================================================================================================================================================200`
			`// INCLUDE`
			`//========================================================================================================================================================================================================200`

			`//======================================================================================================================================================150`
			`// COMMON`
			`//======================================================================================================================================================150`

			`#include "../common.h" // (in the main program folder) needed to recognized input parameters`

			`//======================================================================================================================================================150`
			`// UTILITIES`
			`//======================================================================================================================================================150`

			`#include "../util/cuda/cuda.h" // (in library path specified to compiler) needed by for device functions`
			`#include "../util/timer/timer.h" // (in library path specified to compiler) needed by timer`

			`//======================================================================================================================================================150`
			`// KERNEL`
			`//======================================================================================================================================================150`

			`#include "./kernel_gpu_cuda_2.cu" // (in the current directory) GPU kernel, cannot include with header file because of complications with passing of constant memory variables`

			`//======================================================================================================================================================150`
			`// HEADER`
			`//======================================================================================================================================================150`

			`#include "./kernel_gpu_cuda_wrapper_2.h" // (in the current directory)`

			`//========================================================================================================================================================================================================200`
			`// FUNCTION`
			`//========================================================================================================================================================================================================200`

			`void`
			`kernel_gpu_cuda_wrapper_2( knode *knodes,`
			`long knodes_elem,`
			`long knodes_mem,`

			`int order,`
			`long maxheight,`
			`int count,`

			`long *currKnode,`
			`long *offset,`
			`long *lastKnode,`
			`long *offset_2,`
			`int *start,`
			`int *end,`
			`int *recstart,`
			`int *reclength)`
			`{`

			`//======================================================================================================================================================150`
			`// CPU VARIABLES`
			`//======================================================================================================================================================150`

			`// timer`
			`long long time0;`
			`long long time1;`
			`long long time2;`
			`long long time3;`
			`long long time4;`
			`long long time5;`
			`long long time6;`

			`time0 = get_time();`

			`//======================================================================================================================================================150`
			`// GPU SETUP`
			`//======================================================================================================================================================150`

			`//====================================================================================================100`
			`// INITIAL DRIVER OVERHEAD`
			`//====================================================================================================100`

			`cudaThreadSynchronize();`

			`//====================================================================================================100`
			`// EXECUTION PARAMETERS`
			`//====================================================================================================100`

			`int numBlocks;`
			`numBlocks = count;`
			`int threadsPerBlock;`
			`threadsPerBlock = order < 1024 ? order : 1024;`

			`printf("# of blocks = %d, # of threads/block = %d (ensure that device can handle)\n", numBlocks, threadsPerBlock);`

			`time1 = get_time();`

			`//======================================================================================================================================================150`
			`// GPU MEMORY MALLOC`
			`//======================================================================================================================================================150`

			`//====================================================================================================100`
			`// DEVICE IN`
			`//====================================================================================================100`

			`//==================================================50`
			`// knodesD`
			`//==================================================50`

			`knode *knodesD;`
			`cudaMalloc((void**)&knodesD, knodes_mem);`
			`checkCUDAError("cudaMalloc recordsD");`

			`//==================================================50`
			`// currKnodeD`
			`//==================================================50`

			`long *currKnodeD;`
			`cudaMalloc((void*)&currKnodeD, countsizeof(long));`
			`checkCUDAError("cudaMalloc currKnodeD");`

			`//==================================================50`
			`// offsetD`
			`//==================================================50`

			`long *offsetD;`
			`cudaMalloc((void*)&offsetD, countsizeof(long));`
			`checkCUDAError("cudaMalloc offsetD");`

			`//==================================================50`
			`// lastKnodeD`
			`//==================================================50`

			`long *lastKnodeD;`
			`cudaMalloc((void*)&lastKnodeD, countsizeof(long));`
			`checkCUDAError("cudaMalloc lastKnodeD");`

			`//==================================================50`
			`// offset_2D`
			`//==================================================50`

			`long *offset_2D;`
			`cudaMalloc((void*)&offset_2D, countsizeof(long));`
			`checkCUDAError("cudaMalloc offset_2D");`

			`//==================================================50`
			`// startD`
			`//==================================================50`

			`int *startD;`
			`cudaMalloc((void*)&startD, countsizeof(int));`
			`checkCUDAError("cudaMalloc startD");`

			`//==================================================50`
			`// endD`
			`//==================================================50`

			`int *endD;`
			`cudaMalloc((void*)&endD, countsizeof(int));`
			`checkCUDAError("cudaMalloc endD");`

			`//====================================================================================================100`
			`// DEVICE IN/OUT`
			`//====================================================================================================100`

			`//==================================================50`
			`// ansDStart`
			`//==================================================50`

			`int *ansDStart;`
			`cudaMalloc((void*)&ansDStart, countsizeof(int));`
			`checkCUDAError("cudaMalloc ansDStart");`

			`//==================================================50`
			`// ansDLength`
			`//==================================================50`

			`int *ansDLength;`
			`cudaMalloc((void*)&ansDLength, countsizeof(int));`
			`checkCUDAError("cudaMalloc ansDLength");`

			`time2 = get_time();`

			`//======================================================================================================================================================150`
			`// GPU MEMORY COPY`
			`//======================================================================================================================================================150`

			`//====================================================================================================100`
			`// DEVICE IN`
			`//====================================================================================================100`

			`//==================================================50`
			`// knodesD`
			`//==================================================50`

			`cudaMemcpy(knodesD, knodes, knodes_mem, cudaMemcpyHostToDevice);`
			`checkCUDAError("cudaMalloc cudaMemcpy memD");`

			`//==================================================50`
			`// currKnodeD`
			`//==================================================50`

			`cudaMemcpy(currKnodeD, currKnode, count*sizeof(long), cudaMemcpyHostToDevice);`
			`checkCUDAError("cudaMalloc cudaMemcpy currKnodeD");`

			`//==================================================50`
			`// offsetD`
			`//==================================================50`

			`cudaMemcpy(offsetD, offset, count*sizeof(long), cudaMemcpyHostToDevice);`
			`checkCUDAError("cudaMalloc cudaMemcpy offsetD");`

			`//==================================================50`
			`// lastKnodeD`
			`//==================================================50`

			`cudaMemcpy(lastKnodeD, lastKnode, count*sizeof(long), cudaMemcpyHostToDevice);`
			`checkCUDAError("cudaMalloc cudaMemcpy lastKnodeD");`

			`//==================================================50`
			`// offset_2D`
			`//==================================================50`

			`cudaMemcpy(offset_2D, offset_2, count*sizeof(long), cudaMemcpyHostToDevice);`
			`checkCUDAError("cudaMalloc cudaMemcpy offset_2D");`

			`//==================================================50`
			`// startD`
			`//==================================================50`

			`cudaMemcpy(startD, start, count*sizeof(int), cudaMemcpyHostToDevice);`
			`checkCUDAError("cudaMemcpy startD");`

			`//==================================================50`
			`// endD`
			`//==================================================50`

			`cudaMemcpy(endD, end, count*sizeof(int), cudaMemcpyHostToDevice);`
			`checkCUDAError("cudaMemcpy endD");`

			`//====================================================================================================100`
			`// DEVICE IN/OUT`
			`//====================================================================================================100`

			`//==================================================50`
			`// ansDStart`
			`//==================================================50`

			`cudaMemcpy(ansDStart, recstart, count*sizeof(int), cudaMemcpyHostToDevice);`
			`checkCUDAError("cudaMemcpy ansDStart");`

			`//==================================================50`
			`// ansDLength`
			`//==================================================50`

			`cudaMemcpy(ansDLength, reclength, count*sizeof(int), cudaMemcpyHostToDevice);`
			`checkCUDAError("cudaMemcpy ansDLength");`

			`time3 = get_time();`

			`//======================================================================================================================================================150`
			`// KERNEL`
			`//======================================================================================================================================================150`

			`// [GPU] findRangeK kernel`
			`findRangeK<<<numBlocks, threadsPerBlock>>>( maxheight,`
			`knodesD,`
			`knodes_elem,`

			`currKnodeD,`
			`offsetD,`
			`lastKnodeD,`
			`offset_2D,`
			`startD,`
			`endD,`
			`ansDStart,`
			`ansDLength);`
			`cudaThreadSynchronize();`
			`checkCUDAError("findRangeK");`

			`time4 = get_time();`

			`//======================================================================================================================================================150`
			`// GPU MEMORY COPY (CONTD.)`
			`//======================================================================================================================================================150`

			`//====================================================================================================100`
			`// DEVICE IN/OUT`
			`//====================================================================================================100`

			`//==================================================50`
			`// ansDStart`
			`//==================================================50`

			`cudaMemcpy(recstart, ansDStart, count*sizeof(int), cudaMemcpyDeviceToHost);`
			`checkCUDAError("cudaMemcpy ansDStart");`

			`//==================================================50`
			`// ansDLength`
			`//==================================================50`

			`cudaMemcpy(reclength, ansDLength, count*sizeof(int), cudaMemcpyDeviceToHost);`
			`checkCUDAError("cudaMemcpy ansDLength");`

			`time5 = get_time();`

			`//======================================================================================================================================================150`
			`// GPU MEMORY DEALLOCATION`
			`//======================================================================================================================================================150`

			`cudaFree(knodesD);`

			`cudaFree(currKnodeD);`
			`cudaFree(offsetD);`
			`cudaFree(lastKnodeD);`
			`cudaFree(offset_2D);`
			`cudaFree(startD);`
			`cudaFree(endD);`
			`cudaFree(ansDStart);`
			`cudaFree(ansDLength);`

			`time6 = get_time();`

			`//======================================================================================================================================================150`
			`// DISPLAY TIMING`
			`//======================================================================================================================================================150`

			`printf("Time spent in different stages of GPU_CUDA KERNEL:\n");`

			`printf("%15.12f s, %15.12f % : GPU: SET DEVICE / DRIVER INIT\n", (float) (time1-time0) / 1000000, (float) (time1-time0) / (float) (time6-time0) * 100);`
			`printf("%15.12f s, %15.12f % : GPU MEM: ALO\n", (float) (time2-time1) / 1000000, (float) (time2-time1) / (float) (time6-time0) * 100);`
			`printf("%15.12f s, %15.12f % : GPU MEM: COPY IN\n", (float) (time3-time2) / 1000000, (float) (time3-time2) / (float) (time6-time0) * 100);`

			`printf("%15.12f s, %15.12f % : GPU: KERNEL\n", (float) (time4-time3) / 1000000, (float) (time4-time3) / (float) (time6-time0) * 100);`

			`printf("%15.12f s, %15.12f % : GPU MEM: COPY OUT\n", (float) (time5-time4) / 1000000, (float) (time5-time4) / (float) (time6-time0) * 100);`
			`printf("%15.12f s, %15.12f % : GPU MEM: FRE\n", (float) (time6-time5) / 1000000, (float) (time6-time5) / (float) (time6-time0) * 100);`

			`printf("Total time:\n");`
			`printf("%.12f s\n", (float) (time6-time0) / 1000000);`

			`}`

			`//========================================================================================================================================================================================================200`
			`// END`
			`//========================================================================================================================================================================================================200`

			`#ifdef __cplusplus`
			`}`
			`#endif`