diff --git a/examples/microbench/cudamemcpy_test.cc b/examples/microbench/cudamemcpy_test.cc new file mode 100644 index 0000000..b723921 --- /dev/null +++ b/examples/microbench/cudamemcpy_test.cc @@ -0,0 +1,42 @@ +#include + +__global__ +void saxpy(int n, float a, float *x, float *y) +{ + int i = blockIdx.x*blockDim.x + threadIdx.x; + if (i < n) y[i] = a*x[i] + y[i]; +} + +int main(void) +{ + int N = 1<<20; + float *x, *y, *d_x, *d_y; + x = (float*)malloc(N*sizeof(float)); + y = (float*)malloc(N*sizeof(float)); + + cudaMalloc(&d_x, N*sizeof(float)); + cudaMalloc(&d_y, N*sizeof(float)); + + for (int i = 0; i < N; i++) { + x[i] = 1.0f; + y[i] = 2.0f; + } + + cudaMemcpy(d_x, x, N*sizeof(float), cudaMemcpyHostToDevice); + cudaMemcpy(d_y, y, N*sizeof(float), cudaMemcpyHostToDevice); + + // Perform SAXPY on 1M elements + // saxpy<<<(N+255)/256, 256>>>(N, 2.0f, d_x, d_y); + + cudaMemcpy(y, d_y, N*sizeof(float), cudaMemcpyDeviceToHost); + + float maxError = 0.0f; + for (int i = 0; i < N; i++) + maxError = max(maxError, abs(y[i]-4.0f)); + printf("Max error: %f\n", maxError); + + cudaFree(d_x); + cudaFree(d_y); + free(x); + free(y); +} \ No newline at end of file diff --git a/examples/microbench/dummy_kernel.cc b/examples/microbench/dummy_kernel.cc new file mode 100644 index 0000000..4bb63e9 --- /dev/null +++ b/examples/microbench/dummy_kernel.cc @@ -0,0 +1,42 @@ +#include + +__global__ +void saxpy(void) +{ + int i = blockIdx.x*blockDim.x + threadIdx.x; + printf("block_id:%d thread_id:%d \n", i) +} + +int main(void) +{ + int N = 1<<20; + float *x, *y, *d_x, *d_y; + x = (float*)malloc(N*sizeof(float)); + y = (float*)malloc(N*sizeof(float)); + + cudaMalloc(&d_x, N*sizeof(float)); + cudaMalloc(&d_y, N*sizeof(float)); + + for (int i = 0; i < N; i++) { + x[i] = 1.0f; + y[i] = 2.0f; + } + + cudaMemcpy(d_x, x, N*sizeof(float), cudaMemcpyHostToDevice); + cudaMemcpy(d_y, y, N*sizeof(float), cudaMemcpyHostToDevice); + + // Perform SAXPY on 1M elements + saxpy<<<(1,1)>>>; + + cudaMemcpy(y, d_y, N*sizeof(float), cudaMemcpyDeviceToHost); + + float maxError = 0.0f; + for (int i = 0; i < N; i++) + maxError = max(maxError, abs(y[i]-4.0f)); + printf("Max error: %f\n", maxError); + + cudaFree(d_x); + cudaFree(d_y); + free(x); + free(y); +} \ No newline at end of file diff --git a/examples/microbench/kerne_arg.cc b/examples/microbench/kerne_arg.cc new file mode 100644 index 0000000..6a8626f --- /dev/null +++ b/examples/microbench/kerne_arg.cc @@ -0,0 +1,41 @@ +#include + +__global__ +void saxpy(int N) +{ +printf("hello!: %d\n", N); +} + +int main(void) +{ + int N = 1<<20; + float *x, *y, *d_x, *d_y; + x = (float*)malloc(N*sizeof(float)); + y = (float*)malloc(N*sizeof(float)); + + cudaMalloc(&d_x, N*sizeof(float)); + cudaMalloc(&d_y, N*sizeof(float)); + + for (int i = 0; i < N; i++) { + x[i] = 1.0f; + y[i] = 2.0f; + } + + cudaMemcpy(d_x, x, N*sizeof(float), cudaMemcpyHostToDevice); + cudaMemcpy(d_y, y, N*sizeof(float), cudaMemcpyHostToDevice); + + // Perform SAXPY on 1M elements + saxpy<<<(1,1)>>>(N); + + cudaMemcpy(y, d_y, N*sizeof(float), cudaMemcpyDeviceToHost); + + float maxError = 0.0f; + for (int i = 0; i < N; i++) + maxError = max(maxError, abs(y[i]-4.0f)); + printf("Max error: %f\n", maxError); + + cudaFree(d_x); + cudaFree(d_y); + free(x); + free(y); +} \ No newline at end of file diff --git a/examples/microbench/one_thread_kernel.cc b/examples/microbench/one_thread_kernel.cc new file mode 100644 index 0000000..73736b2 --- /dev/null +++ b/examples/microbench/one_thread_kernel.cc @@ -0,0 +1,41 @@ +#include + +__global__ +void saxpy(void) +{ +printf("hello!\n"); +} + +int main(void) +{ + int N = 1<<20; + float *x, *y, *d_x, *d_y; + x = (float*)malloc(N*sizeof(float)); + y = (float*)malloc(N*sizeof(float)); + + cudaMalloc(&d_x, N*sizeof(float)); + cudaMalloc(&d_y, N*sizeof(float)); + + for (int i = 0; i < N; i++) { + x[i] = 1.0f; + y[i] = 2.0f; + } + + cudaMemcpy(d_x, x, N*sizeof(float), cudaMemcpyHostToDevice); + cudaMemcpy(d_y, y, N*sizeof(float), cudaMemcpyHostToDevice); + + // Perform SAXPY on 1M elements + saxpy<<<(1,1)>>>; + + cudaMemcpy(y, d_y, N*sizeof(float), cudaMemcpyDeviceToHost); + + float maxError = 0.0f; + for (int i = 0; i < N; i++) + maxError = max(maxError, abs(y[i]-4.0f)); + printf("Max error: %f\n", maxError); + + cudaFree(d_x); + cudaFree(d_y); + free(x); + free(y); +} \ No newline at end of file