CuPBoP/examples/vecadd/vecadd.cu

// Get from: https://github.com/olcf/vector_addition_tutorials
#include <stdio.h>
#include <stdlib.h>
#include <math.h>

const double epsilon = 1e-6;
// CUDA kernel. Each thread takes care of one element of c
__global__ void vecAdd(double *a, double *b, double *c, int n)
{
    // Get our global thread ID
    int id = blockIdx.x*blockDim.x+threadIdx.x;

    // Make sure we do not go out of bounds
    if (id < n)
        c[id] = a[id] + b[id];
}

int main( int argc, char* argv[] )
{
    //cudaSetDevice(0);
    // Size of vectors
    int n = 100000;

    // Host input vectors
    double *h_a;
    double *h_b;
    //Host output vector
    double *h_c;

    // Device input vectors
    double *d_a;
    double *d_b;
    //Device output vector
    double *d_c;

    // Size, in bytes, of each vector
    size_t bytes = n*sizeof(double);

    // Allocate memory for each vector on host
    h_a = (double*)malloc(bytes);
    h_b = (double*)malloc(bytes);
    h_c = (double*)malloc(bytes);

    // Allocate memory for each vector on GPU
    cudaMalloc(&d_a, bytes);
    cudaMalloc(&d_b, bytes);
    cudaMalloc(&d_c, bytes);

    int i;
    // Initialize vectors on host
    for( i = 0; i < n; i++ ) {
        h_a[i] = sin(i)*sin(i);
        h_b[i] = cos(i)*cos(i);
    }

    // Copy host vectors to device
    cudaMemcpy( d_a, h_a, bytes, cudaMemcpyHostToDevice);
    cudaMemcpy( d_b, h_b, bytes, cudaMemcpyHostToDevice);

    int blockSize, gridSize;

    // Number of threads in each thread block
    blockSize = 1024;

    // Number of thread blocks in grid
    gridSize = (int)ceil((float)n/blockSize);

    // Execute the kernel
    vecAdd<<<gridSize, blockSize>>>(d_a, d_b, d_c, n);

    // Copy array back to host
    cudaMemcpy( h_c, d_c, bytes, cudaMemcpyDeviceToHost );

    // Sum up vector c and print result divided by n, this should equal 1 within error
    double sum = 0;
    for(i=0; i<n; i++)
        sum += h_c[i];
    sum/=(double)n;
    if(abs(sum-1.0)<epsilon)
        printf("PASS\n");
    else
        printf("FAIL\n");

    // Release device memory
    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_c);

    // Release host memory
    free(h_a);
    free(h_b);
    free(h_c);

    return 0;
}
add vecadd example and update README.md 2022-09-15 23:15:21 +08:00			`// Get from: https://github.com/olcf/vector_addition_tutorials`
			`#include <stdio.h>`
			`#include <stdlib.h>`
			`#include <math.h>`

			`const double epsilon = 1e-6;`
			`// CUDA kernel. Each thread takes care of one element of c`
			`__global__ void vecAdd(double a, double b, double *c, int n)`
			`{`
			`// Get our global thread ID`
			`int id = blockIdx.x*blockDim.x+threadIdx.x;`

			`// Make sure we do not go out of bounds`
			`if (id < n)`
			`c[id] = a[id] + b[id];`
			`}`

			`int main( int argc, char* argv[] )`
			`{`
			`//cudaSetDevice(0);`
			`// Size of vectors`
			`int n = 100000;`

			`// Host input vectors`
			`double *h_a;`
			`double *h_b;`
			`//Host output vector`
			`double *h_c;`

			`// Device input vectors`
			`double *d_a;`
			`double *d_b;`
			`//Device output vector`
			`double *d_c;`

			`// Size, in bytes, of each vector`
			`size_t bytes = n*sizeof(double);`

			`// Allocate memory for each vector on host`
			`h_a = (double*)malloc(bytes);`
			`h_b = (double*)malloc(bytes);`
			`h_c = (double*)malloc(bytes);`

			`// Allocate memory for each vector on GPU`
			`cudaMalloc(&d_a, bytes);`
			`cudaMalloc(&d_b, bytes);`
			`cudaMalloc(&d_c, bytes);`

			`int i;`
			`// Initialize vectors on host`
			`for( i = 0; i < n; i++ ) {`
			`h_a[i] = sin(i)*sin(i);`
			`h_b[i] = cos(i)*cos(i);`
			`}`

			`// Copy host vectors to device`
			`cudaMemcpy( d_a, h_a, bytes, cudaMemcpyHostToDevice);`
			`cudaMemcpy( d_b, h_b, bytes, cudaMemcpyHostToDevice);`

			`int blockSize, gridSize;`

			`// Number of threads in each thread block`
			`blockSize = 1024;`

			`// Number of thread blocks in grid`
			`gridSize = (int)ceil((float)n/blockSize);`

			`// Execute the kernel`
			`vecAdd<<<gridSize, blockSize>>>(d_a, d_b, d_c, n);`

			`// Copy array back to host`
			`cudaMemcpy( h_c, d_c, bytes, cudaMemcpyDeviceToHost );`

			`// Sum up vector c and print result divided by n, this should equal 1 within error`
			`double sum = 0;`
			`for(i=0; i<n; i++)`
			`sum += h_c[i];`
			`sum/=(double)n;`
			`if(abs(sum-1.0)<epsilon)`
			`printf("PASS\n");`
			`else`
			`printf("FAIL\n");`

			`// Release device memory`
			`cudaFree(d_a);`
			`cudaFree(d_b);`
			`cudaFree(d_c);`

			`// Release host memory`
			`free(h_a);`
			`free(h_b);`
			`free(h_c);`

			`return 0;`
			`}`