#include __global__ void saxpy(void) { printf("hello!\n"); } int main(void) { int N = 1 << 20; float *x, *y, *d_x, *d_y; x = (float *)malloc(N * sizeof(float)); y = (float *)malloc(N * sizeof(float)); cudaMalloc(&d_x, N * sizeof(float)); cudaMalloc(&d_y, N * sizeof(float)); for (int i = 0; i < N; i++) { x[i] = 1.0f; y[i] = 2.0f; } cudaMemcpy(d_x, x, N * sizeof(float), cudaMemcpyHostToDevice); cudaMemcpy(d_y, y, N * sizeof(float), cudaMemcpyHostToDevice); // Perform SAXPY on 1M elements saxpy<<<(1, 1)>>>; cudaMemcpy(y, d_y, N * sizeof(float), cudaMemcpyDeviceToHost); float maxError = 0.0f; for (int i = 0; i < N; i++) maxError = max(maxError, abs(y[i] - 4.0f)); printf("Max error: %f\n", maxError); cudaFree(d_x); cudaFree(d_y); free(x); free(y); }