105 lines
3.1 KiB
Plaintext
105 lines
3.1 KiB
Plaintext
/*
|
|
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
|
|
*
|
|
* NVIDIA Corporation and its licensors retain all intellectual property and *
|
|
* proprietary rights in and to this software and related documentation. Any
|
|
* use, reproduction, disclosure, or distribution of this software and related
|
|
* documentation without an express license agreement from NVIDIA Corporation is
|
|
* strictly prohibited.
|
|
*
|
|
* Please refer to the applicable NVIDIA end user license agreement (EULA)
|
|
* associated with this source code for terms and conditions that govern
|
|
* your use of this NVIDIA software.
|
|
*
|
|
*/
|
|
|
|
#include <iostream>
|
|
#include <stdio.h>
|
|
|
|
#define CHECK(ans) \
|
|
{ gpuAssert((ans), __FILE__, __LINE__); }
|
|
inline void gpuAssert(cudaError_t code, const char *file, int line,
|
|
bool abort = true) {
|
|
if (code != cudaSuccess) {
|
|
fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file,
|
|
line);
|
|
if (abort)
|
|
exit(code);
|
|
}
|
|
}
|
|
|
|
using namespace std;
|
|
|
|
#define SIZE (100 * 1024 * 1024)
|
|
|
|
__global__ void histo_kernel(unsigned char *buffer, long size,
|
|
unsigned int *histo) {
|
|
|
|
__shared__ unsigned int temp[256];
|
|
|
|
temp[threadIdx.x] = 0;
|
|
__syncthreads();
|
|
|
|
int i = threadIdx.x + blockIdx.x * blockDim.x;
|
|
int offset = blockDim.x * gridDim.x;
|
|
while (i < size) {
|
|
atomicAdd(&temp[buffer[i]], 1);
|
|
i += offset;
|
|
}
|
|
|
|
__syncthreads();
|
|
atomicAdd(&(histo[threadIdx.x]), temp[threadIdx.x]);
|
|
}
|
|
|
|
int runHisto(char *file, unsigned int *freq, unsigned int memSize,
|
|
unsigned int *source) {
|
|
|
|
FILE *f = fopen(file, "rb");
|
|
if (!f) {
|
|
perror(file);
|
|
exit(1);
|
|
}
|
|
fseek(f, 0, SEEK_SET);
|
|
size_t result = fread(source, 1, memSize, f);
|
|
if (result != memSize)
|
|
fputs("Cannot read input file", stderr);
|
|
|
|
fclose(f);
|
|
|
|
unsigned char *buffer = (unsigned char *)source;
|
|
|
|
int blocks = 2;
|
|
|
|
// allocate memory on the GPU for the file's data
|
|
int partSize = memSize / 32;
|
|
int totalNum = memSize / sizeof(unsigned int);
|
|
int partialNum = partSize / sizeof(unsigned int);
|
|
|
|
unsigned char *dev_buffer0;
|
|
unsigned char *dev_buffer1;
|
|
unsigned int *dev_histo;
|
|
cudaMalloc((void **)&dev_buffer0, partSize);
|
|
cudaMalloc((void **)&dev_buffer1, partSize);
|
|
cudaMalloc((void **)&dev_histo, 256 * sizeof(int));
|
|
cudaMemset(dev_histo, 0, 256 * sizeof(int));
|
|
|
|
for (int i = 0; i < totalNum; i += partialNum * 2) {
|
|
CHECK(
|
|
cudaMemcpy(dev_buffer0, buffer + i, partSize, cudaMemcpyHostToDevice));
|
|
CHECK(cudaMemcpy(dev_buffer1, buffer + i + partialNum, partSize,
|
|
cudaMemcpyHostToDevice));
|
|
|
|
// kernel launch - 2x the number of mps gave best timing
|
|
histo_kernel<<<blocks * 2, 256>>>(dev_buffer0, partSize, dev_histo);
|
|
cudaDeviceSynchronize();
|
|
histo_kernel<<<blocks * 2, 256>>>(dev_buffer1, partSize, dev_histo);
|
|
cudaDeviceSynchronize();
|
|
}
|
|
cudaMemcpy(freq, dev_histo, 256 * sizeof(int), cudaMemcpyDeviceToHost);
|
|
|
|
cudaFree(dev_histo);
|
|
cudaFree(dev_buffer0);
|
|
cudaFree(dev_buffer1);
|
|
return 0;
|
|
}
|