CuPBoP/examples/streamcluster/streamcluster_cuda.cu

364 lines
11 KiB
Plaintext

/***********************************************
streamcluster_cuda.cu
: parallelized code of streamcluster
- original code from PARSEC Benchmark Suite
- parallelization with CUDA API has been applied by
Shawn Sang-Ha Lee - sl4ge@virginia.edu
University of Virginia
Department of Electrical and Computer Engineering
Department of Computer Science
***********************************************/
#include "streamcluster_header.h"
using namespace std;
// AUTO-ERROR CHECK FOR ALL CUDA FUNCTIONS
#define CUDA_SAFE_CALL(call) \
do { \
cudaError err = call; \
if (cudaSuccess != err) { \
fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n", __FILE__, \
__LINE__, cudaGetErrorString(err)); \
exit(EXIT_FAILURE); \
} \
} while (0)
#define THREADS_PER_BLOCK 512
#define MAXBLOCKS 65536
// host memory
float *work_mem_h;
float *coord_h;
// device memory
float *work_mem_d;
float *coord_d;
int *center_table_d;
bool *switch_membership_d;
Point *p;
static int iter = 0; // counter for total# of iteration
//=======================================
// Euclidean Distance
//=======================================
__device__ float d_dist(int p1, int p2, int num, int dim, float *coord_d) {
float retval = 0.0;
for (int i = 0; i < dim; i++) {
float tmp = coord_d[(i * num) + p1] - coord_d[(i * num) + p2];
retval += tmp * tmp;
}
return retval;
}
//=======================================
// Kernel - Compute Cost
//=======================================
__global__ void kernel_compute_cost(int num, int dim, long x, Point *p, int K,
int stride, float *coord_d,
float *work_mem_d, int *center_table_d,
bool *switch_membership_d) {
// block ID and global thread ID
const int bid = blockIdx.x + gridDim.x * blockIdx.y;
const int tid = blockDim.x * bid + threadIdx.x;
if (tid < num) {
float *lower = &work_mem_d[tid * stride];
// cost between this point and point[x]: euclidean distance multiplied by
// weight
float x_cost = d_dist(tid, x, num, dim, coord_d) * p[tid].weight;
// if computed cost is less then original (it saves), mark it as to reassign
if (x_cost < p[tid].cost) {
switch_membership_d[tid] = 1;
lower[K] += x_cost - p[tid].cost;
}
// if computed cost is larger, save the difference
else {
lower[center_table_d[p[tid].assign]] += p[tid].cost - x_cost;
}
}
}
//=======================================
// Allocate Device Memory
//=======================================
void allocDevMem(int num, int dim) {
CUDA_SAFE_CALL(cudaMalloc((void **)&center_table_d, num * sizeof(int)));
CUDA_SAFE_CALL(cudaMalloc((void **)&switch_membership_d, num * sizeof(bool)));
CUDA_SAFE_CALL(cudaMalloc((void **)&p, num * sizeof(Point)));
CUDA_SAFE_CALL(cudaMalloc((void **)&coord_d, num * dim * sizeof(float)));
}
//=======================================
// Allocate Host Memory
//=======================================
void allocHostMem(int num, int dim) {
coord_h = (float *)malloc(num * dim * sizeof(float));
}
//=======================================
// Free Device Memory
//=======================================
void freeDevMem() {
CUDA_SAFE_CALL(cudaFree(center_table_d));
CUDA_SAFE_CALL(cudaFree(switch_membership_d));
CUDA_SAFE_CALL(cudaFree(p));
CUDA_SAFE_CALL(cudaFree(coord_d));
}
//=======================================
// Free Host Memory
//=======================================
void freeHostMem() { free(coord_h); }
//=======================================
// pgain Entry - CUDA SETUP + CUDA CALL
//=======================================
float pgain(long x, Points *points, float z, long int *numcenters, int kmax,
bool *is_center, int *center_table, bool *switch_membership,
bool isCoordChanged, double *serial_t, double *cpu_to_gpu_t,
double *gpu_to_cpu_t, double *alloc_t, double *kernel_t,
double *free_t) {
#ifdef CUDATIME
float tmp_t;
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
#endif
cudaError_t error;
int stride = *numcenters + 1; // size of each work_mem segment
int K = *numcenters; // number of centers
int num = points->num; // number of points
int dim = points->dim; // number of dimension
int nThread = num; // number of threads == number of data points
//=========================================
// ALLOCATE HOST MEMORY + DATA PREPARATION
//=========================================
work_mem_h = (float *)malloc(stride * (nThread + 1) * sizeof(float));
// Only on the first iteration
if (iter == 0) {
allocHostMem(num, dim);
}
// build center-index table
int count = 0;
for (int i = 0; i < num; i++) {
if (is_center[i]) {
center_table[i] = count++;
}
}
// Extract 'coord'
// Only if first iteration OR coord has changed
if (isCoordChanged || iter == 0) {
for (int i = 0; i < dim; i++) {
for (int j = 0; j < num; j++) {
coord_h[(num * i) + j] = points->p[j].coord[i];
}
}
}
#ifdef CUDATIME
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&tmp_t, start, stop);
*serial_t += (double)tmp_t;
cudaEventRecord(start, 0);
#endif
//=======================================
// ALLOCATE GPU MEMORY
//=======================================
CUDA_SAFE_CALL(
cudaMalloc((void **)&work_mem_d, stride * (nThread + 1) * sizeof(float)));
// Only on the first iteration
if (iter == 0) {
allocDevMem(num, dim);
}
#ifdef CUDATIME
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&tmp_t, start, stop);
*alloc_t += (double)tmp_t;
cudaEventRecord(start, 0);
#endif
//=======================================
// CPU-TO-GPU MEMORY COPY
//=======================================
// Only if first iteration OR coord has changed
if (isCoordChanged || iter == 0) {
CUDA_SAFE_CALL(cudaMemcpy(coord_d, coord_h, num * dim * sizeof(float),
cudaMemcpyHostToDevice));
}
CUDA_SAFE_CALL(cudaMemcpy(center_table_d, center_table, num * sizeof(int),
cudaMemcpyHostToDevice));
CUDA_SAFE_CALL(
cudaMemcpy(p, points->p, num * sizeof(Point), cudaMemcpyHostToDevice));
CUDA_SAFE_CALL(
cudaMemset((void *)switch_membership_d, 0, num * sizeof(bool)));
CUDA_SAFE_CALL(cudaMemset((void *)work_mem_d, 0,
stride * (nThread + 1) * sizeof(float)));
#ifdef CUDATIME
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&tmp_t, start, stop);
*cpu_to_gpu_t += (double)tmp_t;
cudaEventRecord(start, 0);
#endif
//=======================================
// KERNEL: CALCULATE COST
//=======================================
// Determine the number of thread blocks in the x- and y-dimension
int num_blocks =
(int)((float)(num + THREADS_PER_BLOCK - 1) / (float)THREADS_PER_BLOCK);
int num_blocks_y =
(int)((float)(num_blocks + MAXBLOCKS - 1) / (float)MAXBLOCKS);
int num_blocks_x =
(int)((float)(num_blocks + num_blocks_y - 1) / (float)num_blocks_y);
dim3 grid_size(num_blocks_x, num_blocks_y, 1);
kernel_compute_cost<<<grid_size, THREADS_PER_BLOCK>>>(
num, // in: # of data
dim, // in: dimension of point coordinates
x, // in: point to open a center at
p, // in: data point array
K, // in: number of centers
stride, // in: size of each work_mem segment
coord_d, // in: array of point coordinates
work_mem_d, // out: cost and lower field array
center_table_d, // in: center index table
switch_membership_d // out: changes in membership
);
cudaThreadSynchronize();
// error check
error = cudaGetLastError();
if (error != cudaSuccess) {
printf("kernel error: %s\n", cudaGetErrorString(error));
exit(EXIT_FAILURE);
}
#ifdef CUDATIME
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&tmp_t, start, stop);
*kernel_t += (double)tmp_t;
cudaEventRecord(start, 0);
#endif
//=======================================
// GPU-TO-CPU MEMORY COPY
//=======================================
CUDA_SAFE_CALL(cudaMemcpy(work_mem_h, work_mem_d,
stride * (nThread + 1) * sizeof(float),
cudaMemcpyDeviceToHost));
CUDA_SAFE_CALL(cudaMemcpy(switch_membership, switch_membership_d,
num * sizeof(bool), cudaMemcpyDeviceToHost));
#ifdef CUDATIME
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&tmp_t, start, stop);
*gpu_to_cpu_t += (double)tmp_t;
cudaEventRecord(start, 0);
#endif
//=======================================
// CPU (SERIAL) WORK
//=======================================
int number_of_centers_to_close = 0;
float gl_cost_of_opening_x = z;
float *gl_lower = &work_mem_h[stride * nThread];
// compute the number of centers to close if we are to open i
for (int i = 0; i < num; i++) {
if (is_center[i]) {
float low = z;
for (int j = 0; j < num; j++) {
low += work_mem_h[j * stride + center_table[i]];
}
gl_lower[center_table[i]] = low;
if (low > 0) {
++number_of_centers_to_close;
work_mem_h[i * stride + K] -= low;
}
}
gl_cost_of_opening_x += work_mem_h[i * stride + K];
}
// if opening a center at x saves cost (i.e. cost is negative) do so;
// otherwise, do nothing
if (gl_cost_of_opening_x < 0) {
for (int i = 0; i < num; i++) {
bool close_center = gl_lower[center_table[points->p[i].assign]] > 0;
if (switch_membership[i] || close_center) {
points->p[i].cost =
dist(points->p[i], points->p[x], dim) * points->p[i].weight;
points->p[i].assign = x;
}
}
for (int i = 0; i < num; i++) {
if (is_center[i] && gl_lower[center_table[i]] > 0) {
is_center[i] = false;
}
}
if (x >= 0 && x < num) {
is_center[x] = true;
}
*numcenters = *numcenters + 1 - number_of_centers_to_close;
} else {
gl_cost_of_opening_x = 0;
}
//=======================================
// DEALLOCATE HOST MEMORY
//=======================================
free(work_mem_h);
#ifdef CUDATIME
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&tmp_t, start, stop);
*serial_t += (double)tmp_t;
cudaEventRecord(start, 0);
#endif
//=======================================
// DEALLOCATE GPU MEMORY
//=======================================
CUDA_SAFE_CALL(cudaFree(work_mem_d));
#ifdef CUDATIME
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&tmp_t, start, stop);
*free_t += (double)tmp_t;
#endif
iter++;
return -gl_cost_of_opening_x;
}