remove useless examples

This commit is contained in:
Ruobing Han 2022-09-15 11:31:58 -04:00
parent 49adfd026c
commit 9152feb24f
140 changed files with 0 additions and 67741 deletions

View File

@ -1,454 +0,0 @@
#include "backprop.h"
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
//#define OPEN
#define ABS(x) (((x) > 0.0) ? (x) : (-(x)))
#define fastcopy(to, from, len) \
{ \
register char *_to, *_from; \
register int _i, _l; \
_to = (char *)(to); \
_from = (char *)(from); \
_l = (len); \
for (_i = 0; _i < _l; _i++) \
*_to++ = *_from++; \
}
/*** Return random number between 0.0 and 1.0 ***/
float drnd() { return ((float)rand() / (float)BIGRND); }
/*** Return random number between -1.0 and 1.0 ***/
float dpn1() { return ((drnd() * 2.0) - 1.0); }
/*** The squashing function. Currently, it's a sigmoid. ***/
float squash(x)
float x;
{
float m;
// x = -x;
// m = 1 + x + x*x/2 + x*x*x/6 + x*x*x*x/24 + x*x*x*x*x/120;
// return(1.0 / (1.0 + m));
return (1.0 / (1.0 + exp(-x)));
}
/*** Allocate 1d array of floats ***/
float *alloc_1d_dbl(n)
int n;
{
float *new;
new = (float *)malloc((unsigned)(n * sizeof(float)));
if (new == NULL) {
printf("ALLOC_1D_DBL: Couldn't allocate array of floats\n");
return (NULL);
}
return (new);
}
/*** Allocate 2d array of floats ***/
float **alloc_2d_dbl(m, n)
int m, n;
{
int i;
float **new;
new = (float **)malloc((unsigned)(m * sizeof(float *)));
if (new == NULL) {
printf("ALLOC_2D_DBL: Couldn't allocate array of dbl ptrs\n");
return (NULL);
}
for (i = 0; i < m; i++) {
new[i] = alloc_1d_dbl(n);
}
return (new);
}
bpnn_randomize_weights(w, m, n) float **w;
int m, n;
{
int i, j;
for (i = 0; i <= m; i++) {
for (j = 0; j <= n; j++) {
w[i][j] = (float)rand() / RAND_MAX;
// w[i][j] = dpn1();
}
}
}
bpnn_randomize_row(w, m) float *w;
int m;
{
int i;
for (i = 0; i <= m; i++) {
// w[i] = (float) rand()/RAND_MAX;
w[i] = 0.1;
}
}
bpnn_zero_weights(w, m, n) float **w;
int m, n;
{
int i, j;
for (i = 0; i <= m; i++) {
for (j = 0; j <= n; j++) {
w[i][j] = 0.0;
}
}
}
void bpnn_initialize(seed) {
printf("Random number generator seed: %d\n", seed);
srand(seed);
}
BPNN *bpnn_internal_create(n_in, n_hidden, n_out)
int n_in, n_hidden, n_out;
{
BPNN *newnet;
newnet = (BPNN *)malloc(sizeof(BPNN));
if (newnet == NULL) {
printf("BPNN_CREATE: Couldn't allocate neural network\n");
return (NULL);
}
newnet->input_n = n_in;
newnet->hidden_n = n_hidden;
newnet->output_n = n_out;
newnet->input_units = alloc_1d_dbl(n_in + 1);
newnet->hidden_units = alloc_1d_dbl(n_hidden + 1);
newnet->output_units = alloc_1d_dbl(n_out + 1);
newnet->hidden_delta = alloc_1d_dbl(n_hidden + 1);
newnet->output_delta = alloc_1d_dbl(n_out + 1);
newnet->target = alloc_1d_dbl(n_out + 1);
newnet->input_weights = alloc_2d_dbl(n_in + 1, n_hidden + 1);
newnet->hidden_weights = alloc_2d_dbl(n_hidden + 1, n_out + 1);
newnet->input_prev_weights = alloc_2d_dbl(n_in + 1, n_hidden + 1);
newnet->hidden_prev_weights = alloc_2d_dbl(n_hidden + 1, n_out + 1);
return (newnet);
}
void bpnn_free(net) BPNN *net;
{
int n1, n2, i;
n1 = net->input_n;
n2 = net->hidden_n;
free((char *)net->input_units);
free((char *)net->hidden_units);
free((char *)net->output_units);
free((char *)net->hidden_delta);
free((char *)net->output_delta);
free((char *)net->target);
for (i = 0; i <= n1; i++) {
free((char *)net->input_weights[i]);
free((char *)net->input_prev_weights[i]);
}
free((char *)net->input_weights);
free((char *)net->input_prev_weights);
for (i = 0; i <= n2; i++) {
free((char *)net->hidden_weights[i]);
free((char *)net->hidden_prev_weights[i]);
}
free((char *)net->hidden_weights);
free((char *)net->hidden_prev_weights);
free((char *)net);
}
/*** Creates a new fully-connected network from scratch,
with the given numbers of input, hidden, and output units.
Threshold units are automatically included. All weights are
randomly initialized.
Space is also allocated for temporary storage (momentum weights,
error computations, etc).
***/
BPNN *bpnn_create(n_in, n_hidden, n_out)
int n_in, n_hidden, n_out;
{
BPNN *newnet;
newnet = bpnn_internal_create(n_in, n_hidden, n_out);
#ifdef INITZERO
bpnn_zero_weights(newnet->input_weights, n_in, n_hidden);
#else
bpnn_randomize_weights(newnet->input_weights, n_in, n_hidden);
#endif
bpnn_randomize_weights(newnet->hidden_weights, n_hidden, n_out);
bpnn_zero_weights(newnet->input_prev_weights, n_in, n_hidden);
bpnn_zero_weights(newnet->hidden_prev_weights, n_hidden, n_out);
bpnn_randomize_row(newnet->target, n_out);
return (newnet);
}
void bpnn_layerforward(l1, l2, conn, n1, n2) float *l1, *l2, **conn;
int n1, n2;
{
float sum;
int j, k;
/*** Set up thresholding unit ***/
l1[0] = 1.0;
#ifdef OPEN
omp_set_num_threads(NUM_THREAD);
#pragma omp parallel for shared(conn, n1, n2, l1) private(k, j) reduction(+: sum) schedule(static)
#endif
/*** For each unit in second layer ***/
for (j = 1; j <= n2; j++) {
/*** Compute weighted sum of its inputs ***/
sum = 0.0;
for (k = 0; k <= n1; k++) {
sum += conn[k][j] * l1[k];
}
l2[j] = squash(sum);
}
}
// extern "C"
void bpnn_output_error(delta, target, output, nj, err) float *delta, *target,
*output, *err;
int nj;
{
int j;
float o, t, errsum;
errsum = 0.0;
for (j = 1; j <= nj; j++) {
o = output[j];
t = target[j];
delta[j] = o * (1.0 - o) * (t - o);
errsum += ABS(delta[j]);
}
*err = errsum;
}
void bpnn_hidden_error(delta_h, nh, delta_o, no, who, hidden,
err) float *delta_h,
*delta_o, *hidden, **who, *err;
int nh, no;
{
int j, k;
float h, sum, errsum;
errsum = 0.0;
for (j = 1; j <= nh; j++) {
h = hidden[j];
sum = 0.0;
for (k = 1; k <= no; k++) {
sum += delta_o[k] * who[j][k];
}
delta_h[j] = h * (1.0 - h) * sum;
errsum += ABS(delta_h[j]);
}
*err = errsum;
}
void bpnn_adjust_weights(delta, ndelta, ly, nly, w, oldw) float *delta, *ly,
**w, **oldw;
{
float new_dw;
int k, j;
ly[0] = 1.0;
// eta = 0.3;
// momentum = 0.3;
#ifdef OPEN
omp_set_num_threads(NUM_THREAD);
#pragma omp parallel for shared(oldw, w, delta) private(j, k, new_dw) \
firstprivate(ndelta, nly, momentum)
#endif
for (j = 1; j <= ndelta; j++) {
for (k = 0; k <= nly; k++) {
new_dw = ((ETA * delta[j] * ly[k]) + (MOMENTUM * oldw[k][j]));
w[k][j] += new_dw;
oldw[k][j] = new_dw;
}
}
}
void bpnn_feedforward(net) BPNN *net;
{
int in, hid, out;
in = net->input_n;
hid = net->hidden_n;
out = net->output_n;
/*** Feed forward input activations. ***/
bpnn_layerforward(net->input_units, net->hidden_units, net->input_weights, in,
hid);
bpnn_layerforward(net->hidden_units, net->output_units, net->hidden_weights,
hid, out);
}
void bpnn_train(net, eo, eh) BPNN *net;
float *eo, *eh;
{
int in, hid, out;
float out_err, hid_err;
in = net->input_n;
hid = net->hidden_n;
out = net->output_n;
/*** Feed forward input activations. ***/
bpnn_layerforward(net->input_units, net->hidden_units, net->input_weights, in,
hid);
bpnn_layerforward(net->hidden_units, net->output_units, net->hidden_weights,
hid, out);
/*** Compute error on output and hidden units. ***/
bpnn_output_error(net->output_delta, net->target, net->output_units, out,
&out_err);
bpnn_hidden_error(net->hidden_delta, hid, net->output_delta, out,
net->hidden_weights, net->hidden_units, &hid_err);
*eo = out_err;
*eh = hid_err;
/*** Adjust input and hidden weights. ***/
bpnn_adjust_weights(net->output_delta, out, net->hidden_units, hid,
net->hidden_weights, net->hidden_prev_weights);
bpnn_adjust_weights(net->hidden_delta, hid, net->input_units, in,
net->input_weights, net->input_prev_weights);
}
void bpnn_save(net, filename) BPNN *net;
char *filename;
{
int n1, n2, n3, i, j, memcnt;
float dvalue, **w;
char *mem;
/// add//
FILE *pFile;
pFile = fopen(filename, "w+");
///////
/*
if ((fd = creat(filename, 0644)) == -1) {
printf("BPNN_SAVE: Cannot create '%s'\n", filename);
return;
}
*/
n1 = net->input_n;
n2 = net->hidden_n;
n3 = net->output_n;
printf("Saving %dx%dx%d network to '%s'\n", n1, n2, n3, filename);
// fflush(stdout);
// write(fd, (char *) &n1, sizeof(int));
// write(fd, (char *) &n2, sizeof(int));
// write(fd, (char *) &n3, sizeof(int));
fwrite((char *)&n1, sizeof(char), sizeof(char), pFile);
fwrite((char *)&n2, sizeof(char), sizeof(char), pFile);
fwrite((char *)&n3, sizeof(char), sizeof(char), pFile);
memcnt = 0;
w = net->input_weights;
mem = (char *)malloc((unsigned)((n1 + 1) * (n2 + 1) * sizeof(float)));
for (i = 0; i <= n1; i++) {
for (j = 0; j <= n2; j++) {
dvalue = w[i][j];
fastcopy(&mem[memcnt], &dvalue, sizeof(float));
memcnt += sizeof(float);
}
}
// write(fd, mem, (n1+1) * (n2+1) * sizeof(float));
fwrite(mem, (unsigned)(sizeof(float)),
(unsigned)((n1 + 1) * (n2 + 1) * sizeof(float)), pFile);
free(mem);
memcnt = 0;
w = net->hidden_weights;
mem = (char *)malloc((unsigned)((n2 + 1) * (n3 + 1) * sizeof(float)));
for (i = 0; i <= n2; i++) {
for (j = 0; j <= n3; j++) {
dvalue = w[i][j];
fastcopy(&mem[memcnt], &dvalue, sizeof(float));
memcnt += sizeof(float);
}
}
// write(fd, mem, (n2+1) * (n3+1) * sizeof(float));
fwrite(mem, sizeof(float), (unsigned)((n2 + 1) * (n3 + 1) * sizeof(float)),
pFile);
free(mem);
fclose(pFile);
return;
}
BPNN *bpnn_read(filename)
char *filename;
{
char *mem;
BPNN *new;
int fd, n1, n2, n3, i, j, memcnt;
if ((fd = open(filename, 0, 0644)) == -1) {
return (NULL);
}
printf("Reading '%s'\n", filename); // fflush(stdout);
read(fd, (char *)&n1, sizeof(int));
read(fd, (char *)&n2, sizeof(int));
read(fd, (char *)&n3, sizeof(int));
new = bpnn_internal_create(n1, n2, n3);
printf("'%s' contains a %dx%dx%d network\n", filename, n1, n2, n3);
printf("Reading input weights..."); // fflush(stdout);
memcnt = 0;
mem = (char *)malloc((unsigned)((n1 + 1) * (n2 + 1) * sizeof(float)));
read(fd, mem, (n1 + 1) * (n2 + 1) * sizeof(float));
for (i = 0; i <= n1; i++) {
for (j = 0; j <= n2; j++) {
fastcopy(&(new->input_weights[i][j]), &mem[memcnt], sizeof(float));
memcnt += sizeof(float);
}
}
free(mem);
printf("Done\nReading hidden weights..."); // fflush(stdout);
memcnt = 0;
mem = (char *)malloc((unsigned)((n2 + 1) * (n3 + 1) * sizeof(float)));
read(fd, mem, (n2 + 1) * (n3 + 1) * sizeof(float));
for (i = 0; i <= n2; i++) {
for (j = 0; j <= n3; j++) {
fastcopy(&(new->hidden_weights[i][j]), &mem[memcnt], sizeof(float));
memcnt += sizeof(float);
}
}
free(mem);
close(fd);
printf("Done\n"); // fflush(stdout);
bpnn_zero_weights(new->input_prev_weights, n1, n2);
bpnn_zero_weights(new->hidden_prev_weights, n2, n3);
return (new);
}

View File

@ -1,50 +0,0 @@
#ifndef _BACKPROP_H_
#define _BACKPROP_H_
#define BIGRND 0x7fffffff
#define GPU
#define THREADS 256
#define WIDTH 16 // shared memory width
#define HEIGHT 16 // shared memory height
#define ETA 0.3 // eta value
#define MOMENTUM 0.3 // momentum value
#define NUM_THREAD 4 // OpenMP threads
typedef struct {
int input_n; /* number of input units */
int hidden_n; /* number of hidden units */
int output_n; /* number of output units */
float *input_units; /* the input units */
float *hidden_units; /* the hidden units */
float *output_units; /* the output units */
float *hidden_delta; /* storage for hidden unit error */
float *output_delta; /* storage for output unit error */
float *target; /* storage for target vector */
float **input_weights; /* weights from input to hidden layer */
float **hidden_weights; /* weights from hidden to output layer */
/*** The next two are for momentum ***/
float **input_prev_weights; /* previous change on input to hidden wgt */
float **hidden_prev_weights; /* previous change on hidden to output wgt */
} BPNN;
/*** User-level functions ***/
void bpnn_initialize();
BPNN *bpnn_create();
void bpnn_free();
void bpnn_train();
void bpnn_feedforward();
void bpnn_save();
BPNN *bpnn_read();
#endif

View File

@ -1,615 +0,0 @@
; ModuleID = 'backprop_cuda-cuda-nvptx64-nvidia-cuda-sm_61.bc'
source_filename = "backprop_cuda.cu"
target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
target triple = "nvptx64-nvidia-cuda"
%struct.__cuda_builtin_blockIdx_t = type { i8 }
%struct.__cuda_builtin_threadIdx_t = type { i8 }
%struct.cudaFuncAttributes = type { i64, i64, i64, i32, i32, i32, i32, i32, i32, i32 }
$_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_yEv = comdat any
$_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv = comdat any
$_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_yEv = comdat any
@blockIdx = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_blockIdx_t, align 1
@threadIdx = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_threadIdx_t, align 1
@_ZZ22bpnn_layerforward_CUDAPfS_S_S_iiE10input_node = internal addrspace(3) global [16 x float] undef, align 4
@_ZZ22bpnn_layerforward_CUDAPfS_S_S_iiE13weight_matrix = internal addrspace(3) global [16 x [16 x float]] undef, align 4
@"$str" = private addrspace(1) constant [11 x i8] c"__CUDA_FTZ\00"
; Function Attrs: convergent noinline nounwind optnone
define weak dso_local i32 @cudaMalloc(i8** %p, i64 %s) #0 {
entry:
%p.addr = alloca i8**, align 8
%s.addr = alloca i64, align 8
store i8** %p, i8*** %p.addr, align 8
store i64 %s, i64* %s.addr, align 8
ret i32 999
}
; Function Attrs: convergent noinline nounwind optnone
define weak dso_local i32 @cudaFuncGetAttributes(%struct.cudaFuncAttributes* %p, i8* %c) #0 {
entry:
%p.addr = alloca %struct.cudaFuncAttributes*, align 8
%c.addr = alloca i8*, align 8
store %struct.cudaFuncAttributes* %p, %struct.cudaFuncAttributes** %p.addr, align 8
store i8* %c, i8** %c.addr, align 8
ret i32 999
}
; Function Attrs: convergent noinline nounwind optnone
define weak dso_local i32 @cudaDeviceGetAttribute(i32* %value, i32 %attr, i32 %device) #0 {
entry:
%value.addr = alloca i32*, align 8
%attr.addr = alloca i32, align 4
%device.addr = alloca i32, align 4
store i32* %value, i32** %value.addr, align 8
store i32 %attr, i32* %attr.addr, align 4
store i32 %device, i32* %device.addr, align 4
ret i32 999
}
; Function Attrs: convergent noinline nounwind optnone
define weak dso_local i32 @cudaGetDevice(i32* %device) #0 {
entry:
%device.addr = alloca i32*, align 8
store i32* %device, i32** %device.addr, align 8
ret i32 999
}
; Function Attrs: convergent noinline nounwind optnone
define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessor(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize) #0 {
entry:
%numBlocks.addr = alloca i32*, align 8
%func.addr = alloca i8*, align 8
%blockSize.addr = alloca i32, align 4
%dynamicSmemSize.addr = alloca i64, align 8
store i32* %numBlocks, i32** %numBlocks.addr, align 8
store i8* %func, i8** %func.addr, align 8
store i32 %blockSize, i32* %blockSize.addr, align 4
store i64 %dynamicSmemSize, i64* %dynamicSmemSize.addr, align 8
ret i32 999
}
; Function Attrs: convergent noinline nounwind optnone
define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize, i32 %flags) #0 {
entry:
%numBlocks.addr = alloca i32*, align 8
%func.addr = alloca i8*, align 8
%blockSize.addr = alloca i32, align 4
%dynamicSmemSize.addr = alloca i64, align 8
%flags.addr = alloca i32, align 4
store i32* %numBlocks, i32** %numBlocks.addr, align 8
store i8* %func, i8** %func.addr, align 8
store i32 %blockSize, i32* %blockSize.addr, align 4
store i64 %dynamicSmemSize, i64* %dynamicSmemSize.addr, align 8
store i32 %flags, i32* %flags.addr, align 4
ret i32 999
}
; Function Attrs: convergent noinline nounwind optnone
define dso_local void @_Z22bpnn_layerforward_CUDAPfS_S_S_ii(float* %input_cuda, float* %output_hidden_cuda, float* %input_hidden_cuda, float* %hidden_partial_sum, i32 %in, i32 %hid) #0 {
entry:
%input_cuda.addr = alloca float*, align 8
%output_hidden_cuda.addr = alloca float*, align 8
%input_hidden_cuda.addr = alloca float*, align 8
%hidden_partial_sum.addr = alloca float*, align 8
%in.addr = alloca i32, align 4
%hid.addr = alloca i32, align 4
%by = alloca i32, align 4
%tx = alloca i32, align 4
%ty = alloca i32, align 4
%index = alloca i32, align 4
%index_in = alloca i32, align 4
%i = alloca i32, align 4
%power_two = alloca i32, align 4
store float* %input_cuda, float** %input_cuda.addr, align 8
store float* %output_hidden_cuda, float** %output_hidden_cuda.addr, align 8
store float* %input_hidden_cuda, float** %input_hidden_cuda.addr, align 8
store float* %hidden_partial_sum, float** %hidden_partial_sum.addr, align 8
store i32 %in, i32* %in.addr, align 4
store i32 %hid, i32* %hid.addr, align 4
%call = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_yEv() #2
store i32 %call, i32* %by, align 4
%call1 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #2
store i32 %call1, i32* %tx, align 4
%call2 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_yEv() #2
store i32 %call2, i32* %ty, align 4
%0 = load i32, i32* %hid.addr, align 4
%add = add nsw i32 %0, 1
%mul = mul nsw i32 %add, 16
%1 = load i32, i32* %by, align 4
%mul3 = mul nsw i32 %mul, %1
%2 = load i32, i32* %hid.addr, align 4
%add4 = add nsw i32 %2, 1
%3 = load i32, i32* %ty, align 4
%mul5 = mul nsw i32 %add4, %3
%add6 = add nsw i32 %mul3, %mul5
%4 = load i32, i32* %tx, align 4
%add7 = add nsw i32 %add6, %4
%add8 = add nsw i32 %add7, 1
%5 = load i32, i32* %hid.addr, align 4
%add9 = add nsw i32 %5, 1
%add10 = add nsw i32 %add8, %add9
store i32 %add10, i32* %index, align 4
%6 = load i32, i32* %by, align 4
%mul11 = mul nsw i32 16, %6
%7 = load i32, i32* %ty, align 4
%add12 = add nsw i32 %mul11, %7
%add13 = add nsw i32 %add12, 1
store i32 %add13, i32* %index_in, align 4
%8 = load i32, i32* %tx, align 4
%cmp = icmp eq i32 %8, 0
br i1 %cmp, label %if.then, label %if.end
if.then: ; preds = %entry
%9 = load float*, float** %input_cuda.addr, align 8
%10 = load i32, i32* %index_in, align 4
%idxprom = sext i32 %10 to i64
%arrayidx = getelementptr inbounds float, float* %9, i64 %idxprom
%11 = load float, float* %arrayidx, align 4
%12 = load i32, i32* %ty, align 4
%idxprom14 = sext i32 %12 to i64
%arrayidx15 = getelementptr inbounds [16 x float], [16 x float]* addrspacecast ([16 x float] addrspace(3)* @_ZZ22bpnn_layerforward_CUDAPfS_S_S_iiE10input_node to [16 x float]*), i64 0, i64 %idxprom14
store float %11, float* %arrayidx15, align 4
br label %if.end
if.end: ; preds = %if.then, %entry
call void @llvm.nvvm.barrier0()
%13 = load float*, float** %input_hidden_cuda.addr, align 8
%14 = load i32, i32* %index, align 4
%idxprom16 = sext i32 %14 to i64
%arrayidx17 = getelementptr inbounds float, float* %13, i64 %idxprom16
%15 = load float, float* %arrayidx17, align 4
%16 = load i32, i32* %ty, align 4
%idxprom18 = sext i32 %16 to i64
%arrayidx19 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ22bpnn_layerforward_CUDAPfS_S_S_iiE13weight_matrix to [16 x [16 x float]]*), i64 0, i64 %idxprom18
%17 = load i32, i32* %tx, align 4
%idxprom20 = sext i32 %17 to i64
%arrayidx21 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx19, i64 0, i64 %idxprom20
store float %15, float* %arrayidx21, align 4
call void @llvm.nvvm.barrier0()
%18 = load i32, i32* %ty, align 4
%idxprom22 = sext i32 %18 to i64
%arrayidx23 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ22bpnn_layerforward_CUDAPfS_S_S_iiE13weight_matrix to [16 x [16 x float]]*), i64 0, i64 %idxprom22
%19 = load i32, i32* %tx, align 4
%idxprom24 = sext i32 %19 to i64
%arrayidx25 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx23, i64 0, i64 %idxprom24
%20 = load float, float* %arrayidx25, align 4
%21 = load i32, i32* %ty, align 4
%idxprom26 = sext i32 %21 to i64
%arrayidx27 = getelementptr inbounds [16 x float], [16 x float]* addrspacecast ([16 x float] addrspace(3)* @_ZZ22bpnn_layerforward_CUDAPfS_S_S_iiE10input_node to [16 x float]*), i64 0, i64 %idxprom26
%22 = load float, float* %arrayidx27, align 4
%mul28 = fmul contract float %20, %22
%23 = load i32, i32* %ty, align 4
%idxprom29 = sext i32 %23 to i64
%arrayidx30 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ22bpnn_layerforward_CUDAPfS_S_S_iiE13weight_matrix to [16 x [16 x float]]*), i64 0, i64 %idxprom29
%24 = load i32, i32* %tx, align 4
%idxprom31 = sext i32 %24 to i64
%arrayidx32 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx30, i64 0, i64 %idxprom31
store float %mul28, float* %arrayidx32, align 4
call void @llvm.nvvm.barrier0()
store i32 1, i32* %i, align 4
br label %for.cond
for.cond: ; preds = %for.inc, %if.end
%25 = load i32, i32* %i, align 4
%conv = sitofp i32 %25 to float
%call33 = call float @_ZL7__log2ff(float 1.600000e+01) #2
%cmp34 = fcmp ole float %conv, %call33
br i1 %cmp34, label %for.body, label %for.end
for.body: ; preds = %for.cond
%26 = load i32, i32* %i, align 4
%conv35 = sitofp i32 %26 to float
%call36 = call float @_ZL6__powfff(float 2.000000e+00, float %conv35) #2
%conv37 = fptosi float %call36 to i32
store i32 %conv37, i32* %power_two, align 4
%27 = load i32, i32* %ty, align 4
%28 = load i32, i32* %power_two, align 4
%rem = srem i32 %27, %28
%cmp38 = icmp eq i32 %rem, 0
br i1 %cmp38, label %if.then39, label %if.end54
if.then39: ; preds = %for.body
%29 = load i32, i32* %ty, align 4
%idxprom40 = sext i32 %29 to i64
%arrayidx41 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ22bpnn_layerforward_CUDAPfS_S_S_iiE13weight_matrix to [16 x [16 x float]]*), i64 0, i64 %idxprom40
%30 = load i32, i32* %tx, align 4
%idxprom42 = sext i32 %30 to i64
%arrayidx43 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx41, i64 0, i64 %idxprom42
%31 = load float, float* %arrayidx43, align 4
%32 = load i32, i32* %ty, align 4
%33 = load i32, i32* %power_two, align 4
%div = sdiv i32 %33, 2
%add44 = add nsw i32 %32, %div
%idxprom45 = sext i32 %add44 to i64
%arrayidx46 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ22bpnn_layerforward_CUDAPfS_S_S_iiE13weight_matrix to [16 x [16 x float]]*), i64 0, i64 %idxprom45
%34 = load i32, i32* %tx, align 4
%idxprom47 = sext i32 %34 to i64
%arrayidx48 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx46, i64 0, i64 %idxprom47
%35 = load float, float* %arrayidx48, align 4
%add49 = fadd contract float %31, %35
%36 = load i32, i32* %ty, align 4
%idxprom50 = sext i32 %36 to i64
%arrayidx51 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ22bpnn_layerforward_CUDAPfS_S_S_iiE13weight_matrix to [16 x [16 x float]]*), i64 0, i64 %idxprom50
%37 = load i32, i32* %tx, align 4
%idxprom52 = sext i32 %37 to i64
%arrayidx53 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx51, i64 0, i64 %idxprom52
store float %add49, float* %arrayidx53, align 4
br label %if.end54
if.end54: ; preds = %if.then39, %for.body
call void @llvm.nvvm.barrier0()
br label %for.inc
for.inc: ; preds = %if.end54
%38 = load i32, i32* %i, align 4
%inc = add nsw i32 %38, 1
store i32 %inc, i32* %i, align 4
br label %for.cond
for.end: ; preds = %for.cond
%39 = load i32, i32* %ty, align 4
%idxprom55 = sext i32 %39 to i64
%arrayidx56 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ22bpnn_layerforward_CUDAPfS_S_S_iiE13weight_matrix to [16 x [16 x float]]*), i64 0, i64 %idxprom55
%40 = load i32, i32* %tx, align 4
%idxprom57 = sext i32 %40 to i64
%arrayidx58 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx56, i64 0, i64 %idxprom57
%41 = load float, float* %arrayidx58, align 4
%42 = load float*, float** %input_hidden_cuda.addr, align 8
%43 = load i32, i32* %index, align 4
%idxprom59 = sext i32 %43 to i64
%arrayidx60 = getelementptr inbounds float, float* %42, i64 %idxprom59
store float %41, float* %arrayidx60, align 4
call void @llvm.nvvm.barrier0()
%44 = load i32, i32* %tx, align 4
%cmp61 = icmp eq i32 %44, 0
br i1 %cmp61, label %if.then62, label %if.end71
if.then62: ; preds = %for.end
%45 = load i32, i32* %tx, align 4
%idxprom63 = sext i32 %45 to i64
%arrayidx64 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ22bpnn_layerforward_CUDAPfS_S_S_iiE13weight_matrix to [16 x [16 x float]]*), i64 0, i64 %idxprom63
%46 = load i32, i32* %ty, align 4
%idxprom65 = sext i32 %46 to i64
%arrayidx66 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx64, i64 0, i64 %idxprom65
%47 = load float, float* %arrayidx66, align 4
%48 = load float*, float** %hidden_partial_sum.addr, align 8
%49 = load i32, i32* %by, align 4
%50 = load i32, i32* %hid.addr, align 4
%mul67 = mul nsw i32 %49, %50
%51 = load i32, i32* %ty, align 4
%add68 = add nsw i32 %mul67, %51
%idxprom69 = sext i32 %add68 to i64
%arrayidx70 = getelementptr inbounds float, float* %48, i64 %idxprom69
store float %47, float* %arrayidx70, align 4
br label %if.end71
if.end71: ; preds = %if.then62, %for.end
ret void
}
; Function Attrs: alwaysinline convergent nounwind
define linkonce_odr dso_local i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_yEv() #1 comdat align 2 {
entry:
%0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y()
ret i32 %0
}
; Function Attrs: alwaysinline convergent nounwind
define linkonce_odr dso_local i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #1 comdat align 2 {
entry:
%0 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
ret i32 %0
}
; Function Attrs: alwaysinline convergent nounwind
define linkonce_odr dso_local i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_yEv() #1 comdat align 2 {
entry:
%0 = call i32 @llvm.nvvm.read.ptx.sreg.tid.y()
ret i32 %0
}
; Function Attrs: convergent nounwind
declare void @llvm.nvvm.barrier0() #2
; Function Attrs: alwaysinline convergent nounwind
define internal float @_ZL7__log2ff(float %__a) #1 {
entry:
%__a.addr = alloca float, align 4
store float %__a, float* %__a.addr, align 4
%0 = load float, float* %__a.addr, align 4
%call = call float @__nv_fast_log2f(float %0) #2
ret float %call
}
; Function Attrs: alwaysinline convergent nounwind
define internal float @_ZL6__powfff(float %__a, float %__b) #1 {
entry:
%__a.addr = alloca float, align 4
%__b.addr = alloca float, align 4
store float %__a, float* %__a.addr, align 4
store float %__b, float* %__b.addr, align 4
%0 = load float, float* %__a.addr, align 4
%1 = load float, float* %__b.addr, align 4
%call = call float @__nv_fast_powf(float %0, float %1) #2
ret float %call
}
; Function Attrs: convergent noinline nounwind optnone
define dso_local void @_Z24bpnn_adjust_weights_cudaPfiS_iS_S_(float* %delta, i32 %hid, float* %ly, i32 %in, float* %w, float* %oldw) #0 {
entry:
%delta.addr = alloca float*, align 8
%hid.addr = alloca i32, align 4
%ly.addr = alloca float*, align 8
%in.addr = alloca i32, align 4
%w.addr = alloca float*, align 8
%oldw.addr = alloca float*, align 8
%by = alloca i32, align 4
%tx = alloca i32, align 4
%ty = alloca i32, align 4
%index = alloca i32, align 4
%index_y = alloca i32, align 4
%index_x = alloca i32, align 4
store float* %delta, float** %delta.addr, align 8
store i32 %hid, i32* %hid.addr, align 4
store float* %ly, float** %ly.addr, align 8
store i32 %in, i32* %in.addr, align 4
store float* %w, float** %w.addr, align 8
store float* %oldw, float** %oldw.addr, align 8
%call = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_yEv() #2
store i32 %call, i32* %by, align 4
%call1 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #2
store i32 %call1, i32* %tx, align 4
%call2 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_yEv() #2
store i32 %call2, i32* %ty, align 4
%0 = load i32, i32* %hid.addr, align 4
%add = add nsw i32 %0, 1
%mul = mul nsw i32 %add, 16
%1 = load i32, i32* %by, align 4
%mul3 = mul nsw i32 %mul, %1
%2 = load i32, i32* %hid.addr, align 4
%add4 = add nsw i32 %2, 1
%3 = load i32, i32* %ty, align 4
%mul5 = mul nsw i32 %add4, %3
%add6 = add nsw i32 %mul3, %mul5
%4 = load i32, i32* %tx, align 4
%add7 = add nsw i32 %add6, %4
%add8 = add nsw i32 %add7, 1
%5 = load i32, i32* %hid.addr, align 4
%add9 = add nsw i32 %5, 1
%add10 = add nsw i32 %add8, %add9
store i32 %add10, i32* %index, align 4
%6 = load i32, i32* %by, align 4
%mul11 = mul nsw i32 16, %6
%7 = load i32, i32* %ty, align 4
%add12 = add nsw i32 %mul11, %7
%add13 = add nsw i32 %add12, 1
store i32 %add13, i32* %index_y, align 4
%8 = load i32, i32* %tx, align 4
%add14 = add nsw i32 %8, 1
store i32 %add14, i32* %index_x, align 4
%9 = load float*, float** %delta.addr, align 8
%10 = load i32, i32* %index_x, align 4
%idxprom = sext i32 %10 to i64
%arrayidx = getelementptr inbounds float, float* %9, i64 %idxprom
%11 = load float, float* %arrayidx, align 4
%conv = fpext float %11 to double
%mul15 = fmul contract double 3.000000e-01, %conv
%12 = load float*, float** %ly.addr, align 8
%13 = load i32, i32* %index_y, align 4
%idxprom16 = sext i32 %13 to i64
%arrayidx17 = getelementptr inbounds float, float* %12, i64 %idxprom16
%14 = load float, float* %arrayidx17, align 4
%conv18 = fpext float %14 to double
%mul19 = fmul contract double %mul15, %conv18
%15 = load float*, float** %oldw.addr, align 8
%16 = load i32, i32* %index, align 4
%idxprom20 = sext i32 %16 to i64
%arrayidx21 = getelementptr inbounds float, float* %15, i64 %idxprom20
%17 = load float, float* %arrayidx21, align 4
%conv22 = fpext float %17 to double
%mul23 = fmul contract double 3.000000e-01, %conv22
%add24 = fadd contract double %mul19, %mul23
%18 = load float*, float** %w.addr, align 8
%19 = load i32, i32* %index, align 4
%idxprom25 = sext i32 %19 to i64
%arrayidx26 = getelementptr inbounds float, float* %18, i64 %idxprom25
%20 = load float, float* %arrayidx26, align 4
%conv27 = fpext float %20 to double
%add28 = fadd contract double %conv27, %add24
%conv29 = fptrunc double %add28 to float
store float %conv29, float* %arrayidx26, align 4
%21 = load float*, float** %delta.addr, align 8
%22 = load i32, i32* %index_x, align 4
%idxprom30 = sext i32 %22 to i64
%arrayidx31 = getelementptr inbounds float, float* %21, i64 %idxprom30
%23 = load float, float* %arrayidx31, align 4
%conv32 = fpext float %23 to double
%mul33 = fmul contract double 3.000000e-01, %conv32
%24 = load float*, float** %ly.addr, align 8
%25 = load i32, i32* %index_y, align 4
%idxprom34 = sext i32 %25 to i64
%arrayidx35 = getelementptr inbounds float, float* %24, i64 %idxprom34
%26 = load float, float* %arrayidx35, align 4
%conv36 = fpext float %26 to double
%mul37 = fmul contract double %mul33, %conv36
%27 = load float*, float** %oldw.addr, align 8
%28 = load i32, i32* %index, align 4
%idxprom38 = sext i32 %28 to i64
%arrayidx39 = getelementptr inbounds float, float* %27, i64 %idxprom38
%29 = load float, float* %arrayidx39, align 4
%conv40 = fpext float %29 to double
%mul41 = fmul contract double 3.000000e-01, %conv40
%add42 = fadd contract double %mul37, %mul41
%conv43 = fptrunc double %add42 to float
%30 = load float*, float** %oldw.addr, align 8
%31 = load i32, i32* %index, align 4
%idxprom44 = sext i32 %31 to i64
%arrayidx45 = getelementptr inbounds float, float* %30, i64 %idxprom44
store float %conv43, float* %arrayidx45, align 4
call void @llvm.nvvm.barrier0()
%32 = load i32, i32* %ty, align 4
%cmp = icmp eq i32 %32, 0
br i1 %cmp, label %land.lhs.true, label %if.end
land.lhs.true: ; preds = %entry
%33 = load i32, i32* %by, align 4
%cmp46 = icmp eq i32 %33, 0
br i1 %cmp46, label %if.then, label %if.end
if.then: ; preds = %land.lhs.true
%34 = load float*, float** %delta.addr, align 8
%35 = load i32, i32* %index_x, align 4
%idxprom47 = sext i32 %35 to i64
%arrayidx48 = getelementptr inbounds float, float* %34, i64 %idxprom47
%36 = load float, float* %arrayidx48, align 4
%conv49 = fpext float %36 to double
%mul50 = fmul contract double 3.000000e-01, %conv49
%37 = load float*, float** %oldw.addr, align 8
%38 = load i32, i32* %index_x, align 4
%idxprom51 = sext i32 %38 to i64
%arrayidx52 = getelementptr inbounds float, float* %37, i64 %idxprom51
%39 = load float, float* %arrayidx52, align 4
%conv53 = fpext float %39 to double
%mul54 = fmul contract double 3.000000e-01, %conv53
%add55 = fadd contract double %mul50, %mul54
%40 = load float*, float** %w.addr, align 8
%41 = load i32, i32* %index_x, align 4
%idxprom56 = sext i32 %41 to i64
%arrayidx57 = getelementptr inbounds float, float* %40, i64 %idxprom56
%42 = load float, float* %arrayidx57, align 4
%conv58 = fpext float %42 to double
%add59 = fadd contract double %conv58, %add55
%conv60 = fptrunc double %add59 to float
store float %conv60, float* %arrayidx57, align 4
%43 = load float*, float** %delta.addr, align 8
%44 = load i32, i32* %index_x, align 4
%idxprom61 = sext i32 %44 to i64
%arrayidx62 = getelementptr inbounds float, float* %43, i64 %idxprom61
%45 = load float, float* %arrayidx62, align 4
%conv63 = fpext float %45 to double
%mul64 = fmul contract double 3.000000e-01, %conv63
%46 = load float*, float** %oldw.addr, align 8
%47 = load i32, i32* %index_x, align 4
%idxprom65 = sext i32 %47 to i64
%arrayidx66 = getelementptr inbounds float, float* %46, i64 %idxprom65
%48 = load float, float* %arrayidx66, align 4
%conv67 = fpext float %48 to double
%mul68 = fmul contract double 3.000000e-01, %conv67
%add69 = fadd contract double %mul64, %mul68
%conv70 = fptrunc double %add69 to float
%49 = load float*, float** %oldw.addr, align 8
%50 = load i32, i32* %index_x, align 4
%idxprom71 = sext i32 %50 to i64
%arrayidx72 = getelementptr inbounds float, float* %49, i64 %idxprom71
store float %conv70, float* %arrayidx72, align 4
br label %if.end
if.end: ; preds = %if.then, %land.lhs.true, %entry
ret void
}
; Function Attrs: nounwind readnone
declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.y() #3
; Function Attrs: nounwind readnone
declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() #3
; Function Attrs: nounwind readnone
declare i32 @llvm.nvvm.read.ptx.sreg.tid.y() #3
; Function Attrs: alwaysinline convergent inlinehint nounwind
define internal float @__nv_fast_log2f(float %a) #4 {
%call.i = call i32 @__nvvm_reflect(i8* addrspacecast (i8 addrspace(1)* getelementptr inbounds ([11 x i8], [11 x i8] addrspace(1)* @"$str", i32 0, i32 0) to i8*))
%1 = icmp ne i32 %call.i, 0
br i1 %1, label %2, label %4
2: ; preds = %0
%3 = call float @llvm.nvvm.lg2.approx.ftz.f(float %a)
br label %__nvvm_builtin_log2f.exit
4: ; preds = %0
%5 = call float @llvm.nvvm.lg2.approx.f(float %a)
br label %__nvvm_builtin_log2f.exit
__nvvm_builtin_log2f.exit: ; preds = %4, %2
%retval.0.i = phi float [ %3, %2 ], [ %5, %4 ]
ret float %retval.0.i
}
; Function Attrs: convergent nounwind
declare i32 @__nvvm_reflect(i8*) #5
; Function Attrs: nounwind readnone
declare float @llvm.nvvm.lg2.approx.ftz.f(float) #3
; Function Attrs: nounwind readnone
declare float @llvm.nvvm.lg2.approx.f(float) #3
; Function Attrs: alwaysinline convergent inlinehint nounwind
define internal float @__nv_fast_powf(float %a, float %b) #4 {
%call.i.i = call i32 @__nvvm_reflect(i8* addrspacecast (i8 addrspace(1)* getelementptr inbounds ([11 x i8], [11 x i8] addrspace(1)* @"$str", i32 0, i32 0) to i8*))
%1 = icmp ne i32 %call.i.i, 0
br i1 %1, label %2, label %4
2: ; preds = %0
%3 = call float @llvm.nvvm.lg2.approx.ftz.f(float %a)
br label %__nv_fast_log2f.exit
4: ; preds = %0
%5 = call float @llvm.nvvm.lg2.approx.f(float %a)
br label %__nv_fast_log2f.exit
__nv_fast_log2f.exit: ; preds = %4, %2
%retval.0.i.i = phi float [ %3, %2 ], [ %5, %4 ]
%6 = fmul float %b, %retval.0.i.i
%call.i.i1 = call i32 @__nvvm_reflect(i8* addrspacecast (i8 addrspace(1)* getelementptr inbounds ([11 x i8], [11 x i8] addrspace(1)* @"$str", i32 0, i32 0) to i8*))
%7 = icmp ne i32 %call.i.i1, 0
br i1 %7, label %8, label %10
8: ; preds = %__nv_fast_log2f.exit
%9 = call float @llvm.nvvm.ex2.approx.ftz.f(float %6)
br label %__nv_exp2f.exit
10: ; preds = %__nv_fast_log2f.exit
%11 = call float @llvm.nvvm.ex2.approx.f(float %6)
br label %__nv_exp2f.exit
__nv_exp2f.exit: ; preds = %10, %8
%retval.0.i.i2 = phi float [ %9, %8 ], [ %11, %10 ]
ret float %retval.0.i.i2
}
; Function Attrs: nounwind readnone
declare float @llvm.nvvm.ex2.approx.ftz.f(float) #3
; Function Attrs: nounwind readnone
declare float @llvm.nvvm.ex2.approx.f(float) #3
attributes #0 = { convergent noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #1 = { alwaysinline convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #2 = { convergent nounwind }
attributes #3 = { nounwind readnone }
attributes #4 = { alwaysinline convergent inlinehint nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #5 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
!llvm.module.flags = !{!0, !1, !2}
!nvvm.annotations = !{!3, !4, !5, !6, !5, !7, !7, !7, !7, !8, !8, !7}
!llvm.ident = !{!9}
!nvvmir.version = !{!10}
!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 10, i32 1]}
!1 = !{i32 1, !"wchar_size", i32 4}
!2 = !{i32 4, !"nvvm-reflect-ftz", i32 0}
!3 = !{void (float*, float*, float*, float*, i32, i32)* @_Z22bpnn_layerforward_CUDAPfS_S_S_ii, !"kernel", i32 1}
!4 = !{void (float*, i32, float*, i32, float*, float*)* @_Z24bpnn_adjust_weights_cudaPfiS_iS_S_, !"kernel", i32 1}
!5 = !{null, !"align", i32 8}
!6 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080}
!7 = !{null, !"align", i32 16}
!8 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088}
!9 = !{!"clang version 10.0.1 (https://github.com/llvm/llvm-project.git ef32c611aa214dea855364efd7ba451ec5ec3f74)"}
!10 = !{i32 1, i32 4}

File diff suppressed because one or more lines are too long

View File

@ -1,195 +0,0 @@
#include <cuda.h>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/time.h>
// includes, kernels
#include "backprop.h"
#include "backprop_cuda_kernel.cu"
////////////////////////////////////////////////////////////////////////////////
extern "C" void bpnn_layerforward(float *l1, float *l2, float **conn, int n1,
int n2);
extern "C" void bpnn_output_error(float *delta, float *target, float *output,
int nj, float *err);
extern "C" void bpnn_hidden_error(float *delta_h, int nh, float *delta_o,
int no, float **who, float *hidden,
float *err);
extern "C" void bpnn_adjust_weights(float *delta, int ndelta, float *ly,
int nly, float **w, float **oldw);
extern "C" int setup(int argc, char **argv);
extern "C" float **alloc_2d_dbl(int m, int n);
extern "C" float squash(float x);
double gettime() {
struct timeval t;
gettimeofday(&t, NULL);
return t.tv_sec + t.tv_usec * 1e-6;
}
unsigned int num_threads = 0;
unsigned int num_blocks = 0;
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv) {
cudaSetDevice(0);
setup(argc, argv);
}
extern "C" void bpnn_train_cuda(BPNN *net, float *eo, float *eh) {
int in, hid, out;
float out_err, hid_err;
in = net->input_n;
hid = net->hidden_n;
out = net->output_n;
#ifdef GPU
int m = 0;
float *input_hidden_cuda;
float *input_cuda;
float *output_hidden_cuda;
float *partial_sum;
float *hidden_partial_sum;
float *hidden_delta_cuda;
float *input_prev_weights_cuda;
float sum;
float *input_weights_one_dim;
float *input_weights_prev_one_dim;
num_blocks = in / 16;
dim3 grid(1, num_blocks);
dim3 threads(16, 16);
input_weights_one_dim = (float *)malloc((in + 1) * (hid + 1) * sizeof(float));
input_weights_prev_one_dim =
(float *)malloc((in + 1) * (hid + 1) * sizeof(float));
partial_sum = (float *)malloc(num_blocks * WIDTH * sizeof(float));
// this preprocessing stage is added to correct the bugs of wrong memcopy
// using two-dimensional net->inputweights
for (int k = 0; k <= in; k++) {
for (int j = 0; j <= hid; j++) {
input_weights_one_dim[m] = net->input_weights[k][j];
input_weights_prev_one_dim[m] = net->input_prev_weights[k][j];
m++;
}
}
cudaMalloc((void **)&input_cuda, (in + 1) * sizeof(float));
cudaMalloc((void **)&output_hidden_cuda, (hid + 1) * sizeof(float));
cudaMalloc((void **)&input_hidden_cuda, (in + 1) * (hid + 1) * sizeof(float));
cudaMalloc((void **)&hidden_partial_sum, num_blocks * WIDTH * sizeof(float));
#endif
#ifdef CPU
printf("Performing CPU computation\n");
bpnn_layerforward(net->input_units, net->hidden_units, net->input_weights, in,
hid);
#endif
#ifdef GPU
printf("Performing GPU computation\n");
// printf("in= %d, hid = %d, numblocks = %d\n", in, hid, num_blocks);
cudaMemcpy(input_cuda, net->input_units, (in + 1) * sizeof(float),
cudaMemcpyHostToDevice);
cudaMemcpy(input_hidden_cuda, input_weights_one_dim,
(in + 1) * (hid + 1) * sizeof(float), cudaMemcpyHostToDevice);
bpnn_layerforward_CUDA<<<grid, threads>>>(input_cuda, output_hidden_cuda,
input_hidden_cuda,
hidden_partial_sum, in, hid);
cudaThreadSynchronize();
cudaError_t error = cudaGetLastError();
if (error != cudaSuccess) {
printf("bpnn kernel error: %s\n", cudaGetErrorString(error));
exit(EXIT_FAILURE);
}
cudaMemcpy(partial_sum, hidden_partial_sum,
num_blocks * WIDTH * sizeof(float), cudaMemcpyDeviceToHost);
for (int j = 1; j <= hid; j++) {
sum = 0.0;
for (int k = 0; k < num_blocks; k++) {
sum += partial_sum[k * hid + j - 1];
}
sum += net->input_weights[0][j];
net->hidden_units[j] = float(1.0 / (1.0 + exp(-sum)));
}
#endif
bpnn_layerforward(net->hidden_units, net->output_units, net->hidden_weights,
hid, out);
bpnn_output_error(net->output_delta, net->target, net->output_units, out,
&out_err);
bpnn_hidden_error(net->hidden_delta, hid, net->output_delta, out,
net->hidden_weights, net->hidden_units, &hid_err);
bpnn_adjust_weights(net->output_delta, out, net->hidden_units, hid,
net->hidden_weights, net->hidden_prev_weights);
#ifdef CPU
bpnn_adjust_weights(net->hidden_delta, hid, net->input_units, in,
net->input_weights, net->input_prev_weights);
#endif
#ifdef GPU
cudaMalloc((void **)&hidden_delta_cuda, (hid + 1) * sizeof(float));
cudaMalloc((void **)&input_prev_weights_cuda,
(in + 1) * (hid + 1) * sizeof(float));
cudaMemcpy(hidden_delta_cuda, net->hidden_delta, (hid + 1) * sizeof(float),
cudaMemcpyHostToDevice);
cudaMemcpy(input_prev_weights_cuda, input_weights_prev_one_dim,
(in + 1) * (hid + 1) * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(input_hidden_cuda, input_weights_one_dim,
(in + 1) * (hid + 1) * sizeof(float), cudaMemcpyHostToDevice);
bpnn_adjust_weights_cuda<<<grid, threads>>>(hidden_delta_cuda, hid,
input_cuda, in, input_hidden_cuda,
input_prev_weights_cuda);
cudaMemcpy(net->input_units, input_cuda, (in + 1) * sizeof(float),
cudaMemcpyDeviceToHost);
cudaMemcpy(input_weights_one_dim, input_hidden_cuda,
(in + 1) * (hid + 1) * sizeof(float), cudaMemcpyDeviceToHost);
for (int i = 0; i < (in + 1) * (hid + 1); i++) {
printf("%f ", input_weights_one_dim[i]);
}
printf("\n");
cudaFree(input_cuda);
cudaFree(output_hidden_cuda);
cudaFree(input_hidden_cuda);
cudaFree(hidden_partial_sum);
cudaFree(input_prev_weights_cuda);
cudaFree(hidden_delta_cuda);
free(partial_sum);
free(input_weights_one_dim);
free(input_weights_prev_one_dim);
#endif
}

View File

@ -1,96 +0,0 @@
#ifndef _BACKPROP_CUDA_KERNEL_H_
#define _BACKPROP_CUDA_KERNEL_H_
#include "backprop.h"
#include "cuda.h"
#include "math.h"
#include <stdio.h>
__global__ void bpnn_layerforward_CUDA(float *input_cuda,
float *output_hidden_cuda,
float *input_hidden_cuda,
float *hidden_partial_sum, int in,
int hid) {
int by = blockIdx.y;
int tx = threadIdx.x;
int ty = threadIdx.y;
int index = (hid + 1) * HEIGHT * by + (hid + 1) * ty + tx + 1 + (hid + 1);
int index_in = HEIGHT * by + ty + 1;
__shared__ float input_node[HEIGHT];
__shared__ float weight_matrix[HEIGHT][WIDTH];
if (tx == 0)
input_node[ty] = input_cuda[index_in];
__syncthreads();
weight_matrix[ty][tx] = input_hidden_cuda[index];
__syncthreads();
weight_matrix[ty][tx] = weight_matrix[ty][tx] * input_node[ty];
__syncthreads();
for (int i = 1; i <= __log2f(HEIGHT); i++) {
int power_two = __powf(2, i);
if (ty % power_two == 0)
weight_matrix[ty][tx] =
weight_matrix[ty][tx] + weight_matrix[ty + power_two / 2][tx];
__syncthreads();
}
//__syncthreads();
input_hidden_cuda[index] = weight_matrix[ty][tx];
/*
for ( unsigned int i = 2 ; i <= HEIGHT ; i *= 2){
unsigned int power_two = i - 1;
if( (ty & power_two) == 0 ) {
weight_matrix[ty][tx] = weight_matrix[ty][tx] +
weight_matrix[ty + power_two/2][tx];
}
}
*/
__syncthreads();
if (tx == 0) {
hidden_partial_sum[by * hid + ty] = weight_matrix[tx][ty];
}
}
__global__ void bpnn_adjust_weights_cuda(float *delta, int hid, float *ly,
int in, float *w, float *oldw) {
int by = blockIdx.y;
int tx = threadIdx.x;
int ty = threadIdx.y;
int index = (hid + 1) * HEIGHT * by + (hid + 1) * ty + tx + 1 + (hid + 1);
int index_y = HEIGHT * by + ty + 1;
int index_x = tx + 1;
// eta = 0.3;
// momentum = 0.3;
w[index] += ((ETA * delta[index_x] * ly[index_y]) + (MOMENTUM * oldw[index]));
oldw[index] =
((ETA * delta[index_x] * ly[index_y]) + (MOMENTUM * oldw[index]));
__syncthreads();
if (ty == 0 && by == 0) {
w[index_x] += ((ETA * delta[index_x]) + (MOMENTUM * oldw[index_x]));
oldw[index_x] = ((ETA * delta[index_x]) + (MOMENTUM * oldw[index_x]));
}
}
#endif

View File

@ -1,48 +0,0 @@
#include "backprop.h"
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
extern char *strcpy();
extern void exit();
int layer_size = 0;
backprop_face() {
BPNN *net;
int i;
float out_err, hid_err;
net = bpnn_create(layer_size, 16, 1); // (16, 1 can not be changed)
printf("Input layer size : %d\n", layer_size);
load(net);
// entering the training kernel, only one iteration
printf("Starting training kernel\n");
bpnn_train_cuda(net, &out_err, &hid_err);
bpnn_free(net);
printf("Training done\n");
}
int setup(argc, argv)
int argc;
char *argv[];
{
int seed;
if (argc != 2) {
fprintf(stderr, "usage: backprop <num of input elements>\n");
exit(0);
}
layer_size = atoi(argv[1]);
if (layer_size % 16 != 0) {
fprintf(stderr, "The number of input points must be divided by 16\n");
exit(0);
}
seed = 7;
bpnn_initialize(seed);
backprop_face();
exit(0);
}

View File

@ -1,22 +0,0 @@
#include "backprop.h"
#include <stdio.h>
#include <stdlib.h>
extern layer_size;
load(net) BPNN *net;
{
float *units;
int nr, nc, imgsize, i, j, k;
nr = layer_size;
imgsize = nr * nc;
units = net->input_units;
k = 1;
for (i = 0; i < nr; i++) {
units[k] = (float)rand() / RAND_MAX;
k++;
}
}

View File

@ -1,28 +0,0 @@
#!/bin/bash
set -e
clang -c -emit-llvm backprop.c
clang -c -emit-llvm facetrain.c
clang -c -emit-llvm imagenet.c
llvm-as backprop_cuda-cuda-nvptx64-nvidia-cuda-sm_61.ll
llvm-as backprop_cuda-host-x86_64-unknown-linux-gnu.ll
../../build/compilation/kernelTranslator backprop_cuda-cuda-nvptx64-nvidia-cuda-sm_61.bc kernel.bc
../../build/compilation/hostTranslator backprop_cuda-host-x86_64-unknown-linux-gnu.bc host.bc
llc --relocation-model=pic --filetype=obj kernel.bc
llc --relocation-model=pic --filetype=obj host.bc
llc --relocation-model=pic --filetype=obj backprop.bc
llc --relocation-model=pic --filetype=obj facetrain.bc
llc --relocation-model=pic --filetype=obj imagenet.bc
export LD_LIBRARY_PATH=../../build/runtime:../../build/runtime/threadPool:$LD_LIBRARY_PATH
g++ -Wall -L../../build/runtime -L../../build/runtime/threadPool -o demo \
-fPIC -no-pie host.o kernel.o backprop.o facetrain.o imagenet.o \
-lc -lx86Runtime -lthreadPool -lpthread
./demo 1024 > res.log
if grep -q -e "0.173289 0.259645 0.350836" res.log; then
echo "Pass"
else
echo "Error result"
exit 1
fi

View File

@ -1,307 +0,0 @@
; ModuleID = 'bfs-cuda-nvptx64-nvidia-cuda-sm_61.bc'
source_filename = "bfs.cu"
target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
target triple = "nvptx64-nvidia-cuda"
%struct.__cuda_builtin_blockIdx_t = type { i8 }
%struct.__cuda_builtin_threadIdx_t = type { i8 }
%struct.cudaFuncAttributes = type { i64, i64, i64, i32, i32, i32, i32, i32, i32, i32 }
%struct.Node = type { i32, i32 }
$_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv = comdat any
$_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv = comdat any
@blockIdx = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_blockIdx_t, align 1
@threadIdx = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_threadIdx_t, align 1
; Function Attrs: convergent noinline nounwind optnone
define weak dso_local i32 @cudaMalloc(i8** %p, i64 %s) #0 {
entry:
%p.addr = alloca i8**, align 8
%s.addr = alloca i64, align 8
store i8** %p, i8*** %p.addr, align 8
store i64 %s, i64* %s.addr, align 8
ret i32 999
}
; Function Attrs: convergent noinline nounwind optnone
define weak dso_local i32 @cudaFuncGetAttributes(%struct.cudaFuncAttributes* %p, i8* %c) #0 {
entry:
%p.addr = alloca %struct.cudaFuncAttributes*, align 8
%c.addr = alloca i8*, align 8
store %struct.cudaFuncAttributes* %p, %struct.cudaFuncAttributes** %p.addr, align 8
store i8* %c, i8** %c.addr, align 8
ret i32 999
}
; Function Attrs: convergent noinline nounwind optnone
define weak dso_local i32 @cudaDeviceGetAttribute(i32* %value, i32 %attr, i32 %device) #0 {
entry:
%value.addr = alloca i32*, align 8
%attr.addr = alloca i32, align 4
%device.addr = alloca i32, align 4
store i32* %value, i32** %value.addr, align 8
store i32 %attr, i32* %attr.addr, align 4
store i32 %device, i32* %device.addr, align 4
ret i32 999
}
; Function Attrs: convergent noinline nounwind optnone
define weak dso_local i32 @cudaGetDevice(i32* %device) #0 {
entry:
%device.addr = alloca i32*, align 8
store i32* %device, i32** %device.addr, align 8
ret i32 999
}
; Function Attrs: convergent noinline nounwind optnone
define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessor(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize) #0 {
entry:
%numBlocks.addr = alloca i32*, align 8
%func.addr = alloca i8*, align 8
%blockSize.addr = alloca i32, align 4
%dynamicSmemSize.addr = alloca i64, align 8
store i32* %numBlocks, i32** %numBlocks.addr, align 8
store i8* %func, i8** %func.addr, align 8
store i32 %blockSize, i32* %blockSize.addr, align 4
store i64 %dynamicSmemSize, i64* %dynamicSmemSize.addr, align 8
ret i32 999
}
; Function Attrs: convergent noinline nounwind optnone
define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize, i32 %flags) #0 {
entry:
%numBlocks.addr = alloca i32*, align 8
%func.addr = alloca i8*, align 8
%blockSize.addr = alloca i32, align 4
%dynamicSmemSize.addr = alloca i64, align 8
%flags.addr = alloca i32, align 4
store i32* %numBlocks, i32** %numBlocks.addr, align 8
store i8* %func, i8** %func.addr, align 8
store i32 %blockSize, i32* %blockSize.addr, align 4
store i64 %dynamicSmemSize, i64* %dynamicSmemSize.addr, align 8
store i32 %flags, i32* %flags.addr, align 4
ret i32 999
}
; Function Attrs: convergent noinline nounwind optnone
define dso_local void @_Z6KernelP4NodePiPbS2_S2_S1_i(%struct.Node* %g_graph_nodes, i32* %g_graph_edges, i8* %g_graph_mask, i8* %g_updating_graph_mask, i8* %g_graph_visited, i32* %g_cost, i32 %no_of_nodes) #0 {
entry:
%g_graph_nodes.addr = alloca %struct.Node*, align 8
%g_graph_edges.addr = alloca i32*, align 8
%g_graph_mask.addr = alloca i8*, align 8
%g_updating_graph_mask.addr = alloca i8*, align 8
%g_graph_visited.addr = alloca i8*, align 8
%g_cost.addr = alloca i32*, align 8
%no_of_nodes.addr = alloca i32, align 4
%tid = alloca i32, align 4
%i = alloca i32, align 4
%id = alloca i32, align 4
store %struct.Node* %g_graph_nodes, %struct.Node** %g_graph_nodes.addr, align 8
store i32* %g_graph_edges, i32** %g_graph_edges.addr, align 8
store i8* %g_graph_mask, i8** %g_graph_mask.addr, align 8
store i8* %g_updating_graph_mask, i8** %g_updating_graph_mask.addr, align 8
store i8* %g_graph_visited, i8** %g_graph_visited.addr, align 8
store i32* %g_cost, i32** %g_cost.addr, align 8
store i32 %no_of_nodes, i32* %no_of_nodes.addr, align 4
%call = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #3
%mul = mul i32 %call, 512
%call1 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #3
%add = add i32 %mul, %call1
store i32 %add, i32* %tid, align 4
%0 = load i32, i32* %tid, align 4
%1 = load i32, i32* %no_of_nodes.addr, align 4
%cmp = icmp slt i32 %0, %1
br i1 %cmp, label %land.lhs.true, label %if.end26
land.lhs.true: ; preds = %entry
%2 = load i8*, i8** %g_graph_mask.addr, align 8
%3 = load i32, i32* %tid, align 4
%idxprom = sext i32 %3 to i64
%arrayidx = getelementptr inbounds i8, i8* %2, i64 %idxprom
%4 = load i8, i8* %arrayidx, align 1
%tobool = trunc i8 %4 to i1
br i1 %tobool, label %if.then, label %if.end26
if.then: ; preds = %land.lhs.true
%5 = load i8*, i8** %g_graph_mask.addr, align 8
%6 = load i32, i32* %tid, align 4
%idxprom2 = sext i32 %6 to i64
%arrayidx3 = getelementptr inbounds i8, i8* %5, i64 %idxprom2
store i8 0, i8* %arrayidx3, align 1
%7 = load %struct.Node*, %struct.Node** %g_graph_nodes.addr, align 8
%8 = load i32, i32* %tid, align 4
%idxprom4 = sext i32 %8 to i64
%arrayidx5 = getelementptr inbounds %struct.Node, %struct.Node* %7, i64 %idxprom4
%starting = getelementptr inbounds %struct.Node, %struct.Node* %arrayidx5, i32 0, i32 0
%9 = load i32, i32* %starting, align 4
store i32 %9, i32* %i, align 4
br label %for.cond
for.cond: ; preds = %for.inc, %if.then
%10 = load i32, i32* %i, align 4
%11 = load %struct.Node*, %struct.Node** %g_graph_nodes.addr, align 8
%12 = load i32, i32* %tid, align 4
%idxprom6 = sext i32 %12 to i64
%arrayidx7 = getelementptr inbounds %struct.Node, %struct.Node* %11, i64 %idxprom6
%no_of_edges = getelementptr inbounds %struct.Node, %struct.Node* %arrayidx7, i32 0, i32 1
%13 = load i32, i32* %no_of_edges, align 4
%14 = load %struct.Node*, %struct.Node** %g_graph_nodes.addr, align 8
%15 = load i32, i32* %tid, align 4
%idxprom8 = sext i32 %15 to i64
%arrayidx9 = getelementptr inbounds %struct.Node, %struct.Node* %14, i64 %idxprom8
%starting10 = getelementptr inbounds %struct.Node, %struct.Node* %arrayidx9, i32 0, i32 0
%16 = load i32, i32* %starting10, align 4
%add11 = add nsw i32 %13, %16
%cmp12 = icmp slt i32 %10, %add11
br i1 %cmp12, label %for.body, label %for.end
for.body: ; preds = %for.cond
%17 = load i32*, i32** %g_graph_edges.addr, align 8
%18 = load i32, i32* %i, align 4
%idxprom13 = sext i32 %18 to i64
%arrayidx14 = getelementptr inbounds i32, i32* %17, i64 %idxprom13
%19 = load i32, i32* %arrayidx14, align 4
store i32 %19, i32* %id, align 4
%20 = load i8*, i8** %g_graph_visited.addr, align 8
%21 = load i32, i32* %id, align 4
%idxprom15 = sext i32 %21 to i64
%arrayidx16 = getelementptr inbounds i8, i8* %20, i64 %idxprom15
%22 = load i8, i8* %arrayidx16, align 1
%tobool17 = trunc i8 %22 to i1
br i1 %tobool17, label %if.end, label %if.then18
if.then18: ; preds = %for.body
%23 = load i32*, i32** %g_cost.addr, align 8
%24 = load i32, i32* %tid, align 4
%idxprom19 = sext i32 %24 to i64
%arrayidx20 = getelementptr inbounds i32, i32* %23, i64 %idxprom19
%25 = load i32, i32* %arrayidx20, align 4
%add21 = add nsw i32 %25, 1
%26 = load i32*, i32** %g_cost.addr, align 8
%27 = load i32, i32* %id, align 4
%idxprom22 = sext i32 %27 to i64
%arrayidx23 = getelementptr inbounds i32, i32* %26, i64 %idxprom22
store i32 %add21, i32* %arrayidx23, align 4
%28 = load i8*, i8** %g_updating_graph_mask.addr, align 8
%29 = load i32, i32* %id, align 4
%idxprom24 = sext i32 %29 to i64
%arrayidx25 = getelementptr inbounds i8, i8* %28, i64 %idxprom24
store i8 1, i8* %arrayidx25, align 1
br label %if.end
if.end: ; preds = %if.then18, %for.body
br label %for.inc
for.inc: ; preds = %if.end
%30 = load i32, i32* %i, align 4
%inc = add nsw i32 %30, 1
store i32 %inc, i32* %i, align 4
br label %for.cond
for.end: ; preds = %for.cond
br label %if.end26
if.end26: ; preds = %for.end, %land.lhs.true, %entry
ret void
}
; Function Attrs: alwaysinline convergent nounwind
define linkonce_odr dso_local i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #1 comdat align 2 {
entry:
%0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
ret i32 %0
}
; Function Attrs: alwaysinline convergent nounwind
define linkonce_odr dso_local i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #1 comdat align 2 {
entry:
%0 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
ret i32 %0
}
; Function Attrs: convergent noinline nounwind optnone
define dso_local void @_Z7Kernel2PbS_S_S_i(i8* %g_graph_mask, i8* %g_updating_graph_mask, i8* %g_graph_visited, i8* %g_over, i32 %no_of_nodes) #0 {
entry:
%g_graph_mask.addr = alloca i8*, align 8
%g_updating_graph_mask.addr = alloca i8*, align 8
%g_graph_visited.addr = alloca i8*, align 8
%g_over.addr = alloca i8*, align 8
%no_of_nodes.addr = alloca i32, align 4
%tid = alloca i32, align 4
store i8* %g_graph_mask, i8** %g_graph_mask.addr, align 8
store i8* %g_updating_graph_mask, i8** %g_updating_graph_mask.addr, align 8
store i8* %g_graph_visited, i8** %g_graph_visited.addr, align 8
store i8* %g_over, i8** %g_over.addr, align 8
store i32 %no_of_nodes, i32* %no_of_nodes.addr, align 4
%call = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #3
%mul = mul i32 %call, 512
%call1 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #3
%add = add i32 %mul, %call1
store i32 %add, i32* %tid, align 4
%0 = load i32, i32* %tid, align 4
%1 = load i32, i32* %no_of_nodes.addr, align 4
%cmp = icmp slt i32 %0, %1
br i1 %cmp, label %land.lhs.true, label %if.end
land.lhs.true: ; preds = %entry
%2 = load i8*, i8** %g_updating_graph_mask.addr, align 8
%3 = load i32, i32* %tid, align 4
%idxprom = sext i32 %3 to i64
%arrayidx = getelementptr inbounds i8, i8* %2, i64 %idxprom
%4 = load i8, i8* %arrayidx, align 1
%tobool = trunc i8 %4 to i1
br i1 %tobool, label %if.then, label %if.end
if.then: ; preds = %land.lhs.true
%5 = load i8*, i8** %g_graph_mask.addr, align 8
%6 = load i32, i32* %tid, align 4
%idxprom2 = sext i32 %6 to i64
%arrayidx3 = getelementptr inbounds i8, i8* %5, i64 %idxprom2
store i8 1, i8* %arrayidx3, align 1
%7 = load i8*, i8** %g_graph_visited.addr, align 8
%8 = load i32, i32* %tid, align 4
%idxprom4 = sext i32 %8 to i64
%arrayidx5 = getelementptr inbounds i8, i8* %7, i64 %idxprom4
store i8 1, i8* %arrayidx5, align 1
%9 = load i8*, i8** %g_over.addr, align 8
store i8 1, i8* %9, align 1
%10 = load i8*, i8** %g_updating_graph_mask.addr, align 8
%11 = load i32, i32* %tid, align 4
%idxprom6 = sext i32 %11 to i64
%arrayidx7 = getelementptr inbounds i8, i8* %10, i64 %idxprom6
store i8 0, i8* %arrayidx7, align 1
br label %if.end
if.end: ; preds = %if.then, %land.lhs.true, %entry
ret void
}
; Function Attrs: nounwind readnone
declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #2
; Function Attrs: nounwind readnone
declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() #2
attributes #0 = { convergent noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #1 = { alwaysinline convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #2 = { nounwind readnone }
attributes #3 = { convergent nounwind }
!llvm.module.flags = !{!0, !1, !2}
!nvvm.annotations = !{!3, !4, !5, !6, !5, !7, !7, !7, !7, !8, !8, !7}
!llvm.ident = !{!9}
!nvvmir.version = !{!10}
!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 10, i32 1]}
!1 = !{i32 1, !"wchar_size", i32 4}
!2 = !{i32 4, !"nvvm-reflect-ftz", i32 0}
!3 = !{void (%struct.Node*, i32*, i8*, i8*, i8*, i32*, i32)* @_Z6KernelP4NodePiPbS2_S2_S1_i, !"kernel", i32 1}
!4 = !{void (i8*, i8*, i8*, i8*, i32)* @_Z7Kernel2PbS_S_S_i, !"kernel", i32 1}
!5 = !{null, !"align", i32 8}
!6 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080}
!7 = !{null, !"align", i32 16}
!8 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088}
!9 = !{!"clang version 10.0.1 (https://github.com/llvm/llvm-project.git ef32c611aa214dea855364efd7ba451ec5ec3f74)"}
!10 = !{i32 1, i32 4}

File diff suppressed because one or more lines are too long

View File

@ -1,213 +0,0 @@
#include <cuda.h>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#define MAX_THREADS_PER_BLOCK 512
int no_of_nodes;
int edge_list_size;
FILE *fp;
// Structure to hold a node information
struct Node {
int starting;
int no_of_edges;
};
#include "kernel.cu"
#include "kernel2.cu"
void BFSGraph(int argc, char **argv);
////////////////////////////////////////////////////////////////////////////////
// Main Program
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv) {
cudaSetDevice(0);
no_of_nodes = 0;
edge_list_size = 0;
BFSGraph(argc, argv);
}
void Usage(int argc, char **argv) {
fprintf(stderr, "Usage: %s <input_file>\n", argv[0]);
}
////////////////////////////////////////////////////////////////////////////////
// Apply BFS on a Graph using CUDA
////////////////////////////////////////////////////////////////////////////////
void BFSGraph(int argc, char **argv) {
char *input_f;
if (argc != 2) {
Usage(argc, argv);
exit(0);
}
input_f = argv[1];
printf("Reading File\n");
// Read in Graph from a file
fp = fopen(input_f, "r");
if (!fp) {
printf("Error Reading graph file\n");
return;
}
int source = 0;
fscanf(fp, "%d", &no_of_nodes);
int num_of_blocks = 1;
int num_of_threads_per_block = no_of_nodes;
// Make execution Parameters according to the number of nodes
// Distribute threads across multiple Blocks if necessary
if (no_of_nodes > MAX_THREADS_PER_BLOCK) {
num_of_blocks = (int)ceil(no_of_nodes / (double)MAX_THREADS_PER_BLOCK);
num_of_threads_per_block = MAX_THREADS_PER_BLOCK;
}
// allocate host memory
Node *h_graph_nodes = (Node *)malloc(sizeof(Node) * no_of_nodes);
bool *h_graph_mask = (bool *)malloc(sizeof(bool) * no_of_nodes);
bool *h_updating_graph_mask = (bool *)malloc(sizeof(bool) * no_of_nodes);
bool *h_graph_visited = (bool *)malloc(sizeof(bool) * no_of_nodes);
int start, edgeno;
// initalize the memory
for (unsigned int i = 0; i < no_of_nodes; i++) {
fscanf(fp, "%d %d", &start, &edgeno);
h_graph_nodes[i].starting = start;
h_graph_nodes[i].no_of_edges = edgeno;
h_graph_mask[i] = false;
h_updating_graph_mask[i] = false;
h_graph_visited[i] = false;
}
// read the source node from the file
fscanf(fp, "%d", &source);
source = 0;
// set the source node as true in the mask
h_graph_mask[source] = true;
h_graph_visited[source] = true;
fscanf(fp, "%d", &edge_list_size);
int id, cost;
int *h_graph_edges = (int *)malloc(sizeof(int) * edge_list_size);
for (int i = 0; i < edge_list_size; i++) {
fscanf(fp, "%d", &id);
fscanf(fp, "%d", &cost);
h_graph_edges[i] = id;
}
if (fp)
fclose(fp);
printf("Read File\n");
// Copy the Node list to device memory
Node *d_graph_nodes;
cudaMalloc((void **)&d_graph_nodes, sizeof(Node) * no_of_nodes);
cudaMemcpy(d_graph_nodes, h_graph_nodes, sizeof(Node) * no_of_nodes,
cudaMemcpyHostToDevice);
// Copy the Edge List to device Memory
int *d_graph_edges;
cudaMalloc((void **)&d_graph_edges, sizeof(int) * edge_list_size);
cudaMemcpy(d_graph_edges, h_graph_edges, sizeof(int) * edge_list_size,
cudaMemcpyHostToDevice);
// Copy the Mask to device memory
bool *d_graph_mask;
cudaMalloc((void **)&d_graph_mask, sizeof(bool) * no_of_nodes);
cudaMemcpy(d_graph_mask, h_graph_mask, sizeof(bool) * no_of_nodes,
cudaMemcpyHostToDevice);
bool *d_updating_graph_mask;
cudaMalloc((void **)&d_updating_graph_mask, sizeof(bool) * no_of_nodes);
cudaMemcpy(d_updating_graph_mask, h_updating_graph_mask,
sizeof(bool) * no_of_nodes, cudaMemcpyHostToDevice);
// Copy the Visited nodes array to device memory
bool *d_graph_visited;
cudaMalloc((void **)&d_graph_visited, sizeof(bool) * no_of_nodes);
cudaMemcpy(d_graph_visited, h_graph_visited, sizeof(bool) * no_of_nodes,
cudaMemcpyHostToDevice);
// allocate mem for the result on host side
int *h_cost = (int *)malloc(sizeof(int) * no_of_nodes);
for (int i = 0; i < no_of_nodes; i++)
h_cost[i] = -1;
h_cost[source] = 0;
// allocate device memory for result
int *d_cost;
cudaMalloc((void **)&d_cost, sizeof(int) * no_of_nodes);
cudaMemcpy(d_cost, h_cost, sizeof(int) * no_of_nodes, cudaMemcpyHostToDevice);
// make a bool to check if the execution is over
bool *d_over;
cudaMalloc((void **)&d_over, sizeof(bool));
printf("Copied Everything to GPU memory\n");
// setup execution parameters
dim3 grid(num_of_blocks, 1, 1);
dim3 threads(num_of_threads_per_block, 1, 1);
int k = 0;
printf("Start traversing the tree\n");
bool stop;
// Call the Kernel untill all the elements of Frontier are not false
do {
// if no thread changes this value then the loop stops
stop = false;
cudaMemcpy(d_over, &stop, sizeof(bool), cudaMemcpyHostToDevice);
Kernel<<<grid, threads, 0>>>(d_graph_nodes, d_graph_edges, d_graph_mask,
d_updating_graph_mask, d_graph_visited, d_cost,
no_of_nodes);
cudaDeviceSynchronize();
// check if kernel execution generated and error
Kernel2<<<grid, threads, 0>>>(d_graph_mask, d_updating_graph_mask,
d_graph_visited, d_over, no_of_nodes);
cudaDeviceSynchronize();
// check if kernel execution generated and error
cudaMemcpy(&stop, d_over, sizeof(bool), cudaMemcpyDeviceToHost);
k++;
} while (stop);
printf("Kernel Executed %d times\n", k);
// copy result from device to host
cudaMemcpy(h_cost, d_cost, sizeof(int) * no_of_nodes, cudaMemcpyDeviceToHost);
// Store the result into a file
FILE *fpo = fopen("result.txt", "w");
for (int i = 0; i < no_of_nodes; i++)
fprintf(fpo, "%d) cost:%d\n", i, h_cost[i]);
fclose(fpo);
printf("Result stored in result.txt\n");
// cleanup memory
free(h_graph_nodes);
free(h_graph_edges);
free(h_graph_mask);
free(h_updating_graph_mask);
free(h_graph_visited);
free(h_cost);
cudaFree(d_graph_nodes);
cudaFree(d_graph_edges);
cudaFree(d_graph_mask);
cudaFree(d_updating_graph_mask);
cudaFree(d_graph_visited);
cudaFree(d_cost);
}

View File

@ -1,23 +0,0 @@
#ifndef _KERNEL_H_
#define _KERNEL_H_
__global__ void
Kernel( Node* g_graph_nodes, int* g_graph_edges, bool* g_graph_mask, bool* g_updating_graph_mask, bool *g_graph_visited, int* g_cost, int no_of_nodes)
{
int tid = blockIdx.x*MAX_THREADS_PER_BLOCK + threadIdx.x;
if( tid<no_of_nodes && g_graph_mask[tid])
{
g_graph_mask[tid]=false;
for(int i=g_graph_nodes[tid].starting; i<(g_graph_nodes[tid].no_of_edges + g_graph_nodes[tid].starting); i++)
{
int id = g_graph_edges[i];
if(!g_graph_visited[id])
{
g_cost[id]=g_cost[tid]+1;
g_updating_graph_mask[id]=true;
}
}
}
}
#endif

View File

@ -1,18 +0,0 @@
#ifndef _KERNEL2_H_
#define _KERNEL2_H_
__global__ void
Kernel2( bool* g_graph_mask, bool *g_updating_graph_mask, bool* g_graph_visited, bool *g_over, int no_of_nodes)
{
int tid = blockIdx.x*MAX_THREADS_PER_BLOCK + threadIdx.x;
if( tid<no_of_nodes && g_updating_graph_mask[tid])
{
g_graph_mask[tid]=true;
g_graph_visited[tid]=true;
*g_over=true;
g_updating_graph_mask[tid]=false;
}
}
#endif

View File

@ -1,21 +0,0 @@
#!/bin/bash
set -e
llvm-as bfs-cuda-nvptx64-nvidia-cuda-sm_61.ll
llvm-as bfs-host-x86_64-unknown-linux-gnu.ll
../../build/compilation/kernelTranslator bfs-cuda-nvptx64-nvidia-cuda-sm_61.bc kernel.bc
../../build/compilation/hostTranslator bfs-host-x86_64-unknown-linux-gnu.bc host.bc
llc --relocation-model=pic --filetype=obj kernel.bc
llc --relocation-model=pic --filetype=obj host.bc
g++ -Wall -L../../build/runtime -L../../build/runtime/threadPool \
-o bfs.out -fPIC -no-pie host.o kernel.o -lc -lx86Runtime -lthreadPool -lpthread
export LD_LIBRARY_PATH=../../build/runtime:../../build/runtime/threadPool:$LD_LIBRARY_PATH
./bfs.out ../../rodinia-data/bfs/graph65536.txt
if grep -q "0) cost:0" result.txt; then
echo "Pass"
else
echo "Error result"
exit 1
fi

View File

@ -1,343 +0,0 @@
// # ifdef __cplusplus
// extern "C" {
// # endif
// #ifndef LIST_H
// # define LIST_H
//===============================================================================================================================================================================================================200
// DEFINE/INCLUDE
//===============================================================================================================================================================================================================200
//======================================================================================================================================================150
// INCLUDE (for some reason these are not recognized when defined in main
// file before this one is included)
//======================================================================================================================================================150
#include <stdbool.h> // (in path known to compiler) needed by true/false, bool
#include <stdint.h> // (in path known to compiler) needed by uint32_t
#include <stdlib.h> // (in path known to compiler) needed by malloc
//======================================================================================================================================================150
// DEFINE
//======================================================================================================================================================150
#define fp float
#define Version "1.5"
#ifdef WINDOWS
#define bool char
#define false 0
#define true 1
#endif
/* #define DEFAULT_ORDER 256 */
#ifdef RD_WG_SIZE_0_0
#define DEFAULT_ORDER RD_WG_SIZE_0_0
#elif defined(RD_WG_SIZE_0)
#define DEFAULT_ORDER RD_WG_SIZE_0
#elif defined(RD_WG_SIZE)
#define DEFAULT_ORDER RD_WG_SIZE
#else
#define DEFAULT_ORDER 256
#endif
/* #ifdef RD_WG_SIZE_1_0 */
/* #define DEFAULT_ORDER_2 RD_WG_SIZE_1_0 */
/* #elif defined(RD_WG_SIZE_1) */
/* #define DEFAULT_ORDER_2 RD_WG_SIZE_1 */
/* #elif defined(RD_WG_SIZE) */
/* #define DEFAULT_ORDER_2 RD_WG_SIZE */
/* #else */
/* #define DEFAULT_ORDER_2 256 */
/* #endif */
/* #define DEFAULT_ORDER 508 */
#define malloc(size) \
({ \
void *_tmp; \
\
if (!(_tmp = malloc(size))) { \
fprintf(stderr, "Allocation failed at %s:%d!\n", __FILE__, __LINE__); \
exit(-1); \
} \
\
_tmp; \
})
//======================================================================================================================================================150
// STRUCTURES
//======================================================================================================================================================150
// struct list_item;
typedef struct list_item list_item_t;
typedef struct list_t {
list_item_t *head, *tail;
uint32_t length;
int32_t (*compare)(const void *key, const void *with);
void (*datum_delete)(void *);
} list_t;
typedef list_item_t *list_iterator_t;
typedef list_item_t *list_reverse_iterator_t;
/* Type representing the record
* to which a given key refers.
* In a real B+ tree system, the
* record would hold data (in a database)
* or a file (in an operating system)
* or some other information.
* Users can rewrite this part of the code
* to change the type and content
* of the value field.
*/
typedef struct record {
int value;
} record;
/* Type representing a node in the B+ tree.
* This type is general enough to serve for both
* the leaf and the internal node.
* The heart of the node is the array
* of keys and the array of corresponding
* pointers. The relation between keys
* and pointers differs between leaves and
* internal nodes. In a leaf, the index
* of each key equals the index of its corresponding
* pointer, with a maximum of order - 1 key-pointer
* pairs. The last pointer points to the
* leaf to the right (or NULL in the case
* of the rightmost leaf).
* In an internal node, the first pointer
* refers to lower nodes with keys less than
* the smallest key in the keys array. Then,
* with indices i starting at 0, the pointer
* at i + 1 points to the subtree with keys
* greater than or equal to the key in this
* node at index i.
* The num_keys field is used to keep
* track of the number of valid keys.
* In an internal node, the number of valid
* pointers is always num_keys + 1.
* In a leaf, the number of valid pointers
* to data is always num_keys. The
* last leaf pointer points to the next leaf.
*/
typedef struct node {
void **pointers;
int *keys;
struct node *parent;
bool is_leaf;
int num_keys;
struct node *next; // Used for queue.
} node;
//
typedef struct knode {
int location;
int indices[DEFAULT_ORDER + 1];
int keys[DEFAULT_ORDER + 1];
bool is_leaf;
int num_keys;
} knode;
struct list_item {
struct list_item *pred, *next;
void *datum;
};
//===============================================================================================================================================================================================================200
// PROTOTYPES
//===============================================================================================================================================================================================================200
//======================================================================================================================================================150
// Other
//======================================================================================================================================================150
void list_item_init(list_item_t *li, void *datum);
void list_item_delete(list_item_t *li, void (*datum_delete)(void *datum));
void list_insert_item_tail(list_t *l, list_item_t *i);
void list_insert_item_before(list_t *l, list_item_t *next, list_item_t *i);
void list_insert_item_after(list_t *l, list_item_t *pred, list_item_t *i);
void list_insert_item_sorted(list_t *l, list_item_t *i);
//======================================================================================================================================================150
// ???
//======================================================================================================================================================150
void list_init(list_t *l, int32_t (*compare)(const void *key, const void *with),
void (*datum_delete)(void *datum));
void list_delete(list_t *l);
void list_reset(list_t *l);
void list_insert_head(list_t *l, void *v);
void list_insert_tail(list_t *l, void *v);
void list_insert_before(list_t *l, list_item_t *next, void *v);
void list_insert_after(list_t *l, list_item_t *pred, void *v);
void list_insert_sorted(list_t *l, void *v);
void list_insert_item_head(list_t *l, list_item_t *i);
void list_remove_item(list_t *l, list_item_t *i);
void list_remove_head(list_t *l);
void list_remove_tail(list_t *l);
list_item_t *list_find_item(list_t *l, void *datum);
list_item_t *list_get_head_item(list_t *l);
list_item_t *list_get_tail_item(list_t *l);
void *list_find(list_t *l, void *datum);
void *list_get_head(list_t *l);
void *list_get_tail(list_t *l);
uint32_t list_get_length(list_t *l);
bool list_is_empty(list_t *l);
bool list_not_empty(list_t *l);
void list_visit_items(list_t *l, void (*visitor)(void *v));
void *list_item_get_datum(list_item_t *li);
void list_iterator_init(list_t *l, list_iterator_t *li);
void list_iterator_delete(list_iterator_t *li);
void list_iterator_next(list_iterator_t *li);
void list_iterator_prev(list_iterator_t *li);
void *list_iterator_get_datum(list_iterator_t *li);
bool list_iterator_is_valid(list_iterator_t *li);
void list_reverse_iterator_init(list_t *l, list_iterator_t *li);
void list_reverse_iterator_delete(list_iterator_t *li);
void list_reverse_iterator_next(list_iterator_t *li);
void list_reverse_iterator_prev(list_iterator_t *li);
void *list_reverse_iterator_get_datum(list_iterator_t *li);
bool list_reverse_iterator_is_valid(list_reverse_iterator_t *li);
//======================================================================================================================================================150
// Output and utility
//======================================================================================================================================================150
void *kmalloc(int size);
long transform_to_cuda(node *n,
bool verbose); // returns actual mem used in a long
void usage_1(void);
void usage_2(void);
void enqueue(node *new_node);
node *dequeue(void);
int height(node *root);
int path_to_root(node *root, node *child);
void print_leaves(node *root);
void print_tree(node *root);
node *find_leaf(node *root, int key, bool verbose);
record *find(node *root, int key, bool verbose);
int cut(int length);
//======================================================================================================================================================150
// Insertion
//======================================================================================================================================================150
record *make_record(int value);
node *make_node(void);
node *make_leaf(void);
int get_left_index(node *parent, node *left);
node *insert_into_leaf(node *leaf, int key, record *pointer);
node *insert_into_leaf_after_splitting(node *root, node *leaf, int key,
record *pointer);
node *insert_into_node(node *root, node *parent, int left_index, int key,
node *right);
node *insert_into_node_after_splitting(node *root, node *parent, int left_index,
int key, node *right);
node *insert_into_parent(node *root, node *left, int key, node *right);
node *insert_into_new_root(node *left, int key, node *right);
node *start_new_tree(int key, record *pointer);
node *insert(node *root, int key, int value);
//======================================================================================================================================================150
// Deletion
//======================================================================================================================================================150
int get_neighbor_index(node *n);
node *adjust_root(node *root);
node *coalesce_nodes(node *root, node *n, node *neighbor, int neighbor_index,
int k_prime);
node *redistribute_nodes(node *root, node *n, node *neighbor,
int neighbor_index, int k_prime_index, int k_prime);
node *delete_entry(node *root, node *n, int key, void *pointer);
node *deleteVal(node *root, int key);
//===============================================================================================================================================================================================================200
// HEADER
//===============================================================================================================================================================================================================200
// int main( int argc,
// char *argv []);
//===============================================================================================================================================================================================================200
// END
//===============================================================================================================================================================================================================200
// #endif
// # ifdef __cplusplus
// }
// # endif

View File

@ -1,54 +0,0 @@
//========================================================================================================================================================================================================200
// findK function
//========================================================================================================================================================================================================200
__global__ void
findK( long height,
knode *knodesD,
long knodes_elem,
record *recordsD,
long *currKnodeD,
long *offsetD,
int *keysD,
record *ansD)
{
// private thread IDs
int thid = threadIdx.x;
int bid = blockIdx.x;
// processtree levels
int i;
for(i = 0; i < height; i++){
// if value is between the two keys
if((knodesD[currKnodeD[bid]].keys[thid]) <= keysD[bid] && (knodesD[currKnodeD[bid]].keys[thid+1] > keysD[bid])){
// this conditional statement is inserted to avoid crush due to but in original code
// "offset[bid]" calculated below that addresses knodes[] in the next iteration goes outside of its bounds cause segmentation fault
// more specifically, values saved into knodes->indices in the main function are out of bounds of knodes that they address
if(knodesD[offsetD[bid]].indices[thid] < knodes_elem){
offsetD[bid] = knodesD[offsetD[bid]].indices[thid];
}
}
__syncthreads();
// set for next tree level
if(thid==0){
currKnodeD[bid] = offsetD[bid];
}
__syncthreads();
}
//At this point, we have a candidate leaf node which may contain
//the target record. Check each key to hopefully find the record
if(knodesD[currKnodeD[bid]].keys[thid] == keysD[bid]){
ansD[bid].value = recordsD[knodesD[currKnodeD[bid]].indices[thid]].value;
}
}
//========================================================================================================================================================================================================200
// End
//========================================================================================================================================================================================================200

View File

@ -1,70 +0,0 @@
//========================================================================================================================================================================================================200
// findRangeK function
//========================================================================================================================================================================================================200
__global__ void
findRangeK( long height,
knode *knodesD,
long knodes_elem,
long *currKnodeD,
long *offsetD,
long *lastKnodeD,
long *offset_2D,
int *startD,
int *endD,
int *RecstartD,
int *ReclenD)
{
// private thread IDs
int thid = threadIdx.x;
int bid = blockIdx.x;
// ???
int i;
for(i = 0; i < height; i++){
if((knodesD[currKnodeD[bid]].keys[thid] <= startD[bid]) && (knodesD[currKnodeD[bid]].keys[thid+1] > startD[bid])){
// this conditional statement is inserted to avoid crush due to but in original code
// "offset[bid]" calculated below that later addresses part of knodes goes outside of its bounds cause segmentation fault
// more specifically, values saved into knodes->indices in the main function are out of bounds of knodes that they address
if(knodesD[currKnodeD[bid]].indices[thid] < knodes_elem){
offsetD[bid] = knodesD[currKnodeD[bid]].indices[thid];
}
}
if((knodesD[lastKnodeD[bid]].keys[thid] <= endD[bid]) && (knodesD[lastKnodeD[bid]].keys[thid+1] > endD[bid])){
// this conditional statement is inserted to avoid crush due to but in original code
// "offset_2[bid]" calculated below that later addresses part of knodes goes outside of its bounds cause segmentation fault
// more specifically, values saved into knodes->indices in the main function are out of bounds of knodes that they address
if(knodesD[lastKnodeD[bid]].indices[thid] < knodes_elem){
offset_2D[bid] = knodesD[lastKnodeD[bid]].indices[thid];
}
}
__syncthreads();
// set for next tree level
if(thid==0){
currKnodeD[bid] = offsetD[bid];
lastKnodeD[bid] = offset_2D[bid];
}
__syncthreads();
}
// Find the index of the starting record
if(knodesD[currKnodeD[bid]].keys[thid] == startD[bid]){
RecstartD[bid] = knodesD[currKnodeD[bid]].indices[thid];
}
__syncthreads();
// Find the index of the ending record
if(knodesD[lastKnodeD[bid]].keys[thid] == endD[bid]){
ReclenD[bid] = knodesD[lastKnodeD[bid]].indices[thid] - RecstartD[bid]+1;
}
}
//========================================================================================================================================================================================================200
// End
//========================================================================================================================================================================================================200

View File

@ -1,292 +0,0 @@
#ifdef __cplusplus
extern "C" {
#endif
//========================================================================================================================================================================================================200
// DEFINE/INCLUDE
//========================================================================================================================================================================================================200
//======================================================================================================================================================150
// COMMON
//======================================================================================================================================================150
#include "../common.h" // (in main program directory) needed to recognized input variables
//======================================================================================================================================================150
// UTILITIES
//======================================================================================================================================================150
#include "../util/cuda/cuda.h" // (in path specified to compiler) needed by for device functions
#include "../util/timer/timer.h" // (in path specified to compiler) needed by timer
//======================================================================================================================================================150
// KERNEL
//======================================================================================================================================================150
#include "./kernel_gpu_cuda.cu" // (in current directory) GPU kernel, cannot include with header file because of complications with passing of constant memory variables
//======================================================================================================================================================150
// HEADER
//======================================================================================================================================================150
#include "./kernel_gpu_cuda_wrapper.h" // (in current directory)
//========================================================================================================================================================================================================200
// KERNEL_GPU_CUDA_WRAPPER FUNCTION
//========================================================================================================================================================================================================200
void
kernel_gpu_cuda_wrapper(record *records,
long records_mem,
knode *knodes,
long knodes_elem,
long knodes_mem,
int order,
long maxheight,
int count,
long *currKnode,
long *offset,
int *keys,
record *ans)
{
//======================================================================================================================================================150
// CPU VARIABLES
//======================================================================================================================================================150
// timer
long long time0;
long long time1;
long long time2;
long long time3;
long long time4;
long long time5;
long long time6;
time0 = get_time();
//======================================================================================================================================================150
// GPU SETUP
//======================================================================================================================================================150
//====================================================================================================100
// INITIAL DRIVER OVERHEAD
//====================================================================================================100
cudaThreadSynchronize();
//====================================================================================================100
// EXECUTION PARAMETERS
//====================================================================================================100
int numBlocks;
numBlocks = count; // max # of blocks can be 65,535
int threadsPerBlock;
threadsPerBlock = order < 1024 ? order : 1024;
printf("# of blocks = %d, # of threads/block = %d (ensure that device can handle)\n", numBlocks, threadsPerBlock);
time1 = get_time();
//======================================================================================================================================================150
// GPU MEMORY (MALLOC)
//======================================================================================================================================================150
//====================================================================================================100
// DEVICE IN
//====================================================================================================100
//==================================================50
// recordsD
//==================================================50
record *recordsD;
cudaMalloc((void**)&recordsD, records_mem);
checkCUDAError("cudaMalloc recordsD");
//==================================================50
// knodesD
//==================================================50
knode *knodesD;
cudaMalloc((void**)&knodesD, knodes_mem);
checkCUDAError("cudaMalloc recordsD");
//==================================================50
// currKnodeD
//==================================================50
long *currKnodeD;
cudaMalloc((void**)&currKnodeD, count*sizeof(long));
checkCUDAError("cudaMalloc currKnodeD");
//==================================================50
// offsetD
//==================================================50
long *offsetD;
cudaMalloc((void**)&offsetD, count*sizeof(long));
checkCUDAError("cudaMalloc offsetD");
//==================================================50
// keysD
//==================================================50
int *keysD;
cudaMalloc((void**)&keysD, count*sizeof(int));
checkCUDAError("cudaMalloc keysD");
//====================================================================================================100
// DEVICE IN/OUT
//====================================================================================================100
//==================================================50
// ansD
//==================================================50
record *ansD;
cudaMalloc((void**)&ansD, count*sizeof(record));
checkCUDAError("cudaMalloc ansD");
time2 = get_time();
//======================================================================================================================================================150
// GPU MEMORY COPY
//======================================================================================================================================================150
//====================================================================================================100
// GPU MEMORY (MALLOC) COPY IN
//====================================================================================================100
//==================================================50
// recordsD
//==================================================50
cudaMemcpy(recordsD, records, records_mem, cudaMemcpyHostToDevice);
checkCUDAError("cudaMalloc cudaMemcpy memD");
//==================================================50
// knodesD
//==================================================50
cudaMemcpy(knodesD, knodes, knodes_mem, cudaMemcpyHostToDevice);
checkCUDAError("cudaMalloc cudaMemcpy memD");
//==================================================50
// currKnodeD
//==================================================50
cudaMemcpy(currKnodeD, currKnode, count*sizeof(long), cudaMemcpyHostToDevice);
checkCUDAError("cudaMalloc cudaMemcpy currKnodeD");
//==================================================50
// offsetD
//==================================================50
cudaMemcpy(offsetD, offset, count*sizeof(long), cudaMemcpyHostToDevice);
checkCUDAError("cudaMalloc cudaMemcpy offsetD");
//==================================================50
// keysD
//==================================================50
cudaMemcpy(keysD, keys, count*sizeof(int), cudaMemcpyHostToDevice);
checkCUDAError("cudaMalloc cudaMemcpy keysD");
//====================================================================================================100
// DEVICE IN/OUT
//====================================================================================================100
//==================================================50
// ansD
//==================================================50
cudaMemcpy(ansD, ans, count*sizeof(record), cudaMemcpyHostToDevice);
checkCUDAError("cudaMalloc cudaMemcpy ansD");
time3 = get_time();
//======================================================================================================================================================150
// findK kernel
//======================================================================================================================================================150
findK<<<numBlocks, threadsPerBlock>>>( maxheight,
knodesD,
knodes_elem,
recordsD,
currKnodeD,
offsetD,
keysD,
ansD);
cudaThreadSynchronize();
checkCUDAError("findK");
time4 = get_time();
//======================================================================================================================================================150
// GPU MEMORY COPY (CONTD.)
//======================================================================================================================================================150
//====================================================================================================100
// DEVICE IN/OUT
//====================================================================================================100
//==================================================50
// ansD
//==================================================50
cudaMemcpy(ans, ansD, count*sizeof(record), cudaMemcpyDeviceToHost);
checkCUDAError("cudaMemcpy ansD");
time5 = get_time();
//======================================================================================================================================================150
// GPU MEMORY DEALLOCATION
//======================================================================================================================================================150
cudaFree(recordsD);
cudaFree(knodesD);
cudaFree(currKnodeD);
cudaFree(offsetD);
cudaFree(keysD);
cudaFree(ansD);
time6 = get_time();
//======================================================================================================================================================150
// DISPLAY TIMING
//======================================================================================================================================================150
printf("Time spent in different stages of GPU_CUDA KERNEL:\n");
printf("%15.12f s, %15.12f % : GPU: SET DEVICE / DRIVER INIT\n", (float) (time1-time0) / 1000000, (float) (time1-time0) / (float) (time6-time0) * 100);
printf("%15.12f s, %15.12f % : GPU MEM: ALO\n", (float) (time2-time1) / 1000000, (float) (time2-time1) / (float) (time6-time0) * 100);
printf("%15.12f s, %15.12f % : GPU MEM: COPY IN\n", (float) (time3-time2) / 1000000, (float) (time3-time2) / (float) (time6-time0) * 100);
printf("%15.12f s, %15.12f % : GPU: KERNEL\n", (float) (time4-time3) / 1000000, (float) (time4-time3) / (float) (time6-time0) * 100);
printf("%15.12f s, %15.12f % : GPU MEM: COPY OUT\n", (float) (time5-time4) / 1000000, (float) (time5-time4) / (float) (time6-time0) * 100);
printf("%15.12f s, %15.12f % : GPU MEM: FRE\n", (float) (time6-time5) / 1000000, (float) (time6-time5) / (float) (time6-time0) * 100);
printf("Total time:\n");
printf("%.12f s\n", (float) (time6-time0) / 1000000);
//========================================================================================================================================================================================================200
// End
//========================================================================================================================================================================================================200
}
//========================================================================================================================================================================================================200
// END
//========================================================================================================================================================================================================200
#ifdef __cplusplus
}
#endif

View File

@ -1,23 +0,0 @@
#ifdef __cplusplus
extern "C" {
#endif
//========================================================================================================================================================================================================200
// KERNEL_GPU_CUDA_WRAPPER HEADER
//========================================================================================================================================================================================================200
void kernel_gpu_cuda_wrapper(record *records, long records_mem, knode *knodes,
long knodes_elem, long knodes_mem,
int order, long maxheight, int count,
long *currKnode, long *offset, int *keys,
record *ans);
//========================================================================================================================================================================================================200
// End
//========================================================================================================================================================================================================200
#ifdef __cplusplus
}
#endif

View File

@ -1,347 +0,0 @@
#ifdef __cplusplus
extern "C" {
#endif
//========================================================================================================================================================================================================200
// INCLUDE
//========================================================================================================================================================================================================200
//======================================================================================================================================================150
// COMMON
//======================================================================================================================================================150
#include "../common.h" // (in the main program folder) needed to recognized input parameters
//======================================================================================================================================================150
// UTILITIES
//======================================================================================================================================================150
#include "../util/cuda/cuda.h" // (in library path specified to compiler) needed by for device functions
#include "../util/timer/timer.h" // (in library path specified to compiler) needed by timer
//======================================================================================================================================================150
// KERNEL
//======================================================================================================================================================150
#include "./kernel_gpu_cuda_2.cu" // (in the current directory) GPU kernel, cannot include with header file because of complications with passing of constant memory variables
//======================================================================================================================================================150
// HEADER
//======================================================================================================================================================150
#include "./kernel_gpu_cuda_wrapper_2.h" // (in the current directory)
//========================================================================================================================================================================================================200
// FUNCTION
//========================================================================================================================================================================================================200
void
kernel_gpu_cuda_wrapper_2( knode *knodes,
long knodes_elem,
long knodes_mem,
int order,
long maxheight,
int count,
long *currKnode,
long *offset,
long *lastKnode,
long *offset_2,
int *start,
int *end,
int *recstart,
int *reclength)
{
//======================================================================================================================================================150
// CPU VARIABLES
//======================================================================================================================================================150
// timer
long long time0;
long long time1;
long long time2;
long long time3;
long long time4;
long long time5;
long long time6;
time0 = get_time();
//======================================================================================================================================================150
// GPU SETUP
//======================================================================================================================================================150
//====================================================================================================100
// INITIAL DRIVER OVERHEAD
//====================================================================================================100
cudaThreadSynchronize();
//====================================================================================================100
// EXECUTION PARAMETERS
//====================================================================================================100
int numBlocks;
numBlocks = count;
int threadsPerBlock;
threadsPerBlock = order < 1024 ? order : 1024;
printf("# of blocks = %d, # of threads/block = %d (ensure that device can handle)\n", numBlocks, threadsPerBlock);
time1 = get_time();
//======================================================================================================================================================150
// GPU MEMORY MALLOC
//======================================================================================================================================================150
//====================================================================================================100
// DEVICE IN
//====================================================================================================100
//==================================================50
// knodesD
//==================================================50
knode *knodesD;
cudaMalloc((void**)&knodesD, knodes_mem);
checkCUDAError("cudaMalloc recordsD");
//==================================================50
// currKnodeD
//==================================================50
long *currKnodeD;
cudaMalloc((void**)&currKnodeD, count*sizeof(long));
checkCUDAError("cudaMalloc currKnodeD");
//==================================================50
// offsetD
//==================================================50
long *offsetD;
cudaMalloc((void**)&offsetD, count*sizeof(long));
checkCUDAError("cudaMalloc offsetD");
//==================================================50
// lastKnodeD
//==================================================50
long *lastKnodeD;
cudaMalloc((void**)&lastKnodeD, count*sizeof(long));
checkCUDAError("cudaMalloc lastKnodeD");
//==================================================50
// offset_2D
//==================================================50
long *offset_2D;
cudaMalloc((void**)&offset_2D, count*sizeof(long));
checkCUDAError("cudaMalloc offset_2D");
//==================================================50
// startD
//==================================================50
int *startD;
cudaMalloc((void**)&startD, count*sizeof(int));
checkCUDAError("cudaMalloc startD");
//==================================================50
// endD
//==================================================50
int *endD;
cudaMalloc((void**)&endD, count*sizeof(int));
checkCUDAError("cudaMalloc endD");
//====================================================================================================100
// DEVICE IN/OUT
//====================================================================================================100
//==================================================50
// ansDStart
//==================================================50
int *ansDStart;
cudaMalloc((void**)&ansDStart, count*sizeof(int));
checkCUDAError("cudaMalloc ansDStart");
//==================================================50
// ansDLength
//==================================================50
int *ansDLength;
cudaMalloc((void**)&ansDLength, count*sizeof(int));
checkCUDAError("cudaMalloc ansDLength");
time2 = get_time();
//======================================================================================================================================================150
// GPU MEMORY COPY
//======================================================================================================================================================150
//====================================================================================================100
// DEVICE IN
//====================================================================================================100
//==================================================50
// knodesD
//==================================================50
cudaMemcpy(knodesD, knodes, knodes_mem, cudaMemcpyHostToDevice);
checkCUDAError("cudaMalloc cudaMemcpy memD");
//==================================================50
// currKnodeD
//==================================================50
cudaMemcpy(currKnodeD, currKnode, count*sizeof(long), cudaMemcpyHostToDevice);
checkCUDAError("cudaMalloc cudaMemcpy currKnodeD");
//==================================================50
// offsetD
//==================================================50
cudaMemcpy(offsetD, offset, count*sizeof(long), cudaMemcpyHostToDevice);
checkCUDAError("cudaMalloc cudaMemcpy offsetD");
//==================================================50
// lastKnodeD
//==================================================50
cudaMemcpy(lastKnodeD, lastKnode, count*sizeof(long), cudaMemcpyHostToDevice);
checkCUDAError("cudaMalloc cudaMemcpy lastKnodeD");
//==================================================50
// offset_2D
//==================================================50
cudaMemcpy(offset_2D, offset_2, count*sizeof(long), cudaMemcpyHostToDevice);
checkCUDAError("cudaMalloc cudaMemcpy offset_2D");
//==================================================50
// startD
//==================================================50
cudaMemcpy(startD, start, count*sizeof(int), cudaMemcpyHostToDevice);
checkCUDAError("cudaMemcpy startD");
//==================================================50
// endD
//==================================================50
cudaMemcpy(endD, end, count*sizeof(int), cudaMemcpyHostToDevice);
checkCUDAError("cudaMemcpy endD");
//====================================================================================================100
// DEVICE IN/OUT
//====================================================================================================100
//==================================================50
// ansDStart
//==================================================50
cudaMemcpy(ansDStart, recstart, count*sizeof(int), cudaMemcpyHostToDevice);
checkCUDAError("cudaMemcpy ansDStart");
//==================================================50
// ansDLength
//==================================================50
cudaMemcpy(ansDLength, reclength, count*sizeof(int), cudaMemcpyHostToDevice);
checkCUDAError("cudaMemcpy ansDLength");
time3 = get_time();
//======================================================================================================================================================150
// KERNEL
//======================================================================================================================================================150
// [GPU] findRangeK kernel
findRangeK<<<numBlocks, threadsPerBlock>>>( maxheight,
knodesD,
knodes_elem,
currKnodeD,
offsetD,
lastKnodeD,
offset_2D,
startD,
endD,
ansDStart,
ansDLength);
cudaThreadSynchronize();
checkCUDAError("findRangeK");
time4 = get_time();
//======================================================================================================================================================150
// GPU MEMORY COPY (CONTD.)
//======================================================================================================================================================150
//====================================================================================================100
// DEVICE IN/OUT
//====================================================================================================100
//==================================================50
// ansDStart
//==================================================50
cudaMemcpy(recstart, ansDStart, count*sizeof(int), cudaMemcpyDeviceToHost);
checkCUDAError("cudaMemcpy ansDStart");
//==================================================50
// ansDLength
//==================================================50
cudaMemcpy(reclength, ansDLength, count*sizeof(int), cudaMemcpyDeviceToHost);
checkCUDAError("cudaMemcpy ansDLength");
time5 = get_time();
//======================================================================================================================================================150
// GPU MEMORY DEALLOCATION
//======================================================================================================================================================150
cudaFree(knodesD);
cudaFree(currKnodeD);
cudaFree(offsetD);
cudaFree(lastKnodeD);
cudaFree(offset_2D);
cudaFree(startD);
cudaFree(endD);
cudaFree(ansDStart);
cudaFree(ansDLength);
time6 = get_time();
//======================================================================================================================================================150
// DISPLAY TIMING
//======================================================================================================================================================150
printf("Time spent in different stages of GPU_CUDA KERNEL:\n");
printf("%15.12f s, %15.12f % : GPU: SET DEVICE / DRIVER INIT\n", (float) (time1-time0) / 1000000, (float) (time1-time0) / (float) (time6-time0) * 100);
printf("%15.12f s, %15.12f % : GPU MEM: ALO\n", (float) (time2-time1) / 1000000, (float) (time2-time1) / (float) (time6-time0) * 100);
printf("%15.12f s, %15.12f % : GPU MEM: COPY IN\n", (float) (time3-time2) / 1000000, (float) (time3-time2) / (float) (time6-time0) * 100);
printf("%15.12f s, %15.12f % : GPU: KERNEL\n", (float) (time4-time3) / 1000000, (float) (time4-time3) / (float) (time6-time0) * 100);
printf("%15.12f s, %15.12f % : GPU MEM: COPY OUT\n", (float) (time5-time4) / 1000000, (float) (time5-time4) / (float) (time6-time0) * 100);
printf("%15.12f s, %15.12f % : GPU MEM: FRE\n", (float) (time6-time5) / 1000000, (float) (time6-time5) / (float) (time6-time0) * 100);
printf("Total time:\n");
printf("%.12f s\n", (float) (time6-time0) / 1000000);
}
//========================================================================================================================================================================================================200
// END
//========================================================================================================================================================================================================200
#ifdef __cplusplus
}
#endif

View File

@ -1,23 +0,0 @@
#ifdef __cplusplus
extern "C" {
#endif
//========================================================================================================================================================================================================200
// KERNEL_GPU_CUDA_WRAPPER HEADER
//========================================================================================================================================================================================================200
void kernel_gpu_cuda_wrapper_2(knode *knodes, long knodes_elem, long knodes_mem,
int order, long maxheight, int count,
long *currKnode, long *offset, long *lastKnode,
long *offset_2, int *start, int *end,
int *recstart, int *reclength);
//========================================================================================================================================================================================================200
// End
//========================================================================================================================================================================================================200
#ifdef __cplusplus
}
#endif

View File

@ -1,332 +0,0 @@
; ModuleID = 'kernel_gpu_cuda_wrapper-cuda-nvptx64-nvidia-cuda-sm_61.bc'
source_filename = "kernel/kernel_gpu_cuda_wrapper.cu"
target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
target triple = "nvptx64-nvidia-cuda"
%struct.__cuda_builtin_threadIdx_t = type { i8 }
%struct.__cuda_builtin_blockIdx_t = type { i8 }
%struct.cudaFuncAttributes = type { i64, i64, i64, i32, i32, i32, i32, i32, i32, i32 }
%struct.knode = type { i32, [257 x i32], [257 x i32], i8, i32 }
%struct.record = type { i32 }
$_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv = comdat any
$_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv = comdat any
@threadIdx = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_threadIdx_t, align 1
@blockIdx = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_blockIdx_t, align 1
; Function Attrs: convergent noinline nounwind optnone
define weak dso_local i32 @cudaMalloc(i8** %p, i64 %s) #0 {
entry:
%p.addr = alloca i8**, align 8
%s.addr = alloca i64, align 8
store i8** %p, i8*** %p.addr, align 8
store i64 %s, i64* %s.addr, align 8
ret i32 999
}
; Function Attrs: convergent noinline nounwind optnone
define weak dso_local i32 @cudaFuncGetAttributes(%struct.cudaFuncAttributes* %p, i8* %c) #0 {
entry:
%p.addr = alloca %struct.cudaFuncAttributes*, align 8
%c.addr = alloca i8*, align 8
store %struct.cudaFuncAttributes* %p, %struct.cudaFuncAttributes** %p.addr, align 8
store i8* %c, i8** %c.addr, align 8
ret i32 999
}
; Function Attrs: convergent noinline nounwind optnone
define weak dso_local i32 @cudaDeviceGetAttribute(i32* %value, i32 %attr, i32 %device) #0 {
entry:
%value.addr = alloca i32*, align 8
%attr.addr = alloca i32, align 4
%device.addr = alloca i32, align 4
store i32* %value, i32** %value.addr, align 8
store i32 %attr, i32* %attr.addr, align 4
store i32 %device, i32* %device.addr, align 4
ret i32 999
}
; Function Attrs: convergent noinline nounwind optnone
define weak dso_local i32 @cudaGetDevice(i32* %device) #0 {
entry:
%device.addr = alloca i32*, align 8
store i32* %device, i32** %device.addr, align 8
ret i32 999
}
; Function Attrs: convergent noinline nounwind optnone
define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessor(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize) #0 {
entry:
%numBlocks.addr = alloca i32*, align 8
%func.addr = alloca i8*, align 8
%blockSize.addr = alloca i32, align 4
%dynamicSmemSize.addr = alloca i64, align 8
store i32* %numBlocks, i32** %numBlocks.addr, align 8
store i8* %func, i8** %func.addr, align 8
store i32 %blockSize, i32* %blockSize.addr, align 4
store i64 %dynamicSmemSize, i64* %dynamicSmemSize.addr, align 8
ret i32 999
}
; Function Attrs: convergent noinline nounwind optnone
define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize, i32 %flags) #0 {
entry:
%numBlocks.addr = alloca i32*, align 8
%func.addr = alloca i8*, align 8
%blockSize.addr = alloca i32, align 4
%dynamicSmemSize.addr = alloca i64, align 8
%flags.addr = alloca i32, align 4
store i32* %numBlocks, i32** %numBlocks.addr, align 8
store i8* %func, i8** %func.addr, align 8
store i32 %blockSize, i32* %blockSize.addr, align 4
store i64 %dynamicSmemSize, i64* %dynamicSmemSize.addr, align 8
store i32 %flags, i32* %flags.addr, align 4
ret i32 999
}
; Function Attrs: convergent noinline nounwind optnone
define dso_local void @findK(i64 %height, %struct.knode* %knodesD, i64 %knodes_elem, %struct.record* %recordsD, i64* %currKnodeD, i64* %offsetD, i32* %keysD, %struct.record* %ansD) #0 {
entry:
%height.addr = alloca i64, align 8
%knodesD.addr = alloca %struct.knode*, align 8
%knodes_elem.addr = alloca i64, align 8
%recordsD.addr = alloca %struct.record*, align 8
%currKnodeD.addr = alloca i64*, align 8
%offsetD.addr = alloca i64*, align 8
%keysD.addr = alloca i32*, align 8
%ansD.addr = alloca %struct.record*, align 8
%thid = alloca i32, align 4
%bid = alloca i32, align 4
%i = alloca i32, align 4
store i64 %height, i64* %height.addr, align 8
store %struct.knode* %knodesD, %struct.knode** %knodesD.addr, align 8
store i64 %knodes_elem, i64* %knodes_elem.addr, align 8
store %struct.record* %recordsD, %struct.record** %recordsD.addr, align 8
store i64* %currKnodeD, i64** %currKnodeD.addr, align 8
store i64* %offsetD, i64** %offsetD.addr, align 8
store i32* %keysD, i32** %keysD.addr, align 8
store %struct.record* %ansD, %struct.record** %ansD.addr, align 8
%call = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #2
store i32 %call, i32* %thid, align 4
%call1 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #2
store i32 %call1, i32* %bid, align 4
store i32 0, i32* %i, align 4
br label %for.cond
for.cond: ; preds = %for.inc, %entry
%0 = load i32, i32* %i, align 4
%conv = sext i32 %0 to i64
%1 = load i64, i64* %height.addr, align 8
%cmp = icmp slt i64 %conv, %1
br i1 %cmp, label %for.body, label %for.end
for.body: ; preds = %for.cond
%2 = load %struct.knode*, %struct.knode** %knodesD.addr, align 8
%3 = load i64*, i64** %currKnodeD.addr, align 8
%4 = load i32, i32* %bid, align 4
%idxprom = sext i32 %4 to i64
%arrayidx = getelementptr inbounds i64, i64* %3, i64 %idxprom
%5 = load i64, i64* %arrayidx, align 8
%arrayidx2 = getelementptr inbounds %struct.knode, %struct.knode* %2, i64 %5
%keys = getelementptr inbounds %struct.knode, %struct.knode* %arrayidx2, i32 0, i32 2
%6 = load i32, i32* %thid, align 4
%idxprom3 = sext i32 %6 to i64
%arrayidx4 = getelementptr inbounds [257 x i32], [257 x i32]* %keys, i64 0, i64 %idxprom3
%7 = load i32, i32* %arrayidx4, align 4
%8 = load i32*, i32** %keysD.addr, align 8
%9 = load i32, i32* %bid, align 4
%idxprom5 = sext i32 %9 to i64
%arrayidx6 = getelementptr inbounds i32, i32* %8, i64 %idxprom5
%10 = load i32, i32* %arrayidx6, align 4
%cmp7 = icmp sle i32 %7, %10
br i1 %cmp7, label %land.lhs.true, label %if.end34
land.lhs.true: ; preds = %for.body
%11 = load %struct.knode*, %struct.knode** %knodesD.addr, align 8
%12 = load i64*, i64** %currKnodeD.addr, align 8
%13 = load i32, i32* %bid, align 4
%idxprom8 = sext i32 %13 to i64
%arrayidx9 = getelementptr inbounds i64, i64* %12, i64 %idxprom8
%14 = load i64, i64* %arrayidx9, align 8
%arrayidx10 = getelementptr inbounds %struct.knode, %struct.knode* %11, i64 %14
%keys11 = getelementptr inbounds %struct.knode, %struct.knode* %arrayidx10, i32 0, i32 2
%15 = load i32, i32* %thid, align 4
%add = add nsw i32 %15, 1
%idxprom12 = sext i32 %add to i64
%arrayidx13 = getelementptr inbounds [257 x i32], [257 x i32]* %keys11, i64 0, i64 %idxprom12
%16 = load i32, i32* %arrayidx13, align 4
%17 = load i32*, i32** %keysD.addr, align 8
%18 = load i32, i32* %bid, align 4
%idxprom14 = sext i32 %18 to i64
%arrayidx15 = getelementptr inbounds i32, i32* %17, i64 %idxprom14
%19 = load i32, i32* %arrayidx15, align 4
%cmp16 = icmp sgt i32 %16, %19
br i1 %cmp16, label %if.then, label %if.end34
if.then: ; preds = %land.lhs.true
%20 = load %struct.knode*, %struct.knode** %knodesD.addr, align 8
%21 = load i64*, i64** %offsetD.addr, align 8
%22 = load i32, i32* %bid, align 4
%idxprom17 = sext i32 %22 to i64
%arrayidx18 = getelementptr inbounds i64, i64* %21, i64 %idxprom17
%23 = load i64, i64* %arrayidx18, align 8
%arrayidx19 = getelementptr inbounds %struct.knode, %struct.knode* %20, i64 %23
%indices = getelementptr inbounds %struct.knode, %struct.knode* %arrayidx19, i32 0, i32 1
%24 = load i32, i32* %thid, align 4
%idxprom20 = sext i32 %24 to i64
%arrayidx21 = getelementptr inbounds [257 x i32], [257 x i32]* %indices, i64 0, i64 %idxprom20
%25 = load i32, i32* %arrayidx21, align 4
%conv22 = sext i32 %25 to i64
%26 = load i64, i64* %knodes_elem.addr, align 8
%cmp23 = icmp slt i64 %conv22, %26
br i1 %cmp23, label %if.then24, label %if.end
if.then24: ; preds = %if.then
%27 = load %struct.knode*, %struct.knode** %knodesD.addr, align 8
%28 = load i64*, i64** %offsetD.addr, align 8
%29 = load i32, i32* %bid, align 4
%idxprom25 = sext i32 %29 to i64
%arrayidx26 = getelementptr inbounds i64, i64* %28, i64 %idxprom25
%30 = load i64, i64* %arrayidx26, align 8
%arrayidx27 = getelementptr inbounds %struct.knode, %struct.knode* %27, i64 %30
%indices28 = getelementptr inbounds %struct.knode, %struct.knode* %arrayidx27, i32 0, i32 1
%31 = load i32, i32* %thid, align 4
%idxprom29 = sext i32 %31 to i64
%arrayidx30 = getelementptr inbounds [257 x i32], [257 x i32]* %indices28, i64 0, i64 %idxprom29
%32 = load i32, i32* %arrayidx30, align 4
%conv31 = sext i32 %32 to i64
%33 = load i64*, i64** %offsetD.addr, align 8
%34 = load i32, i32* %bid, align 4
%idxprom32 = sext i32 %34 to i64
%arrayidx33 = getelementptr inbounds i64, i64* %33, i64 %idxprom32
store i64 %conv31, i64* %arrayidx33, align 8
br label %if.end
if.end: ; preds = %if.then24, %if.then
br label %if.end34
if.end34: ; preds = %if.end, %land.lhs.true, %for.body
call void @llvm.nvvm.barrier0()
%35 = load i32, i32* %thid, align 4
%cmp35 = icmp eq i32 %35, 0
br i1 %cmp35, label %if.then36, label %if.end41
if.then36: ; preds = %if.end34
%36 = load i64*, i64** %offsetD.addr, align 8
%37 = load i32, i32* %bid, align 4
%idxprom37 = sext i32 %37 to i64
%arrayidx38 = getelementptr inbounds i64, i64* %36, i64 %idxprom37
%38 = load i64, i64* %arrayidx38, align 8
%39 = load i64*, i64** %currKnodeD.addr, align 8
%40 = load i32, i32* %bid, align 4
%idxprom39 = sext i32 %40 to i64
%arrayidx40 = getelementptr inbounds i64, i64* %39, i64 %idxprom39
store i64 %38, i64* %arrayidx40, align 8
br label %if.end41
if.end41: ; preds = %if.then36, %if.end34
call void @llvm.nvvm.barrier0()
br label %for.inc
for.inc: ; preds = %if.end41
%41 = load i32, i32* %i, align 4
%inc = add nsw i32 %41, 1
store i32 %inc, i32* %i, align 4
br label %for.cond
for.end: ; preds = %for.cond
%42 = load %struct.knode*, %struct.knode** %knodesD.addr, align 8
%43 = load i64*, i64** %currKnodeD.addr, align 8
%44 = load i32, i32* %bid, align 4
%idxprom42 = sext i32 %44 to i64
%arrayidx43 = getelementptr inbounds i64, i64* %43, i64 %idxprom42
%45 = load i64, i64* %arrayidx43, align 8
%arrayidx44 = getelementptr inbounds %struct.knode, %struct.knode* %42, i64 %45
%keys45 = getelementptr inbounds %struct.knode, %struct.knode* %arrayidx44, i32 0, i32 2
%46 = load i32, i32* %thid, align 4
%idxprom46 = sext i32 %46 to i64
%arrayidx47 = getelementptr inbounds [257 x i32], [257 x i32]* %keys45, i64 0, i64 %idxprom46
%47 = load i32, i32* %arrayidx47, align 4
%48 = load i32*, i32** %keysD.addr, align 8
%49 = load i32, i32* %bid, align 4
%idxprom48 = sext i32 %49 to i64
%arrayidx49 = getelementptr inbounds i32, i32* %48, i64 %idxprom48
%50 = load i32, i32* %arrayidx49, align 4
%cmp50 = icmp eq i32 %47, %50
br i1 %cmp50, label %if.then51, label %if.end63
if.then51: ; preds = %for.end
%51 = load %struct.record*, %struct.record** %recordsD.addr, align 8
%52 = load %struct.knode*, %struct.knode** %knodesD.addr, align 8
%53 = load i64*, i64** %currKnodeD.addr, align 8
%54 = load i32, i32* %bid, align 4
%idxprom52 = sext i32 %54 to i64
%arrayidx53 = getelementptr inbounds i64, i64* %53, i64 %idxprom52
%55 = load i64, i64* %arrayidx53, align 8
%arrayidx54 = getelementptr inbounds %struct.knode, %struct.knode* %52, i64 %55
%indices55 = getelementptr inbounds %struct.knode, %struct.knode* %arrayidx54, i32 0, i32 1
%56 = load i32, i32* %thid, align 4
%idxprom56 = sext i32 %56 to i64
%arrayidx57 = getelementptr inbounds [257 x i32], [257 x i32]* %indices55, i64 0, i64 %idxprom56
%57 = load i32, i32* %arrayidx57, align 4
%idxprom58 = sext i32 %57 to i64
%arrayidx59 = getelementptr inbounds %struct.record, %struct.record* %51, i64 %idxprom58
%value = getelementptr inbounds %struct.record, %struct.record* %arrayidx59, i32 0, i32 0
%58 = load i32, i32* %value, align 4
%59 = load %struct.record*, %struct.record** %ansD.addr, align 8
%60 = load i32, i32* %bid, align 4
%idxprom60 = sext i32 %60 to i64
%arrayidx61 = getelementptr inbounds %struct.record, %struct.record* %59, i64 %idxprom60
%value62 = getelementptr inbounds %struct.record, %struct.record* %arrayidx61, i32 0, i32 0
store i32 %58, i32* %value62, align 4
br label %if.end63
if.end63: ; preds = %if.then51, %for.end
ret void
}
; Function Attrs: alwaysinline convergent nounwind
define linkonce_odr dso_local i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #1 comdat align 2 {
entry:
%0 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
ret i32 %0
}
; Function Attrs: alwaysinline convergent nounwind
define linkonce_odr dso_local i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #1 comdat align 2 {
entry:
%0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
ret i32 %0
}
; Function Attrs: convergent nounwind
declare void @llvm.nvvm.barrier0() #2
; Function Attrs: nounwind readnone
declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() #3
; Function Attrs: nounwind readnone
declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #3
attributes #0 = { convergent noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #1 = { alwaysinline convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #2 = { convergent nounwind }
attributes #3 = { nounwind readnone }
!llvm.module.flags = !{!0, !1, !2}
!nvvm.annotations = !{!3, !4, !5, !4, !6, !6, !6, !6, !7, !7, !6}
!llvm.ident = !{!8}
!nvvmir.version = !{!9}
!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 10, i32 1]}
!1 = !{i32 1, !"wchar_size", i32 4}
!2 = !{i32 4, !"nvvm-reflect-ftz", i32 0}
!3 = !{void (i64, %struct.knode*, i64, %struct.record*, i64*, i64*, i32*, %struct.record*)* @findK, !"kernel", i32 1}
!4 = !{null, !"align", i32 8}
!5 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080}
!6 = !{null, !"align", i32 16}
!7 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088}
!8 = !{!"clang version 10.0.1 (https://github.com/llvm/llvm-project.git ef32c611aa214dea855364efd7ba451ec5ec3f74)"}
!9 = !{i32 1, i32 4}

File diff suppressed because one or more lines are too long

View File

@ -1,475 +0,0 @@
; ModuleID = 'kernel_gpu_cuda_wrapper_2-cuda-nvptx64-nvidia-cuda-sm_61.bc'
source_filename = "kernel/kernel_gpu_cuda_wrapper_2.cu"
target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
target triple = "nvptx64-nvidia-cuda"
%struct.__cuda_builtin_threadIdx_t = type { i8 }
%struct.__cuda_builtin_blockIdx_t = type { i8 }
%struct.cudaFuncAttributes = type { i64, i64, i64, i32, i32, i32, i32, i32, i32, i32 }
%struct.knode = type { i32, [257 x i32], [257 x i32], i8, i32 }
$_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv = comdat any
$_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv = comdat any
@threadIdx = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_threadIdx_t, align 1
@blockIdx = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_blockIdx_t, align 1
; Function Attrs: convergent noinline nounwind optnone
define weak dso_local i32 @cudaMalloc(i8** %p, i64 %s) #0 {
entry:
%p.addr = alloca i8**, align 8
%s.addr = alloca i64, align 8
store i8** %p, i8*** %p.addr, align 8
store i64 %s, i64* %s.addr, align 8
ret i32 999
}
; Function Attrs: convergent noinline nounwind optnone
define weak dso_local i32 @cudaFuncGetAttributes(%struct.cudaFuncAttributes* %p, i8* %c) #0 {
entry:
%p.addr = alloca %struct.cudaFuncAttributes*, align 8
%c.addr = alloca i8*, align 8
store %struct.cudaFuncAttributes* %p, %struct.cudaFuncAttributes** %p.addr, align 8
store i8* %c, i8** %c.addr, align 8
ret i32 999
}
; Function Attrs: convergent noinline nounwind optnone
define weak dso_local i32 @cudaDeviceGetAttribute(i32* %value, i32 %attr, i32 %device) #0 {
entry:
%value.addr = alloca i32*, align 8
%attr.addr = alloca i32, align 4
%device.addr = alloca i32, align 4
store i32* %value, i32** %value.addr, align 8
store i32 %attr, i32* %attr.addr, align 4
store i32 %device, i32* %device.addr, align 4
ret i32 999
}
; Function Attrs: convergent noinline nounwind optnone
define weak dso_local i32 @cudaGetDevice(i32* %device) #0 {
entry:
%device.addr = alloca i32*, align 8
store i32* %device, i32** %device.addr, align 8
ret i32 999
}
; Function Attrs: convergent noinline nounwind optnone
define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessor(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize) #0 {
entry:
%numBlocks.addr = alloca i32*, align 8
%func.addr = alloca i8*, align 8
%blockSize.addr = alloca i32, align 4
%dynamicSmemSize.addr = alloca i64, align 8
store i32* %numBlocks, i32** %numBlocks.addr, align 8
store i8* %func, i8** %func.addr, align 8
store i32 %blockSize, i32* %blockSize.addr, align 4
store i64 %dynamicSmemSize, i64* %dynamicSmemSize.addr, align 8
ret i32 999
}
; Function Attrs: convergent noinline nounwind optnone
define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize, i32 %flags) #0 {
entry:
%numBlocks.addr = alloca i32*, align 8
%func.addr = alloca i8*, align 8
%blockSize.addr = alloca i32, align 4
%dynamicSmemSize.addr = alloca i64, align 8
%flags.addr = alloca i32, align 4
store i32* %numBlocks, i32** %numBlocks.addr, align 8
store i8* %func, i8** %func.addr, align 8
store i32 %blockSize, i32* %blockSize.addr, align 4
store i64 %dynamicSmemSize, i64* %dynamicSmemSize.addr, align 8
store i32 %flags, i32* %flags.addr, align 4
ret i32 999
}
; Function Attrs: convergent noinline nounwind optnone
define dso_local void @findRangeK(i64 %height, %struct.knode* %knodesD, i64 %knodes_elem, i64* %currKnodeD, i64* %offsetD, i64* %lastKnodeD, i64* %offset_2D, i32* %startD, i32* %endD, i32* %RecstartD, i32* %ReclenD) #0 {
entry:
%height.addr = alloca i64, align 8
%knodesD.addr = alloca %struct.knode*, align 8
%knodes_elem.addr = alloca i64, align 8
%currKnodeD.addr = alloca i64*, align 8
%offsetD.addr = alloca i64*, align 8
%lastKnodeD.addr = alloca i64*, align 8
%offset_2D.addr = alloca i64*, align 8
%startD.addr = alloca i32*, align 8
%endD.addr = alloca i32*, align 8
%RecstartD.addr = alloca i32*, align 8
%ReclenD.addr = alloca i32*, align 8
%thid = alloca i32, align 4
%bid = alloca i32, align 4
%i = alloca i32, align 4
store i64 %height, i64* %height.addr, align 8
store %struct.knode* %knodesD, %struct.knode** %knodesD.addr, align 8
store i64 %knodes_elem, i64* %knodes_elem.addr, align 8
store i64* %currKnodeD, i64** %currKnodeD.addr, align 8
store i64* %offsetD, i64** %offsetD.addr, align 8
store i64* %lastKnodeD, i64** %lastKnodeD.addr, align 8
store i64* %offset_2D, i64** %offset_2D.addr, align 8
store i32* %startD, i32** %startD.addr, align 8
store i32* %endD, i32** %endD.addr, align 8
store i32* %RecstartD, i32** %RecstartD.addr, align 8
store i32* %ReclenD, i32** %ReclenD.addr, align 8
%call = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #2
store i32 %call, i32* %thid, align 4
%call1 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #2
store i32 %call1, i32* %bid, align 4
store i32 0, i32* %i, align 4
br label %for.cond
for.cond: ; preds = %for.inc, %entry
%0 = load i32, i32* %i, align 4
%conv = sext i32 %0 to i64
%1 = load i64, i64* %height.addr, align 8
%cmp = icmp slt i64 %conv, %1
br i1 %cmp, label %for.body, label %for.end
for.body: ; preds = %for.cond
%2 = load %struct.knode*, %struct.knode** %knodesD.addr, align 8
%3 = load i64*, i64** %currKnodeD.addr, align 8
%4 = load i32, i32* %bid, align 4
%idxprom = sext i32 %4 to i64
%arrayidx = getelementptr inbounds i64, i64* %3, i64 %idxprom
%5 = load i64, i64* %arrayidx, align 8
%arrayidx2 = getelementptr inbounds %struct.knode, %struct.knode* %2, i64 %5
%keys = getelementptr inbounds %struct.knode, %struct.knode* %arrayidx2, i32 0, i32 2
%6 = load i32, i32* %thid, align 4
%idxprom3 = sext i32 %6 to i64
%arrayidx4 = getelementptr inbounds [257 x i32], [257 x i32]* %keys, i64 0, i64 %idxprom3
%7 = load i32, i32* %arrayidx4, align 4
%8 = load i32*, i32** %startD.addr, align 8
%9 = load i32, i32* %bid, align 4
%idxprom5 = sext i32 %9 to i64
%arrayidx6 = getelementptr inbounds i32, i32* %8, i64 %idxprom5
%10 = load i32, i32* %arrayidx6, align 4
%cmp7 = icmp sle i32 %7, %10
br i1 %cmp7, label %land.lhs.true, label %if.end34
land.lhs.true: ; preds = %for.body
%11 = load %struct.knode*, %struct.knode** %knodesD.addr, align 8
%12 = load i64*, i64** %currKnodeD.addr, align 8
%13 = load i32, i32* %bid, align 4
%idxprom8 = sext i32 %13 to i64
%arrayidx9 = getelementptr inbounds i64, i64* %12, i64 %idxprom8
%14 = load i64, i64* %arrayidx9, align 8
%arrayidx10 = getelementptr inbounds %struct.knode, %struct.knode* %11, i64 %14
%keys11 = getelementptr inbounds %struct.knode, %struct.knode* %arrayidx10, i32 0, i32 2
%15 = load i32, i32* %thid, align 4
%add = add nsw i32 %15, 1
%idxprom12 = sext i32 %add to i64
%arrayidx13 = getelementptr inbounds [257 x i32], [257 x i32]* %keys11, i64 0, i64 %idxprom12
%16 = load i32, i32* %arrayidx13, align 4
%17 = load i32*, i32** %startD.addr, align 8
%18 = load i32, i32* %bid, align 4
%idxprom14 = sext i32 %18 to i64
%arrayidx15 = getelementptr inbounds i32, i32* %17, i64 %idxprom14
%19 = load i32, i32* %arrayidx15, align 4
%cmp16 = icmp sgt i32 %16, %19
br i1 %cmp16, label %if.then, label %if.end34
if.then: ; preds = %land.lhs.true
%20 = load %struct.knode*, %struct.knode** %knodesD.addr, align 8
%21 = load i64*, i64** %currKnodeD.addr, align 8
%22 = load i32, i32* %bid, align 4
%idxprom17 = sext i32 %22 to i64
%arrayidx18 = getelementptr inbounds i64, i64* %21, i64 %idxprom17
%23 = load i64, i64* %arrayidx18, align 8
%arrayidx19 = getelementptr inbounds %struct.knode, %struct.knode* %20, i64 %23
%indices = getelementptr inbounds %struct.knode, %struct.knode* %arrayidx19, i32 0, i32 1
%24 = load i32, i32* %thid, align 4
%idxprom20 = sext i32 %24 to i64
%arrayidx21 = getelementptr inbounds [257 x i32], [257 x i32]* %indices, i64 0, i64 %idxprom20
%25 = load i32, i32* %arrayidx21, align 4
%conv22 = sext i32 %25 to i64
%26 = load i64, i64* %knodes_elem.addr, align 8
%cmp23 = icmp slt i64 %conv22, %26
br i1 %cmp23, label %if.then24, label %if.end
if.then24: ; preds = %if.then
%27 = load %struct.knode*, %struct.knode** %knodesD.addr, align 8
%28 = load i64*, i64** %currKnodeD.addr, align 8
%29 = load i32, i32* %bid, align 4
%idxprom25 = sext i32 %29 to i64
%arrayidx26 = getelementptr inbounds i64, i64* %28, i64 %idxprom25
%30 = load i64, i64* %arrayidx26, align 8
%arrayidx27 = getelementptr inbounds %struct.knode, %struct.knode* %27, i64 %30
%indices28 = getelementptr inbounds %struct.knode, %struct.knode* %arrayidx27, i32 0, i32 1
%31 = load i32, i32* %thid, align 4
%idxprom29 = sext i32 %31 to i64
%arrayidx30 = getelementptr inbounds [257 x i32], [257 x i32]* %indices28, i64 0, i64 %idxprom29
%32 = load i32, i32* %arrayidx30, align 4
%conv31 = sext i32 %32 to i64
%33 = load i64*, i64** %offsetD.addr, align 8
%34 = load i32, i32* %bid, align 4
%idxprom32 = sext i32 %34 to i64
%arrayidx33 = getelementptr inbounds i64, i64* %33, i64 %idxprom32
store i64 %conv31, i64* %arrayidx33, align 8
br label %if.end
if.end: ; preds = %if.then24, %if.then
br label %if.end34
if.end34: ; preds = %if.end, %land.lhs.true, %for.body
%35 = load %struct.knode*, %struct.knode** %knodesD.addr, align 8
%36 = load i64*, i64** %lastKnodeD.addr, align 8
%37 = load i32, i32* %bid, align 4
%idxprom35 = sext i32 %37 to i64
%arrayidx36 = getelementptr inbounds i64, i64* %36, i64 %idxprom35
%38 = load i64, i64* %arrayidx36, align 8
%arrayidx37 = getelementptr inbounds %struct.knode, %struct.knode* %35, i64 %38
%keys38 = getelementptr inbounds %struct.knode, %struct.knode* %arrayidx37, i32 0, i32 2
%39 = load i32, i32* %thid, align 4
%idxprom39 = sext i32 %39 to i64
%arrayidx40 = getelementptr inbounds [257 x i32], [257 x i32]* %keys38, i64 0, i64 %idxprom39
%40 = load i32, i32* %arrayidx40, align 4
%41 = load i32*, i32** %endD.addr, align 8
%42 = load i32, i32* %bid, align 4
%idxprom41 = sext i32 %42 to i64
%arrayidx42 = getelementptr inbounds i32, i32* %41, i64 %idxprom41
%43 = load i32, i32* %arrayidx42, align 4
%cmp43 = icmp sle i32 %40, %43
br i1 %cmp43, label %land.lhs.true44, label %if.end75
land.lhs.true44: ; preds = %if.end34
%44 = load %struct.knode*, %struct.knode** %knodesD.addr, align 8
%45 = load i64*, i64** %lastKnodeD.addr, align 8
%46 = load i32, i32* %bid, align 4
%idxprom45 = sext i32 %46 to i64
%arrayidx46 = getelementptr inbounds i64, i64* %45, i64 %idxprom45
%47 = load i64, i64* %arrayidx46, align 8
%arrayidx47 = getelementptr inbounds %struct.knode, %struct.knode* %44, i64 %47
%keys48 = getelementptr inbounds %struct.knode, %struct.knode* %arrayidx47, i32 0, i32 2
%48 = load i32, i32* %thid, align 4
%add49 = add nsw i32 %48, 1
%idxprom50 = sext i32 %add49 to i64
%arrayidx51 = getelementptr inbounds [257 x i32], [257 x i32]* %keys48, i64 0, i64 %idxprom50
%49 = load i32, i32* %arrayidx51, align 4
%50 = load i32*, i32** %endD.addr, align 8
%51 = load i32, i32* %bid, align 4
%idxprom52 = sext i32 %51 to i64
%arrayidx53 = getelementptr inbounds i32, i32* %50, i64 %idxprom52
%52 = load i32, i32* %arrayidx53, align 4
%cmp54 = icmp sgt i32 %49, %52
br i1 %cmp54, label %if.then55, label %if.end75
if.then55: ; preds = %land.lhs.true44
%53 = load %struct.knode*, %struct.knode** %knodesD.addr, align 8
%54 = load i64*, i64** %lastKnodeD.addr, align 8
%55 = load i32, i32* %bid, align 4
%idxprom56 = sext i32 %55 to i64
%arrayidx57 = getelementptr inbounds i64, i64* %54, i64 %idxprom56
%56 = load i64, i64* %arrayidx57, align 8
%arrayidx58 = getelementptr inbounds %struct.knode, %struct.knode* %53, i64 %56
%indices59 = getelementptr inbounds %struct.knode, %struct.knode* %arrayidx58, i32 0, i32 1
%57 = load i32, i32* %thid, align 4
%idxprom60 = sext i32 %57 to i64
%arrayidx61 = getelementptr inbounds [257 x i32], [257 x i32]* %indices59, i64 0, i64 %idxprom60
%58 = load i32, i32* %arrayidx61, align 4
%conv62 = sext i32 %58 to i64
%59 = load i64, i64* %knodes_elem.addr, align 8
%cmp63 = icmp slt i64 %conv62, %59
br i1 %cmp63, label %if.then64, label %if.end74
if.then64: ; preds = %if.then55
%60 = load %struct.knode*, %struct.knode** %knodesD.addr, align 8
%61 = load i64*, i64** %lastKnodeD.addr, align 8
%62 = load i32, i32* %bid, align 4
%idxprom65 = sext i32 %62 to i64
%arrayidx66 = getelementptr inbounds i64, i64* %61, i64 %idxprom65
%63 = load i64, i64* %arrayidx66, align 8
%arrayidx67 = getelementptr inbounds %struct.knode, %struct.knode* %60, i64 %63
%indices68 = getelementptr inbounds %struct.knode, %struct.knode* %arrayidx67, i32 0, i32 1
%64 = load i32, i32* %thid, align 4
%idxprom69 = sext i32 %64 to i64
%arrayidx70 = getelementptr inbounds [257 x i32], [257 x i32]* %indices68, i64 0, i64 %idxprom69
%65 = load i32, i32* %arrayidx70, align 4
%conv71 = sext i32 %65 to i64
%66 = load i64*, i64** %offset_2D.addr, align 8
%67 = load i32, i32* %bid, align 4
%idxprom72 = sext i32 %67 to i64
%arrayidx73 = getelementptr inbounds i64, i64* %66, i64 %idxprom72
store i64 %conv71, i64* %arrayidx73, align 8
br label %if.end74
if.end74: ; preds = %if.then64, %if.then55
br label %if.end75
if.end75: ; preds = %if.end74, %land.lhs.true44, %if.end34
call void @llvm.nvvm.barrier0()
%68 = load i32, i32* %thid, align 4
%cmp76 = icmp eq i32 %68, 0
br i1 %cmp76, label %if.then77, label %if.end86
if.then77: ; preds = %if.end75
%69 = load i64*, i64** %offsetD.addr, align 8
%70 = load i32, i32* %bid, align 4
%idxprom78 = sext i32 %70 to i64
%arrayidx79 = getelementptr inbounds i64, i64* %69, i64 %idxprom78
%71 = load i64, i64* %arrayidx79, align 8
%72 = load i64*, i64** %currKnodeD.addr, align 8
%73 = load i32, i32* %bid, align 4
%idxprom80 = sext i32 %73 to i64
%arrayidx81 = getelementptr inbounds i64, i64* %72, i64 %idxprom80
store i64 %71, i64* %arrayidx81, align 8
%74 = load i64*, i64** %offset_2D.addr, align 8
%75 = load i32, i32* %bid, align 4
%idxprom82 = sext i32 %75 to i64
%arrayidx83 = getelementptr inbounds i64, i64* %74, i64 %idxprom82
%76 = load i64, i64* %arrayidx83, align 8
%77 = load i64*, i64** %lastKnodeD.addr, align 8
%78 = load i32, i32* %bid, align 4
%idxprom84 = sext i32 %78 to i64
%arrayidx85 = getelementptr inbounds i64, i64* %77, i64 %idxprom84
store i64 %76, i64* %arrayidx85, align 8
br label %if.end86
if.end86: ; preds = %if.then77, %if.end75
call void @llvm.nvvm.barrier0()
br label %for.inc
for.inc: ; preds = %if.end86
%79 = load i32, i32* %i, align 4
%inc = add nsw i32 %79, 1
store i32 %inc, i32* %i, align 4
br label %for.cond
for.end: ; preds = %for.cond
%80 = load %struct.knode*, %struct.knode** %knodesD.addr, align 8
%81 = load i64*, i64** %currKnodeD.addr, align 8
%82 = load i32, i32* %bid, align 4
%idxprom87 = sext i32 %82 to i64
%arrayidx88 = getelementptr inbounds i64, i64* %81, i64 %idxprom87
%83 = load i64, i64* %arrayidx88, align 8
%arrayidx89 = getelementptr inbounds %struct.knode, %struct.knode* %80, i64 %83
%keys90 = getelementptr inbounds %struct.knode, %struct.knode* %arrayidx89, i32 0, i32 2
%84 = load i32, i32* %thid, align 4
%idxprom91 = sext i32 %84 to i64
%arrayidx92 = getelementptr inbounds [257 x i32], [257 x i32]* %keys90, i64 0, i64 %idxprom91
%85 = load i32, i32* %arrayidx92, align 4
%86 = load i32*, i32** %startD.addr, align 8
%87 = load i32, i32* %bid, align 4
%idxprom93 = sext i32 %87 to i64
%arrayidx94 = getelementptr inbounds i32, i32* %86, i64 %idxprom93
%88 = load i32, i32* %arrayidx94, align 4
%cmp95 = icmp eq i32 %85, %88
br i1 %cmp95, label %if.then96, label %if.end105
if.then96: ; preds = %for.end
%89 = load %struct.knode*, %struct.knode** %knodesD.addr, align 8
%90 = load i64*, i64** %currKnodeD.addr, align 8
%91 = load i32, i32* %bid, align 4
%idxprom97 = sext i32 %91 to i64
%arrayidx98 = getelementptr inbounds i64, i64* %90, i64 %idxprom97
%92 = load i64, i64* %arrayidx98, align 8
%arrayidx99 = getelementptr inbounds %struct.knode, %struct.knode* %89, i64 %92
%indices100 = getelementptr inbounds %struct.knode, %struct.knode* %arrayidx99, i32 0, i32 1
%93 = load i32, i32* %thid, align 4
%idxprom101 = sext i32 %93 to i64
%arrayidx102 = getelementptr inbounds [257 x i32], [257 x i32]* %indices100, i64 0, i64 %idxprom101
%94 = load i32, i32* %arrayidx102, align 4
%95 = load i32*, i32** %RecstartD.addr, align 8
%96 = load i32, i32* %bid, align 4
%idxprom103 = sext i32 %96 to i64
%arrayidx104 = getelementptr inbounds i32, i32* %95, i64 %idxprom103
store i32 %94, i32* %arrayidx104, align 4
br label %if.end105
if.end105: ; preds = %if.then96, %for.end
call void @llvm.nvvm.barrier0()
%97 = load %struct.knode*, %struct.knode** %knodesD.addr, align 8
%98 = load i64*, i64** %lastKnodeD.addr, align 8
%99 = load i32, i32* %bid, align 4
%idxprom106 = sext i32 %99 to i64
%arrayidx107 = getelementptr inbounds i64, i64* %98, i64 %idxprom106
%100 = load i64, i64* %arrayidx107, align 8
%arrayidx108 = getelementptr inbounds %struct.knode, %struct.knode* %97, i64 %100
%keys109 = getelementptr inbounds %struct.knode, %struct.knode* %arrayidx108, i32 0, i32 2
%101 = load i32, i32* %thid, align 4
%idxprom110 = sext i32 %101 to i64
%arrayidx111 = getelementptr inbounds [257 x i32], [257 x i32]* %keys109, i64 0, i64 %idxprom110
%102 = load i32, i32* %arrayidx111, align 4
%103 = load i32*, i32** %endD.addr, align 8
%104 = load i32, i32* %bid, align 4
%idxprom112 = sext i32 %104 to i64
%arrayidx113 = getelementptr inbounds i32, i32* %103, i64 %idxprom112
%105 = load i32, i32* %arrayidx113, align 4
%cmp114 = icmp eq i32 %102, %105
br i1 %cmp114, label %if.then115, label %if.end127
if.then115: ; preds = %if.end105
%106 = load %struct.knode*, %struct.knode** %knodesD.addr, align 8
%107 = load i64*, i64** %lastKnodeD.addr, align 8
%108 = load i32, i32* %bid, align 4
%idxprom116 = sext i32 %108 to i64
%arrayidx117 = getelementptr inbounds i64, i64* %107, i64 %idxprom116
%109 = load i64, i64* %arrayidx117, align 8
%arrayidx118 = getelementptr inbounds %struct.knode, %struct.knode* %106, i64 %109
%indices119 = getelementptr inbounds %struct.knode, %struct.knode* %arrayidx118, i32 0, i32 1
%110 = load i32, i32* %thid, align 4
%idxprom120 = sext i32 %110 to i64
%arrayidx121 = getelementptr inbounds [257 x i32], [257 x i32]* %indices119, i64 0, i64 %idxprom120
%111 = load i32, i32* %arrayidx121, align 4
%112 = load i32*, i32** %RecstartD.addr, align 8
%113 = load i32, i32* %bid, align 4
%idxprom122 = sext i32 %113 to i64
%arrayidx123 = getelementptr inbounds i32, i32* %112, i64 %idxprom122
%114 = load i32, i32* %arrayidx123, align 4
%sub = sub nsw i32 %111, %114
%add124 = add nsw i32 %sub, 1
%115 = load i32*, i32** %ReclenD.addr, align 8
%116 = load i32, i32* %bid, align 4
%idxprom125 = sext i32 %116 to i64
%arrayidx126 = getelementptr inbounds i32, i32* %115, i64 %idxprom125
store i32 %add124, i32* %arrayidx126, align 4
br label %if.end127
if.end127: ; preds = %if.then115, %if.end105
ret void
}
; Function Attrs: alwaysinline convergent nounwind
define linkonce_odr dso_local i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #1 comdat align 2 {
entry:
%0 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
ret i32 %0
}
; Function Attrs: alwaysinline convergent nounwind
define linkonce_odr dso_local i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #1 comdat align 2 {
entry:
%0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
ret i32 %0
}
; Function Attrs: convergent nounwind
declare void @llvm.nvvm.barrier0() #2
; Function Attrs: nounwind readnone
declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() #3
; Function Attrs: nounwind readnone
declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #3
attributes #0 = { convergent noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #1 = { alwaysinline convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #2 = { convergent nounwind }
attributes #3 = { nounwind readnone }
!llvm.module.flags = !{!0, !1, !2}
!nvvm.annotations = !{!3, !4, !5, !4, !6, !6, !6, !6, !7, !7, !6}
!llvm.ident = !{!8}
!nvvmir.version = !{!9}
!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 10, i32 1]}
!1 = !{i32 1, !"wchar_size", i32 4}
!2 = !{i32 4, !"nvvm-reflect-ftz", i32 0}
!3 = !{void (i64, %struct.knode*, i64, i64*, i64*, i64*, i64*, i32*, i32*, i32*, i32*)* @findRangeK, !"kernel", i32 1}
!4 = !{null, !"align", i32 8}
!5 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080}
!6 = !{null, !"align", i32 16}
!7 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088}
!8 = !{!"clang version 10.0.1 (https://github.com/llvm/llvm-project.git ef32c611aa214dea855364efd7ba451ec5ec3f74)"}
!9 = !{i32 1, i32 4}

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load Diff

View File

@ -1,40 +0,0 @@
#!/bin/bash
set -e
clang -c -emit-llvm util/timer/timer.c
clang -c -emit-llvm util/num/num.c
#clang -c -emit-llvm util/cuda/cuda.cu --cuda-gpu-arch=sm_61
#clang -c -emit-llvm kernel/kernel_gpu_cuda_wrapper.cu --cuda-gpu-arch=sm_61
#clang++ kernel/kernel_gpu_cuda_wrapper.cu --cuda-path=/usr/local/cuda-10.1 --cuda-gpu-arch=sm_61 -L/usr/local/cuda-10.1/lib64 -lcudart_static -ldl -lrt -pthread -save-temps -v
#clang++ kernel/kernel_gpu_cuda_wrapper_2.cu --cuda-path=/usr/local/cuda-10.1 --cuda-gpu-arch=sm_61 -L/usr/local/cuda-10.1/lib64 -lcudart_static -ldl -lrt -pthread -save-temps -v
clang -c -emit-llvm main.c
llvm-as kernel_gpu_cuda_wrapper-cuda-nvptx64-nvidia-cuda-sm_61.ll
llvm-as kernel_gpu_cuda_wrapper_2-cuda-nvptx64-nvidia-cuda-sm_61.ll
llvm-as kernel_gpu_cuda_wrapper-host-x86_64-unknown-linux-gnu.ll
llvm-as kernel_gpu_cuda_wrapper_2-host-x86_64-unknown-linux-gnu.ll
../../build/compilation/kernelTranslator kernel_gpu_cuda_wrapper-cuda-nvptx64-nvidia-cuda-sm_61.bc kernel1.bc
../../build/compilation/kernelTranslator kernel_gpu_cuda_wrapper_2-cuda-nvptx64-nvidia-cuda-sm_61.bc kernel2.bc
../../build/compilation/hostTranslator kernel_gpu_cuda_wrapper-host-x86_64-unknown-linux-gnu.bc host1.bc
../../build/compilation/hostTranslator kernel_gpu_cuda_wrapper_2-host-x86_64-unknown-linux-gnu.bc host2.bc
llc --relocation-model=pic --filetype=obj main.bc
llc --relocation-model=pic --filetype=obj cuda.bc
llc --relocation-model=pic --filetype=obj num.bc
llc --relocation-model=pic --filetype=obj timer.bc
llc --relocation-model=pic --filetype=obj kernel1.bc
llc --relocation-model=pic --filetype=obj kernel2.bc
llc --relocation-model=pic --filetype=obj host1.bc
llc --relocation-model=pic --filetype=obj host2.bc
export LD_LIBRARY_PATH=../../build/runtime:../../build/runtime/threadPool:$LD_LIBRARY_PATH
g++ -Wall -L../../build/runtime -L../../build/runtime/threadPool -o b+tree.out \
-fPIC -no-pie main.o host1.o host2.o kernel1.o kernel2.o cuda.o num.o timer.o \
-lc -lx86Runtime -lthreadPool -lpthread
./b+tree.out file ../../rodinia-data/b+tree/mil.txt \
command ../../rodinia-data/b+tree/command.txt
if grep -q "0 840187 6001" output.txt; then
echo "Pass"
else
echo "Error result"
exit 1
fi

View File

@ -1,75 +0,0 @@
#ifdef __cplusplus
extern "C" {
#endif
//===============================================================================================================================================================================================================200
// SET_DEVICE CODE
//===============================================================================================================================================================================================================200
//======================================================================================================================================================150
// INCLUDE/DEFINE
//======================================================================================================================================================150
#include "cuda.h" // (in library path specified to compiler)
//======================================================================================================================================================150
// FUNCTIONS
//======================================================================================================================================================150
//====================================================================================================100
// SET DEVICE
//====================================================================================================100
void setdevice(void){
// variables
int num_devices;
int device;
// work
cudaGetDeviceCount(&num_devices);
if (num_devices > 1) {
// variables
int max_multiprocessors;
int max_device;
cudaDeviceProp properties;
// initialize variables
max_multiprocessors = 0;
max_device = 0;
for (device = 0; device < num_devices; device++) {
cudaGetDeviceProperties(&properties, device);
if (max_multiprocessors < properties.multiProcessorCount) {
max_multiprocessors = properties.multiProcessorCount;
max_device = device;
}
}
cudaSetDevice(max_device);
}
}
//====================================================================================================100
// GET LAST ERROR
//====================================================================================================100
void checkCUDAError(const char *msg)
{
cudaError_t err = cudaGetLastError();
if( cudaSuccess != err) {
// fprintf(stderr, "Cuda error: %s: %s.\n", msg, cudaGetErrorString( err) );
printf("Cuda error: %s: %s.\n", msg, cudaGetErrorString( err) );
fflush(NULL);
exit(EXIT_FAILURE);
}
}
//===============================================================================================================================================================================================================200
// END
//===============================================================================================================================================================================================================200
#ifdef __cplusplus
}
#endif

View File

@ -1,37 +0,0 @@
#ifdef __cplusplus
extern "C" {
#endif
//===============================================================================================================================================================================================================200
// SET_DEVICE HEADER
//===============================================================================================================================================================================================================200
//======================================================================================================================================================150
// INCLUDE/DEFINE
//======================================================================================================================================================150
#include <stdio.h> // (in library path known to compiler) needed by printf
//======================================================================================================================================================150
// FUNCTION PROTOTYPES
//======================================================================================================================================================150
//====================================================================================================100
// SET DEVICE
//====================================================================================================100
void setdevice(void);
//====================================================================================================100
// GET LAST ERROR
//====================================================================================================100
void checkCUDAError(const char *msg);
//===============================================================================================================================================================================================================200
// END SET_DEVICE HEADER
//===============================================================================================================================================================================================================200
#ifdef __cplusplus
}
#endif

View File

@ -1,55 +0,0 @@
#ifdef __cplusplus
extern "C" {
#endif
//===============================================================================================================================================================================================================200
// DESCRIPTION
//===============================================================================================================================================================================================================200
// Returns: 0 if string does not represent integer
// 1 if string represents integer
//===============================================================================================================================================================================================================200
// NUM CODE
//===============================================================================================================================================================================================================200
//======================================================================================================================================================150
// ISINTEGER FUNCTION
//======================================================================================================================================================150
int isInteger(char *str) {
//====================================================================================================100
// make sure it's not empty
//====================================================================================================100
if (*str == '\0') {
return 0;
}
//====================================================================================================100
// if any digit is not a number, return false
//====================================================================================================100
for (; *str != '\0'; str++) {
if (*str < 48 ||
*str >
57) { // digit characters (need to include . if checking for float)
return 0;
}
}
//====================================================================================================100
// it got past all my checks so I think it's a number
//====================================================================================================100
return 1;
}
//===============================================================================================================================================================================================================200
// END NUM CODE
//===============================================================================================================================================================================================================200
#ifdef __cplusplus
}
#endif

View File

@ -1,21 +0,0 @@
#ifdef __cplusplus
extern "C" {
#endif
//===============================================================================================================================================================================================================200
// FILE HEADER
//===============================================================================================================================================================================================================200
//======================================================================================================================================================150
// ISINTEGER FUNCTION PROTOTYPE
//======================================================================================================================================================150
int isInteger(char *str);
//===============================================================================================================================================================================================================200
// END FILE HEADER
//===============================================================================================================================================================================================================200
#ifdef __cplusplus
}
#endif

View File

@ -1,36 +0,0 @@
#ifdef __cplusplus
extern "C" {
#endif
//===============================================================================================================================================================================================================200
// TIMER CODE
//===============================================================================================================================================================================================================200
//======================================================================================================================================================150
// INCLUDE/DEFINE
//======================================================================================================================================================150
#include <stdlib.h>
//======================================================================================================================================================150
// FUNCTIONS
//======================================================================================================================================================150
//====================================================================================================100
// DISPLAY TIME
//====================================================================================================100
// Returns the current system time in microseconds
long long get_time() {
struct timeval tv;
gettimeofday(&tv, NULL);
return (tv.tv_sec * 1000000) + tv.tv_usec;
}
//===============================================================================================================================================================================================================200
// END TIMER CODE
//===============================================================================================================================================================================================================200
#ifdef __cplusplus
}
#endif

View File

@ -1,21 +0,0 @@
#ifdef __cplusplus
extern "C" {
#endif
//===============================================================================================================================================================================================================200
// TIMER HEADER
//===============================================================================================================================================================================================================200
//======================================================================================================================================================150
// FUNCTION PROTOTYPES
//======================================================================================================================================================150
long long get_time();
//===============================================================================================================================================================================================================200
// END TIMER HEADER
//===============================================================================================================================================================================================================200
#ifdef __cplusplus
}
#endif

View File

@ -1,662 +0,0 @@
#include <fstream>
#include <helper_cuda.h>
#include <helper_timer.h>
#include <iostream>
/*
* Options
*
*/
#define GAMMA 1.4f
#define iterations 2
// #ifndef block_length
// #define block_length 192
// #endif
#define NDIM 3
#define NNB 4
#define RK 3 // 3rd order RK
#define ff_mach 1.2f
#define deg_angle_of_attack 0.0f
/*
* not options
*/
#ifdef RD_WG_SIZE_0_0
#define BLOCK_SIZE_0 RD_WG_SIZE_0_0
#elif defined(RD_WG_SIZE_0)
#define BLOCK_SIZE_0 RD_WG_SIZE_0
#elif defined(RD_WG_SIZE)
#define BLOCK_SIZE_0 RD_WG_SIZE
#else
#define BLOCK_SIZE_0 192
#endif
#ifdef RD_WG_SIZE_1_0
#define BLOCK_SIZE_1 RD_WG_SIZE_1_0
#elif defined(RD_WG_SIZE_1)
#define BLOCK_SIZE_1 RD_WG_SIZE_1
#elif defined(RD_WG_SIZE)
#define BLOCK_SIZE_1 RD_WG_SIZE
#else
#define BLOCK_SIZE_1 192
#endif
#ifdef RD_WG_SIZE_2_0
#define BLOCK_SIZE_2 RD_WG_SIZE_2_0
#elif defined(RD_WG_SIZE_1)
#define BLOCK_SIZE_2 RD_WG_SIZE_2
#elif defined(RD_WG_SIZE)
#define BLOCK_SIZE_2 RD_WG_SIZE
#else
#define BLOCK_SIZE_2 192
#endif
#ifdef RD_WG_SIZE_3_0
#define BLOCK_SIZE_3 RD_WG_SIZE_3_0
#elif defined(RD_WG_SIZE_3)
#define BLOCK_SIZE_3 RD_WG_SIZE_3
#elif defined(RD_WG_SIZE)
#define BLOCK_SIZE_3 RD_WG_SIZE
#else
#define BLOCK_SIZE_3 192
#endif
#ifdef RD_WG_SIZE_4_0
#define BLOCK_SIZE_4 RD_WG_SIZE_4_0
#elif defined(RD_WG_SIZE_4)
#define BLOCK_SIZE_4 RD_WG_SIZE_4
#elif defined(RD_WG_SIZE)
#define BLOCK_SIZE_4 RD_WG_SIZE
#else
#define BLOCK_SIZE_4 192
#endif
// #if block_length > 128
// #warning "the kernels may fail too launch on some systems if the block length
// is too large" #endif
#define VAR_DENSITY 0
#define VAR_MOMENTUM 1
#define VAR_DENSITY_ENERGY (VAR_MOMENTUM + NDIM)
#define NVAR (VAR_DENSITY_ENERGY + 1)
/*
* Generic functions
*/
template <typename T> T *alloc(int N) {
T *t;
checkCudaErrors(cudaMalloc((void **)&t, sizeof(T) * N));
return t;
}
template <typename T> void dealloc(T *array) {
checkCudaErrors(cudaFree((void *)array));
}
template <typename T> void copy(T *dst, T *src, int N) {
checkCudaErrors(cudaMemcpy((void *)dst, (void *)src, N * sizeof(T),
cudaMemcpyDeviceToDevice));
}
template <typename T> void upload(T *dst, T *src, int N) {
checkCudaErrors(cudaMemcpy((void *)dst, (void *)src, N * sizeof(T),
cudaMemcpyHostToDevice));
}
template <typename T> void download(T *dst, T *src, int N) {
checkCudaErrors(cudaMemcpy((void *)dst, (void *)src, N * sizeof(T),
cudaMemcpyDeviceToHost));
}
void dump(float *variables, int nel, int nelr) {
float *h_variables = new float[nelr * NVAR];
download(h_variables, variables, nelr * NVAR);
{
std::ofstream file("density");
file << nel << " " << nelr << std::endl;
for (int i = 0; i < nel; i++)
file << h_variables[i + VAR_DENSITY * nelr] << std::endl;
}
{
std::ofstream file("momentum");
file << nel << " " << nelr << std::endl;
for (int i = 0; i < nel; i++) {
for (int j = 0; j != NDIM; j++)
file << h_variables[i + (VAR_MOMENTUM + j) * nelr] << " ";
file << std::endl;
}
}
{
std::ofstream file("density_energy");
file << nel << " " << nelr << std::endl;
for (int i = 0; i < nel; i++)
file << h_variables[i + VAR_DENSITY_ENERGY * nelr] << std::endl;
}
delete[] h_variables;
}
/*
* Element-based Cell-centered FVM solver functions
*/
__constant__ float ff_variable[NVAR];
__constant__ float3 ff_flux_contribution_momentum_x[1];
__constant__ float3 ff_flux_contribution_momentum_y[1];
__constant__ float3 ff_flux_contribution_momentum_z[1];
__constant__ float3 ff_flux_contribution_density_energy[1];
__global__ void cuda_initialize_variables(int nelr, float *variables) {
const int i = (blockDim.x * blockIdx.x + threadIdx.x);
for (int j = 0; j < NVAR; j++)
variables[i + j * nelr] = ff_variable[j];
}
void initialize_variables(int nelr, float *variables) {
dim3 Dg(nelr / BLOCK_SIZE_1), Db(BLOCK_SIZE_1);
cuda_initialize_variables<<<Dg, Db>>>(nelr, variables);
getLastCudaError("initialize_variables failed");
}
__device__ __host__ inline void compute_flux_contribution(
float &density, float3 &momentum, float &density_energy, float &pressure,
float3 &velocity, float3 &fc_momentum_x, float3 &fc_momentum_y,
float3 &fc_momentum_z, float3 &fc_density_energy) {
fc_momentum_x.x = velocity.x * momentum.x + pressure;
fc_momentum_x.y = velocity.x * momentum.y;
fc_momentum_x.z = velocity.x * momentum.z;
fc_momentum_y.x = fc_momentum_x.y;
fc_momentum_y.y = velocity.y * momentum.y + pressure;
fc_momentum_y.z = velocity.y * momentum.z;
fc_momentum_z.x = fc_momentum_x.z;
fc_momentum_z.y = fc_momentum_y.z;
fc_momentum_z.z = velocity.z * momentum.z + pressure;
float de_p = density_energy + pressure;
fc_density_energy.x = velocity.x * de_p;
fc_density_energy.y = velocity.y * de_p;
fc_density_energy.z = velocity.z * de_p;
}
__device__ inline void compute_velocity(float &density, float3 &momentum,
float3 &velocity) {
velocity.x = momentum.x / density;
velocity.y = momentum.y / density;
velocity.z = momentum.z / density;
}
__device__ inline float compute_speed_sqd(float3 &velocity) {
return velocity.x * velocity.x + velocity.y * velocity.y +
velocity.z * velocity.z;
}
__device__ inline float compute_pressure(float &density, float &density_energy,
float &speed_sqd) {
return (float(GAMMA) - float(1.0f)) *
(density_energy - float(0.5f) * density * speed_sqd);
}
__device__ inline float compute_speed_of_sound(float &density,
float &pressure) {
return sqrtf(float(GAMMA) * pressure / density);
}
__global__ void cuda_compute_step_factor(int nelr, float *variables,
float *areas, float *step_factors) {
const int i = (blockDim.x * blockIdx.x + threadIdx.x);
float density = variables[i + VAR_DENSITY * nelr];
float3 momentum;
momentum.x = variables[i + (VAR_MOMENTUM + 0) * nelr];
momentum.y = variables[i + (VAR_MOMENTUM + 1) * nelr];
momentum.z = variables[i + (VAR_MOMENTUM + 2) * nelr];
float density_energy = variables[i + VAR_DENSITY_ENERGY * nelr];
float3 velocity;
compute_velocity(density, momentum, velocity);
float speed_sqd = compute_speed_sqd(velocity);
float pressure = compute_pressure(density, density_energy, speed_sqd);
float speed_of_sound = compute_speed_of_sound(density, pressure);
// dt = float(0.5f) * sqrtf(areas[i]) / (||v|| + c).... but when we do time
// stepping, this later would need to be divided by the area, so we just do it
// all at once
step_factors[i] =
float(0.5f) / (sqrtf(areas[i]) * (sqrtf(speed_sqd) + speed_of_sound));
}
void compute_step_factor(int nelr, float *variables, float *areas,
float *step_factors) {
dim3 Dg(nelr / BLOCK_SIZE_2), Db(BLOCK_SIZE_2);
cuda_compute_step_factor<<<Dg, Db>>>(nelr, variables, areas, step_factors);
getLastCudaError("compute_step_factor failed");
}
/*
*
*
*/
__global__ void cuda_compute_flux(int nelr, int *elements_surrounding_elements,
float *normals, float *variables,
float *fluxes) {
const float smoothing_coefficient = float(0.2f);
const int i = (blockDim.x * blockIdx.x + threadIdx.x);
int j, nb;
float3 normal;
float normal_len;
float factor;
float density_i = variables[i + VAR_DENSITY * nelr];
float3 momentum_i;
momentum_i.x = variables[i + (VAR_MOMENTUM + 0) * nelr];
momentum_i.y = variables[i + (VAR_MOMENTUM + 1) * nelr];
momentum_i.z = variables[i + (VAR_MOMENTUM + 2) * nelr];
float density_energy_i = variables[i + VAR_DENSITY_ENERGY * nelr];
float3 velocity_i;
compute_velocity(density_i, momentum_i, velocity_i);
float speed_sqd_i = compute_speed_sqd(velocity_i);
float speed_i = sqrtf(speed_sqd_i);
float pressure_i = compute_pressure(density_i, density_energy_i, speed_sqd_i);
float speed_of_sound_i = compute_speed_of_sound(density_i, pressure_i);
float3 flux_contribution_i_momentum_x, flux_contribution_i_momentum_y,
flux_contribution_i_momentum_z;
float3 flux_contribution_i_density_energy;
compute_flux_contribution(
density_i, momentum_i, density_energy_i, pressure_i, velocity_i,
flux_contribution_i_momentum_x, flux_contribution_i_momentum_y,
flux_contribution_i_momentum_z, flux_contribution_i_density_energy);
float flux_i_density = float(0.0f);
float3 flux_i_momentum;
flux_i_momentum.x = float(0.0f);
flux_i_momentum.y = float(0.0f);
flux_i_momentum.z = float(0.0f);
float flux_i_density_energy = float(0.0f);
float3 velocity_nb;
float density_nb, density_energy_nb;
float3 momentum_nb;
float3 flux_contribution_nb_momentum_x, flux_contribution_nb_momentum_y,
flux_contribution_nb_momentum_z;
float3 flux_contribution_nb_density_energy;
float speed_sqd_nb, speed_of_sound_nb, pressure_nb;
#pragma unroll
for (j = 0; j < NNB; j++) {
nb = elements_surrounding_elements[i + j * nelr];
normal.x = normals[i + (j + 0 * NNB) * nelr];
normal.y = normals[i + (j + 1 * NNB) * nelr];
normal.z = normals[i + (j + 2 * NNB) * nelr];
normal_len =
sqrtf(normal.x * normal.x + normal.y * normal.y + normal.z * normal.z);
if (nb >= 0) // a legitimate neighbor
{
density_nb = variables[nb + VAR_DENSITY * nelr];
momentum_nb.x = variables[nb + (VAR_MOMENTUM + 0) * nelr];
momentum_nb.y = variables[nb + (VAR_MOMENTUM + 1) * nelr];
momentum_nb.z = variables[nb + (VAR_MOMENTUM + 2) * nelr];
density_energy_nb = variables[nb + VAR_DENSITY_ENERGY * nelr];
compute_velocity(density_nb, momentum_nb, velocity_nb);
speed_sqd_nb = compute_speed_sqd(velocity_nb);
pressure_nb =
compute_pressure(density_nb, density_energy_nb, speed_sqd_nb);
speed_of_sound_nb = compute_speed_of_sound(density_nb, pressure_nb);
compute_flux_contribution(
density_nb, momentum_nb, density_energy_nb, pressure_nb, velocity_nb,
flux_contribution_nb_momentum_x, flux_contribution_nb_momentum_y,
flux_contribution_nb_momentum_z, flux_contribution_nb_density_energy);
// artificial viscosity
factor = -normal_len * smoothing_coefficient * float(0.5f) *
(speed_i + sqrtf(speed_sqd_nb) + speed_of_sound_i +
speed_of_sound_nb);
flux_i_density += factor * (density_i - density_nb);
flux_i_density_energy += factor * (density_energy_i - density_energy_nb);
flux_i_momentum.x += factor * (momentum_i.x - momentum_nb.x);
flux_i_momentum.y += factor * (momentum_i.y - momentum_nb.y);
flux_i_momentum.z += factor * (momentum_i.z - momentum_nb.z);
// accumulate cell-centered fluxes
factor = float(0.5f) * normal.x;
flux_i_density += factor * (momentum_nb.x + momentum_i.x);
flux_i_density_energy += factor * (flux_contribution_nb_density_energy.x +
flux_contribution_i_density_energy.x);
flux_i_momentum.x += factor * (flux_contribution_nb_momentum_x.x +
flux_contribution_i_momentum_x.x);
flux_i_momentum.y += factor * (flux_contribution_nb_momentum_y.x +
flux_contribution_i_momentum_y.x);
flux_i_momentum.z += factor * (flux_contribution_nb_momentum_z.x +
flux_contribution_i_momentum_z.x);
factor = float(0.5f) * normal.y;
flux_i_density += factor * (momentum_nb.y + momentum_i.y);
flux_i_density_energy += factor * (flux_contribution_nb_density_energy.y +
flux_contribution_i_density_energy.y);
flux_i_momentum.x += factor * (flux_contribution_nb_momentum_x.y +
flux_contribution_i_momentum_x.y);
flux_i_momentum.y += factor * (flux_contribution_nb_momentum_y.y +
flux_contribution_i_momentum_y.y);
flux_i_momentum.z += factor * (flux_contribution_nb_momentum_z.y +
flux_contribution_i_momentum_z.y);
factor = float(0.5f) * normal.z;
flux_i_density += factor * (momentum_nb.z + momentum_i.z);
flux_i_density_energy += factor * (flux_contribution_nb_density_energy.z +
flux_contribution_i_density_energy.z);
flux_i_momentum.x += factor * (flux_contribution_nb_momentum_x.z +
flux_contribution_i_momentum_x.z);
flux_i_momentum.y += factor * (flux_contribution_nb_momentum_y.z +
flux_contribution_i_momentum_y.z);
flux_i_momentum.z += factor * (flux_contribution_nb_momentum_z.z +
flux_contribution_i_momentum_z.z);
} else if (nb == -1) // a wing boundary
{
flux_i_momentum.x += normal.x * pressure_i;
flux_i_momentum.y += normal.y * pressure_i;
flux_i_momentum.z += normal.z * pressure_i;
} else if (nb == -2) // a far field boundary
{
factor = float(0.5f) * normal.x;
flux_i_density += factor * (ff_variable[VAR_MOMENTUM + 0] + momentum_i.x);
flux_i_density_energy +=
factor * (ff_flux_contribution_density_energy[0].x +
flux_contribution_i_density_energy.x);
flux_i_momentum.x += factor * (ff_flux_contribution_momentum_x[0].x +
flux_contribution_i_momentum_x.x);
flux_i_momentum.y += factor * (ff_flux_contribution_momentum_y[0].x +
flux_contribution_i_momentum_y.x);
flux_i_momentum.z += factor * (ff_flux_contribution_momentum_z[0].x +
flux_contribution_i_momentum_z.x);
factor = float(0.5f) * normal.y;
flux_i_density += factor * (ff_variable[VAR_MOMENTUM + 1] + momentum_i.y);
flux_i_density_energy +=
factor * (ff_flux_contribution_density_energy[0].y +
flux_contribution_i_density_energy.y);
flux_i_momentum.x += factor * (ff_flux_contribution_momentum_x[0].y +
flux_contribution_i_momentum_x.y);
flux_i_momentum.y += factor * (ff_flux_contribution_momentum_y[0].y +
flux_contribution_i_momentum_y.y);
flux_i_momentum.z += factor * (ff_flux_contribution_momentum_z[0].y +
flux_contribution_i_momentum_z.y);
factor = float(0.5f) * normal.z;
flux_i_density += factor * (ff_variable[VAR_MOMENTUM + 2] + momentum_i.z);
flux_i_density_energy +=
factor * (ff_flux_contribution_density_energy[0].z +
flux_contribution_i_density_energy.z);
flux_i_momentum.x += factor * (ff_flux_contribution_momentum_x[0].z +
flux_contribution_i_momentum_x.z);
flux_i_momentum.y += factor * (ff_flux_contribution_momentum_y[0].z +
flux_contribution_i_momentum_y.z);
flux_i_momentum.z += factor * (ff_flux_contribution_momentum_z[0].z +
flux_contribution_i_momentum_z.z);
}
}
fluxes[i + VAR_DENSITY * nelr] = flux_i_density;
fluxes[i + (VAR_MOMENTUM + 0) * nelr] = flux_i_momentum.x;
fluxes[i + (VAR_MOMENTUM + 1) * nelr] = flux_i_momentum.y;
fluxes[i + (VAR_MOMENTUM + 2) * nelr] = flux_i_momentum.z;
fluxes[i + VAR_DENSITY_ENERGY * nelr] = flux_i_density_energy;
}
void compute_flux(int nelr, int *elements_surrounding_elements, float *normals,
float *variables, float *fluxes) {
dim3 Dg(nelr / BLOCK_SIZE_3), Db(BLOCK_SIZE_3);
cuda_compute_flux<<<Dg, Db>>>(nelr, elements_surrounding_elements, normals,
variables, fluxes);
getLastCudaError("compute_flux failed");
}
__global__ void cuda_time_step(int j, int nelr, float *old_variables,
float *variables, float *step_factors,
float *fluxes) {
const int i = (blockDim.x * blockIdx.x + threadIdx.x);
float factor = step_factors[i] / float(RK + 1 - j);
variables[i + VAR_DENSITY * nelr] = old_variables[i + VAR_DENSITY * nelr] +
factor * fluxes[i + VAR_DENSITY * nelr];
variables[i + VAR_DENSITY_ENERGY * nelr] =
old_variables[i + VAR_DENSITY_ENERGY * nelr] +
factor * fluxes[i + VAR_DENSITY_ENERGY * nelr];
variables[i + (VAR_MOMENTUM + 0) * nelr] =
old_variables[i + (VAR_MOMENTUM + 0) * nelr] +
factor * fluxes[i + (VAR_MOMENTUM + 0) * nelr];
variables[i + (VAR_MOMENTUM + 1) * nelr] =
old_variables[i + (VAR_MOMENTUM + 1) * nelr] +
factor * fluxes[i + (VAR_MOMENTUM + 1) * nelr];
variables[i + (VAR_MOMENTUM + 2) * nelr] =
old_variables[i + (VAR_MOMENTUM + 2) * nelr] +
factor * fluxes[i + (VAR_MOMENTUM + 2) * nelr];
}
void time_step(int j, int nelr, float *old_variables, float *variables,
float *step_factors, float *fluxes) {
dim3 Dg(nelr / BLOCK_SIZE_4), Db(BLOCK_SIZE_4);
cuda_time_step<<<Dg, Db>>>(j, nelr, old_variables, variables, step_factors,
fluxes);
getLastCudaError("update failed");
}
/*
* Main function
*/
int main(int argc, char **argv) {
printf("WG size of kernel:initialize = %d, WG size of "
"kernel:compute_step_factor = %d, WG size of kernel:compute_flux = "
"%d, WG size of kernel:time_step = %d\n",
BLOCK_SIZE_1, BLOCK_SIZE_2, BLOCK_SIZE_3, BLOCK_SIZE_4);
if (argc < 2) {
std::cout << "specify data file name" << std::endl;
return 0;
}
const char *data_file_name = argv[1];
cudaDeviceProp prop;
int dev;
checkCudaErrors(cudaSetDevice(0));
// set far field conditions and load them into constant memory on the gpu
{
float h_ff_variable[NVAR];
const float angle_of_attack =
float(3.1415926535897931 / 180.0f) * float(deg_angle_of_attack);
h_ff_variable[VAR_DENSITY] = float(1.4);
float ff_pressure = float(1.0f);
float ff_speed_of_sound =
sqrt(GAMMA * ff_pressure / h_ff_variable[VAR_DENSITY]);
float ff_speed = float(ff_mach) * ff_speed_of_sound;
float3 ff_velocity;
ff_velocity.x = ff_speed * float(cos((float)angle_of_attack));
ff_velocity.y = ff_speed * float(sin((float)angle_of_attack));
ff_velocity.z = 0.0f;
h_ff_variable[VAR_MOMENTUM + 0] =
h_ff_variable[VAR_DENSITY] * ff_velocity.x;
h_ff_variable[VAR_MOMENTUM + 1] =
h_ff_variable[VAR_DENSITY] * ff_velocity.y;
h_ff_variable[VAR_MOMENTUM + 2] =
h_ff_variable[VAR_DENSITY] * ff_velocity.z;
h_ff_variable[VAR_DENSITY_ENERGY] =
h_ff_variable[VAR_DENSITY] * (float(0.5f) * (ff_speed * ff_speed)) +
(ff_pressure / float(GAMMA - 1.0f));
float3 h_ff_momentum;
h_ff_momentum.x = *(h_ff_variable + VAR_MOMENTUM + 0);
h_ff_momentum.y = *(h_ff_variable + VAR_MOMENTUM + 1);
h_ff_momentum.z = *(h_ff_variable + VAR_MOMENTUM + 2);
float3 h_ff_flux_contribution_momentum_x;
float3 h_ff_flux_contribution_momentum_y;
float3 h_ff_flux_contribution_momentum_z;
float3 h_ff_flux_contribution_density_energy;
compute_flux_contribution(h_ff_variable[VAR_DENSITY], h_ff_momentum,
h_ff_variable[VAR_DENSITY_ENERGY], ff_pressure,
ff_velocity, h_ff_flux_contribution_momentum_x,
h_ff_flux_contribution_momentum_y,
h_ff_flux_contribution_momentum_z,
h_ff_flux_contribution_density_energy);
// copy far field conditions to the gpu
checkCudaErrors(
cudaMemcpyToSymbol(ff_variable, h_ff_variable, NVAR * sizeof(float)));
checkCudaErrors(cudaMemcpyToSymbol(ff_flux_contribution_momentum_x,
&h_ff_flux_contribution_momentum_x,
sizeof(float3)));
checkCudaErrors(cudaMemcpyToSymbol(ff_flux_contribution_momentum_y,
&h_ff_flux_contribution_momentum_y,
sizeof(float3)));
checkCudaErrors(cudaMemcpyToSymbol(ff_flux_contribution_momentum_z,
&h_ff_flux_contribution_momentum_z,
sizeof(float3)));
checkCudaErrors(cudaMemcpyToSymbol(ff_flux_contribution_density_energy,
&h_ff_flux_contribution_density_energy,
sizeof(float3)));
}
int nel;
int nelr;
// read in domain geometry
float *areas;
int *elements_surrounding_elements;
float *normals;
{
std::ifstream file(data_file_name);
file >> nel;
nelr =
BLOCK_SIZE_0 * ((nel / BLOCK_SIZE_0) + std::min(1, nel % BLOCK_SIZE_0));
float *h_areas = new float[nelr];
int *h_elements_surrounding_elements = new int[nelr * NNB];
float *h_normals = new float[nelr * NDIM * NNB];
// read in data
for (int i = 0; i < nel; i++) {
file >> h_areas[i];
for (int j = 0; j < NNB; j++) {
file >> h_elements_surrounding_elements[i + j * nelr];
if (h_elements_surrounding_elements[i + j * nelr] < 0)
h_elements_surrounding_elements[i + j * nelr] = -1;
h_elements_surrounding_elements[i + j * nelr]--; // it's coming in with
// Fortran numbering
for (int k = 0; k < NDIM; k++) {
file >> h_normals[i + (j + k * NNB) * nelr];
h_normals[i + (j + k * NNB) * nelr] =
-h_normals[i + (j + k * NNB) * nelr];
}
}
}
// fill in remaining data
int last = nel - 1;
for (int i = nel; i < nelr; i++) {
h_areas[i] = h_areas[last];
for (int j = 0; j < NNB; j++) {
// duplicate the last element
h_elements_surrounding_elements[i + j * nelr] =
h_elements_surrounding_elements[last + j * nelr];
for (int k = 0; k < NDIM; k++)
h_normals[last + (j + k * NNB) * nelr] =
h_normals[last + (j + k * NNB) * nelr];
}
}
areas = alloc<float>(nelr);
upload<float>(areas, h_areas, nelr);
elements_surrounding_elements = alloc<int>(nelr * NNB);
upload<int>(elements_surrounding_elements, h_elements_surrounding_elements,
nelr * NNB);
normals = alloc<float>(nelr * NDIM * NNB);
upload<float>(normals, h_normals, nelr * NDIM * NNB);
delete[] h_areas;
delete[] h_elements_surrounding_elements;
delete[] h_normals;
}
// Create arrays and set initial conditions
float *variables = alloc<float>(nelr * NVAR);
initialize_variables(nelr, variables);
float *old_variables = alloc<float>(nelr * NVAR);
float *fluxes = alloc<float>(nelr * NVAR);
float *step_factors = alloc<float>(nelr);
// make sure all memory is floatly allocated before we start timing
initialize_variables(nelr, old_variables);
initialize_variables(nelr, fluxes);
cudaMemset((void *)step_factors, 0, sizeof(float) * nelr);
// make sure CUDA isn't still doing something before we start timing
cudaThreadSynchronize();
// these need to be computed the first time in order to compute time step
std::cout << "Starting..." << std::endl;
StopWatchInterface *timer = 0;
// unsigned int timer = 0;
// CUT_SAFE_CALL( cutCreateTimer( &timer));
// CUT_SAFE_CALL( cutStartTimer( timer));
sdkCreateTimer(&timer);
sdkStartTimer(&timer);
// Begin iterations
for (int i = 0; i < iterations; i++) {
copy<float>(old_variables, variables, nelr * NVAR);
// for the first iteration we compute the time step
compute_step_factor(nelr, variables, areas, step_factors);
getLastCudaError("compute_step_factor failed");
for (int j = 0; j < RK; j++) {
compute_flux(nelr, elements_surrounding_elements, normals, variables,
fluxes);
getLastCudaError("compute_flux failed");
time_step(j, nelr, old_variables, variables, step_factors, fluxes);
getLastCudaError("time_step failed");
}
}
cudaThreadSynchronize();
// CUT_SAFE_CALL( cutStopTimer(timer) );
sdkStopTimer(&timer);
std::cout << (sdkGetAverageTimerValue(&timer) / 1000.0) / iterations
<< " seconds per iteration" << std::endl;
std::cout << "Saving solution..." << std::endl;
dump(variables, nel, nelr);
std::cout << "Saved solution..." << std::endl;
std::cout << "Cleaning up..." << std::endl;
dealloc<float>(areas);
dealloc<int>(elements_surrounding_elements);
dealloc<float>(normals);
dealloc<float>(variables);
dealloc<float>(old_variables);
dealloc<float>(fluxes);
dealloc<float>(step_factors);
std::cout << "Done..." << std::endl;
return 0;
}

View File

@ -1,15 +0,0 @@
# # #!/bin/bash
clang++ euler3d.cu -I/usr/local/cuda-10.1/samples/common/inc --cuda-path=/usr/local/cuda-10.1 --cuda-gpu-arch=sm_50 -L/usr/local/cuda-10.1/lib64 -lcudart_static -ldl -lrt -pthread -save-temps -v
/home/robinhan/repo/open_source_template/build/compilation/kernelTranslator euler3d-cuda-nvptx64-nvidia-cuda-sm_50.bc kernel.bc
/home/robinhan/repo/open_source_template/build/compilation/hostTranslator euler3d-host-x86_64-unknown-linux-gnu.bc host.bc
llc --relocation-model=pic --filetype=obj kernel.bc
llc --relocation-model=pic --filetype=obj host.bc
g++ -Wall -L/home/robinhan/repo/open_source_template/build/runtime -L/home/robinhan/repo/open_source_template/build/runtime/threadPool -o a.out -fPIC -no-pie host.o kernel.o -lc -lx86Runtime -lthreadPool -lpthread
./a.out ../rodinia-data/cfd/fvcorr.domn.097K
# ./demo 1024
# # # ./demo -f ../../data/matrix3.txt
# # # run -f ../../data/gaussian/matrix3.txt

View File

@ -1,64 +0,0 @@
/*
* Copyright (c) 2009, Jiri Matela
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef _COMMON_H
#define _COMMON_H
// 24-bit multiplication is faster on G80,
// but we must be sure to multiply integers
// only within [-8M, 8M - 1] range
#define IMUL(a, b) __mul24(a, b)
////cuda timing macros
//#define CTIMERINIT cudaEvent_t cstart, cstop; \
// cudaEventCreate(&cstart); \
// cudaEventCreate(&cstop); \
// float elapsedTime
//#define CTIMERSTART(cstart) cudaEventRecord(cstart,0)
//#define CTIMERSTOP(cstop) cudaEventRecord(cstop,0); \
// cudaEventSynchronize(cstop); \
// cudaEventElapsedTime(&elapsedTime, cstart, cstop)
// divide and round up macro
#define DIVANDRND(a, b) ((((a) % (b)) != 0) ? ((a) / (b) + 1) : ((a) / (b)))
#define cudaCheckError(msg) \
{ \
cudaError_t err = cudaGetLastError(); \
if (cudaSuccess != err) { \
fprintf(stderr, "%s: %i: %s: %s.\n", __FILE__, __LINE__, msg, \
cudaGetErrorString(err)); \
exit(-1); \
} \
}
#define cudaCheckAsyncError(msg) \
{ \
cudaThreadSynchronize(); \
cudaCheckError(msg); \
}
#endif

View File

@ -1,193 +0,0 @@
/*
* Copyright (c) 2009, Jiri Matela
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include <unistd.h>
#include <error.h>
#include <stdio.h>
#include <stdlib.h>
#include <errno.h>
#include <assert.h>
#include "components.h"
#include "common.h"
#define THREADS 256
/* Store 3 RGB float components */
__device__ void storeComponents(float *d_r, float *d_g, float *d_b, float r, float g, float b, int pos)
{
d_r[pos] = (r/255.0f) - 0.5f;
d_g[pos] = (g/255.0f) - 0.5f;
d_b[pos] = (b/255.0f) - 0.5f;
}
/* Store 3 RGB intege components */
__device__ void storeComponents(int *d_r, int *d_g, int *d_b, int r, int g, int b, int pos)
{
d_r[pos] = r - 128;
d_g[pos] = g - 128;
d_b[pos] = b - 128;
}
/* Store float component */
__device__ void storeComponent(float *d_c, float c, int pos)
{
d_c[pos] = (c/255.0f) - 0.5f;
}
/* Store integer component */
__device__ void storeComponent(int *d_c, int c, int pos)
{
d_c[pos] = c - 128;
}
/* Copy img src data into three separated component buffers */
template<typename T>
__global__ void c_CopySrcToComponents(T *d_r, T *d_g, T *d_b,
unsigned char * d_src,
int pixels)
{
int x = threadIdx.x;
int gX = blockDim.x*blockIdx.x;
__shared__ unsigned char sData[THREADS*3];
/* Copy data to shared mem by 4bytes
other checks are not necessary, since
d_src buffer is aligned to sharedDataSize */
if ( (x*4) < THREADS*3 ) {
float *s = (float *)d_src;
float *d = (float *)sData;
d[x] = s[((gX*3)>>2) + x];
}
__syncthreads();
T r, g, b;
int offset = x*3;
r = (T)(sData[offset]);
g = (T)(sData[offset+1]);
b = (T)(sData[offset+2]);
int globalOutputPosition = gX + x;
if (globalOutputPosition < pixels) {
storeComponents(d_r, d_g, d_b, r, g, b, globalOutputPosition);
}
}
/* Copy img src data into three separated component buffers */
template<typename T>
__global__ void c_CopySrcToComponent(T *d_c, unsigned char * d_src, int pixels)
{
int x = threadIdx.x;
int gX = blockDim.x*blockIdx.x;
__shared__ unsigned char sData[THREADS];
/* Copy data to shared mem by 4bytes
other checks are not necessary, since
d_src buffer is aligned to sharedDataSize */
if ( (x*4) < THREADS) {
float *s = (float *)d_src;
float *d = (float *)sData;
d[x] = s[(gX>>2) + x];
}
__syncthreads();
T c;
c = (T)(sData[x]);
int globalOutputPosition = gX + x;
if (globalOutputPosition < pixels) {
storeComponent(d_c, c, globalOutputPosition);
}
}
/* Separate compoents of 8bit RGB source image */
template<typename T>
void rgbToComponents(T *d_r, T *d_g, T *d_b, unsigned char * src, int width, int height)
{
unsigned char * d_src;
int pixels = width*height;
int alignedSize = DIVANDRND(width*height, THREADS) * THREADS * 3; //aligned to thread block size -- THREADS
/* Alloc d_src buffer */
cudaMalloc((void **)&d_src, alignedSize);
cudaCheckAsyncError("Cuda malloc")
cudaMemset(d_src, 0, alignedSize);
/* Copy data to device */
cudaMemcpy(d_src, src, pixels*3, cudaMemcpyHostToDevice);
cudaCheckError("Copy data to device")
/* Kernel */
dim3 threads(THREADS);
dim3 grid(alignedSize/(THREADS*3));
assert(alignedSize%(THREADS*3) == 0);
c_CopySrcToComponents<<<grid, threads>>>(d_r, d_g, d_b, d_src, pixels);
cudaCheckAsyncError("CopySrcToComponents kernel")
/* Free Memory */
cudaFree(d_src);
cudaCheckAsyncError("Free memory")
}
template void rgbToComponents<float>(float *d_r, float *d_g, float *d_b, unsigned char * src, int width, int height);
template void rgbToComponents<int>(int *d_r, int *d_g, int *d_b, unsigned char * src, int width, int height);
/* Copy a 8bit source image data into a color compoment of type T */
template<typename T>
void bwToComponent(T *d_c, unsigned char * src, int width, int height)
{
unsigned char * d_src;
int pixels = width*height;
int alignedSize = DIVANDRND(pixels, THREADS) * THREADS; //aligned to thread block size -- THREADS
/* Alloc d_src buffer */
cudaMalloc((void **)&d_src, alignedSize);
cudaCheckAsyncError("Cuda malloc")
cudaMemset(d_src, 0, alignedSize);
/* Copy data to device */
cudaMemcpy(d_src, src, pixels, cudaMemcpyHostToDevice);
cudaCheckError("Copy data to device")
/* Kernel */
dim3 threads(THREADS);
dim3 grid(alignedSize/(THREADS));
assert(alignedSize%(THREADS) == 0);
c_CopySrcToComponent<<<grid, threads>>>(d_c, d_src, pixels);
cudaCheckAsyncError("CopySrcToComponent kernel")
/* Free Memory */
cudaFree(d_src);
cudaCheckAsyncError("Free memory")
}
template void bwToComponent<float>(float *d_c, unsigned char *src, int width, int height);
template void bwToComponent<int>(int *d_c, unsigned char *src, int width, int height);

View File

@ -1,39 +0,0 @@
/*
* Copyright (c) 2009, Jiri Matela
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef _COMPONENTS_H
#define _COMPONENTS_H
/* Separate compoents of source 8bit RGB image */
template <typename T>
void rgbToComponents(T *d_r, T *d_g, T *d_b, unsigned char *src, int width,
int height);
/* Copy a 8bit source image data into a color compoment of type T */
template <typename T>
void bwToComponent(T *d_c, unsigned char *src, int width, int height);
#endif

View File

@ -1,385 +0,0 @@
/*
* Copyright (c) 2009, Jiri Matela
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include <stdio.h>
#include <fcntl.h>
#include <assert.h>
#include <errno.h>
#include <sys/time.h>
#include <unistd.h>
#include <error.h>
#include "dwt_cuda/dwt.h"
#include "dwt_cuda/common.h"
#include "dwt.h"
#include "common.h"
#include <iostream>
#include <fstream>
inline void fdwt(float *in, float *out, int width, int height, int levels)
{
printf(" Running fdwt97 Float \n");
dwt_cuda::fdwt97(in, out, width, height, levels);
}
/*
inline void fdwt(float *in, float *out, int width, int height, int levels, float *diffOut)
{
dwt_cuda::fdwt97(in, out, width, height, levels, diffOut);
}
*/
inline void fdwt(int *in, int *out, int width, int height, int levels)
{
printf(" Running fdwt53 Int \n");
dwt_cuda::fdwt53(in, out, width, height, levels);
}
/*
inline void fdwt(int *in, int *out, int width, int height, int levels, int *diffOut)
{
dwt_cuda::fdwt53(in, out, width, height, levels, diffOut);
}
*/
inline void rdwt(float *in, float *out, int width, int height, int levels)
{
printf(" Running rdwt97 Float \n");
dwt_cuda::rdwt97(in, out, width, height, levels);
}
inline void rdwt(int *in, int *out, int width, int height, int levels)
{
printf(" Running rdwt53 Int \n");
dwt_cuda::rdwt53(in, out, width, height, levels);
}
template<typename T>
int nStage2dDWT(T * in, T * out, T * backup, int pixWidth, int pixHeight, int stages, bool forward)
{
printf("\n*** %d stages of 2D forward DWT:\n", stages);
/* create backup of input, because each test iteration overwrites it */
const int size = pixHeight * pixWidth * sizeof(T);
cudaMemcpy(backup, in, size, cudaMemcpyDeviceToDevice);
cudaCheckError("Memcopy device to device");
/* Measure time of individual levels. */
if(forward)
fdwt(in, out, pixWidth, pixHeight, stages);
else
rdwt(in, out, pixWidth, pixHeight, stages);
// Measure overall time of DWT.
/* #ifdef GPU_DWT_TESTING_1
dwt_cuda::CudaDWTTester tester;
for(int i = tester.getNumIterations(); i--; ) {
// Recover input and measure one overall DWT run.
cudaMemcpy(in, backup, size, cudaMemcpyDeviceToDevice);
cudaCheckError("Memcopy device to device");
tester.beginTestIteration();
if(forward)
fdwt(in, out, pixWidth, pixHeight, stages);
else
rdwt(in, out, pixWidth, pixHeight, stages);
tester.endTestIteration();
}
tester.showPerformance(" Overall DWT", pixWidth, pixHeight);
#endif // GPU_DWT_TESTING
cudaCheckAsyncError("DWT Kernel calls");
*/ return 0;
}
template int nStage2dDWT<float>(float*, float*, float*, int, int, int, bool);
template int nStage2dDWT<int>(int*, int*, int*, int, int, int, bool);
/*
template<typename T>
int nStage2dDWT(T * in, T * out, T * backup, int pixWidth, int pixHeight, int stages, bool forward, T * diffOut)
{
printf("*** %d stages of 2D forward DWT:\n", stages);
// create backup of input, because each test iteration overwrites it
const int size = pixHeight * pixWidth * sizeof(T);
cudaMemcpy(backup, in, size, cudaMemcpyDeviceToDevice);
cudaCheckError("Memcopy device to device");
// Measure time of individual levels.
if(forward)
fdwt(in, out, pixWidth, pixHeight, stages, diffOut);
else
rdwt(in, out, pixWidth, pixHeight, stages);
// Measure overall time of DWT.
#ifdef GPU_DWT_TESTING_1
dwt_cuda::CudaDWTTester tester;
for(int i = tester.getNumIterations(); i--; ) {
// Recover input and measure one overall DWT run.
cudaMemcpy(in, backup, size, cudaMemcpyDeviceToDevice);
cudaCheckError("Memcopy device to device");
tester.beginTestIteration();
if(forward)
fdwt(in, out, pixWidth, pixHeight, stages, diffOut);
else
rdwt(in, out, pixWidth, pixHeight, stages);
tester.endTestIteration();
}
tester.showPerformance(" Overall DWT", pixWidth, pixHeight);
#endif // GPU_DWT_TESTING
cudaCheckAsyncError("DWT Kernel calls");
return 0;
}
template int nStage2dDWT<float>(float*, float*, float*, int, int, int, bool, float*);
template int nStage2dDWT<int>(int*, int*, int*, int, int, int, bool, int*);
*/
void samplesToChar(unsigned char * dst, float * src, int samplesNum, const char * filename)
{
int i;
std::ofstream outputFile;
char outfile[strlen(filename)+strlen(".txt")];
strcpy(outfile, filename);
strcpy(outfile+strlen(filename), ".txt");
outputFile.open(outfile);
for(i = 0; i < samplesNum; i++) {
float r = (src[i]+0.5f) * 255;
if (r > 255) r = 255;
if (r < 0) r = 0;
dst[i] = (unsigned char)r;
outputFile << "index: " << i << " val: "<< r <<" \n";
}
outputFile.close();
}
void samplesToChar(unsigned char * dst, int * src, int samplesNum, const char * filename)
{
int i;
std::ofstream outputFile;
char outfile[strlen(filename)+strlen(".txt")];
strcpy(outfile, filename);
strcpy(outfile+strlen(filename), ".txt");
outputFile.open(outfile);
for(i = 0; i < samplesNum; i++) {
int r = src[i]+128;
if (r > 255) r = 255;
if (r < 0) r = 0;
dst[i] = (unsigned char)r;
// added this line to output check
outputFile << "index: " << i << " val: "<< r <<" \n";
}
outputFile.close();
}
///* Write output linear orderd*/
template<typename T>
int writeLinear(T *component_cuda, int pixWidth, int pixHeight,
const char * filename, const char * suffix)
{
unsigned char * result;
T *gpu_output;
int i;
int size;
int samplesNum = pixWidth*pixHeight;
size = samplesNum*sizeof(T);
cudaMallocHost((void **)&gpu_output, size);
cudaCheckError("Malloc host");
memset(gpu_output, 0, size);
result = (unsigned char *)malloc(samplesNum);
cudaMemcpy(gpu_output, component_cuda, size, cudaMemcpyDeviceToHost);
cudaCheckError("Memcopy device to host");
/* T to char */
samplesToChar(result, gpu_output, samplesNum, filename);
/* Write component */
char outfile[strlen(filename)+strlen(suffix)];
strcpy(outfile, filename);
strcpy(outfile+strlen(filename), suffix);
i = open(outfile, O_CREAT|O_WRONLY, 0644);
if (i == -1) {
error(0,errno,"cannot access %s", outfile);
return -1;
}
printf("\nWriting to %s (%d x %d)\n", outfile, pixWidth, pixHeight);
ssize_t x ;
x = write(i, result, samplesNum);
close(i);
/* Clean up */
cudaFreeHost(gpu_output);
cudaCheckError("Cuda free host memory");
free(result);
if(x == 0) return 1;
return 0;
}
template int writeLinear<float>(float *component_cuda, int pixWidth, int pixHeight, const char * filename, const char * suffix);
template int writeLinear<int>(int *component_cuda, int pixWidth, int pixHeight, const char * filename, const char * suffix);
/* Write output visual ordered */
template<typename T>
int writeNStage2DDWT(T *component_cuda, int pixWidth, int pixHeight,
int stages, const char * filename, const char * suffix)
{
struct band {
int dimX;
int dimY;
};
struct dimensions {
struct band LL;
struct band HL;
struct band LH;
struct band HH;
};
unsigned char * result;
T *src, *dst;
int i,s;
int size;
int offset;
int yOffset;
int samplesNum = pixWidth*pixHeight;
struct dimensions * bandDims;
bandDims = (struct dimensions *)malloc(stages * sizeof(struct dimensions));
bandDims[0].LL.dimX = DIVANDRND(pixWidth,2);
bandDims[0].LL.dimY = DIVANDRND(pixHeight,2);
bandDims[0].HL.dimX = pixWidth - bandDims[0].LL.dimX;
bandDims[0].HL.dimY = bandDims[0].LL.dimY;
bandDims[0].LH.dimX = bandDims[0].LL.dimX;
bandDims[0].LH.dimY = pixHeight - bandDims[0].LL.dimY;
bandDims[0].HH.dimX = bandDims[0].HL.dimX;
bandDims[0].HH.dimY = bandDims[0].LH.dimY;
for (i = 1; i < stages; i++) {
bandDims[i].LL.dimX = DIVANDRND(bandDims[i-1].LL.dimX,2);
bandDims[i].LL.dimY = DIVANDRND(bandDims[i-1].LL.dimY,2);
bandDims[i].HL.dimX = bandDims[i-1].LL.dimX - bandDims[i].LL.dimX;
bandDims[i].HL.dimY = bandDims[i].LL.dimY;
bandDims[i].LH.dimX = bandDims[i].LL.dimX;
bandDims[i].LH.dimY = bandDims[i-1].LL.dimY - bandDims[i].LL.dimY;
bandDims[i].HH.dimX = bandDims[i].HL.dimX;
bandDims[i].HH.dimY = bandDims[i].LH.dimY;
}
#if 0
printf("Original image pixWidth x pixHeight: %d x %d\n", pixWidth, pixHeight);
for (i = 0; i < stages; i++) {
printf("Stage %d: LL: pixWidth x pixHeight: %d x %d\n", i, bandDims[i].LL.dimX, bandDims[i].LL.dimY);
printf("Stage %d: HL: pixWidth x pixHeight: %d x %d\n", i, bandDims[i].HL.dimX, bandDims[i].HL.dimY);
printf("Stage %d: LH: pixWidth x pixHeight: %d x %d\n", i, bandDims[i].LH.dimX, bandDims[i].LH.dimY);
printf("Stage %d: HH: pixWidth x pixHeight: %d x %d\n", i, bandDims[i].HH.dimX, bandDims[i].HH.dimY);
}
#endif
size = samplesNum*sizeof(T);
cudaMallocHost((void **)&src, size);
cudaCheckError("Malloc host");
dst = (T*)malloc(size);
memset(src, 0, size);
memset(dst, 0, size);
result = (unsigned char *)malloc(samplesNum);
cudaMemcpy(src, component_cuda, size, cudaMemcpyDeviceToHost);
cudaCheckError("Memcopy device to host");
// LL Band
size = bandDims[stages-1].LL.dimX * sizeof(T);
for (i = 0; i < bandDims[stages-1].LL.dimY; i++) {
memcpy(dst+i*pixWidth, src+i*bandDims[stages-1].LL.dimX, size);
}
for (s = stages - 1; s >= 0; s--) {
// HL Band
size = bandDims[s].HL.dimX * sizeof(T);
offset = bandDims[s].LL.dimX * bandDims[s].LL.dimY;
for (i = 0; i < bandDims[s].HL.dimY; i++) {
memcpy(dst+i*pixWidth+bandDims[s].LL.dimX,
src+offset+i*bandDims[s].HL.dimX,
size);
}
// LH band
size = bandDims[s].LH.dimX * sizeof(T);
offset += bandDims[s].HL.dimX * bandDims[s].HL.dimY;
yOffset = bandDims[s].LL.dimY;
for (i = 0; i < bandDims[s].HL.dimY; i++) {
memcpy(dst+(yOffset+i)*pixWidth,
src+offset+i*bandDims[s].LH.dimX,
size);
}
//HH band
size = bandDims[s].HH.dimX * sizeof(T);
offset += bandDims[s].LH.dimX * bandDims[s].LH.dimY;
yOffset = bandDims[s].HL.dimY;
for (i = 0; i < bandDims[s].HH.dimY; i++) {
memcpy(dst+(yOffset+i)*pixWidth+bandDims[s].LH.dimX,
src+offset+i*bandDims[s].HH.dimX,
size);
}
}
/* Write component */
samplesToChar(result, dst, samplesNum, filename);
char outfile[strlen(filename)+strlen(suffix)];
strcpy(outfile, filename);
strcpy(outfile+strlen(filename), suffix);
i = open(outfile, O_CREAT|O_WRONLY, 0644);
if (i == -1) {
error(0,errno,"cannot access %s", outfile);
return -1;
}
printf("\nWriting to %s (%d x %d)\n", outfile, pixWidth, pixHeight);
ssize_t x;
x = write(i, result, samplesNum);
close(i);
cudaFreeHost(src);
cudaCheckError("Cuda free host memory");
free(dst);
free(result);
free(bandDims);
if (x == 0) return 1;
return 0;
}
template int writeNStage2DDWT<float>(float *component_cuda, int pixWidth, int pixHeight, int stages, const char * filename, const char * suffix);
template int writeNStage2DDWT<int>(int *component_cuda, int pixWidth, int pixHeight, int stages, const char * filename, const char * suffix);

View File

@ -1,41 +0,0 @@
/*
* Copyright (c) 2009, Jiri Matela
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef _DWT_H
#define _DWT_H
template <typename T>
int nStage2dDWT(T *in, T *out, T *backup, int pixWidth, int pixHeight,
int stages, bool forward);
template <typename T>
int writeNStage2DDWT(T *component_cuda, int width, int height, int stages,
const char *filename, const char *suffix);
template <typename T>
int writeLinear(T *component_cuda, int width, int height, const char *filename,
const char *suffix);
#endif

View File

@ -1,35 +0,0 @@
///
/// @file common.cu
/// @author Martin Jirman (207962@mail.muni.cz)
/// @date 2011-01-20 14:37
///
/// Copyright (c) 2011 Martin Jirman
/// All rights reserved.
///
/// Redistribution and use in source and binary forms, with or without
/// modification, are permitted provided that the following conditions are met:
///
/// * Redistributions of source code must retain the above copyright
/// notice, this list of conditions and the following disclaimer.
/// * Redistributions in binary form must reproduce the above copyright
/// notice, this list of conditions and the following disclaimer in the
/// documentation and/or other materials provided with the distribution.
///
/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
/// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
/// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
/// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
/// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
/// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
/// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
/// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
/// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
/// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
/// POSSIBILITY OF SUCH DAMAGE.
///
#include "common.h"
namespace dwt_cuda {
bool CudaDWTTester::testRunning = false;
}

View File

@ -1,232 +0,0 @@
///
/// @file common.h
/// @author Martin Jirman (207962@mail.muni.cz)
/// @brief Common stuff for all CUDA dwt functions.
/// @date 2011-01-20 14:19
///
/// Copyright (c) 2011 Martin Jirman
/// All rights reserved.
///
/// Redistribution and use in source and binary forms, with or without
/// modification, are permitted provided that the following conditions are met:
///
/// * Redistributions of source code must retain the above copyright
/// notice, this list of conditions and the following disclaimer.
/// * Redistributions in binary form must reproduce the above copyright
/// notice, this list of conditions and the following disclaimer in the
/// documentation and/or other materials provided with the distribution.
///
/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
/// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
/// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
/// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
/// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
/// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
/// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
/// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
/// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
/// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
/// POSSIBILITY OF SUCH DAMAGE.
///
#ifndef DWT_COMMON_H
#define DWT_COMMON_H
#include <algorithm>
#include <cstdio>
#include <vector>
// compile time minimum macro
#define CTMIN(a, b) (((a) < (b)) ? (a) : (b))
// performance testing macros
#if defined(GPU_DWT_TESTING)
#define PERF_BEGIN \
{ \
dwt_cuda::CudaDWTTester PERF_TESTER; \
for (int PERF_N = PERF_TESTER.getNumIterations(); PERF_N--;) { \
PERF_TESTER.beginTestIteration();
#define PERF_END(PERF_NAME, PERF_W, PERF_H) \
PERF_TESTER.endTestIteration(); \
} \
PERF_TESTER.showPerformance(PERF_NAME, PERF_W, PERF_H); \
}
#else // GPU_DWT_TESTING
#define PERF_BEGIN
#define PERF_END(PERF_NAME, PERF_W, PERF_H)
#endif // GPU_DWT_TESTING
namespace dwt_cuda {
/// Divide and round up.
template <typename T>
__device__ __host__ inline T divRndUp(const T &n, const T &d) {
return (n / d) + ((n % d) ? 1 : 0);
}
// 9/7 forward DWT lifting schema coefficients
const float f97Predict1 = -1.586134342; ///< forward 9/7 predict 1
const float f97Update1 = -0.05298011854; ///< forward 9/7 update 1
const float f97Predict2 = 0.8829110762; ///< forward 9/7 predict 2
const float f97Update2 = 0.4435068522; ///< forward 9/7 update 2
// 9/7 reverse DWT lifting schema coefficients
const float r97update2 = -f97Update2; ///< undo 9/7 update 2
const float r97predict2 = -f97Predict2; ///< undo 9/7 predict 2
const float r97update1 = -f97Update1; ///< undo 9/7 update 1
const float r97Predict1 = -f97Predict1; ///< undo 9/7 predict 1
// FDWT 9/7 scaling coefficients
const float scale97Mul = 1.23017410491400f;
const float scale97Div = 1.0 / scale97Mul;
// 5/3 forward DWT lifting schema coefficients
const float forward53Predict = -0.5f; /// forward 5/3 predict
const float forward53Update = 0.25f; /// forward 5/3 update
// 5/3 forward DWT lifting schema coefficients
const float reverse53Update = -forward53Update; /// undo 5/3 update
const float reverse53Predict = -forward53Predict; /// undo 5/3 predict
/// Functor which adds scaled sum of neighbors to given central pixel.
struct AddScaledSum {
const float scale; // scale of neighbors
__device__ AddScaledSum(const float scale) : scale(scale) {}
__device__ void operator()(const float p, float &c, const float n) const {
// if(threadIdx.x == 0) {
// printf("scale %f, p %f c %f n %f , result: %f\n", scale, p, c, n,
// scale * (p + n) );
// }
c += scale * (p + n);
}
};
/// Returns index ranging from 0 to num threads, such that first half
/// of threads get even indices and others get odd indices. Each thread
/// gets different index.
/// Example: (for 8 threads) threadIdx.x: 0 1 2 3 4 5 6 7
/// parityIdx: 0 2 4 6 1 3 5 7
/// @tparam THREADS total count of participating threads
/// @return parity-separated index of thread
template <int THREADS> __device__ inline int parityIdx() {
return (threadIdx.x * 2) - (THREADS - 1) * (threadIdx.x / (THREADS / 2));
}
/// size of shared memory
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 200)
const int SHM_SIZE = 48 * 1024;
#else
const int SHM_SIZE = 16 * 1024;
#endif
/// Perrformance and return code tester.
class CudaDWTTester {
private:
static bool testRunning; ///< true if any test is currently running
cudaEvent_t beginEvent; ///< begin CUDA event
cudaEvent_t endEvent; ///< end CUDA event
std::vector<float> times; ///< collected times
const bool disabled; ///< true if this object is disabled
public:
/// Checks CUDA related error.
/// @param status return code to be checked
/// @param message message to be shown if there was an error
/// @return true if there was no error, false otherwise
static bool check(const cudaError_t &status, const char *message) {
#if defined(GPU_DWT_TESTING)
if ((!testRunning) && status != cudaSuccess) {
const char *errorString = cudaGetErrorString(status);
fprintf(stderr, "CUDA ERROR: '%s': %s\n", message, errorString);
fflush(stderr);
return false;
}
#endif // GPU_DWT_TESTING
return true;
}
/// Checks last kernel call for errors.
/// @param message description of the kernel call
/// @return true if there was no error, false otherwise
static bool checkLastKernelCall(const char *message) {
#if defined(GPU_DWT_TESTING)
return testRunning ? true : check(cudaThreadSynchronize(), message);
#else // GPU_DWT_TESTING
return true;
#endif // GPU_DWT_TESTING
}
/// Initializes DWT tester for time measurement
CudaDWTTester() : disabled(testRunning) {}
/// Gets rpefered number of iterations
int getNumIterations() { return disabled ? 1 : 31; }
/// Starts one test iteration.
void beginTestIteration() {
if (!disabled) {
cudaEventCreate(&beginEvent);
cudaEventCreate(&endEvent);
cudaEventRecord(beginEvent, 0);
testRunning = true;
}
}
/// Ends on etest iteration.
void endTestIteration() {
if (!disabled) {
float time;
testRunning = false;
cudaEventRecord(endEvent, 0);
cudaEventSynchronize(endEvent);
cudaEventElapsedTime(&time, beginEvent, endEvent);
cudaEventDestroy(beginEvent);
cudaEventDestroy(endEvent);
times.push_back(time);
}
}
/// Shows brief info about all iterations.
/// @param name name of processing method
/// @param sizeX width of processed image
/// @param sizeY height of processed image
void showPerformance(const char *name, const int sizeX, const int sizeY) {
if (!disabled) {
// compute mean and median
std::sort(times.begin(), times.end());
double sum = 0;
for (int i = times.size(); i--;) {
sum += times[i];
}
const double median =
(times[times.size() / 2] + times[(times.size() - 1) / 2]) * 0.5f;
printf(" %s: %7.3f ms (mean) %7.3f ms (median) %7.3f ms (max) "
"(%d x %d)\n",
name, (sum / times.size()), median, times[times.size() - 1], sizeX,
sizeY);
}
}
};
/// Simple cudaMemcpy wrapped in performance tester.
/// @param dest destination bufer
/// @param src source buffer
/// @param sx width of copied image
/// @param sy height of copied image
template <typename T>
inline void memCopy(T *const dest, const T *const src, const size_t sx,
const size_t sy) {
cudaError_t status;
PERF_BEGIN
status = cudaMemcpy(dest, src, sx * sy * sizeof(T), cudaMemcpyDeviceToDevice);
PERF_END(" memcpy", sx, sy)
CudaDWTTester::check(status, "memcpy device > device");
}
} // end of namespace dwt_cuda
#endif // DWT_COMMON_CUDA_H

View File

@ -1,103 +0,0 @@
///
/// @file dwt.h
/// @author Martin Jirman (207962@mail.muni.cz)
/// @brief Entry points for CUDA implementaion of 9/7 and 5/3 DWT.
/// @date 2011-01-20 11:41
///
///
///
/// Copyright (c) 2011 Martin Jirman
/// All rights reserved.
///
/// Redistribution and use in source and binary forms, with or without
/// modification, are permitted provided that the following conditions are met:
///
/// * Redistributions of source code must retain the above copyright
/// notice, this list of conditions and the following disclaimer.
/// * Redistributions in binary form must reproduce the above copyright
/// notice, this list of conditions and the following disclaimer in the
/// documentation and/or other materials provided with the distribution.
///
/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
/// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
/// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
/// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
/// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
/// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
/// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
/// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
/// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
/// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
/// POSSIBILITY OF SUCH DAMAGE.
///
///
///
/// Following conditions are common for all four DWT functions:
/// - Both input and output images are stored in GPU memory with no padding
/// of lines or interleaving of pixels.
/// - DWT coefficients are stored as follows: Each band is saved as one
/// consecutive chunk (no padding/stride/interleaving). Deepest level bands
/// (smallest ones) are stored first (at the beginning of the input/output
/// buffers), less deep bands follow. There is no padding between stored
/// bands in the buffer. Order of bands of the same level in the buffer is
/// following: Low-low band (or deeper level subbands) is stored first.
/// Vertical-low/horizontal-high band follows. Vertical-high/horizonal-low
/// band is saved next and finally, the high-high band is saved. Out of all
/// low-low bands, only th edeepest one is saved (right at the beginning of
/// the buffer), others are replaced with deeper level subbands.
/// - Input images of all functions won't be preserved (will be overwritten).
/// - Input and output buffers can't overlap.
/// - Size of output buffer must be greater or equal to size of input buffer.
///
/// There are no common compile time settings (buffer size, etc...) for
/// all DWTs, because each DTW type needs different amount of GPU resources.
/// Instead, each DWT type has its own compile time settings, which can be
/// found in *.cu file, where it is implemented.
///
#ifndef DWT_CUDA_H
#define DWT_CUDA_H
namespace dwt_cuda {
/// Forward 5/3 2D DWT. See common rules (above) for more details.
/// @param in Expected to be normalized into range [-128, 127].
/// Will not be preserved (will be overwritten).
/// @param out output buffer on GPU
/// @param sizeX width of input image (in pixels)
/// @param sizeY height of input image (in pixels)
/// @param levels number of recursive DWT levels
void fdwt53(int *in, int *out, int sizeX, int sizeY, int levels);
/// Reverse 5/3 2D DWT. See common rules (above) for more details.
/// @param in Input DWT coefficients. Format described in common rules.
/// Will not be preserved (will be overwritten).
/// @param out output buffer on GPU - will contain original image
/// in normalized range [-128, 127].
/// @param sizeX width of input image (in pixels)
/// @param sizeY height of input image (in pixels)
/// @param levels number of recursive DWT levels
void rdwt53(int *in, int *out, int sizeX, int sizeY, int levels);
/// Forward 9/7 2D DWT. See common rules (above) for more details.
/// @param in Input DWT coefficients. Should be normalized (in range
/// [-0.5, 0.5]). Will not be preserved (will be overwritten).
/// @param out output buffer on GPU - format specified in common rules
/// @param sizeX width of input image (in pixels)
/// @param sizeY height of input image (in pixels)
/// @param levels number of recursive DWT levels
void fdwt97(float *in, float *out, int sizeX, int sizeY, int levels);
/// Reverse 9/7 2D DWT. See common rules (above) for more details.
/// @param in Input DWT coefficients. Format described in common rules.
/// Will not be preserved (will be overwritten).
/// @param out output buffer on GPU - will contain original image
/// in normalized range [-0.5, 0.5].
/// @param sizeX width of input image (in pixels)
/// @param sizeY height of input image (in pixels)
/// @param levels number of recursive DWT levels
void rdwt97(float *in, float *out, int sizeX, int sizeY, int levels);
} // namespace dwt_cuda
#endif // DWT_CUDA_H

View File

@ -1,400 +0,0 @@
/// @file fdwt53.cu
/// @brief CUDA implementation of forward 5/3 2D DWT.
/// @author Martin Jirman (207962@mail.muni.cz)
/// @date 2011-02-04 13:23
///
///
/// Copyright (c) 2011 Martin Jirman
/// All rights reserved.
///
/// Redistribution and use in source and binary forms, with or without
/// modification, are permitted provided that the following conditions are met:
///
/// * Redistributions of source code must retain the above copyright
/// notice, this list of conditions and the following disclaimer.
/// * Redistributions in binary form must reproduce the above copyright
/// notice, this list of conditions and the following disclaimer in the
/// documentation and/or other materials provided with the distribution.
///
/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
/// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
/// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
/// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
/// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
/// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
/// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
/// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
/// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
/// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
/// POSSIBILITY OF SUCH DAMAGE.
///
#include "common.h"
#include "transform_buffer.h"
#include "io.h"
namespace dwt_cuda {
/// Wraps buffer and methods needed for computing one level of 5/3 FDWT
/// using sliding window approach.
/// @tparam WIN_SIZE_X width of sliding window
/// @tparam WIN_SIZE_Y height of sliding window
template <int WIN_SIZE_X, int WIN_SIZE_Y>
class FDWT53 {
private:
/// Info needed for processing of one input column.
/// @tparam CHECKED_LOADER true if column's loader should check boundaries
/// false if there are no near boudnaries to check
template <bool CHECKED_LOADER>
struct FDWT53Column {
/// loader for the column
VerticalDWTPixelLoader<int, CHECKED_LOADER> loader;
/// offset of the column in shared buffer
int offset;
// backup of first 3 loaded pixels (not transformed)
int pixel0, pixel1, pixel2;
/// Sets all fields to anything to prevent 'uninitialized' warnings.
__device__ void clear() {
offset = pixel0 = pixel1 = pixel2 = 0;
loader.clear();
}
};
/// Type of shared memory buffer for 5/3 FDWT transforms.
typedef TransformBuffer<int, WIN_SIZE_X, WIN_SIZE_Y + 3, 2> FDWT53Buffer;
/// Actual shared buffer used for forward 5/3 DWT.
FDWT53Buffer buffer;
/// Difference between indices of two vertical neighbors in buffer.
enum { STRIDE = FDWT53Buffer::VERTICAL_STRIDE };
/// Forward 5/3 DWT predict operation.
struct Forward53Predict {
__device__ void operator() (const int p, int & c, const int n) const {
// c = n;
c -= (p + n) / 2; // F.8, page 126, ITU-T Rec. T.800 final draft the real one
}
};
/// Forward 5/3 DWT update operation.
struct Forward53Update {
__device__ void operator() (const int p, int & c, const int n) const {
c += (p + n + 2) / 4; // F.9, page 126, ITU-T Rec. T.800 final draft
}
};
/// Initializes one column: computes offset of the column in shared memory
/// buffer, initializes loader and finally uses it to load first 3 pixels.
/// @tparam CHECKED true if loader of the column checks boundaries
/// @param column (uninitialized) column info to be initialized
/// @param input input image
/// @param sizeX width of the input image
/// @param sizeY height of the input image
/// @param colIndex x-axis coordinate of the column (relative to the left
/// side of this threadblock's block of input pixels)
/// @param firstY y-axis coordinate of first image row to be transformed
template <bool CHECKED>
__device__ void initColumn(FDWT53Column<CHECKED> & column,
const int * const input,
const int sizeX, const int sizeY,
const int colIndex, const int firstY) {
// get offset of the column with index 'cId'
column.offset = buffer.getColumnOffset(colIndex);
// coordinates of the first pixel to be loaded
const int firstX = blockIdx.x * WIN_SIZE_X + colIndex;
if(blockIdx.y == 0) {
// topmost block - apply mirroring rules when loading first 3 rows
column.loader.init(sizeX, sizeY, firstX, firstY);
// load pixels in mirrored way
column.pixel2 = column.loader.loadFrom(input); // loaded pixel #0
column.pixel1 = column.loader.loadFrom(input); // loaded pixel #1
column.pixel0 = column.loader.loadFrom(input); // loaded pixel #2
// reinitialize loader to start with pixel #1 again
column.loader.init(sizeX, sizeY, firstX, firstY + 1);
} else {
// non-topmost row - regular loading:
column.loader.init(sizeX, sizeY, firstX, firstY - 2);
// load 3 rows into the column
column.pixel0 = column.loader.loadFrom(input);
column.pixel1 = column.loader.loadFrom(input);
column.pixel2 = column.loader.loadFrom(input);
// Now, the next pixel, which will be loaded by loader, is pixel #1.
}
}
/// Loads and vertically transforms given column. Assumes that first 3
/// pixels are already loaded in column fields pixel0 ... pixel2.
/// @tparam CHECKED true if loader of the column checks boundaries
/// @param column column to be loaded and vertically transformed
/// @param input pointer to input image data
template <bool CHECKED>
__device__ void loadAndVerticallyTransform(FDWT53Column<CHECKED> & column,
const int * const input) {
// take 3 loaded pixels and put them into shared memory transform buffer
buffer[column.offset + 0 * STRIDE] = column.pixel0;
buffer[column.offset + 1 * STRIDE] = column.pixel1;
buffer[column.offset + 2 * STRIDE] = column.pixel2;
// load remaining pixels to be able to vertically transform the window
for(int i = 3; i < (3 + WIN_SIZE_Y); i++)
{
buffer[column.offset + i * STRIDE] = column.loader.loadFrom(input);
}
// remember last 3 pixels for use in next iteration
column.pixel0 = buffer[column.offset + (WIN_SIZE_Y + 0) * STRIDE];
column.pixel1 = buffer[column.offset + (WIN_SIZE_Y + 1) * STRIDE];
column.pixel2 = buffer[column.offset + (WIN_SIZE_Y + 2) * STRIDE];
// vertically transform the column in transform buffer
buffer.forEachVerticalOdd(column.offset, Forward53Predict());
buffer.forEachVerticalEven(column.offset, Forward53Update());
}
/// Actual implementation of 5/3 FDWT.
/// @tparam CHECK_LOADS true if input loader must check boundaries
/// @tparam CHECK_WRITES true if output writer must check boundaries
/// @param in input image
/// @param out output buffer
/// @param sizeX width of the input image
/// @param sizeY height of the input image
/// @param winSteps number of sliding window steps
template <bool CHECK_LOADS, bool CHECK_WRITES>
__device__ void transform(const int * const in, int * const out,
const int sizeX, const int sizeY,
const int winSteps) {
// info about one main and one boundary columns processed by this thread
FDWT53Column<CHECK_LOADS> column;
FDWT53Column<CHECK_LOADS> boundaryColumn; // only few threads use this
// Initialize all column info: initialize loaders, compute offset of
// column in shared buffer and initialize loader of column.
const int firstY = blockIdx.y * WIN_SIZE_Y * winSteps;
initColumn(column, in, sizeX, sizeY, threadIdx.x, firstY); //has been checked Mar 9th
// first 3 threads initialize boundary columns, others do not use them
boundaryColumn.clear();
if(threadIdx.x < 3) {
// index of boundary column (relative x-axis coordinate of the column)
const int colId = threadIdx.x + ((threadIdx.x == 0) ? WIN_SIZE_X : -3);
// initialize the column
initColumn(boundaryColumn, in, sizeX, sizeY, colId, firstY);
}
// index of column which will be written into output by this thread
const int outColumnIndex = parityIdx<WIN_SIZE_X>();
// offset of column which will be written by this thread into output
const int outColumnOffset = buffer.getColumnOffset(outColumnIndex);
// initialize output writer for this thread
const int outputFirstX = blockIdx.x * WIN_SIZE_X + outColumnIndex;
VerticalDWTBandWriter<int, CHECK_WRITES> writer;
writer.init(sizeX, sizeY, outputFirstX, firstY);
__syncthreads();
// Sliding window iterations:
// Each iteration assumes that first 3 pixels of each column are loaded.
for(int w = 0; w < winSteps; w++) {
// For each column (including boundary columns): load and vertically
// transform another WIN_SIZE_Y lines.
loadAndVerticallyTransform(column, in);
if(threadIdx.x < 3) {
loadAndVerticallyTransform(boundaryColumn, in);
}
// wait for all columns to be vertically transformed and transform all
// output rows horizontally
__syncthreads();
buffer.forEachHorizontalOdd(2, WIN_SIZE_Y, Forward53Predict());
__syncthreads();
buffer.forEachHorizontalEven(2, WIN_SIZE_Y, Forward53Update());
// wait for all output rows to be transformed horizontally and write
// them into output buffer
__syncthreads();
for(int r = 2; r < (2 + WIN_SIZE_Y); r += 2) {
// Write low coefficients from output column into low band ...
writer.writeLowInto(out, buffer[outColumnOffset + r * STRIDE]);
// ... and high coeficients into the high band.
writer.writeHighInto(out, buffer[outColumnOffset + (r+1) * STRIDE]);
}
// before proceeding to next iteration, wait for all output columns
// to be written into the output
__syncthreads();
}
}
public:
/// Determines, whether this block's pixels touch boundary and selects
/// right version of algorithm according to it - for many threadblocks, it
/// selects version which does not deal with boundary mirroring and thus is
/// slightly faster.
/// @param in input image
/// @param out output buffer
/// @param sx width of the input image
/// @param sy height of the input image
/// @param steps number of sliding window steps
__device__ static void run(const int * const in, int * const out,
const int sx, const int sy, const int steps) {
// if(blockIdx.x==0 && blockIdx.y ==11 && threadIdx.x >=0&&threadIdx.x <64){
// object with transform buffer in shared memory
__shared__ FDWT53<WIN_SIZE_X, WIN_SIZE_Y> fdwt53;
// Compute limits of this threadblock's block of pixels and use them to
// determine, whether this threadblock will have to deal with boundary.
// (1 in next expressions is for radius of impulse response of 9/7 FDWT.)
const int maxX = (blockIdx.x + 1) * WIN_SIZE_X + 1;
const int maxY = (blockIdx.y + 1) * WIN_SIZE_Y * steps + 1;
const bool atRightBoudary = maxX >= sx;
const bool atBottomBoudary = maxY >= sy;
// Select specialized version of code according to distance of this
// threadblock's pixels from image boundary.
// if(threadIdx.x == 0) {
// printf("fdwt53 run");
// }
if(atBottomBoudary)
{
// near bottom boundary => check both writing and reading
fdwt53.transform<true, true>(in, out, sx, sy, steps);
} else if(atRightBoudary)
{
// near right boundary only => check writing only
fdwt53.transform<false, true>(in, out, sx, sy, steps);
} else
{
// no nearby boundary => check nothing
fdwt53.transform<false, false>(in, out, sx, sy, steps);
}
}
// }
}; // end of class FDWT53
/// Main GPU 5/3 FDWT entry point.
/// @tparam WIN_SX width of sliding window to be used
/// @tparam WIN_SY height of sliding window to be used
/// @param input input image
/// @param output output buffer
/// @param sizeX width of the input image
/// @param sizeY height of the input image
/// @param winSteps number of sliding window steps
template <int WIN_SX, int WIN_SY>
__launch_bounds__(WIN_SX, CTMIN(SHM_SIZE/sizeof(FDWT53<WIN_SX, WIN_SY>), 8))
__global__ void fdwt53Kernel(const int * const input, int * const output,
const int sizeX, const int sizeY,
const int winSteps) {
FDWT53<WIN_SX, WIN_SY>::run(input, output, sizeX, sizeY, winSteps);
}
/// Only computes optimal number of sliding window steps,
/// number of threadblocks and then lanches the 5/3 FDWT kernel.
/// @tparam WIN_SX width of sliding window
/// @tparam WIN_SY height of sliding window
/// @param in input image
/// @param out output buffer
/// @param sx width of the input image
/// @param sy height of the input image
template <int WIN_SX, int WIN_SY>
void launchFDWT53Kernel (int * in, int * out, int sx, int sy) {
// compute optimal number of steps of each sliding window
const int steps = divRndUp(sy, 15 * WIN_SY);
int gx = divRndUp(sx, WIN_SX);
int gy = divRndUp(sy, WIN_SY * steps);
printf("\n sliding steps = %d , gx = %d , gy = %d \n", steps, gx, gy);
// prepare grid size
dim3 gSize(divRndUp(sx, WIN_SX), divRndUp(sy, WIN_SY * steps));
// printf("\n globalx=%d, globaly=%d, blocksize=%d\n", gSize.x, gSize.y, WIN_SX);
// run kernel, possibly measure time and finally check the call
// PERF_BEGIN
fdwt53Kernel<WIN_SX, WIN_SY><<<gSize, WIN_SX>>>(in, out, sx, sy, steps);
// PERF_END(" FDWT53", sx, sy)
// CudaDWTTester::checkLastKernelCall("FDWT 5/3 kernel");
printf("fdwt53Kernel in launchFDWT53Kernel has finished");
}
/// Forward 5/3 2D DWT. See common rules (above) for more details.
/// @param in Expected to be normalized into range [-128, 127].
/// Will not be preserved (will be overwritten).
/// @param out output buffer on GPU
/// @param sizeX width of input image (in pixels)
/// @param sizeY height of input image (in pixels)
/// @param levels number of recursive DWT levels
void fdwt53(int * in, int * out, int sizeX, int sizeY, int levels) {
// select right width of kernel for the size of the image
if(sizeX >= 960) {
launchFDWT53Kernel<192, 8>(in, out, sizeX, sizeY);
} else if (sizeX >= 480) {
launchFDWT53Kernel<128, 8>(in, out, sizeX, sizeY);
} else {
launchFDWT53Kernel<64, 8>(in, out, sizeX, sizeY);
}
// if this was not the last level, continue recursively with other levels
if(levels > 1) {
// copy output's LL band back into input buffer
const int llSizeX = divRndUp(sizeX, 2);
const int llSizeY = divRndUp(sizeY, 2);
// printf("\n llSizeX = %d , llSizeY = %d \n", llSizeX, llSizeY);
memCopy(in, out, llSizeX, llSizeY); //the function memCopy in cuda_dwt/common.h line 238
// run remaining levels of FDWT
fdwt53(in, out, llSizeX, llSizeY, levels - 1);
}
}
} // end of namespace dwt_cuda

View File

@ -1,383 +0,0 @@
///
/// @file fdwt97.cu
/// @brief CUDA implementation of forward 9/7 2D DWT.
/// @author Martin Jirman (207962@mail.muni.cz)
/// @date 2011-01-20 13:18
///
///
/// Copyright (c) 2011 Martin Jirman
/// All rights reserved.
///
/// Redistribution and use in source and binary forms, with or without
/// modification, are permitted provided that the following conditions are met:
///
/// * Redistributions of source code must retain the above copyright
/// notice, this list of conditions and the following disclaimer.
/// * Redistributions in binary form must reproduce the above copyright
/// notice, this list of conditions and the following disclaimer in the
/// documentation and/or other materials provided with the distribution.
///
/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
/// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
/// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
/// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
/// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
/// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
/// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
/// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
/// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
/// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
/// POSSIBILITY OF SUCH DAMAGE.
///
#include "common.h"
#include "transform_buffer.h"
#include "io.h"
namespace dwt_cuda {
/// Wraps a buffer and methods for computing 9/7 FDWT with sliding window
/// of specified size. Template arguments specify this size.
/// @tparam WIN_SIZE_X width of sliding window
/// @tparam WIN_SIZE_Y height of sliding window
template <int WIN_SIZE_X, int WIN_SIZE_Y>
class FDWT97 {
private:
/// Type of shared memory buffer used for 9/7 DWT.
typedef TransformBuffer<float, WIN_SIZE_X, WIN_SIZE_Y + 7, 4> FDWT97Buffer;
/// Actual shared buffer used for forward 9/7 DWT.
FDWT97Buffer buffer;
/// Difference of indices of two vertically neighboring items in buffer.
enum { STRIDE = FDWT97Buffer::VERTICAL_STRIDE };
/// One thread's info about loading input image
/// @tparam CHECKED true if loader should check for image boundaries
template <bool CHECKED>
struct FDWT97ColumnLoadingInfo {
/// Loader of pixels from some input image.
VerticalDWTPixelLoader<float, CHECKED> loader;
/// Offset of column loaded by loader. (Offset in shared buffer.)
int offset;
};
/// Horizontal 9/7 FDWT on specified lines of transform buffer.
/// @param lines number of lines to be transformed
/// @param firstLine index of the first line to be transformed
__device__ void horizontalFDWT97(const int lines, const int firstLine) {
__syncthreads();
buffer.forEachHorizontalOdd(firstLine, lines, AddScaledSum(f97Predict1));
__syncthreads();
buffer.forEachHorizontalEven(firstLine, lines, AddScaledSum(f97Update1));
__syncthreads();
buffer.forEachHorizontalOdd(firstLine, lines, AddScaledSum(f97Predict2));
__syncthreads();
buffer.forEachHorizontalEven(firstLine, lines, AddScaledSum(f97Update2));
__syncthreads();
buffer.scaleHorizontal(scale97Div, scale97Mul, firstLine, lines);
__syncthreads();
}
/// Initializes one column of shared transform buffer with 7 input pixels.
/// Those 7 pixels will not be transformed. Also initializes given loader.
/// @tparam CHECKED true if loader should check for image boundaries
/// @param column (uninitialized) object for loading input pixels
/// @param columnIndex index (not offset!) of the column to be loaded
/// (relative to threadblock's first column)
/// @param input pointer to input image in GPU memory
/// @param sizeX width of the input image
/// @param sizeY height of the input image
/// @param firstY index of first row to be loaded from image
template <bool CHECKED>
__device__ void initColumn(FDWT97ColumnLoadingInfo<CHECKED> & column,
const int columnIndex, const float * const input,
const int sizeX, const int sizeY,
const int firstY) {
// get offset of the column with index 'columnIndex'
column.offset = buffer.getColumnOffset(columnIndex);
// printf(" offset: %d , threadIdx: %d, blockIdx.y %d\n ", column.offset, threadIdx.x, blockIdx.y);
// x-coordinate of the first pixel to be loaded by given loader
const int firstX = blockIdx.x * WIN_SIZE_X + columnIndex;
if(blockIdx.y == 0) {
// topmost block - apply mirroring rules when loading first 7 rows
column.loader.init(sizeX, sizeY, firstX, firstY);
// load pixels in mirrored way
buffer[column.offset + 4 * STRIDE] = column.loader.loadFrom(input);
buffer[column.offset + 3 * STRIDE] =
buffer[column.offset + 5 * STRIDE] = column.loader.loadFrom(input);
buffer[column.offset + 2 * STRIDE] =
buffer[column.offset + 6 * STRIDE] = column.loader.loadFrom(input);
buffer[column.offset + 1 * STRIDE] = column.loader.loadFrom(input);
buffer[column.offset + 0 * STRIDE] = column.loader.loadFrom(input);
// reinitialize loader to start with pixel #3 again
column.loader.init(sizeX, sizeY, firstX, firstY + 3);
} else {
// non-topmost row - regular loading:
column.loader.init(sizeX, sizeY, firstX, firstY - 4);
// load 7 rows into the transform buffer
for(int i = 0; i < 7; i++) {
buffer[column.offset + i * STRIDE] = column.loader.loadFrom(input);
}
}
// Now, the next pixel, which will be loaded by loader, is pixel #3.
}
/// Loads another WIN_SIZE_Y pixels into given column using given loader.
/// @tparam CHECKED true if loader should check for image boundaries
/// @param input input image to load from
/// @param column loader and offset of loaded column in shared buffer
template <bool CHECKED>
inline __device__ void loadWindowIntoColumn(const float * const input,
FDWT97ColumnLoadingInfo<CHECKED> & column) {
for(int i = 7; i < (7 + WIN_SIZE_Y); i++) {
buffer[column.offset + i * STRIDE] = column.loader.loadFrom(input);
}
}
/// Main GPU 9/7 FDWT entry point.
/// @tparam CHECK_LOADS true if boundaries should be checked when loading
/// @tparam CHECK_WRITES true if boundaries should be checked when writing
/// @param in input image
/// @param out output buffer
/// @param sizeX width of the input image
/// @param sizeY height of the input image
/// @param winSteps number of steps of sliding window
template <bool CHECK_LOADS, bool CHECK_WRITES>
__device__ void transform(const float * const in, float * const out,
const int sizeX, const int sizeY,
const int winSteps) {
// info about columns loaded by this thread: one main column and possibly
// one boundary column. (Only some threads load some boundary column.)
FDWT97ColumnLoadingInfo<CHECK_LOADS> loadedColumn;
FDWT97ColumnLoadingInfo<CHECK_LOADS> boundaryColumn;
// Initialize first 7 lines of transform buffer.
const int firstY = blockIdx.y * WIN_SIZE_Y * winSteps;
initColumn(loadedColumn, threadIdx.x, in, sizeX, sizeY, firstY);
// Some threads initialize boundary columns.
boundaryColumn.offset = 0;
boundaryColumn.loader.clear();
if(threadIdx.x < 7) {
// each thread among first 7 ones gets index of one of boundary columns
const int colId = threadIdx.x + ((threadIdx.x < 3) ? WIN_SIZE_X : -7);
// Thread initializes offset of the boundary column (in shared buffer),
// first 7 pixels of the column and a loader for this column.
initColumn(boundaryColumn, colId, in, sizeX, sizeY, firstY);
}
// horizontally transform first 7 rows in all columns
horizontalFDWT97(7, 0);
// Index of column handled by this thread. (First half of threads handle
// even columns and others handle odd columns.)
const int outColumnIndex = parityIdx<WIN_SIZE_X>();
// writer of output linear bands - initialize it
const int firstX = blockIdx.x * WIN_SIZE_X + outColumnIndex;
VerticalDWTBandWriter<float, CHECK_WRITES> writer;
writer.init(sizeX, sizeY, firstX, firstY);
// transform buffer offset of column transformed and saved by this thread
const int outColumnOffset = buffer.getColumnOffset(outColumnIndex);
// (Each iteration of this loop assumes that first 7 rows of transform
// buffer are already loaded with horizontally transformed coefficients.)
for(int w = 0; w < winSteps; w++) {
// Load another WIN_SIZE_Y lines of thread's column into the buffer.
loadWindowIntoColumn(in, loadedColumn);
// some threads also load boundary columns
if(threadIdx.x < 7) {
loadWindowIntoColumn(in, boundaryColumn);
}
// horizontally transform all newly loaded lines
horizontalFDWT97(WIN_SIZE_Y, 7);
// Using 7 registers, remember current values of last 7 rows of
// transform buffer. These rows are transformed horizontally only
// and will be used in next iteration.
float last7Lines[7];
for(int i = 0; i < 7; i++) {
last7Lines[i] = buffer[outColumnOffset + (WIN_SIZE_Y + i) * STRIDE];
}
// vertically transform all central columns (do not scale yet)
buffer.forEachVerticalOdd(outColumnOffset, AddScaledSum(f97Predict1));
buffer.forEachVerticalEven(outColumnOffset, AddScaledSum(f97Update1));
buffer.forEachVerticalOdd(outColumnOffset, AddScaledSum(f97Predict2));
buffer.forEachVerticalEven(outColumnOffset, AddScaledSum(f97Update2));
// Save all results of current window. Results are in transform buffer
// at rows from #4 to #(4 + WIN_SIZE_Y). Other rows are invalid now.
// (They only served as a boundary for vertical FDWT.)
for(int i = 4; i < (4 + WIN_SIZE_Y); i += 2) {
const int index = outColumnOffset + i * STRIDE;
// Write low coefficients from column into low band ...
writer.writeLowInto(out, buffer[index] * scale97Div);
// ... and high coeficients into the high band.
writer.writeHighInto(out, buffer[index + STRIDE] * scale97Mul);
}
// Use last 7 remembered lines as first 7 lines for next iteration.
// As expected, these lines are already horizontally transformed.
for(int i = 0; i < 7; i++) {
buffer[outColumnOffset + i * STRIDE] = last7Lines[i];
}
// Wait for all writing threads before proceeding to loading new
// pixels in next iteration. (Not to overwrite those which
// are not written yet.)
__syncthreads();
}
}
public:
/// Runs one of specialized variants of 9/7 FDWT according to distance of
/// processed pixels to image boudnary. Some variants do not check for
/// boudnary and thus are slightly faster.
/// @param in input image
/// @param out output buffer
/// @param sx width of the input image
/// @param sy height of the input image
/// @param steps number of steps of sliding window
__device__ static void run(const float * const input, float * const output,
const int sx, const int sy, const int steps) {
// object with transform buffer in shared memory
__shared__ FDWT97<WIN_SIZE_X, WIN_SIZE_Y> fdwt97;
// Compute limits of this threadblock's block of pixels and use them to
// determine, whether this threadblock will have to deal with boundary.
// (3 in next expressions is for radius of impulse response of 9/7 FDWT.)
const int maxX = (blockIdx.x + 1) * WIN_SIZE_X + 3;
const int maxY = (blockIdx.y + 1) * WIN_SIZE_Y * steps + 3;
const bool atRightBoudary = maxX >= sx;
const bool atBottomBoudary = maxY >= sy;
// Select specialized version of code according to distance of this
// threadblock's pixels from image boundary.
if(atBottomBoudary) {
// near bottom boundary => check both writing and reading
// printf("\n atBottomBoudary \n ");
fdwt97.transform<true, true>(input, output, sx, sy, steps);
} else if(atRightBoudary) {
// near right boundary only => check writing only
fdwt97.transform<false, true>(input, output, sx, sy, steps);
} else {
// no nearby boundary => check nothing
fdwt97.transform<false, false>(input, output, sx, sy, steps);
}
}
}; // end of class FDWT97
/// Main GPU 9/7 FDWT entry point.
/// @param input input image
/// @parma output output buffer
/// @param sx width of the input image
/// @param sy height of the input image
/// @param steps number of steps of sliding window
template <int WIN_SX, int WIN_SY>
__launch_bounds__(WIN_SX, CTMIN(SHM_SIZE/sizeof(FDWT97<WIN_SX, WIN_SY>), 8))
__global__ void fdwt97Kernel(const float * const input, float * const output,
const int sx, const int sy, const int steps) {
// Excuse me, dear reader of this code - this call have to be here. If you
// try to simply put contents of following method right here, CUDA compiler
// (version 3.2) will spit tons of nonsense messy errors ...
// Hope they will not break it even more in future releases.
FDWT97<WIN_SX, WIN_SY>::run(input, output, sx, sy, steps);
}
/// Only computes optimal number of sliding window steps,
/// number of threadblocks and then lanches the 9/7 FDWT kernel.
/// @tparam WIN_SX width of sliding window
/// @tparam WIN_SY height of sliding window
/// @param in input image
/// @param out output buffer
/// @param sx width of the input image
/// @param sy height of the input image
template <int WIN_SX, int WIN_SY>
void launchFDWT97Kernel (float * in, float * out, int sx, int sy) {
// compute optimal number of steps of each sliding window
const int steps = divRndUp(sy, 15 * WIN_SY);
// prepare grid size
dim3 gSize(divRndUp(sx, WIN_SX), divRndUp(sy, WIN_SY * steps));
printf("\n globalx=%d, globaly=%d, blocksize=%d\n", gSize.x, gSize.y, WIN_SX);
// run kernel, possibly measure time and finally check the call
PERF_BEGIN
fdwt97Kernel<WIN_SX, WIN_SY><<<gSize, WIN_SX>>>(in, out, sx, sy, steps);
PERF_END(" FDWT97", sx, sy)
CudaDWTTester::checkLastKernelCall("FDWT 9/7 kernel");
}
/// Forward 9/7 2D DWT. See common rules (dwt.h) for more details.
/// @param in Input DWT coefficients. Should be normalized (in range
/// [-0.5, 0.5]). Will not be preserved (will be overwritten).
/// @param out output buffer on GPU - format specified in common rules
/// @param sizeX width of input image (in pixels)
/// @param sizeY height of input image (in pixels)
/// @param levels number of recursive DWT levels
void fdwt97(float * in, float * out, int sizeX, int sizeY, int levels) {
// select right width of kernel for the size of the image
if(sizeX >= 960) {
launchFDWT97Kernel<192, 8>(in, out, sizeX, sizeY);
} else if (sizeX >= 480) {
launchFDWT97Kernel<128, 6>(in, out, sizeX, sizeY);
} else {
launchFDWT97Kernel<64, 6>(in, out, sizeX, sizeY);
}
// if this was not the last level, continue recursively with other levels
if(levels > 1) {
// copy output's LL band back into input buffer
const int llSizeX = divRndUp(sizeX, 2);
const int llSizeY = divRndUp(sizeY, 2);
memCopy(in, out, llSizeX, llSizeY);
// run remaining levels of FDWT
fdwt97(in, out, llSizeX, llSizeY, levels - 1);
}
}
} // end of namespace dwt_cuda

View File

@ -1,440 +0,0 @@
///
/// @file: io.h
/// @brief Manages loading and saving lineary stored bands and input images.
/// @author Martin Jirman (207962@mail.muni.cz)
/// @date 2011-01-20 22:38
///
///
/// Copyright (c) 2011 Martin Jirman
/// All rights reserved.
///
/// Redistribution and use in source and binary forms, with or without
/// modification, are permitted provided that the following conditions are met:
///
/// * Redistributions of source code must retain the above copyright
/// notice, this list of conditions and the following disclaimer.
/// * Redistributions in binary form must reproduce the above copyright
/// notice, this list of conditions and the following disclaimer in the
/// documentation and/or other materials provided with the distribution.
///
/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
/// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
/// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
/// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
/// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
/// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
/// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
/// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
/// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
/// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
/// POSSIBILITY OF SUCH DAMAGE.
///
#ifndef IO_H
#define IO_H
#include "common.h"
namespace dwt_cuda {
/// Base for all IO classes - manages mirroring.
class DWTIO {
protected:
/// Handles mirroring of image at edges in a DWT correct way.
/// @param d a position in the image (will be replaced by mirrored d)
/// @param sizeD size of the image along the dimension of 'd'
__device__ static void mirror(int &d, const int &sizeD) {
// TODO: enable multiple mirroring:
// if(sizeD > 1) {
// if(d < 0) {
// const int underflow = -1 - d;
// const int phase = (underflow / (sizeD - 1)) & 1;
// const int remainder = underflow % (sizeD - 1);
// if(phase == 0) {
// d = remainder + 1;
// } else {
// d = sizeD - 2 - remainder;
// }
// } else if(d >= sizeD) {
// const int overflow = d - sizeD;
// const int phase = (overflow / (sizeD - 1)) & 1;
// const int remainder = overflow % (sizeD - 1);
// if(phase == 0) {
// d = sizeD - 2 - remainder;
// } else {
// d = remainder + 1;
// }
// }
// } else {
// d = 0;
// }
// for test the mirror's use Feb 17
if (d >= sizeD) {
d = 2 * sizeD - 2 - d;
} else if (d < 0) {
d = -d;
}
}
};
/// Base class for pixel loader and writer - manages computing start index,
/// stride and end of image for loading column of pixels.
/// @tparam T type of image pixels
/// @tparam CHECKED true = be prepared to image boundary, false = don't care
template <typename T, bool CHECKED> class VerticalDWTPixelIO : protected DWTIO {
protected:
int end; ///< index of bottom neightbor of last pixel of column
int stride; ///< increment of pointer to get to next pixel
/// Initializes pixel IO - sets end index and a position of first pixel.
/// @param sizeX width of the image
/// @param sizeY height of the image
/// @param firstX x-coordinate of first pixel to use
/// @param firstY y-coordinate of first pixel to use
/// @return index of pixel at position [x, y] in the image
__device__ int initialize(const int sizeX, const int sizeY, int firstX,
int firstY) {
// initialize all pointers and stride
end = CHECKED ? (sizeY * sizeX + firstX) : 0;
stride = sizeX;
return firstX + sizeX * firstY;
}
};
/// Writes reverse transformed pixels directly into output image.
/// @tparam T type of output pixels
/// @tparam CHECKED true = be prepared to image boundary, false = don't care
template <typename T, bool CHECKED>
class VerticalDWTPixelWriter : VerticalDWTPixelIO<T, CHECKED> {
private:
int next; // index of the next pixel to be loaded
public:
/// Initializes writer - sets output buffer and a position of first pixel.
/// @param sizeX width of the image
/// @param sizeY height of the image
/// @param firstX x-coordinate of first pixel to write into
/// @param firstY y-coordinate of first pixel to write into
__device__ void init(const int sizeX, const int sizeY, int firstX,
int firstY) {
if (firstX < sizeX) {
next = this->initialize(sizeX, sizeY, firstX, firstY);
} else {
this->end = 0;
this->stride = 0;
next = 0;
}
}
/// Writes given value at next position and advances internal pointer while
/// correctly handling mirroring.
/// @param output output image to write pixel into
/// @param value value of the pixel to be written
__device__ void writeInto(T *const output, const T &value) {
if ((!CHECKED) || (next != this->end)) {
output[next] = value;
next += this->stride;
}
}
};
/// Loads pixels from input image.
/// @tparam T type of image input pixels
/// @tparam CHECKED true = be prepared to image boundary, false = don't care
template <typename T, bool CHECKED>
class VerticalDWTPixelLoader : protected VerticalDWTPixelIO<const T, CHECKED> {
private:
int last; ///< index of last loaded pixel
public:
//******************* FOR TEST **********************
__device__ int getlast() { return last; }
__device__ int getend() { return this->end; }
__device__ int getstride() { return this->stride; }
__device__ void setend(int a) { this->end = a; }
//******************* FOR TEST **********************
/// Initializes loader - sets input size and a position of first pixel.
/// @param sizeX width of the image
/// @param sizeY height of the image
/// @param firstX x-coordinate of first pixel to load
/// @param firstY y-coordinate of first pixel to load
__device__ void init(const int sizeX, const int sizeY, int firstX,
int firstY) {
// correctly mirror x coordinate
this->mirror(firstX, sizeX);
// 'last' always points to already loaded pixel (subtract sizeX = stride)
last = this->initialize(sizeX, sizeY, firstX, firstY) - sizeX;
// last = (FirstX + sizeX * FirstY) - sizeX
}
/// Sets all fields to zeros, for compiler not to complain about
/// uninitialized stuff.
__device__ void clear() {
this->end = 0;
this->stride = 0;
this->last = 0;
}
/// Gets another pixel and advancees internal pointer to following one.
/// @param input input image to load next pixel from
/// @return next pixel from given image
__device__ T loadFrom(const T *const input) {
last += this->stride;
if (CHECKED && (last == this->end)) {
last -= 2 * this->stride;
this->stride = -this->stride; // reverse loader's direction
}
// avoid reading from negative indices if loader is checked
// return (CHECKED && (last < 0)) ? 0 : input[last]; // TODO: use this
// checked variant later
if (last < 0) {
return 0;
}
return input[last];
// return this->end;
// return last;
// return this->stride;
}
};
/// Base for band write and loader. Manages computing strides and pointers
/// to first and last pixels in a linearly-stored-bands correct way.
/// @tparam T type of band coefficients
/// @tparam CHECKED true = be prepared to image boundary, false = don't care
template <typename T, bool CHECKED> class VerticalDWTBandIO : protected DWTIO {
protected:
/// index of bottom neighbor of last pixel of loaded column
int end;
/// increment of index to get from highpass band to the lowpass one
int strideHighToLow;
/// increment of index to get from the lowpass band to the highpass one
int strideLowToHigh;
/// Initializes IO - sets size of image and a position of first pixel.
/// @param imageSizeX width of the image
/// @param imageSizeY height of the image
/// @param firstX x-coordinate of first pixel to use
/// (Parity determines vertically low or high band.)
/// @param firstY y-coordinate of first pixel to use
/// (Parity determines horizontally low or high band.)
/// @return index of first item specified by firstX and firstY
__device__ int initialize(const int imageSizeX, const int imageSizeY,
int firstX, int firstY) {
// index of first pixel (topmost one) of the column with index firstX
int columnOffset = firstX / 2;
// difference between indices of two vertically neighboring pixels
// in the same band
int verticalStride;
// resolve index of first pixel according to horizontal parity
if (firstX & 1) {
// first pixel in one of right bands
verticalStride = imageSizeX / 2;
columnOffset += divRndUp(imageSizeX, 2) * divRndUp(imageSizeY, 2);
strideLowToHigh = (imageSizeX * imageSizeY) / 2;
} else {
// first pixel in one of left bands
verticalStride = imageSizeX / 2 + (imageSizeX & 1);
strideLowToHigh = divRndUp(imageSizeY, 2) * imageSizeX;
}
// set the other stride
strideHighToLow = verticalStride - strideLowToHigh;
// compute index of coefficient which indicates end of image
if (CHECKED) {
end = columnOffset // right column
+ (imageSizeY / 2) * verticalStride // right row
+ (imageSizeY & 1) * strideLowToHigh; // possibly in high band
} else {
end = 0;
}
//***********for test**************
// end = CHECKED;
//***********for test**************
// finally, return index of the first item
return columnOffset // right column
+ (firstY / 2) * verticalStride // right row
+ (firstY & 1) * strideLowToHigh; // possibly in high band
}
};
/// Directly loads coefficients from four consecutively stored transformed
/// bands.
/// @tparam T type of input band coefficients
/// @tparam CHECKED true = be prepared to image boundary, false = don't care
template <typename T, bool CHECKED>
class VerticalDWTBandLoader : public VerticalDWTBandIO<const T, CHECKED> {
private:
int last; ///< index of last loaded pixel
/// Checks internal index and possibly reverses direction of loader.
/// (Handles mirroring at the bottom of the image.)
/// @param input input image to load next coefficient from
/// @param stride stride to use now (one of two loader's strides)
/// @return loaded coefficient
__device__ T updateAndLoad(const T *const input, const int &stride) {
last += stride;
if (CHECKED && (last == this->end)) {
// undo last two updates of index (to get to previous mirrored item)
last -= (this->strideLowToHigh + this->strideHighToLow);
// swap and reverse strides (to move up in the loaded column now)
const int temp = this->strideLowToHigh;
this->strideLowToHigh = -this->strideHighToLow;
this->strideHighToLow = -temp;
}
if (last < 0) {
return 0;
}
// avoid reading from negative indices if loader is checked
// return (CHECKED && (last < 0)) ? 0 : input[last]; // TODO: use this
// checked variant later
return input[last];
}
public:
/// Initializes loader - sets input size and a position of first pixel.
/// @param imageSizeX width of the image
/// @param imageSizeY height of the image
/// @param firstX x-coordinate of first pixel to load
/// (Parity determines vertically low or high band.)
/// @param firstY y-coordinate of first pixel to load
/// (Parity determines horizontally low or high band.)
__device__ void init(const int imageSizeX, const int imageSizeY, int firstX,
const int firstY) {
this->mirror(firstX, imageSizeX);
last = this->initialize(imageSizeX, imageSizeY, firstX, firstY);
// adjust to point to previous item
last -= (firstY & 1) ? this->strideLowToHigh : this->strideHighToLow;
}
/// Sets all fields to zeros, for compiler not to complain about
/// uninitialized stuff.
__device__ void clear() {
this->end = 0;
this->strideHighToLow = 0;
this->strideLowToHigh = 0;
this->last = 0;
}
/// Gets another coefficient from lowpass band and advances internal index.
/// Call this method first if position of first pixel passed to init
/// was in high band.
/// @param input input image to load next coefficient from
/// @return next coefficient from the lowpass band of the given image
__device__ T loadLowFrom(const T *const input) {
return updateAndLoad(input, this->strideHighToLow);
}
/// Gets another coefficient from the highpass band and advances index.
/// Call this method first if position of first pixel passed to init
/// was in high band.
/// @param input input image to load next coefficient from
/// @return next coefficient from the highbass band of the given image
__device__ T loadHighFrom(const T *const input) {
return updateAndLoad(input, this->strideLowToHigh);
}
};
/// Directly saves coefficients into four transformed bands.
/// @tparam T type of output band coefficients
/// @tparam CHECKED true = be prepared to image boundary, false = don't care
template <typename T, bool CHECKED>
class VerticalDWTBandWriter : public VerticalDWTBandIO<T, CHECKED> {
private:
int next; ///< index of last loaded pixel
/// Checks internal index and possibly stops the writer.
/// (Handles mirroring at edges of the image.)
/// @param output output buffer
/// @param item item to put into the output
/// @param stride increment of the pointer to get to next output index
__device__ int saveAndUpdate(T *const output, const T &item,
const int &stride) {
// if(blockIdx.x == 0 && blockIdx.y == 11 && threadIdx.x == 0){
////test, Mar 20
if ((!CHECKED) || (next != this->end)) {
// if(next == 4) {
// printf(" next: %d stride: %d val: %f \n", next, stride, item );
// }
output[next] = item;
next += stride;
}
// }
// if((!CHECKED) || (next != this->end)) { //the real one
// output[next] = item;
// next += stride; //stride has been test
// }
return next;
}
public:
/// Initializes writer - sets output size and a position of first pixel.
/// @param output output image
/// @param imageSizeX width of the image
/// @param imageSizeY height of the image
/// @param firstX x-coordinate of first pixel to write
/// (Parity determines vertically low or high band.)
/// @param firstY y-coordinate of first pixel to write
/// (Parity determines horizontally low or high band.)
__device__ void init(const int imageSizeX, const int imageSizeY,
const int firstX, const int firstY) {
if (firstX < imageSizeX) {
next = this->initialize(imageSizeX, imageSizeY, firstX, firstY);
} else {
clear();
}
}
/// Sets all fields to zeros, for compiler not to complain about
/// uninitialized stuff.
__device__ void clear() {
this->end = 0;
this->strideHighToLow = 0;
this->strideLowToHigh = 0;
this->next = 0;
}
/// Writes another coefficient into the band which was specified using
/// init's firstX and firstY parameters and advances internal pointer.
/// Call this method first if position of first pixel passed to init
/// was in lowpass band.
/// @param output output image
/// @param low lowpass coefficient to save into the lowpass band
__device__ int writeLowInto(T *const output, const T &primary) {
return saveAndUpdate(output, primary, this->strideLowToHigh);
}
/// Writes another coefficient from the other band and advances pointer.
/// Call this method first if position of first pixel passed to init
/// was in highpass band.
/// @param output output image
/// @param high highpass coefficient to save into the highpass band
__device__ int writeHighInto(T *const output, const T &other) {
return saveAndUpdate(output, other, this->strideHighToLow);
}
//*******Add three functions to get private values*******
__device__ int getnext() { return next; }
__device__ int getend() { return this->end; }
__device__ int getstrideHighToLow() { return this->strideHighToLow; }
__device__ int getstrideLowToHigh() { return this->strideLowToHigh; }
//*******Add three functions to get private values*******
};
} // namespace dwt_cuda
#endif // IO_H

View File

@ -1,360 +0,0 @@
///
/// @file rdwt53.cu
/// @brief CUDA implementation of reverse 5/3 2D DWT.
/// @author Martin Jirman (207962@mail.muni.cz)
/// @date 2011-02-04 14:19
///
///
/// Copyright (c) 2011 Martin Jirman
/// All rights reserved.
///
/// Redistribution and use in source and binary forms, with or without
/// modification, are permitted provided that the following conditions are met:
///
/// * Redistributions of source code must retain the above copyright
/// notice, this list of conditions and the following disclaimer.
/// * Redistributions in binary form must reproduce the above copyright
/// notice, this list of conditions and the following disclaimer in the
/// documentation and/or other materials provided with the distribution.
///
/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
/// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
/// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
/// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
/// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
/// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
/// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
/// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
/// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
/// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
/// POSSIBILITY OF SUCH DAMAGE.
///
#include "common.h"
#include "transform_buffer.h"
#include "io.h"
namespace dwt_cuda {
/// Wraps shared momory buffer and algorithms needed for computing 5/3 RDWT
/// using sliding window and lifting schema.
/// @tparam WIN_SIZE_X width of sliding window
/// @tparam WIN_SIZE_Y height of sliding window
template <int WIN_SIZE_X, int WIN_SIZE_Y>
class RDWT53 {
private:
/// Shared memory buffer used for 5/3 DWT transforms.
typedef TransformBuffer<int, WIN_SIZE_X, WIN_SIZE_Y + 3, 2> RDWT53Buffer;
/// Shared buffer used for reverse 5/3 DWT.
RDWT53Buffer buffer;
/// Difference between indices of two vertically neighboring items in buffer.
enum { STRIDE = RDWT53Buffer::VERTICAL_STRIDE };
/// Info needed for loading of one input column from input image.
/// @tparam CHECKED true if loader should check boundaries
template <bool CHECKED>
struct RDWT53Column {
/// loader of pixels from column in input image
VerticalDWTBandLoader<int, CHECKED> loader;
/// Offset of corresponding column in shared buffer.
int offset;
/// Sets all fields to some values to avoid 'uninitialized' warnings.
__device__ void clear() {
offset = 0;
loader.clear();
}
};
/// 5/3 DWT reverse update operation.
struct Reverse53Update {
__device__ void operator() (const int p, int & c, const int n) const {
c -= (p + n + 2) / 4; // F.3, page 118, ITU-T Rec. T.800 final draft
}
};
/// 5/3 DWT reverse predict operation.
struct Reverse53Predict {
__device__ void operator() (const int p, int & c, const int n) const {
c += (p + n) / 2; // F.4, page 118, ITU-T Rec. T.800 final draft
}
};
/// Horizontal 5/3 RDWT on specified lines of transform buffer.
/// @param lines number of lines to be transformed
/// @param firstLine index of the first line to be transformed
__device__ void horizontalTransform(const int lines, const int firstLine) {
__syncthreads();
buffer.forEachHorizontalEven(firstLine, lines, Reverse53Update());
__syncthreads();
buffer.forEachHorizontalOdd(firstLine, lines, Reverse53Predict());
__syncthreads();
}
/// Using given loader, it loads another WIN_SIZE_Y coefficients
/// into specified column.
/// @tparam CHECKED true if loader should check image boundaries
/// @param input input coefficients to load from
/// @param col info about loaded column
template <bool CHECKED>
inline __device__ void loadWindowIntoColumn(const int * const input,
RDWT53Column<CHECKED> & col) {
for(int i = 3; i < (3 + WIN_SIZE_Y); i += 2) {
buffer[col.offset + i * STRIDE] = col.loader.loadLowFrom(input);
buffer[col.offset + (i + 1) * STRIDE] = col.loader.loadHighFrom(input);
}
}
/// Initializes one column of shared transform buffer with 7 input pixels.
/// Those 7 pixels will not be transformed. Also initializes given loader.
/// @tparam CHECKED true if loader should check image boundaries
/// @param columnX x coordinate of column in shared transform buffer
/// @param input input image
/// @param sizeX width of the input image
/// @param sizeY height of the input image
/// @param loader (uninitialized) info about loaded column
template <bool CHECKED>
__device__ void initColumn(const int columnX, const int * const input,
const int sizeX, const int sizeY,
RDWT53Column<CHECKED> & column,
const int firstY) {
// coordinates of the first coefficient to be loaded
const int firstX = blockIdx.x * WIN_SIZE_X + columnX;
// offset of the column with index 'colIndex' in the transform buffer
column.offset = buffer.getColumnOffset(columnX);
if(blockIdx.y == 0) {
// topmost block - apply mirroring rules when loading first 3 rows
column.loader.init(sizeX, sizeY, firstX, firstY);
// load pixels in mirrored way
buffer[column.offset + 1 * STRIDE] = column.loader.loadLowFrom(input);
buffer[column.offset + 0 * STRIDE] =
buffer[column.offset + 2 * STRIDE] = column.loader.loadHighFrom(input);
} else {
// non-topmost row - regular loading:
column.loader.init(sizeX, sizeY, firstX, firstY - 1);
buffer[column.offset + 0 * STRIDE] = column.loader.loadHighFrom(input);
buffer[column.offset + 1 * STRIDE] = column.loader.loadLowFrom(input);
buffer[column.offset + 2 * STRIDE] = column.loader.loadHighFrom(input);
}
// Now, the next coefficient, which will be loaded by loader, is #2.
}
/// Actual GPU 5/3 RDWT implementation.
/// @tparam CHECKED_LOADS true if boundaries must be checked when reading
/// @tparam CHECKED_WRITES true if boundaries must be checked when writing
/// @param in input image (5/3 transformed coefficients)
/// @param out output buffer (for reverse transformed image)
/// @param sizeX width of the output image
/// @param sizeY height of the output image
/// @param winSteps number of sliding window steps
template<bool CHECKED_LOADS, bool CHECKED_WRITES>
__device__ void transform(const int * const in, int * const out,
const int sizeX, const int sizeY,
const int winSteps) {
// info about one main and one boundary column
RDWT53Column<CHECKED_LOADS> column, boundaryColumn;
// index of first row to be transformed
const int firstY = blockIdx.y * WIN_SIZE_Y * winSteps;
// some threads initialize boundary columns
boundaryColumn.clear();
if(threadIdx.x < 3) {
// First 3 threads also handle boundary columns. Thread #0 gets right
// column #0, thread #1 get right column #1 and thread #2 left column.
const int colId = threadIdx.x + ((threadIdx.x != 2) ? WIN_SIZE_X : -3);
// Thread initializes offset of the boundary column (in shared
// buffer), first 3 pixels of the column and a loader for this column.
initColumn(colId, in, sizeX, sizeY, boundaryColumn, firstY);
}
// All threads initialize central columns.
initColumn(parityIdx<WIN_SIZE_X>(), in, sizeX, sizeY, column, firstY);
// horizontally transform first 3 rows
horizontalTransform(3, 0);
// writer of output pixels - initialize it
const int outX = blockIdx.x * WIN_SIZE_X + threadIdx.x;
VerticalDWTPixelWriter<int, CHECKED_WRITES> writer;
writer.init(sizeX, sizeY, outX, firstY);
// offset of column (in transform buffer) saved by this thread
const int outputColumnOffset = buffer.getColumnOffset(threadIdx.x);
// (Each iteration assumes that first 3 rows of transform buffer are
// already loaded with horizontally transformed pixels.)
for(int w = 0; w < winSteps; w++) {
// Load another WIN_SIZE_Y lines of this thread's column
// into the transform buffer.
loadWindowIntoColumn(in, column);
// possibly load boundary columns
if(threadIdx.x < 3) {
loadWindowIntoColumn(in, boundaryColumn);
}
// horizontally transform all newly loaded lines
horizontalTransform(WIN_SIZE_Y, 3);
// Using 3 registers, remember current values of last 3 rows
// of transform buffer. These rows are transformed horizontally
// only and will be used in next iteration.
int last3Lines[3];
last3Lines[0] = buffer[outputColumnOffset + (WIN_SIZE_Y + 0) * STRIDE];
last3Lines[1] = buffer[outputColumnOffset + (WIN_SIZE_Y + 1) * STRIDE];
last3Lines[2] = buffer[outputColumnOffset + (WIN_SIZE_Y + 2) * STRIDE];
// vertically transform all central columns
buffer.forEachVerticalOdd(outputColumnOffset, Reverse53Update());
buffer.forEachVerticalEven(outputColumnOffset, Reverse53Predict());
// Save all results of current window. Results are in transform buffer
// at rows from #1 to #(1 + WIN_SIZE_Y). Other rows are invalid now.
// (They only served as a boundary for vertical RDWT.)
for(int i = 1; i < (1 + WIN_SIZE_Y); i++) {
writer.writeInto(out, buffer[outputColumnOffset + i * STRIDE]);
}
// Use last 3 remembered lines as first 3 lines for next iteration.
// As expected, these lines are already horizontally transformed.
buffer[outputColumnOffset + 0 * STRIDE] = last3Lines[0];
buffer[outputColumnOffset + 1 * STRIDE] = last3Lines[1];
buffer[outputColumnOffset + 2 * STRIDE] = last3Lines[2];
// Wait for all writing threads before proceeding to loading new
// coeficients in next iteration. (Not to overwrite those which
// are not written yet.)
__syncthreads();
}
}
public:
/// Main GPU 5/3 RDWT entry point.
/// @param in input image (5/3 transformed coefficients)
/// @param out output buffer (for reverse transformed image)
/// @param sizeX width of the output image
/// @param sizeY height of the output image
/// @param winSteps number of sliding window steps
__device__ static void run(const int * const input, int * const output,
const int sx, const int sy, const int steps) {
// prepare instance with buffer in shared memory
__shared__ RDWT53<WIN_SIZE_X, WIN_SIZE_Y> rdwt53;
// Compute limits of this threadblock's block of pixels and use them to
// determine, whether this threadblock will have to deal with boundary.
// (1 in next expressions is for radius of impulse response of 5/3 RDWT.)
const int maxX = (blockIdx.x + 1) * WIN_SIZE_X + 1;
const int maxY = (blockIdx.y + 1) * WIN_SIZE_Y * steps + 1;
const bool atRightBoudary = maxX >= sx;
const bool atBottomBoudary = maxY >= sy;
// Select specialized version of code according to distance of this
// threadblock's pixels from image boundary.
if(atBottomBoudary) {
// near bottom boundary => check both writing and reading
rdwt53.transform<true, true>(input, output, sx, sy, steps);
} else if(atRightBoudary) {
// near right boundary only => check writing only
rdwt53.transform<false, true>(input, output, sx, sy, steps);
} else {
// no nearby boundary => check nothing
rdwt53.transform<false, false>(input, output, sx, sy, steps);
}
}
}; // end of class RDWT53
/// Main GPU 5/3 RDWT entry point.
/// @param in input image (5/3 transformed coefficients)
/// @param out output buffer (for reverse transformed image)
/// @param sizeX width of the output image
/// @param sizeY height of the output image
/// @param winSteps number of sliding window steps
template <int WIN_SX, int WIN_SY>
__launch_bounds__(WIN_SX, CTMIN(SHM_SIZE/sizeof(RDWT53<WIN_SX, WIN_SY>), 8))
__global__ void rdwt53Kernel(const int * const in, int * const out,
const int sx, const int sy, const int steps) {
RDWT53<WIN_SX, WIN_SY>::run(in, out, sx, sy, steps);
}
/// Only computes optimal number of sliding window steps,
/// number of threadblocks and then lanches the 5/3 RDWT kernel.
/// @tparam WIN_SX width of sliding window
/// @tparam WIN_SY height of sliding window
/// @param in input image
/// @param out output buffer
/// @param sx width of the input image
/// @param sy height of the input image
template <int WIN_SX, int WIN_SY>
void launchRDWT53Kernel (int * in, int * out, const int sx, const int sy) {
// compute optimal number of steps of each sliding window
const int steps = divRndUp(sy, 15 * WIN_SY);
// prepare grid size
dim3 gSize(divRndUp(sx, WIN_SX), divRndUp(sy, WIN_SY * steps));
// finally transform this level
PERF_BEGIN
rdwt53Kernel<WIN_SX, WIN_SY><<<gSize, WIN_SX>>>(in, out, sx, sy, steps);
PERF_END(" RDWT53", sx, sy)
CudaDWTTester::checkLastKernelCall("RDWT 5/3 kernel");
}
/// Reverse 5/3 2D DWT. See common rules (above) for more details.
/// @param in Input DWT coefficients. Format described in common rules.
/// Will not be preserved (will be overwritten).
/// @param out output buffer on GPU - will contain original image
/// in normalized range [-128, 127].
/// @param sizeX width of input image (in pixels)
/// @param sizeY height of input image (in pixels)
/// @param levels number of recursive DWT levels
void rdwt53(int * in, int * out, int sizeX, int sizeY, int levels) {
if(levels > 1) {
// let this function recursively reverse transform deeper levels first
const int llSizeX = divRndUp(sizeX, 2);
const int llSizeY = divRndUp(sizeY, 2);
rdwt53(in, out, llSizeX, llSizeY, levels - 1);
// copy reverse transformed LL band from output back into the input
memCopy(in, out, llSizeX, llSizeY);
}
// select right width of kernel for the size of the image
if(sizeX >= 960) {
launchRDWT53Kernel<192, 8>(in, out, sizeX, sizeY);
} else if (sizeX >= 480) {
launchRDWT53Kernel<128, 8>(in, out, sizeX, sizeY);
} else {
launchRDWT53Kernel<64, 8>(in, out, sizeX, sizeY);
}
}
} // end of namespace dwt_cuda

View File

@ -1,363 +0,0 @@
///
/// @file rdwt97.cu
/// @brief CUDA implementation of reverse 9/7 2D DWT.
/// @author Martin Jirman (207962@mail.muni.cz)
/// @date 2011-02-03 21:59
///
///
/// Copyright (c) 2011 Martin Jirman
/// All rights reserved.
///
/// Redistribution and use in source and binary forms, with or without
/// modification, are permitted provided that the following conditions are met:
///
/// * Redistributions of source code must retain the above copyright
/// notice, this list of conditions and the following disclaimer.
/// * Redistributions in binary form must reproduce the above copyright
/// notice, this list of conditions and the following disclaimer in the
/// documentation and/or other materials provided with the distribution.
///
/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
/// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
/// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
/// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
/// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
/// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
/// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
/// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
/// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
/// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
/// POSSIBILITY OF SUCH DAMAGE.
///
#include "common.h"
#include "transform_buffer.h"
#include "io.h"
namespace dwt_cuda {
/// Wraps shared memory buffer and methods for computing 9/7 RDWT using
/// lifting schema and sliding window.
/// @tparam WIN_SIZE_X width of the sliding window
/// @tparam WIN_SIZE_Y height of the sliding window
template <int WIN_SIZE_X, int WIN_SIZE_Y>
class RDWT97 {
private:
/// Info related to loading of one input column.
/// @tparam CHECKED true if boundary chould be checked,
/// false if there is no near boudnary
template <bool CHECKED>
struct RDWT97Column {
/// laoder of input pxels for given column.
VerticalDWTBandLoader<float, CHECKED> loader;
/// Offset of loaded column in shared memory buffer.
int offset;
/// Sets all fields to some values to avoid 'uninitialized' warnings.
__device__ void clear() {
loader.clear();
offset = 0;
}
};
/// Shared memory buffer used for 9/7 DWT transforms.
typedef TransformBuffer<float, WIN_SIZE_X, WIN_SIZE_Y + 7, 4> RDWT97Buffer;
/// Shared buffer used for reverse 9/7 DWT.
RDWT97Buffer buffer;
/// Difference between indices of two vertical neighbors in buffer.
enum { STRIDE = RDWT97Buffer::VERTICAL_STRIDE };
/// Horizontal 9/7 RDWT on specified lines of transform buffer.
/// @param lines number of lines to be transformed
/// @param firstLine index of the first line to be transformed
__device__ void horizontalRDWT97(int lines, int firstLine) {
__syncthreads();
buffer.scaleHorizontal(scale97Mul, scale97Div, firstLine, lines);
__syncthreads();
buffer.forEachHorizontalEven(firstLine, lines, AddScaledSum(r97update2));
__syncthreads();
buffer.forEachHorizontalOdd(firstLine, lines, AddScaledSum(r97predict2));
__syncthreads();
buffer.forEachHorizontalEven(firstLine, lines, AddScaledSum(r97update1));
__syncthreads();
buffer.forEachHorizontalOdd(firstLine, lines, AddScaledSum(r97Predict1));
__syncthreads();
}
/// Initializes one column of shared transform buffer with 7 input pixels.
/// Those 7 pixels will not be transformed. Also initializes given loader.
/// @tparam CHECKED true if there are near image boundaries
/// @param colIndex index of column in shared transform buffer
/// @param input input image
/// @param sizeX width of the input image
/// @param sizeY height of the input image
/// @param column (uninitialized) info about loading one column
/// @param firstY index of first image row to be transformed
template <bool CHECKED>
__device__ void initColumn(const int colIndex, const float * const input,
const int sizeX, const int sizeY,
RDWT97Column<CHECKED> & column,
const int firstY) {
// coordinates of the first coefficient to be loaded
const int firstX = blockIdx.x * WIN_SIZE_X + colIndex;
// offset of the column with index 'colIndex' in the transform buffer
column.offset = buffer.getColumnOffset(colIndex);
if(blockIdx.y == 0) {
// topmost block - apply mirroring rules when loading first 7 rows
column.loader.init(sizeX, sizeY, firstX, firstY);
// load pixels in mirrored way
buffer[column.offset + 3 * STRIDE] = column.loader.loadLowFrom(input);
buffer[column.offset + 4 * STRIDE] =
buffer[column.offset + 2 * STRIDE] = column.loader.loadHighFrom(input);
buffer[column.offset + 5 * STRIDE] =
buffer[column.offset + 1 * STRIDE] = column.loader.loadLowFrom(input);
buffer[column.offset + 6 * STRIDE] =
buffer[column.offset + 0 * STRIDE] = column.loader.loadHighFrom(input);
} else {
// non-topmost row - regular loading:
column.loader.init(sizeX, sizeY, firstX, firstY - 3);
buffer[column.offset + 0 * STRIDE] = column.loader.loadHighFrom(input);
buffer[column.offset + 1 * STRIDE] = column.loader.loadLowFrom(input);
buffer[column.offset + 2 * STRIDE] = column.loader.loadHighFrom(input);
buffer[column.offset + 3 * STRIDE] = column.loader.loadLowFrom(input);
buffer[column.offset + 4 * STRIDE] = column.loader.loadHighFrom(input);
buffer[column.offset + 5 * STRIDE] = column.loader.loadLowFrom(input);
buffer[column.offset + 6 * STRIDE] = column.loader.loadHighFrom(input);
}
// Now, the next coefficient, which will be loaded by loader, is #4.
}
/// Using given loader, it loads another WIN_SIZE_Y coefficients
/// into specified column.
/// @tparam CHECKED true if there are near image boundaries
/// @param col info about loaded column
/// @param input buffer with input coefficients
template <bool CHECKED>
inline __device__ void loadWindowIntoColumn(RDWT97Column<CHECKED> & col,
const float * const input) {
for(int i = 7; i < (7 + WIN_SIZE_Y); i += 2) {
buffer[col.offset + i * STRIDE] = col.loader.loadLowFrom(input);
buffer[col.offset + (i + 1) * STRIDE] = col.loader.loadHighFrom(input);
}
}
/// Actual GPU 9/7 RDWT sliding window lifting schema implementation.
/// @tparam CHECKED_LOADS true if loader should check boundaries
/// @tparam CHECKED_WRITES true if boundaries should be taken into account
/// when writing into output buffer
/// @param in input image (9/7 transformed coefficients)
/// @param out output buffer (for reverse transformed image)
/// @param sizeX width of the output image
/// @param sizeY height of the output image
/// @param winSteps number of steps of sliding window
template <bool CHECKED_LOADS, bool CHECKED_WRITES>
__device__ void transform(const float * const in, float * const out,
const int sizeX, const int sizeY,
const int winSteps) {
// info about one main column and one boundary column
RDWT97Column<CHECKED_LOADS> column;
RDWT97Column<CHECKED_LOADS> boundaryColumn;
// index of first image row to be transformed
const int firstY = blockIdx.y * WIN_SIZE_Y * winSteps;
// initialize boundary columns
boundaryColumn.clear();
if(threadIdx.x < 7) {
// each thread among first 7 ones gets index of one of boundary columns
const int colId = threadIdx.x + ((threadIdx.x < 4) ? WIN_SIZE_X : -7);
// Thread initializes offset of the boundary column (in shared
// buffer), first 7 pixels of the column and a loader for this column.
initColumn(colId, in, sizeX, sizeY, boundaryColumn, firstY);
}
// All threads initialize central columns.
initColumn(parityIdx<WIN_SIZE_X>(), in, sizeX, sizeY, column, firstY);
// horizontally transform first 7 rows
horizontalRDWT97(7, 0);
// writer of output pixels - initialize it
const int outputX = blockIdx.x * WIN_SIZE_X + threadIdx.x;
VerticalDWTPixelWriter<float, CHECKED_WRITES> writer;
writer.init(sizeX, sizeY, outputX, firstY);
// offset of column (in transform buffer) saved by this thread
const int outColumnOffset = buffer.getColumnOffset(threadIdx.x);
// (Each iteration assumes that first 7 rows of transform buffer are
// already loaded with horizontally transformed pixels.)
for(int w = 0; w < winSteps; w++) {
// Load another WIN_SIZE_Y lines of this thread's column
// into the transform buffer.
loadWindowIntoColumn(column, in);
// possibly load boundary columns
if(threadIdx.x < 7) {
loadWindowIntoColumn(boundaryColumn, in);
}
// horizontally transform all newly loaded lines
horizontalRDWT97(WIN_SIZE_Y, 7);
// Using 7 registers, remember current values of last 7 rows
// of transform buffer. These rows are transformed horizontally
// only and will be used in next iteration.
float last7Lines[7];
for(int i = 0; i < 7; i++) {
last7Lines[i] = buffer[outColumnOffset + (WIN_SIZE_Y + i) * STRIDE];
}
// vertically transform all central columns
buffer.scaleVertical(scale97Div, scale97Mul, outColumnOffset,
WIN_SIZE_Y + 7, 0);
buffer.forEachVerticalOdd(outColumnOffset, AddScaledSum(r97update2));
buffer.forEachVerticalEven(outColumnOffset, AddScaledSum(r97predict2));
buffer.forEachVerticalOdd(outColumnOffset, AddScaledSum(r97update1));
buffer.forEachVerticalEven(outColumnOffset, AddScaledSum(r97Predict1));
// Save all results of current window. Results are in transform buffer
// at rows from #3 to #(3 + WIN_SIZE_Y). Other rows are invalid now.
// (They only served as a boundary for vertical RDWT.)
for(int i = 3; i < (3 + WIN_SIZE_Y); i++) {
writer.writeInto(out, buffer[outColumnOffset + i * STRIDE]);
}
// Use last 7 remembered lines as first 7 lines for next iteration.
// As expected, these lines are already horizontally transformed.
for(int i = 0; i < 7; i++) {
buffer[outColumnOffset + i * STRIDE] = last7Lines[i];
}
// Wait for all writing threads before proceeding to loading new
// coeficients in next iteration. (Not to overwrite those which
// are not written yet.)
__syncthreads();
}
}
public:
/// Main GPU 9/7 RDWT entry point.
/// @param in input image (9/7 transformed coefficients)
/// @param out output buffer (for reverse transformed image)
/// @param sizeX width of the output image
/// @param sizeY height of the output image
__device__ static void run(const float * const input, float * const output,
const int sx, const int sy, const int steps) {
// prepare instance with buffer in shared memory
__shared__ RDWT97<WIN_SIZE_X, WIN_SIZE_Y> rdwt97;
// Compute limits of this threadblock's block of pixels and use them to
// determine, whether this threadblock will have to deal with boundary.
// (3 in next expressions is for radius of impulse response of 9/7 RDWT.)
const int maxX = (blockIdx.x + 1) * WIN_SIZE_X + 3;
const int maxY = (blockIdx.y + 1) * WIN_SIZE_Y * steps + 3;
const bool atRightBoudary = maxX >= sx;
const bool atBottomBoudary = maxY >= sy;
// Select specialized version of code according to distance of this
// threadblock's pixels from image boundary.
if(atBottomBoudary) {
// near bottom boundary => check both writing and reading
rdwt97.transform<true, true>(input, output, sx, sy, steps);
} else if(atRightBoudary) {
// near right boundary only => check writing only
rdwt97.transform<false, true>(input, output, sx, sy, steps);
} else {
// no nearby boundary => check nothing
rdwt97.transform<false, false>(input, output, sx, sy, steps);
}
}
}; // end of class RDWT97
/// Main GPU 9/7 RDWT entry point.
/// @param in input image (9/7 transformed coefficients)
/// @param out output buffer (for reverse transformed image)
/// @param sizeX width of the output image
/// @param sizeY height of the output image
template <int WIN_SX, int WIN_SY>
__launch_bounds__(WIN_SX, CTMIN(SHM_SIZE/sizeof(RDWT97<WIN_SX, WIN_SY>), 8))
__global__ void rdwt97Kernel(const float * const in, float * const out,
const int sx, const int sy, const int steps) {
RDWT97<WIN_SX, WIN_SY>::run(in, out, sx, sy, steps);
}
/// Only computes optimal number of sliding window steps,
/// number of threadblocks and then lanches the 9/7 RDWT kernel.
/// @tparam WIN_SX width of sliding window
/// @tparam WIN_SY height of sliding window
/// @param in input image
/// @param out output buffer
/// @param sx width of the input image
/// @param sy height of the input image
template <int WIN_SX, int WIN_SY>
void launchRDWT97Kernel (float * in, float * out, int sx, int sy) {
// compute optimal number of steps of each sliding window
const int steps = divRndUp(sy, 15 * WIN_SY);
// prepare grid size
dim3 gSize(divRndUp(sx, WIN_SX), divRndUp(sy, WIN_SY * steps));
// finally launch kernel
PERF_BEGIN
rdwt97Kernel<WIN_SX, WIN_SY><<<gSize, WIN_SX>>>(in, out, sx, sy, steps);
PERF_END(" RDWT97", sx, sy)
CudaDWTTester::checkLastKernelCall("RDWT 9/7 kernel");
}
/// Reverse 9/7 2D DWT. See common rules (dwt.h) for more details.
/// @param in Input DWT coefficients. Format described in common rules.
/// Will not be preserved (will be overwritten).
/// @param out output buffer on GPU - will contain original image
/// in normalized range [-0.5, 0.5].
/// @param sizeX width of input image (in pixels)
/// @param sizeY height of input image (in pixels)
/// @param levels number of recursive DWT levels
void rdwt97(float * in, float * out, int sizeX, int sizeY, int levels) {
if(levels > 1) {
// let this function recursively reverse transform deeper levels first
const int llSizeX = divRndUp(sizeX, 2);
const int llSizeY = divRndUp(sizeY, 2);
rdwt97(in, out, llSizeX, llSizeY, levels - 1);
// copy reverse transformed LL band from output back into the input
memCopy(in, out, llSizeX, llSizeY);
}
// select right width of kernel for the size of the image
if(sizeX >= 960) {
launchRDWT97Kernel<192, 8>(in, out, sizeX, sizeY);
} else if (sizeX >= 480) {
launchRDWT97Kernel<128, 6>(in, out, sizeX, sizeY);
} else {
launchRDWT97Kernel<64, 6>(in, out, sizeX, sizeY);
}
}
} // end of namespace dwt_cuda

View File

@ -1,338 +0,0 @@
/// line 248 the index
/// @file transform_buffer.h
/// @brief Buffer with separated even and odd columns and related algorithms.
/// @author Martin Jirman (207962@mail.muni.cz)
/// @date 2011-01-20 18:33
///
///
/// Copyright (c) 2011 Martin Jirman
/// All rights reserved.
///
/// Redistribution and use in source and binary forms, with or without
/// modification, are permitted provided that the following conditions are met:
///
/// * Redistributions of source code must retain the above copyright
/// notice, this list of conditions and the following disclaimer.
/// * Redistributions in binary form must reproduce the above copyright
/// notice, this list of conditions and the following disclaimer in the
/// documentation and/or other materials provided with the distribution.
///
/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
/// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
/// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
/// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
/// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
/// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
/// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
/// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
/// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
/// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
/// POSSIBILITY OF SUCH DAMAGE.
///
#ifndef TRANSFORM_BUFFER_H
#define TRANSFORM_BUFFER_H
namespace dwt_cuda {
/// Buffer (in shared memory of GPU) where block of input image is stored,
/// but odd and even lines are separated. (Generates less bank conflicts when
/// using lifting schema.) All operations expect SIZE_X threads.
/// Also implements basic building blocks of lifting schema.
/// @tparam SIZE_X width of the buffer excluding two boundaries (Also
/// a number of threads participating on all operations.)
/// Must be divisible by 4.
/// @tparam SIZE_Y height of buffer (total number of lines)
/// @tparam BOUNDARY_X number of extra pixels at the left and right side
/// boundary is expected to be smaller than half SIZE_X
/// Must be divisible by 2.
template <typename T, int SIZE_X, int SIZE_Y, int BOUNDARY_X>
class TransformBuffer {
public:
enum {
/// difference between pointers to two vertical neigbors
VERTICAL_STRIDE = BOUNDARY_X + (SIZE_X / 2)
};
private:
enum {
/// number of shared memory banks - needed for correct padding
#ifdef __CUDA_ARCH__
SHM_BANKS = ((__CUDA_ARCH__ >= 200) ? 32 : 16),
#else
SHM_BANKS = 16, // for host code only - can be anything, won't be used
#endif
/// size of one of two buffers (odd or even)
BUFFER_SIZE = VERTICAL_STRIDE * SIZE_Y,
/// unused space between two buffers
PADDING = SHM_BANKS - ((BUFFER_SIZE + SHM_BANKS / 2) % SHM_BANKS),
/// offset of the odd columns buffer from the beginning of data buffer
ODD_OFFSET = BUFFER_SIZE + PADDING,
};
/// buffer for both even and odd columns
T data[2 * BUFFER_SIZE + PADDING];
/// Applies specified function to all central elements while also passing
/// previous and next elements as parameters.
/// @param count count of central elements to apply function to
/// @param prevOffset offset of first central element
/// @param midOffset offset of first central element's predecessor
/// @param nextOffset offset of first central element's successor
/// @param function the function itself
template <typename FUNC>
__device__ void horizontalStep(const int count, const int prevOffset,
const int midOffset, const int nextOffset,
const FUNC &function) {
// number of unchecked iterations
const int STEPS = count / SIZE_X;
// items remaining after last unchecked iteration
const int finalCount = count % SIZE_X;
// offset of items processed in last (checked) iteration
const int finalOffset = count - finalCount;
// all threads perform fixed number of iterations ...
for (int i = 0; i < STEPS; i++) {
// for(int i = 0; i < 3; i++) {
const T previous = data[prevOffset + i * SIZE_X + threadIdx.x];
const T next = data[nextOffset + i * SIZE_X + threadIdx.x];
T &center = data[midOffset + i * SIZE_X + threadIdx.x];
// function(previous, center, (nextOffset + i*SIZE_X+threadIdx.x));
function(previous, center, next); // the real one
}
// ... but not all threads participate on final iteration
if (threadIdx.x < finalCount) {
const T previous = data[prevOffset + finalOffset + threadIdx.x];
const T next = data[nextOffset + finalOffset + threadIdx.x];
T &center = data[midOffset + finalOffset + threadIdx.x];
// function(previous, center, (nextOffset+finalOffset+threadIdx.x));
// kaixi
function(previous, center, next); // the real one
}
}
public:
__device__ void getPrintData() {
//
for (int i = 0; i < 2 * BUFFER_SIZE + PADDING; i++) {
printf(" index: %d data: %f \n ", i, data[i]);
}
}
/// Gets offset of the column with given index. Central columns have
/// indices from 0 to NUM_LINES - 1, left boundary columns have negative
/// indices and right boundary columns indices start with NUM_LINES.
/// @param columnIndex index of column to get pointer to
/// @return offset of the first item of column with specified index
__device__ int getColumnOffset(int columnIndex) {
columnIndex += BOUNDARY_X; // skip boundary
return columnIndex / 2 // select right column
+ (columnIndex & 1) * ODD_OFFSET; // select odd or even buffer
}
/// Provides access to data of the transform buffer.
/// @param index index of the item to work with
/// @return reference to item at given index
__device__ T &operator[](const int index) { return data[index]; }
/// Applies specified function to all horizontally even elements in
/// specified lines. (Including even elements in boundaries except
/// first even element in first left boundary.) SIZE_X threads participate
/// and synchronization is needed before result can be used.
/// @param firstLine index of first line
/// @param numLines count of lines
/// @param func function to be applied on all even elements
/// parameters: previous (odd) element, the even
/// element itself and finally next (odd) element
template <typename FUNC>
__device__ void forEachHorizontalEven(const int firstLine, const int numLines,
const FUNC &func) {
// number of even elemens to apply function to
const int count = numLines * VERTICAL_STRIDE - 1;
// offset of first even element
const int centerOffset = firstLine * VERTICAL_STRIDE + 1;
// offset of odd predecessor of first even element
const int prevOffset = firstLine * VERTICAL_STRIDE + ODD_OFFSET;
// offset of odd successor of first even element
const int nextOffset = prevOffset + 1;
// if(threadIdx.x == 0) {
// printf("forEachHorizontalEven count %d, centerOffset %d prevOffset %d
// nextOffset %d \n", count, centerOffset, prevOffset, nextOffset);
// }
// call generic horizontal step function
horizontalStep(count, prevOffset, centerOffset, nextOffset, func);
}
/// Applies given function to all horizontally odd elements in specified
/// lines. (Including odd elements in boundaries except last odd element
/// in last right boundary.) SIZE_X threads participate and synchronization
/// is needed before result can be used.
/// @param firstLine index of first line
/// @param numLines count of lines
/// @param func function to be applied on all odd elements
/// parameters: previous (even) element, the odd
/// element itself and finally next (even) element
template <typename FUNC>
__device__ void forEachHorizontalOdd(const int firstLine, const int numLines,
const FUNC &func) {
// numbet of odd elements to apply function to
const int count = numLines * VERTICAL_STRIDE - 1;
// offset of even predecessor of first odd element
const int prevOffset = firstLine * VERTICAL_STRIDE;
// offset of first odd element
const int centerOffset = prevOffset + ODD_OFFSET;
// offset of even successor of first odd element
const int nextOffset = prevOffset + 1;
// if(threadIdx.x == 0) {
// printf("forEachHorizontalOdd count %d, centerOffset %d prevOffset %d
// nextOffset %d \n", count, centerOffset, prevOffset, nextOffset);
// }
// call generic horizontal step function
horizontalStep(count, prevOffset, centerOffset, nextOffset, func);
}
/// Applies specified function to all even elements (except element #0)
/// of given column. Each thread takes care of one column, so there's
/// no need for synchronization.
/// @param columnOffset offset of thread's column
/// @param f function to be applied on all even elements
/// parameters: previous (odd) element, the even
/// element itself and finally next (odd) element
template <typename F>
__device__ void forEachVerticalEven(const int columnOffset, const F &f) {
if (SIZE_Y > 3) { // makes no sense otherwise
const int steps = SIZE_Y / 2 - 1;
for (int i = 0; i < steps; i++) {
const int row = 2 + i * 2;
const T prev = data[columnOffset + (row - 1) * VERTICAL_STRIDE];
const T next = data[columnOffset + (row + 1) * VERTICAL_STRIDE];
f(prev, data[columnOffset + row * VERTICAL_STRIDE], next);
//--------------- FOR TEST -----------------
/* __syncthreads();
if ((blockIdx.x * blockDim.x + threadIdx.x) == 0){
diffOut[2500]++;
diffOut[diffOut[2500]] = 2;//data[columnOffset +
row * VERTICAL_STRIDE];
}
__syncthreads();
*/ //--------------- FOR TEST -----------------
}
}
}
/// Applies specified function to all odd elements of given column.
/// Each thread takes care of one column, so there's no need for
/// synchronization.
/// @param columnOffset offset of thread's column
/// @param f function to be applied on all odd elements
/// parameters: previous (even) element, the odd
/// element itself and finally next (even) element
template <typename F>
__device__ void forEachVerticalOdd(const int columnOffset, const F &f) {
const int steps = (SIZE_Y - 1) / 2;
for (int i = 0; i < steps; i++) {
const int row = i * 2 + 1;
const T prev = data[columnOffset + (row - 1) * VERTICAL_STRIDE];
const T next = data[columnOffset + (row + 1) * VERTICAL_STRIDE];
f(prev, data[columnOffset + row * VERTICAL_STRIDE], next);
//--------------- FOR TEST -----------------
/* __syncthreads();
if ((blockIdx.x * blockDim.x + threadIdx.x) == 0){
diffOut[2500]++;
diffOut[diffOut[2500]] = 1; //data[columnOffset +
row * VERTICAL_STRIDE];
}
__syncthreads();
*/ //--------------- FOR TEST -----------------
}
}
/// Scales elements at specified lines.
/// @param evenScale scaling factor for horizontally even elements
/// @param oddScale scaling factor for horizontally odd elements
/// @param numLines number of lines, whose elements should be scaled
/// @param firstLine index of first line to scale elements in
__device__ void scaleHorizontal(const T evenScale, const T oddScale,
const int firstLine, const int numLines) {
const int offset = firstLine * VERTICAL_STRIDE;
const int count = numLines * VERTICAL_STRIDE;
const int steps = count / SIZE_X;
const int finalCount = count % SIZE_X;
const int finalOffset = count - finalCount;
// printf("scaleHorizontal sizeX: %d offset %d, count, %d, steps, %d,
// finalCount %d, finalOffset %d \n", SIZE_X, offset, count, steps,
// finalCount, finalOffset);
// run iterations, whete all threads participate
for (int i = 0; i < steps; i++) {
data[threadIdx.x + i * SIZE_X + offset] *= evenScale;
// if(threadIdx.x + i * SIZE_X + offset == 531) {
// printf("threadidx 531: %d \n", threadIdx.x);
// }
// if(threadIdx.x + i * SIZE_X + offset + ODD_OFFSET == 531) {
// printf("threadidx 531: %d \n", threadIdx.x);
// }
data[threadIdx.x + i * SIZE_X + offset + ODD_OFFSET] *= oddScale;
}
// some threads also finish remaining unscaled items
if (threadIdx.x < finalCount) {
data[threadIdx.x + finalOffset + offset] *= evenScale;
// if(threadIdx.x + finalOffset + offset == 531) {
// printf("threadidx 531: %d \n", threadIdx.x);
// }
// if(threadIdx.x + finalOffset + offset + ODD_OFFSET == 531) {
// printf("threadidx 531: %d \n", threadIdx.x);
// }
data[threadIdx.x + finalOffset + offset + ODD_OFFSET] *= oddScale;
}
}
/// Scales elements in specified column.
/// @param evenScale scaling factor for vertically even elements
/// @param oddScale scaling factor for vertically odd elements
/// @param columnOffset offset of the column to work with
/// @param numLines number of lines, whose elements should be scaled
/// @param firstLine index of first line to scale elements in
__device__ void scaleVertical(const T evenScale, const T oddScale,
const int columnOffset, const int numLines,
const int firstLine) {
for (int i = firstLine; i < (numLines + firstLine); i++) {
if (i & 1) {
data[columnOffset + i * VERTICAL_STRIDE] *= oddScale;
} else {
data[columnOffset + i * VERTICAL_STRIDE] *= evenScale;
}
}
}
//****************For Test(Feb23), test inter parameters*************
__device__ int getVERTICAL_STRIDE() { return VERTICAL_STRIDE; }
__device__ int getSHM_BANKS() { return SHM_BANKS; }
__device__ int getBuffersize() { return BUFFER_SIZE; }
__device__ int getPADDING() { return PADDING; }
__device__ int getODD_OFFSET() { return ODD_OFFSET; }
//****************For Test(Feb23), test inter parameters*************
}; // end of class TransformBuffer
} // namespace dwt_cuda
#endif // TRANSFORM_BUFFER_H

View File

@ -1,401 +0,0 @@
/*
* Copyright (c) 2009, Jiri Matela
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include <unistd.h>
#include <error.h>
#include <stdio.h>
#include <stdlib.h>
#include <fcntl.h>
#include <errno.h>
#include <string.h>
#include <assert.h>
#include <sys/time.h>
#include <getopt.h>
#include "common.h"
#include "components.h"
#include "dwt.h"
struct dwt {
char * srcFilename;
char * outFilename;
unsigned char *srcImg;
int pixWidth;
int pixHeight;
int components;
int dwtLvls;
};
int getImg(char * srcFilename, unsigned char *srcImg, int inputSize)
{
// printf("Loading ipnput: %s\n", srcFilename);
char *path = "../../data/dwt2d/";
char *newSrc = NULL;
if((newSrc = (char *)malloc(strlen(srcFilename)+strlen(path)+1)) != NULL)
{
newSrc[0] = '\0';
strcat(newSrc, path);
strcat(newSrc, srcFilename);
srcFilename= newSrc;
}
printf("Loading ipnput: %s\n", srcFilename);
//srcFilename = strcat("../../data/dwt2d/",srcFilename);
//read image
int i = open(srcFilename, O_RDONLY, 0644);
if (i == -1) {
error(0,errno,"cannot access %s", srcFilename);
return -1;
}
int ret = read(i, srcImg, inputSize);
printf("precteno %d, inputsize %d\n", ret, inputSize);
close(i);
return 0;
}
void usage() {
printf("dwt [otpions] src_img.rgb <out_img.dwt>\n\
-d, --dimension\t\tdimensions of src img, e.g. 1920x1080\n\
-c, --components\t\tnumber of color components, default 3\n\
-b, --depth\t\t\tbit depth, default 8\n\
-l, --level\t\t\tDWT level, default 3\n\
-D, --device\t\t\tcuda device\n\
-f, --forward\t\t\tforward transform\n\
-r, --reverse\t\t\treverse transform\n\
-9, --97\t\t\t9/7 transform\n\
-5, --53\t\t\t5/3 transform\n\
-w --write-visual\t\twrite output in visual (tiled) fashion instead of the linear\n");
}
template <typename T>
void processDWT(struct dwt *d, int forward, int writeVisual)
{
int componentSize = d->pixWidth*d->pixHeight*sizeof(T);
T *c_r_out, *backup ;
cudaMalloc((void**)&c_r_out, componentSize); //< aligned component size
cudaCheckError("Alloc device memory");
cudaMemset(c_r_out, 0, componentSize);
cudaCheckError("Memset device memory");
cudaMalloc((void**)&backup, componentSize); //< aligned component size
cudaCheckError("Alloc device memory");
cudaMemset(backup, 0, componentSize);
cudaCheckError("Memset device memory");
if (d->components == 3) {
/* Alloc two more buffers for G and B */
T *c_g_out, *c_b_out;
cudaMalloc((void**)&c_g_out, componentSize); //< aligned component size
cudaCheckError("Alloc device memory");
cudaMemset(c_g_out, 0, componentSize);
cudaCheckError("Memset device memory");
cudaMalloc((void**)&c_b_out, componentSize); //< aligned component size
cudaCheckError("Alloc device memory");
cudaMemset(c_b_out, 0, componentSize);
cudaCheckError("Memset device memory");
/* Load components */
T *c_r, *c_g, *c_b;
cudaMalloc((void**)&c_r, componentSize); //< R, aligned component size
cudaCheckError("Alloc device memory");
cudaMemset(c_r, 0, componentSize);
cudaCheckError("Memset device memory");
cudaMalloc((void**)&c_g, componentSize); //< G, aligned component size
cudaCheckError("Alloc device memory");
cudaMemset(c_g, 0, componentSize);
cudaCheckError("Memset device memory");
cudaMalloc((void**)&c_b, componentSize); //< B, aligned component size
cudaCheckError("Alloc device memory");
cudaMemset(c_b, 0, componentSize);
cudaCheckError("Memset device memory");
rgbToComponents(c_r, c_g, c_b, d->srcImg, d->pixWidth, d->pixHeight);
/* Compute DWT and always store into file */
nStage2dDWT(c_r, c_r_out, backup, d->pixWidth, d->pixHeight, d->dwtLvls, forward);
nStage2dDWT(c_g, c_g_out, backup, d->pixWidth, d->pixHeight, d->dwtLvls, forward);
nStage2dDWT(c_b, c_b_out, backup, d->pixWidth, d->pixHeight, d->dwtLvls, forward);
// -------test----------
// T *h_r_out=(T*)malloc(componentSize);
// cudaMemcpy(h_r_out, c_g_out, componentSize, cudaMemcpyDeviceToHost);
// int ii;
// for(ii=0;ii<componentSize/sizeof(T);ii++) {
// fprintf(stderr, "%d ", h_r_out[ii]);
// if((ii+1) % (d->pixWidth) == 0) fprintf(stderr, "\n");
// }
// -------test----------
/* Store DWT to file */
writeLinear(c_r_out, d->pixWidth, d->pixHeight, d->outFilename, ".r");
// writeLinear(c_g_out, d->pixWidth, d->pixHeight, d->outFilename, ".g");
// writeLinear(c_b_out, d->pixWidth, d->pixHeight, d->outFilename, ".b");
#ifdef OUTPUT
if (writeVisual) {
writeNStage2DDWT(c_r_out, d->pixWidth, d->pixHeight, d->dwtLvls, d->outFilename, ".r");
writeNStage2DDWT(c_g_out, d->pixWidth, d->pixHeight, d->dwtLvls, d->outFilename, ".g");
writeNStage2DDWT(c_b_out, d->pixWidth, d->pixHeight, d->dwtLvls, d->outFilename, ".b");
} else {
writeLinear(c_r_out, d->pixWidth, d->pixHeight, d->outFilename, ".r");
writeLinear(c_g_out, d->pixWidth, d->pixHeight, d->outFilename, ".g");
writeLinear(c_b_out, d->pixWidth, d->pixHeight, d->outFilename, ".b");
}
#endif
cudaFree(c_r);
cudaCheckError("Cuda free");
cudaFree(c_g);
cudaCheckError("Cuda free");
cudaFree(c_b);
cudaCheckError("Cuda free");
cudaFree(c_g_out);
cudaCheckError("Cuda free");
cudaFree(c_b_out);
cudaCheckError("Cuda free");
}
else if (d->components == 1) {
//Load component
T *c_r;
cudaMalloc((void**)&(c_r), componentSize); //< R, aligned component size
cudaCheckError("Alloc device memory");
cudaMemset(c_r, 0, componentSize);
cudaCheckError("Memset device memory");
bwToComponent(c_r, d->srcImg, d->pixWidth, d->pixHeight);
// Compute DWT
nStage2dDWT(c_r, c_r_out, backup, d->pixWidth, d->pixHeight, d->dwtLvls, forward);
// Store DWT to file
// #ifdef OUTPUT
if (writeVisual) {
writeNStage2DDWT(c_r_out, d->pixWidth, d->pixHeight, d->dwtLvls, d->outFilename, ".out");
} else {
writeLinear(c_r_out, d->pixWidth, d->pixHeight, d->outFilename, ".lin.out");
}
// #endif
cudaFree(c_r);
cudaCheckError("Cuda free");
}
cudaFree(c_r_out);
cudaCheckError("Cuda free device");
cudaFree(backup);
cudaCheckError("Cuda free device");
}
int main(int argc, char **argv)
{
int optindex = 0;
char ch;
struct option longopts[] = {
{"dimension", required_argument, 0, 'd'}, //dimensions of src img
{"components", required_argument, 0, 'c'}, //numger of components of src img
{"depth", required_argument, 0, 'b'}, //bit depth of src img
{"level", required_argument, 0, 'l'}, //level of dwt
{"device", required_argument, 0, 'D'}, //cuda device
{"forward", no_argument, 0, 'f'}, //forward transform
{"reverse", no_argument, 0, 'r'}, //reverse transform
{"97", no_argument, 0, '9'}, //9/7 transform
{"53", no_argument, 0, '5' }, //5/3transform
{"write-visual",no_argument, 0, 'w' }, //write output (subbands) in visual (tiled) order instead of linear
{"help", no_argument, 0, 'h'}
};
int pixWidth = 0; //<real pixWidth
int pixHeight = 0; //<real pixHeight
int compCount = 3; //number of components; 3 for RGB or YUV, 4 for RGBA
int bitDepth = 8;
int dwtLvls = 3; //default numuber of DWT levels
int device = 0;
int forward = 1; //forward transform
int dwt97 = 1; //1=dwt9/7, 0=dwt5/3 transform
int writeVisual = 0; //write output (subbands) in visual (tiled) order instead of linear
char * pos;
while ((ch = getopt_long(argc, argv, "d:c:b:l:D:fr95wh", longopts, &optindex)) != -1) {
switch (ch) {
case 'd':
pixWidth = atoi(optarg);
pos = strstr(optarg, "x");
if (pos == NULL || pixWidth == 0 || (strlen(pos) >= strlen(optarg))) {
usage();
return -1;
}
pixHeight = atoi(pos+1);
break;
case 'c':
compCount = atoi(optarg);
break;
case 'b':
bitDepth = atoi(optarg);
break;
case 'l':
dwtLvls = atoi(optarg);
break;
case 'D':
device = atoi(optarg);
break;
case 'f':
forward = 1;
break;
case 'r':
forward = 0;
break;
case '9':
dwt97 = 1;
break;
case '5':
dwt97 = 0;
break;
case 'w':
writeVisual = 1;
break;
case 'h':
usage();
return 0;
case '?':
return -1;
default :
usage();
return -1;
}
}
argc -= optind;
argv += optind;
if (argc == 0) { // at least one filename is expected
printf("Please supply src file name\n");
usage();
return -1;
}
if (pixWidth <= 0 || pixHeight <=0) {
printf("Wrong or missing dimensions\n");
usage();
return -1;
}
if (forward == 0) {
writeVisual = 0; //do not write visual when RDWT
}
// device init
int devCount;
cudaSetDevice(0);
cudaGetDeviceCount(&devCount);
cudaCheckError("Get device count");
if (devCount == 0) {
printf("No CUDA enabled device\n");
return -1;
}
if (device < 0 || device > devCount -1) {
printf("Selected device %d is out of bound. Devices on your system are in range %d - %d\n",
device, 0, devCount -1);
return -1;
}
cudaDeviceProp devProp;
cudaGetDeviceProperties(&devProp, device);
cudaCheckError("Get device properties");
// if (devProp.major < 1) {
// printf("Device %d does not support CUDA\n", device);
// return -1;
// }
printf("Using device %d: %s\n", device, devProp.name);
cudaSetDevice(device);
cudaCheckError("Set selected device");
struct dwt *d;
d = (struct dwt *)malloc(sizeof(struct dwt));
d->srcImg = NULL;
d->pixWidth = pixWidth;
d->pixHeight = pixHeight;
d->components = compCount;
d->dwtLvls = dwtLvls;
// file names
d->srcFilename = (char *)malloc(strlen(argv[0]));
strcpy(d->srcFilename, argv[0]);
if (argc == 1) { // only one filename supplyed
d->outFilename = (char *)malloc(strlen(d->srcFilename)+4);
strcpy(d->outFilename, d->srcFilename);
strcpy(d->outFilename+strlen(d->srcFilename), ".dwt");
} else {
d->outFilename = strdup(argv[1]);
}
//Input review
printf("Source file:\t\t%s\n", d->srcFilename);
printf(" Dimensions:\t\t%dx%d\n", pixWidth, pixHeight);
printf(" Components count:\t%d\n", compCount);
printf(" Bit depth:\t\t%d\n", bitDepth);
printf(" DWT levels:\t\t%d\n", dwtLvls);
printf(" Forward transform:\t%d\n", forward);
printf(" 9/7 transform:\t\t%d\n", dwt97);
//data sizes
int inputSize = pixWidth*pixHeight*compCount; //<amount of data (in bytes) to proccess
//load img source image
cudaMallocHost((void **)&d->srcImg, inputSize);
cudaCheckError("Alloc host memory");
if (getImg(d->srcFilename, d->srcImg, inputSize) == -1)
return -1;
/* DWT */
if (forward == 1) {
if(dwt97 == 1 )
processDWT<float>(d, forward, writeVisual);
else // 5/3
processDWT<int>(d, forward, writeVisual);
}
else { // reverse
if(dwt97 == 1 )
processDWT<float>(d, forward, writeVisual);
else // 5/3
processDWT<int>(d, forward, writeVisual);
}
//writeComponent(r_cuda, pixWidth, pixHeight, srcFilename, ".g");
//writeComponent(g_wave_cuda, 512000, ".g");
//writeComponent(g_cuda, componentSize, ".g");
//writeComponent(b_wave_cuda, componentSize, ".b");
cudaFreeHost(d->srcImg);
cudaCheckError("Cuda free host");
return 0;
}

View File

@ -1,8 +0,0 @@
./dwt2d 4.bmp z.dwt -d 4x4 -f -5 -l 3
# ./dwt2d 8.bmp -d 8x8 -f -5 -l 3
# ./dwt2d 16.bmp -d 16x16 -f -5 -l 3
# ./dwt2d 64.bmp -d 64x64 -f -5 -l 3
# ./dwt2d 192.bmp -d 192x192 -f -5 -l 3
# ls
# ./dwt2d rgb.bmp -d 1024x1024 -f -5 -l 3

View File

@ -1,7 +0,0 @@
# ./dwt2d 192.bmp -d 192x192 -f -5 -l 3
# ls
# ./dwt2d rgb.bmp -d 1024x1024 -f -5 -l 3
# ./dwt2d 16.bmp -d 16x16 -f -9 -l 3\
./dwt2d 4.bmp -d 4x4 -r -5 -l 3
# ./dwt2d 4.bmp -d 4x4 -r -9 -l 3
# ./dwt2d 8.bmp -d 8x8 -f -9 -l 3

View File

@ -1,14 +0,0 @@
# ./nvcc_dwt2d 192.bmp -d 192x192 -f -5 -l 3
# ls
# ./nvcc_dwt2d rgb.bmp -d 1024x1024 -f -5 -l 3
# ./nvcc_dwt2d 4.bmp -d 4x4 -f -9 -l 3
./nvcc_dwt2d 4.bmp -d 4x4 -f -5 -l 3
# ./nvcc_dwt2d 8.bmp -d 8x8 -f -9 -l 3
# ./nvcc_dwt2d 16.bmp -d 16x16 -f -5 -l 3
# ./nvcc_dwt2d 16.bmp -d 16x16 -r -5 -l 3
# ./nvcc_dwt2d 16.bmp -d 16x16 -f -9 -l 3
# ./nvcc_dwt2d 4.bmp -d 4x4 -r -9 -l 3
# ./nvcc_dwt2d 64.bmp -d 64x64 -f -5 -l 3
# ./nvcc_dwt2d 192.bmp -d 192x192 -f -5 -l 3
# ls
# ./nvcc_dwt2d rgb.bmp -d 1024x1024 -f -5 -l 3

View File

@ -1,51 +0,0 @@
#!/bin/bash
clang++ -I. -I/include -fno-strict-aliasing dwt_cuda/fdwt53.cu dwt_cuda/fdwt97.cu dwt_cuda/common.cu dwt_cuda/rdwt97.cu dwt_cuda/rdwt53.cu components.cu dwt.cu main.cu -c --cuda-path=/usr/local/cuda-10.1 --cuda-gpu-arch=sm_50 -I. -I/include -L/usr/local/cuda-10.1/lib64 -lcudart_static -ldl -lrt -pthread -save-temps -v
export LD_LIBRARY_PATH=../../build/runtime:../../build/runtime/threadPool:$LD_LIBRARY_PATH
../../build/compilation/kernelTranslator common-cuda-nvptx64-nvidia-cuda-sm_50.bc common.bc
../../build/compilation/kernelTranslator components-cuda-nvptx64-nvidia-cuda-sm_50.bc components.bc
../../build/compilation/kernelTranslator fdwt53-cuda-nvptx64-nvidia-cuda-sm_50.bc fdwt53.bc
../../build/compilation/kernelTranslator dwt-cuda-nvptx64-nvidia-cuda-sm_50.bc dwt.bc
../../build/compilation/hostTranslator main-host-x86_64-unknown-linux-gnu.bc host.bc
../../build/compilation/hostTranslator common-host-x86_64-unknown-linux-gnu.bc common_host.bc
../../build/compilation/hostTranslator components-host-x86_64-unknown-linux-gnu.bc components_host.bc
../../build/compilation/hostTranslator dwt-host-x86_64-unknown-linux-gnu.bc dwt_host.bc
../../build/compilation/hostTranslator fdwt53-host-x86_64-unknown-linux-gnu.bc fdwt53_host.bc
../../build/compilation/hostTranslator fdwt97-host-x86_64-unknown-linux-gnu.bc fdwt97_host.bc
../../build/compilation/hostTranslator rdwt53-host-x86_64-unknown-linux-gnu.bc rdwt53_host.bc
../../build/compilation/hostTranslator rdwt97-host-x86_64-unknown-linux-gnu.bc rdwt97_host.bc
../../build/compilation/kernelTranslator fdwt97-cuda-nvptx64-nvidia-cuda-sm_50.bc fdwt97.bc
../../build/compilation/kernelTranslator rdwt97-cuda-nvptx64-nvidia-cuda-sm_50.bc rdwt97.bc
../../build/compilation/kernelTranslator rdwt53-cuda-nvptx64-nvidia-cuda-sm_50.bc rdwt53.bc
llc --relocation-model=pic --filetype=obj common.bc
llc --relocation-model=pic --filetype=obj components.bc
llc --relocation-model=pic --filetype=obj fdwt53.bc
llc --relocation-model=pic --filetype=obj dwt.bc
llc --relocation-model=pic --filetype=obj host.bc
llc --relocation-model=pic --filetype=obj common_host.bc
llc --relocation-model=pic --filetype=obj components_host.bc
llc --relocation-model=pic --filetype=obj fdwt53_host.bc
llc --relocation-model=pic --filetype=obj dwt_host.bc
llc --relocation-model=pic --filetype=obj fdwt97_host.bc
llc --relocation-model=pic --filetype=obj rdwt97_host.bc
llc --relocation-model=pic --filetype=obj rdwt53_host.bc
llc --relocation-model=pic --filetype=obj fdwt97.bc
llc --relocation-model=pic --filetype=obj rdwt97.bc
llc --relocation-model=pic --filetype=obj rdwt53.bc
g++ -g -Wall -L../../build/runtime -L../../build/runtime/threadPool -o dwt2d -fPIC -no-pie common.o components.o dwt.o fdwt53.o fdwt97.o rdwt97.o rdwt53.o host.o common_host.o components_host.o dwt_host.o fdwt53_host.o fdwt97_host.o rdwt97_host.o rdwt53_host.o -lc -lx86Runtime -lthreadPool -lpthread

View File

@ -1,9 +0,0 @@
/usr/local/cuda/bin/nvcc -arch sm_50 -I. -I/include -O2 --compiler-options -fno-strict-aliasing -c main.cu -o main.cu.o
/usr/local/cuda/bin/nvcc -arch sm_50 -I. -I/include -O2 --compiler-options -fno-strict-aliasing -c dwt.cu -o dwt.cu.o
/usr/local/cuda/bin/nvcc -arch sm_50 -I. -I/include -O2 --compiler-options -fno-strict-aliasing -c components.cu -o components.cu.o
/usr/local/cuda/bin/nvcc -arch sm_50 -I. -I/include -O2 --compiler-options -fno-strict-aliasing -c dwt_cuda/fdwt53.cu -o dwt_cuda/fdwt53.cu.o
/usr/local/cuda/bin/nvcc -arch sm_50 -I. -I/include -O2 --compiler-options -fno-strict-aliasing -c dwt_cuda/fdwt97.cu -o dwt_cuda/fdwt97.cu.o
/usr/local/cuda/bin/nvcc -arch sm_50 -I. -I/include -O2 --compiler-options -fno-strict-aliasing -c dwt_cuda/common.cu -o dwt_cuda/common.cu.o
/usr/local/cuda/bin/nvcc -arch sm_50 -I. -I/include -O2 --compiler-options -fno-strict-aliasing -c dwt_cuda/rdwt97.cu -o dwt_cuda/rdwt97.cu.o
/usr/local/cuda/bin/nvcc -arch sm_50 -I. -I/include -O2 --compiler-options -fno-strict-aliasing -c dwt_cuda/rdwt53.cu -o dwt_cuda/rdwt53.cu.o
g++ -fPIC -o nvcc_dwt2d main.cu.o dwt.cu.o components.cu.o dwt_cuda/fdwt53.cu.o dwt_cuda/fdwt97.cu.o dwt_cuda/common.cu.o dwt_cuda/rdwt97.cu.o dwt_cuda/rdwt53.cu.o -L/usr/local/cuda/lib64 -lcudart

View File

@ -1,396 +0,0 @@
; ModuleID = 'gaussian-cuda-nvptx64-nvidia-cuda-sm_61.bc'
source_filename = "gaussian.cu"
target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
target triple = "nvptx64-nvidia-cuda"
%struct.__cuda_builtin_threadIdx_t = type { i8 }
%struct.__cuda_builtin_blockIdx_t = type { i8 }
%struct.__cuda_builtin_blockDim_t = type { i8 }
%struct.cudaFuncAttributes = type { i64, i64, i64, i32, i32, i32, i32, i32, i32, i32 }
$_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv = comdat any
$_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv = comdat any
$_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv = comdat any
$_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_yEv = comdat any
$_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_yEv = comdat any
$_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_yEv = comdat any
@threadIdx = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_threadIdx_t, align 1
@blockIdx = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_blockIdx_t, align 1
@blockDim = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_blockDim_t, align 1
; Function Attrs: convergent noinline nounwind optnone
define weak dso_local i32 @cudaMalloc(i8** %p, i64 %s) #0 {
entry:
%p.addr = alloca i8**, align 8
%s.addr = alloca i64, align 8
store i8** %p, i8*** %p.addr, align 8
store i64 %s, i64* %s.addr, align 8
ret i32 999
}
; Function Attrs: convergent noinline nounwind optnone
define weak dso_local i32 @cudaFuncGetAttributes(%struct.cudaFuncAttributes* %p, i8* %c) #0 {
entry:
%p.addr = alloca %struct.cudaFuncAttributes*, align 8
%c.addr = alloca i8*, align 8
store %struct.cudaFuncAttributes* %p, %struct.cudaFuncAttributes** %p.addr, align 8
store i8* %c, i8** %c.addr, align 8
ret i32 999
}
; Function Attrs: convergent noinline nounwind optnone
define weak dso_local i32 @cudaDeviceGetAttribute(i32* %value, i32 %attr, i32 %device) #0 {
entry:
%value.addr = alloca i32*, align 8
%attr.addr = alloca i32, align 4
%device.addr = alloca i32, align 4
store i32* %value, i32** %value.addr, align 8
store i32 %attr, i32* %attr.addr, align 4
store i32 %device, i32* %device.addr, align 4
ret i32 999
}
; Function Attrs: convergent noinline nounwind optnone
define weak dso_local i32 @cudaGetDevice(i32* %device) #0 {
entry:
%device.addr = alloca i32*, align 8
store i32* %device, i32** %device.addr, align 8
ret i32 999
}
; Function Attrs: convergent noinline nounwind optnone
define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessor(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize) #0 {
entry:
%numBlocks.addr = alloca i32*, align 8
%func.addr = alloca i8*, align 8
%blockSize.addr = alloca i32, align 4
%dynamicSmemSize.addr = alloca i64, align 8
store i32* %numBlocks, i32** %numBlocks.addr, align 8
store i8* %func, i8** %func.addr, align 8
store i32 %blockSize, i32* %blockSize.addr, align 4
store i64 %dynamicSmemSize, i64* %dynamicSmemSize.addr, align 8
ret i32 999
}
; Function Attrs: convergent noinline nounwind optnone
define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize, i32 %flags) #0 {
entry:
%numBlocks.addr = alloca i32*, align 8
%func.addr = alloca i8*, align 8
%blockSize.addr = alloca i32, align 4
%dynamicSmemSize.addr = alloca i64, align 8
%flags.addr = alloca i32, align 4
store i32* %numBlocks, i32** %numBlocks.addr, align 8
store i8* %func, i8** %func.addr, align 8
store i32 %blockSize, i32* %blockSize.addr, align 4
store i64 %dynamicSmemSize, i64* %dynamicSmemSize.addr, align 8
store i32 %flags, i32* %flags.addr, align 4
ret i32 999
}
; Function Attrs: convergent noinline nounwind optnone
define dso_local void @_Z4Fan1PfS_ii(float* %m_cuda, float* %a_cuda, i32 %Size, i32 %t) #0 {
entry:
%m_cuda.addr = alloca float*, align 8
%a_cuda.addr = alloca float*, align 8
%Size.addr = alloca i32, align 4
%t.addr = alloca i32, align 4
store float* %m_cuda, float** %m_cuda.addr, align 8
store float* %a_cuda, float** %a_cuda.addr, align 8
store i32 %Size, i32* %Size.addr, align 4
store i32 %t, i32* %t.addr, align 4
%call = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #3
%call1 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #3
%call2 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #3
%mul = mul i32 %call1, %call2
%add = add i32 %call, %mul
%0 = load i32, i32* %Size.addr, align 4
%sub = sub nsw i32 %0, 1
%1 = load i32, i32* %t.addr, align 4
%sub3 = sub nsw i32 %sub, %1
%cmp = icmp uge i32 %add, %sub3
br i1 %cmp, label %if.then, label %if.end
if.then: ; preds = %entry
br label %return
if.end: ; preds = %entry
%2 = load float*, float** %a_cuda.addr, align 8
%3 = load i32, i32* %Size.addr, align 4
%call4 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #3
%call5 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #3
%mul6 = mul i32 %call4, %call5
%call7 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #3
%add8 = add i32 %mul6, %call7
%4 = load i32, i32* %t.addr, align 4
%add9 = add i32 %add8, %4
%add10 = add i32 %add9, 1
%mul11 = mul i32 %3, %add10
%idx.ext = zext i32 %mul11 to i64
%add.ptr = getelementptr inbounds float, float* %2, i64 %idx.ext
%5 = load i32, i32* %t.addr, align 4
%idx.ext12 = sext i32 %5 to i64
%add.ptr13 = getelementptr inbounds float, float* %add.ptr, i64 %idx.ext12
%6 = load float, float* %add.ptr13, align 4
%7 = load float*, float** %a_cuda.addr, align 8
%8 = load i32, i32* %Size.addr, align 4
%9 = load i32, i32* %t.addr, align 4
%mul14 = mul nsw i32 %8, %9
%idx.ext15 = sext i32 %mul14 to i64
%add.ptr16 = getelementptr inbounds float, float* %7, i64 %idx.ext15
%10 = load i32, i32* %t.addr, align 4
%idx.ext17 = sext i32 %10 to i64
%add.ptr18 = getelementptr inbounds float, float* %add.ptr16, i64 %idx.ext17
%11 = load float, float* %add.ptr18, align 4
%div = fdiv float %6, %11
%12 = load float*, float** %m_cuda.addr, align 8
%13 = load i32, i32* %Size.addr, align 4
%call19 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #3
%call20 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #3
%mul21 = mul i32 %call19, %call20
%call22 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #3
%add23 = add i32 %mul21, %call22
%14 = load i32, i32* %t.addr, align 4
%add24 = add i32 %add23, %14
%add25 = add i32 %add24, 1
%mul26 = mul i32 %13, %add25
%idx.ext27 = zext i32 %mul26 to i64
%add.ptr28 = getelementptr inbounds float, float* %12, i64 %idx.ext27
%15 = load i32, i32* %t.addr, align 4
%idx.ext29 = sext i32 %15 to i64
%add.ptr30 = getelementptr inbounds float, float* %add.ptr28, i64 %idx.ext29
store float %div, float* %add.ptr30, align 4
br label %return
return: ; preds = %if.end, %if.then
ret void
}
; Function Attrs: alwaysinline convergent nounwind
define linkonce_odr dso_local i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #1 comdat align 2 {
entry:
%0 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
ret i32 %0
}
; Function Attrs: alwaysinline convergent nounwind
define linkonce_odr dso_local i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #1 comdat align 2 {
entry:
%0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
ret i32 %0
}
; Function Attrs: alwaysinline convergent nounwind
define linkonce_odr dso_local i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #1 comdat align 2 {
entry:
%0 = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
ret i32 %0
}
; Function Attrs: convergent noinline nounwind optnone
define dso_local void @_Z4Fan2PfS_S_iii(float* %m_cuda, float* %a_cuda, float* %b_cuda, i32 %Size, i32 %j1, i32 %t) #0 {
entry:
%m_cuda.addr = alloca float*, align 8
%a_cuda.addr = alloca float*, align 8
%b_cuda.addr = alloca float*, align 8
%Size.addr = alloca i32, align 4
%j1.addr = alloca i32, align 4
%t.addr = alloca i32, align 4
%xidx = alloca i32, align 4
%yidx = alloca i32, align 4
store float* %m_cuda, float** %m_cuda.addr, align 8
store float* %a_cuda, float** %a_cuda.addr, align 8
store float* %b_cuda, float** %b_cuda.addr, align 8
store i32 %Size, i32* %Size.addr, align 4
store i32 %j1, i32* %j1.addr, align 4
store i32 %t, i32* %t.addr, align 4
%call = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #3
%call1 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #3
%call2 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #3
%mul = mul i32 %call1, %call2
%add = add i32 %call, %mul
%0 = load i32, i32* %Size.addr, align 4
%sub = sub nsw i32 %0, 1
%1 = load i32, i32* %t.addr, align 4
%sub3 = sub nsw i32 %sub, %1
%cmp = icmp uge i32 %add, %sub3
br i1 %cmp, label %if.then, label %if.end
if.then: ; preds = %entry
br label %if.end58
if.end: ; preds = %entry
%call4 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_yEv() #3
%call5 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_yEv() #3
%call6 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_yEv() #3
%mul7 = mul i32 %call5, %call6
%add8 = add i32 %call4, %mul7
%2 = load i32, i32* %Size.addr, align 4
%3 = load i32, i32* %t.addr, align 4
%sub9 = sub nsw i32 %2, %3
%cmp10 = icmp uge i32 %add8, %sub9
br i1 %cmp10, label %if.then11, label %if.end12
if.then11: ; preds = %if.end
br label %if.end58
if.end12: ; preds = %if.end
%call13 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #3
%call14 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #3
%mul15 = mul i32 %call13, %call14
%call16 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #3
%add17 = add i32 %mul15, %call16
store i32 %add17, i32* %xidx, align 4
%call18 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_yEv() #3
%call19 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_yEv() #3
%mul20 = mul i32 %call18, %call19
%call21 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_yEv() #3
%add22 = add i32 %mul20, %call21
store i32 %add22, i32* %yidx, align 4
%4 = load float*, float** %m_cuda.addr, align 8
%5 = load i32, i32* %Size.addr, align 4
%6 = load i32, i32* %xidx, align 4
%add23 = add nsw i32 %6, 1
%7 = load i32, i32* %t.addr, align 4
%add24 = add nsw i32 %add23, %7
%mul25 = mul nsw i32 %5, %add24
%8 = load i32, i32* %t.addr, align 4
%add26 = add nsw i32 %mul25, %8
%idxprom = sext i32 %add26 to i64
%arrayidx = getelementptr inbounds float, float* %4, i64 %idxprom
%9 = load float, float* %arrayidx, align 4
%10 = load float*, float** %a_cuda.addr, align 8
%11 = load i32, i32* %Size.addr, align 4
%12 = load i32, i32* %t.addr, align 4
%mul27 = mul nsw i32 %11, %12
%13 = load i32, i32* %yidx, align 4
%14 = load i32, i32* %t.addr, align 4
%add28 = add nsw i32 %13, %14
%add29 = add nsw i32 %mul27, %add28
%idxprom30 = sext i32 %add29 to i64
%arrayidx31 = getelementptr inbounds float, float* %10, i64 %idxprom30
%15 = load float, float* %arrayidx31, align 4
%mul32 = fmul contract float %9, %15
%16 = load float*, float** %a_cuda.addr, align 8
%17 = load i32, i32* %Size.addr, align 4
%18 = load i32, i32* %xidx, align 4
%add33 = add nsw i32 %18, 1
%19 = load i32, i32* %t.addr, align 4
%add34 = add nsw i32 %add33, %19
%mul35 = mul nsw i32 %17, %add34
%20 = load i32, i32* %yidx, align 4
%21 = load i32, i32* %t.addr, align 4
%add36 = add nsw i32 %20, %21
%add37 = add nsw i32 %mul35, %add36
%idxprom38 = sext i32 %add37 to i64
%arrayidx39 = getelementptr inbounds float, float* %16, i64 %idxprom38
%22 = load float, float* %arrayidx39, align 4
%sub40 = fsub contract float %22, %mul32
store float %sub40, float* %arrayidx39, align 4
%23 = load i32, i32* %yidx, align 4
%cmp41 = icmp eq i32 %23, 0
br i1 %cmp41, label %if.then42, label %if.end58
if.then42: ; preds = %if.end12
%24 = load float*, float** %m_cuda.addr, align 8
%25 = load i32, i32* %Size.addr, align 4
%26 = load i32, i32* %xidx, align 4
%add43 = add nsw i32 %26, 1
%27 = load i32, i32* %t.addr, align 4
%add44 = add nsw i32 %add43, %27
%mul45 = mul nsw i32 %25, %add44
%28 = load i32, i32* %yidx, align 4
%29 = load i32, i32* %t.addr, align 4
%add46 = add nsw i32 %28, %29
%add47 = add nsw i32 %mul45, %add46
%idxprom48 = sext i32 %add47 to i64
%arrayidx49 = getelementptr inbounds float, float* %24, i64 %idxprom48
%30 = load float, float* %arrayidx49, align 4
%31 = load float*, float** %b_cuda.addr, align 8
%32 = load i32, i32* %t.addr, align 4
%idxprom50 = sext i32 %32 to i64
%arrayidx51 = getelementptr inbounds float, float* %31, i64 %idxprom50
%33 = load float, float* %arrayidx51, align 4
%mul52 = fmul contract float %30, %33
%34 = load float*, float** %b_cuda.addr, align 8
%35 = load i32, i32* %xidx, align 4
%add53 = add nsw i32 %35, 1
%36 = load i32, i32* %t.addr, align 4
%add54 = add nsw i32 %add53, %36
%idxprom55 = sext i32 %add54 to i64
%arrayidx56 = getelementptr inbounds float, float* %34, i64 %idxprom55
%37 = load float, float* %arrayidx56, align 4
%sub57 = fsub contract float %37, %mul52
store float %sub57, float* %arrayidx56, align 4
br label %if.end58
if.end58: ; preds = %if.then, %if.then11, %if.then42, %if.end12
ret void
}
; Function Attrs: alwaysinline convergent nounwind
define linkonce_odr dso_local i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_yEv() #1 comdat align 2 {
entry:
%0 = call i32 @llvm.nvvm.read.ptx.sreg.tid.y()
ret i32 %0
}
; Function Attrs: alwaysinline convergent nounwind
define linkonce_odr dso_local i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_yEv() #1 comdat align 2 {
entry:
%0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y()
ret i32 %0
}
; Function Attrs: alwaysinline convergent nounwind
define linkonce_odr dso_local i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_yEv() #1 comdat align 2 {
entry:
%0 = call i32 @llvm.nvvm.read.ptx.sreg.ntid.y()
ret i32 %0
}
; Function Attrs: nounwind readnone
declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() #2
; Function Attrs: nounwind readnone
declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #2
; Function Attrs: nounwind readnone
declare i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #2
; Function Attrs: nounwind readnone
declare i32 @llvm.nvvm.read.ptx.sreg.tid.y() #2
; Function Attrs: nounwind readnone
declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.y() #2
; Function Attrs: nounwind readnone
declare i32 @llvm.nvvm.read.ptx.sreg.ntid.y() #2
attributes #0 = { convergent noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #1 = { alwaysinline convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #2 = { nounwind readnone }
attributes #3 = { convergent nounwind }
!llvm.module.flags = !{!0, !1, !2}
!nvvm.annotations = !{!3, !4, !5, !6, !5, !7, !7, !7, !7, !8, !8, !7}
!llvm.ident = !{!9}
!nvvmir.version = !{!10}
!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 10, i32 1]}
!1 = !{i32 1, !"wchar_size", i32 4}
!2 = !{i32 4, !"nvvm-reflect-ftz", i32 0}
!3 = !{void (float*, float*, i32, i32)* @_Z4Fan1PfS_ii, !"kernel", i32 1}
!4 = !{void (float*, float*, float*, i32, i32, i32)* @_Z4Fan2PfS_S_iii, !"kernel", i32 1}
!5 = !{null, !"align", i32 8}
!6 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080}
!7 = !{null, !"align", i32 16}
!8 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088}
!9 = !{!"clang version 10.0.1 (https://github.com/llvm/llvm-project.git ef32c611aa214dea855364efd7ba451ec5ec3f74)"}
!10 = !{i32 1, i32 4}

File diff suppressed because one or more lines are too long

View File

@ -1,522 +0,0 @@
/*-----------------------------------------------------------
** gaussian.cu -- The program is to solve a linear system Ax = b
** by using Gaussian Elimination. The algorithm on page 101
** ("Foundations of Parallel Programming") is used.
** The sequential version is gaussian.c. This parallel
** implementation converts three independent for() loops
** into three Fans. Use the data file ge_3.dat to verify
** the correction of the output.
**
** Written by Andreas Kura, 02/15/95
** Modified by Chong-wei Xu, 04/20/95
** Modified by Chris Gregg for CUDA, 07/20/2009
**-----------------------------------------------------------
*/
#include "cuda_runtime.h"
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/time.h>
#ifdef TIMING
#include "timing.h"
#endif
#ifdef RD_WG_SIZE_0_0
#define MAXBLOCKSIZE RD_WG_SIZE_0_0
#elif defined(RD_WG_SIZE_0)
#define MAXBLOCKSIZE RD_WG_SIZE_0
#elif defined(RD_WG_SIZE)
#define MAXBLOCKSIZE RD_WG_SIZE
#else
#define MAXBLOCKSIZE 512
#endif
// 2D defines. Go from specific to general
#ifdef RD_WG_SIZE_1_0
#define BLOCK_SIZE_XY RD_WG_SIZE_1_0
#elif defined(RD_WG_SIZE_1)
#define BLOCK_SIZE_XY RD_WG_SIZE_1
#elif defined(RD_WG_SIZE)
#define BLOCK_SIZE_XY RD_WG_SIZE
#else
#define BLOCK_SIZE_XY 1
#endif
#ifdef TIMING
struct timeval tv;
struct timeval tv_total_start, tv_total_end;
struct timeval tv_h2d_start, tv_h2d_end;
struct timeval tv_d2h_start, tv_d2h_end;
struct timeval tv_kernel_start, tv_kernel_end;
struct timeval tv_mem_alloc_start, tv_mem_alloc_end;
struct timeval tv_close_start, tv_close_end;
float init_time = 0, mem_alloc_time = 0, h2d_time = 0, kernel_time = 0,
d2h_time = 0, close_time = 0, total_time = 0;
#endif
int Size;
float *a, *b, *finalVec;
float *m;
FILE *fp;
void InitProblemOnce(char *filename);
void InitPerRun();
void ForwardSub();
void BackSub();
__global__ void Fan1(float *m, float *a, int Size, int t);
__global__ void Fan2(float *m, float *a, float *b, int Size, int j1, int t);
void InitMat(float *ary, int nrow, int ncol);
void InitAry(float *ary, int ary_size);
void PrintMat(float *ary, int nrow, int ncolumn);
void PrintAry(float *ary, int ary_size);
void PrintDeviceProperties();
void checkCUDAError(const char *msg);
unsigned int totalKernelTime = 0;
// create both matrix and right hand side, Ke Wang 2013/08/12 11:51:06
void create_matrix(float *m, int size) {
int i, j;
float lamda = -0.01;
float coe[2 * size - 1];
float coe_i = 0.0;
for (i = 0; i < size; i++) {
coe_i = 10 * exp(lamda * i);
j = size - 1 + i;
coe[j] = coe_i;
j = size - 1 - i;
coe[j] = coe_i;
}
for (i = 0; i < size; i++) {
for (j = 0; j < size; j++) {
m[i * size + j] = coe[size - 1 - i + j];
}
}
}
int main(int argc, char *argv[]) {
printf("WG size of kernel 1 = %d, WG size of kernel 2= %d X %d\n",
MAXBLOCKSIZE, BLOCK_SIZE_XY, BLOCK_SIZE_XY);
int verbose = 1;
int i, j;
char flag;
if (argc < 2) {
printf("Usage: gaussian -f filename / -s size [-q]\n\n");
printf("-q (quiet) suppresses printing the matrix and result values.\n");
printf("-f (filename) path of input file\n");
printf(
"-s (size) size of matrix. Create matrix and rhs in this program \n");
printf(
"The first line of the file contains the dimension of the matrix, n.");
printf("The second line of the file is a newline.\n");
printf("The next n lines contain n tab separated values for the matrix.");
printf("The next line of the file is a newline.\n");
printf("The next line of the file is a 1xn vector with tab separated "
"values.\n");
printf("The next line of the file is a newline. (optional)\n");
printf("The final line of the file is the pre-computed solution. "
"(optional)\n");
printf("Example: matrix4.txt:\n");
printf("4\n");
printf("\n");
printf("-0.6 -0.5 0.7 0.3\n");
printf("-0.3 -0.9 0.3 0.7\n");
printf("-0.4 -0.5 -0.3 -0.8\n");
printf("0.0 -0.1 0.2 0.9\n");
printf("\n");
printf("-0.85 -0.68 0.24 -0.53\n");
printf("\n");
printf("0.7 0.0 -0.4 -0.5\n");
exit(0);
}
cudaSetDevice(0);
PrintDeviceProperties();
// char filename[100];
// sprintf(filename,"matrices/matrix%d.txt",size);
for (i = 1; i < argc; i++) {
if (argv[i][0] == '-') { // flag
flag = argv[i][1];
switch (flag) {
case 's': // platform
i++;
Size = atoi(argv[i]);
printf("Create matrix internally in parse, size = %d \n", Size);
a = (float *)malloc(Size * Size * sizeof(float));
create_matrix(a, Size);
b = (float *)malloc(Size * sizeof(float));
for (j = 0; j < Size; j++)
b[j] = 1.0;
m = (float *)malloc(Size * Size * sizeof(float));
break;
case 'f': // platform
i++;
printf("Read file from %s \n", argv[i]);
InitProblemOnce(argv[i]);
break;
case 'q': // quiet
verbose = 1;
break;
}
}
}
// InitProblemOnce(filename);
InitPerRun();
// begin timing
struct timeval time_start;
gettimeofday(&time_start, NULL);
// run kernels
ForwardSub();
// end timing
struct timeval time_end;
gettimeofday(&time_end, NULL);
unsigned int time_total = (time_end.tv_sec * 1000000 + time_end.tv_usec) -
(time_start.tv_sec * 1000000 + time_start.tv_usec);
if (verbose) {
printf("Matrix m is: \n");
PrintMat(m, Size, Size);
printf("Matrix a is: \n");
PrintMat(a, Size, Size);
printf("Array b is: \n");
PrintAry(b, Size);
}
BackSub();
if (verbose) {
printf("The final solution is: \n");
PrintAry(finalVec, Size);
}
printf("\nTime total (including memory transfers)\t%f sec\n",
time_total * 1e-6);
printf("Time for CUDA kernels:\t%f sec\n", totalKernelTime * 1e-6);
/*printf("%d,%d\n",size,time_total);
fprintf(stderr,"%d,%d\n",size,time_total);*/
free(m);
free(a);
free(b);
#ifdef TIMING
printf("Exec: %f\n", kernel_time);
#endif
}
/*------------------------------------------------------
** PrintDeviceProperties
**-----------------------------------------------------
*/
void PrintDeviceProperties() {
cudaDeviceProp deviceProp;
int nDevCount = 0;
cudaGetDeviceCount(&nDevCount);
printf("Total Device found: %d", nDevCount);
for (int nDeviceIdx = 0; nDeviceIdx < nDevCount; ++nDeviceIdx) {
memset(&deviceProp, 0, sizeof(deviceProp));
if (cudaSuccess == cudaGetDeviceProperties(&deviceProp, nDeviceIdx)) {
printf("\nDevice Name \t\t - %s ", deviceProp.name);
printf("\n**************************************");
printf("\nTotal Global Memory\t\t\t - %lu KB",
deviceProp.totalGlobalMem / 1024);
printf("\nShared memory available per block \t - %lu KB",
deviceProp.sharedMemPerBlock / 1024);
printf("\nNumber of registers per thread block \t - %d",
deviceProp.regsPerBlock);
printf("\nWarp size in threads \t\t\t - %d", deviceProp.warpSize);
printf("\nMemory Pitch \t\t\t\t - %zu bytes", deviceProp.memPitch);
printf("\nMaximum threads per block \t\t - %d",
deviceProp.maxThreadsPerBlock);
printf("\nMaximum Thread Dimension (block) \t - %d %d %d",
deviceProp.maxThreadsDim[0], deviceProp.maxThreadsDim[1],
deviceProp.maxThreadsDim[2]);
printf("\nMaximum Thread Dimension (grid) \t - %d %d %d",
deviceProp.maxGridSize[0], deviceProp.maxGridSize[1],
deviceProp.maxGridSize[2]);
printf("\nTotal constant memory \t\t\t - %zu bytes",
deviceProp.totalConstMem);
printf("\nCUDA ver \t\t\t\t - %d.%d", deviceProp.major, deviceProp.minor);
printf("\nClock rate \t\t\t\t - %d KHz", deviceProp.clockRate);
printf("\nTexture Alignment \t\t\t - %zu bytes",
deviceProp.textureAlignment);
printf("\nDevice Overlap \t\t\t\t - %s",
deviceProp.deviceOverlap ? "Allowed" : "Not Allowed");
printf("\nNumber of Multi processors \t\t - %d\n\n",
deviceProp.multiProcessorCount);
} else
printf("\n%s", cudaGetErrorString(cudaGetLastError()));
}
}
/*------------------------------------------------------
** InitProblemOnce -- Initialize all of matrices and
** vectors by opening a data file specified by the user.
**
** We used dynamic array *a, *b, and *m to allocate
** the memory storages.
**------------------------------------------------------
*/
void InitProblemOnce(char *filename) {
// char *filename = argv[1];
// printf("Enter the data file name: ");
// scanf("%s", filename);
printf("The file name is: %s\n", filename);
fp = fopen(filename, "r");
fscanf(fp, "%d", &Size);
a = (float *)malloc(Size * Size * sizeof(float));
InitMat(a, Size, Size);
printf("The input matrix a is:\n");
PrintMat(a, Size, Size);
b = (float *)malloc(Size * sizeof(float));
InitAry(b, Size);
printf("The input array b is:\n");
PrintAry(b, Size);
m = (float *)malloc(Size * Size * sizeof(float));
}
/*------------------------------------------------------
** InitPerRun() -- Initialize the contents of the
** multipier matrix **m
**------------------------------------------------------
*/
void InitPerRun() {
int i;
for (i = 0; i < Size * Size; i++)
*(m + i) = 0.0;
}
/*-------------------------------------------------------
** Fan1() -- Calculate multiplier matrix
** Pay attention to the index. Index i give the range
** which starts from 0 to range-1. The real values of
** the index should be adjust and related with the value
** of t which is defined on the ForwardSub().
**-------------------------------------------------------
*/
__global__ void Fan1(float *m_cuda, float *a_cuda, int Size, int t) {
// if(threadIdx.x + blockIdx.x * blockDim.x >= Size-1-t) {
// printf("blockIDx.x: %d, threadIdx.x: %d, Size: %d, t:%d,
// Size-1-t: %d\n",blockIdx.x,threadIdx.x,Size,t,Size-1-t);
// }
if (threadIdx.x + blockIdx.x * blockDim.x >= Size - 1 - t)
return;
*(m_cuda + Size * (blockDim.x * blockIdx.x + threadIdx.x + t + 1) + t) =
*(a_cuda + Size * (blockDim.x * blockIdx.x + threadIdx.x + t + 1) + t) /
*(a_cuda + Size * t + t);
}
/*-------------------------------------------------------
** Fan2() -- Modify the matrix A into LUD
**-------------------------------------------------------
*/
__global__ void Fan2(float *m_cuda, float *a_cuda, float *b_cuda, int Size,
int j1, int t) {
if (threadIdx.x + blockIdx.x * blockDim.x >= Size - 1 - t)
return;
if (threadIdx.y + blockIdx.y * blockDim.y >= Size - t)
return;
int xidx = blockIdx.x * blockDim.x + threadIdx.x;
int yidx = blockIdx.y * blockDim.y + threadIdx.y;
// printf("blockIdx.x: %d, threadIdx.x: %d, blockIdx.y: %d, threadIdx.y: %d,
// blockDim.x: %d, blockDim.y:
// %d\n",blockIdx.x,threadIdx.x,blockIdx.y,threadIdx.y,blockDim.x,blockDim.y);
a_cuda[Size * (xidx + 1 + t) + (yidx + t)] -=
m_cuda[Size * (xidx + 1 + t) + t] * a_cuda[Size * t + (yidx + t)];
// a_cuda[xidx+1+t][yidx+t] -= m_cuda[xidx+1+t][t] * a_cuda[t][yidx+t];
if (yidx == 0) {
// printf("blockIdx.x:%d,threadIdx.x:%d,blockIdx.y:%d,threadIdx.y:%d,blockDim.x:%d,blockDim.y:%d\n",blockIdx.x,threadIdx.x,blockIdx.y,threadIdx.y,blockDim.x,blockDim.y);
// printf("xidx:%d,yidx:%d\n",xidx,yidx);
b_cuda[xidx + 1 + t] -=
m_cuda[Size * (xidx + 1 + t) + (yidx + t)] * b_cuda[t];
}
}
/*------------------------------------------------------
** ForwardSub() -- Forward substitution of Gaussian
** elimination.
**------------------------------------------------------
*/
void ForwardSub() {
int t;
float *m_cuda, *a_cuda, *b_cuda;
int A = 1;
int B = 2;
int C = 3;
int D = 4;
int E = 5;
int F = 6;
// printf("blockIDx.x: %d, threadIdx.x: %d, Size: %d, t: %d, Size-1-t: %d\n",
// A, B, C, D, E); printf("blockIdx.x: %d, threadIdx.x: %d, blockIdx.y: %d,
// threadIdx.y: %d, blockDim.x: %d, blockDim.y: %d\n", A , B, C, D, E, F);
// allocate memory on GPU
cudaMalloc((void **)&m_cuda, Size * Size * sizeof(float));
cudaMalloc((void **)&a_cuda, Size * Size * sizeof(float));
cudaMalloc((void **)&b_cuda, Size * sizeof(float));
// copy memory to GPU
cudaMemcpy(m_cuda, m, Size * Size * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(a_cuda, a, Size * Size * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(b_cuda, b, Size * sizeof(float), cudaMemcpyHostToDevice);
int block_size, grid_size;
block_size = MAXBLOCKSIZE;
grid_size = (Size / block_size) + (!(Size % block_size) ? 0 : 1);
printf("1d grid size: %d\n", grid_size);
dim3 dimBlock(block_size);
dim3 dimGrid(grid_size);
// dim3 dimGrid( (N/dimBlock.x) + (!(N%dimBlock.x)?0:1) );
int blockSize2d, gridSize2d;
blockSize2d = BLOCK_SIZE_XY;
gridSize2d = (Size / blockSize2d) + (!(Size % blockSize2d ? 0 : 1));
dim3 dimBlockXY(blockSize2d, blockSize2d);
printf("BlockXY: %d \n", blockSize2d);
dim3 dimGridXY(gridSize2d, gridSize2d);
#ifdef TIMING
gettimeofday(&tv_kernel_start, NULL);
#endif
printf("first grid size: %d second: %d\n", grid_size, gridSize2d);
// begin timing kernels
struct timeval time_start;
gettimeofday(&time_start, NULL);
for (t = 0; t < (Size - 1); t++) {
Fan1<<<dimGrid, dimBlock>>>(m_cuda, a_cuda, Size, t);
cudaDeviceSynchronize();
Fan2<<<dimGridXY, dimBlockXY>>>(m_cuda, a_cuda, b_cuda, Size, Size - t, t);
cudaDeviceSynchronize();
checkCUDAError("Fan2");
}
// end timing kernels
struct timeval time_end;
gettimeofday(&time_end, NULL);
totalKernelTime = (time_end.tv_sec * 1000000 + time_end.tv_usec) -
(time_start.tv_sec * 1000000 + time_start.tv_usec);
#ifdef TIMING
tvsub(&time_end, &tv_kernel_start, &tv);
kernel_time += tv.tv_sec * 1000.0 + (float)tv.tv_usec / 1000.0;
#endif
// copy memory back to CPU
cudaMemcpy(m, m_cuda, Size * Size * sizeof(float), cudaMemcpyDeviceToHost);
cudaMemcpy(a, a_cuda, Size * Size * sizeof(float), cudaMemcpyDeviceToHost);
cudaMemcpy(b, b_cuda, Size * sizeof(float), cudaMemcpyDeviceToHost);
cudaFree(m_cuda);
cudaFree(a_cuda);
cudaFree(b_cuda);
}
/*------------------------------------------------------
** BackSub() -- Backward substitution
**------------------------------------------------------
*/
void BackSub() {
// create a new vector to hold the final answer
finalVec = (float *)malloc(Size * sizeof(float));
// solve "bottom up"
int i, j;
for (i = 0; i < Size; i++) {
finalVec[Size - i - 1] = b[Size - i - 1];
for (j = 0; j < i; j++) {
finalVec[Size - i - 1] -= *(a + Size * (Size - i - 1) + (Size - j - 1)) *
finalVec[Size - j - 1];
}
finalVec[Size - i - 1] =
finalVec[Size - i - 1] / *(a + Size * (Size - i - 1) + (Size - i - 1));
}
}
void InitMat(float *ary, int nrow, int ncol) {
int i, j;
for (i = 0; i < nrow; i++) {
for (j = 0; j < ncol; j++) {
fscanf(fp, "%f", ary + Size * i + j);
}
}
}
/*------------------------------------------------------
** PrintMat() -- Print the contents of the matrix
**------------------------------------------------------
*/
void PrintMat(float *ary, int nrow, int ncol) {
return;
int i, j;
for (i = 0; i < nrow; i++) {
for (j = 0; j < ncol; j++) {
printf("%8.2f ", *(ary + Size * i + j));
}
printf("\n");
}
printf("\n");
}
/*------------------------------------------------------
** InitAry() -- Initialize the array (vector) by reading
** data from the data file
**------------------------------------------------------
*/
void InitAry(float *ary, int ary_size) {
int i;
for (i = 0; i < ary_size; i++) {
fscanf(fp, "%f", &ary[i]);
}
}
/*------------------------------------------------------
** PrintAry() -- Print the contents of the array (vector)
**------------------------------------------------------
*/
void PrintAry(float *ary, int ary_size) {
int i;
for (i = 0; i < ary_size; i++) {
printf("%.2f ", ary[i]);
}
printf("\n\n");
}
void checkCUDAError(const char *msg) {
cudaError_t err = cudaGetLastError();
if (cudaSuccess != err) {
fprintf(stderr, "Cuda error: %s: %s.\n", msg, cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
}

View File

@ -1,23 +0,0 @@
#!/bin/bash
set -e
llvm-as gaussian-cuda-nvptx64-nvidia-cuda-sm_61.ll
llvm-as gaussian-host-x86_64-unknown-linux-gnu.ll
../../build/compilation/kernelTranslator gaussian-cuda-nvptx64-nvidia-cuda-sm_61.bc kernel.bc
../../build/compilation/hostTranslator gaussian-host-x86_64-unknown-linux-gnu.bc host.bc
llc --relocation-model=pic --filetype=obj kernel.bc
llc --relocation-model=pic --filetype=obj host.bc
g++ -Wall -L../../build/runtime \
-L../../build/runtime/threadPool \
-o gaussian -fPIC -no-pie host.o kernel.o -lc -lx86Runtime -lthreadPool -lpthread
export LD_LIBRARY_PATH=../../build/runtime:../../build/runtime/threadPool:$LD_LIBRARY_PATH
./gaussian -f ../../rodinia-data/gaussian/matrix4.txt >> res.log
if grep -q "0.70 0.00 -0.40 -0.50" res.log; then
echo "Pass"
else
echo "Error result"
exit 1
fi

File diff suppressed because it is too large Load Diff

View File

@ -1,317 +0,0 @@
#ifdef __cplusplus
extern "C" {
#endif
/*
* avilib.h
*
* Copyright (C) Thomas Östreich - June 2001
* multiple audio track support Copyright (C) 2002 Thomas Östreich
*
* Original code:
* Copyright (C) 1999 Rainer Johanni <Rainer@Johanni.de>
*
* This file is part of transcode, a linux video stream processing tool
*
* transcode is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2, or (at your option)
* any later version.
*
* transcode is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with GNU Make; see the file COPYING. If not, write to
* the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
*
*/
#include <fcntl.h>
#include <stdio.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <unistd.h>
// #include <windows.h>
#include <errno.h>
#include <inttypes.h>
#include <limits.h>
#include <stdlib.h>
#include <string.h>
#ifndef AVILIB_H
#define AVILIB_H
#define AVI_MAX_TRACKS 8
typedef struct {
unsigned long key;
unsigned long pos;
unsigned long len;
} video_index_entry;
typedef struct {
unsigned long pos;
unsigned long len;
unsigned long tot;
} audio_index_entry;
typedef struct track_s {
long a_fmt; /* Audio format, see #defines below */
long a_chans; /* Audio channels, 0 for no audio */
long a_rate; /* Rate in Hz */
long a_bits; /* bits per audio sample */
long mp3rate; /* mp3 bitrate kbs*/
long audio_strn; /* Audio stream number */
long audio_bytes; /* Total number of bytes of audio data */
long audio_chunks; /* Chunks of audio data in the file */
char audio_tag[4]; /* Tag of audio data */
long audio_posc; /* Audio position: chunk */
long audio_posb; /* Audio position: byte within chunk */
long a_codech_off; /* absolut offset of audio codec information */
long a_codecf_off; /* absolut offset of audio codec information */
audio_index_entry *audio_index;
} track_t;
typedef struct {
long fdes; /* File descriptor of AVI file */
long mode; /* 0 for reading, 1 for writing */
long width; /* Width of a video frame */
long height; /* Height of a video frame */
double fps; /* Frames per second */
char compressor[8]; /* Type of compressor, 4 bytes + padding for 0 byte */
char compressor2[8]; /* Type of compressor, 4 bytes + padding for 0 byte */
long video_strn; /* Video stream number */
long video_frames; /* Number of video frames */
char video_tag[4]; /* Tag of video data */
long video_pos; /* Number of next frame to be read
(if index present) */
unsigned long max_len; /* maximum video chunk present */
track_t track[AVI_MAX_TRACKS]; // up to AVI_MAX_TRACKS audio tracks supported
unsigned long pos; /* position in file */
long n_idx; /* number of index entries actually filled */
long max_idx; /* number of index entries actually allocated */
long v_codech_off; /* absolut offset of video codec (strh) info */
long v_codecf_off; /* absolut offset of video codec (strf) info */
unsigned char (*idx)[16]; /* index entries (AVI idx1 tag) */
video_index_entry *video_index;
unsigned long last_pos; /* Position of last frame written */
unsigned long last_len; /* Length of last frame written */
int must_use_index; /* Flag if frames are duplicated */
unsigned long movi_start;
int anum; // total number of audio tracks
int aptr; // current audio working track
} avi_t;
#define AVI_MODE_WRITE 0
#define AVI_MODE_READ 1
/* The error codes delivered by avi_open_input_file */
#define AVI_ERR_SIZELIM \
1 /* The write of the data would exceed \
the maximum size of the AVI file. \
This is more a warning than an \
error since the file may be closed safely */
#define AVI_ERR_OPEN \
2 /* Error opening the AVI file - wrong path \
name or file nor readable/writable \
*/
#define AVI_ERR_READ 3 /* Error reading from AVI File */
#define AVI_ERR_WRITE \
4 /* Error writing to AVI File, \
disk full ??? */
#define AVI_ERR_WRITE_INDEX \
5 /* Could not write index to AVI file \
during close, file may still be \
usable */
#define AVI_ERR_CLOSE \
6 /* Could not write header to AVI file \
or not truncate the file during \
close, file is most probably corrupted */
#define AVI_ERR_NOT_PERM \
7 /* Operation not permitted: \
trying to read from a file open \
for writing or vice versa */
#define AVI_ERR_NO_MEM 8 /* malloc failed */
#define AVI_ERR_NO_AVI 9 /* Not an AVI file */
#define AVI_ERR_NO_HDRL \
10 /* AVI file has no has no header list, \
corrupted ??? */
#define AVI_ERR_NO_MOVI \
11 /* AVI file has no has no MOVI list, \
corrupted ??? */
#define AVI_ERR_NO_VIDS 12 /* AVI file contains no video data */
#define AVI_ERR_NO_IDX \
13 /* The file has been opened with \
getIndex==0, but an operation has \
been performed that needs an index */
/* Possible Audio formats */
#ifndef WAVE_FORMAT_PCM
#define WAVE_FORMAT_UNKNOWN (0x0000)
#define WAVE_FORMAT_PCM (0x0001)
#define WAVE_FORMAT_ADPCM (0x0002)
#define WAVE_FORMAT_IBM_CVSD (0x0005)
#define WAVE_FORMAT_ALAW (0x0006)
#define WAVE_FORMAT_MULAW (0x0007)
#define WAVE_FORMAT_OKI_ADPCM (0x0010)
#define WAVE_FORMAT_DVI_ADPCM (0x0011)
#define WAVE_FORMAT_DIGISTD (0x0015)
#define WAVE_FORMAT_DIGIFIX (0x0016)
#define WAVE_FORMAT_YAMAHA_ADPCM (0x0020)
#define WAVE_FORMAT_DSP_TRUESPEECH (0x0022)
#define WAVE_FORMAT_GSM610 (0x0031)
#define IBM_FORMAT_MULAW (0x0101)
#define IBM_FORMAT_ALAW (0x0102)
#define IBM_FORMAT_ADPCM (0x0103)
#endif
avi_t *AVI_open_output_file(char *filename);
void AVI_set_video(avi_t *AVI, int width, int height, double fps,
char *compressor);
void AVI_set_audio(avi_t *AVI, int channels, long rate, int bits, int format,
long mp3rate);
int AVI_write_frame(avi_t *AVI, char *data, long bytes, int keyframe);
int AVI_dup_frame(avi_t *AVI);
int AVI_write_audio(avi_t *AVI, char *data, long bytes);
int AVI_append_audio(avi_t *AVI, char *data, long bytes);
long AVI_bytes_remain(avi_t *AVI);
int AVI_close(avi_t *AVI);
long AVI_bytes_written(avi_t *AVI);
avi_t *AVI_open_input_file(char *filename, int getIndex);
avi_t *AVI_open_fd(int fd, int getIndex);
int avi_parse_input_file(avi_t *AVI, int getIndex);
long AVI_audio_mp3rate(avi_t *AVI);
long AVI_video_frames(avi_t *AVI);
int AVI_video_width(avi_t *AVI);
int AVI_video_height(avi_t *AVI);
double AVI_frame_rate(avi_t *AVI);
char *AVI_video_compressor(avi_t *AVI);
int AVI_audio_channels(avi_t *AVI);
int AVI_audio_bits(avi_t *AVI);
int AVI_audio_format(avi_t *AVI);
long AVI_audio_rate(avi_t *AVI);
long AVI_audio_bytes(avi_t *AVI);
long AVI_audio_chunks(avi_t *AVI);
long AVI_max_video_chunk(avi_t *AVI);
long AVI_frame_size(avi_t *AVI, long frame);
long AVI_audio_size(avi_t *AVI, long frame);
int AVI_seek_start(avi_t *AVI);
int AVI_set_video_position(avi_t *AVI, long frame);
long AVI_get_video_position(avi_t *AVI, long frame);
long AVI_read_frame(avi_t *AVI, char *vidbuf, int *keyframe);
int AVI_set_audio_position(avi_t *AVI, long byte);
int AVI_set_audio_bitrate(avi_t *AVI, long bitrate);
long AVI_read_audio(avi_t *AVI, char *audbuf, long bytes);
long AVI_audio_codech_offset(avi_t *AVI);
long AVI_audio_codecf_offset(avi_t *AVI);
long AVI_video_codech_offset(avi_t *AVI);
long AVI_video_codecf_offset(avi_t *AVI);
int AVI_read_data(avi_t *AVI, char *vidbuf, long max_vidbuf, char *audbuf,
long max_audbuf, long *len);
void AVI_print_error(char *str);
char *AVI_strerror();
char *AVI_syserror();
int AVI_scan(char *name);
int AVI_dump(char *name, int mode);
char *AVI_codec2str(short cc);
int AVI_file_check(char *import_file);
void AVI_info(avi_t *avifile);
uint64_t AVI_max_size();
int avi_update_header(avi_t *AVI);
int AVI_set_audio_track(avi_t *AVI, int track);
int AVI_get_audio_track(avi_t *AVI);
int AVI_audio_tracks(avi_t *AVI);
struct riff_struct {
unsigned char id[4]; /* RIFF */
unsigned long len;
unsigned char wave_id[4]; /* WAVE */
};
struct chunk_struct {
unsigned char id[4];
unsigned long len;
};
struct common_struct {
unsigned short wFormatTag;
unsigned short wChannels;
unsigned long dwSamplesPerSec;
unsigned long dwAvgBytesPerSec;
unsigned short wBlockAlign;
unsigned short wBitsPerSample; /* Only for PCM */
};
struct wave_header {
struct riff_struct riff;
struct chunk_struct format;
struct common_struct common;
struct chunk_struct data;
};
struct AVIStreamHeader {
long fccType;
long fccHandler;
long dwFlags;
long dwPriority;
long dwInitialFrames;
long dwScale;
long dwRate;
long dwStart;
long dwLength;
long dwSuggestedBufferSize;
long dwQuality;
long dwSampleSize;
};
#endif
#ifdef __cplusplus
}
#endif

View File

@ -1,130 +0,0 @@
// #ifdef __cplusplus
// extern "C" {
// #endif
//===============================================================================================================================================================================================================
// DEFINE / INCLUDE
//===============================================================================================================================================================================================================
#include "avimod.h"
//===============================================================================================================================================================================================================
// FUNCTIONS
//===============================================================================================================================================================================================================
// Flips the specified image and crops it to the specified dimensions
// If scaled == true, all values are scaled to the range [0.0, 1.0
fp *chop_flip_image(char *image, int height, int width, int cropped, int scaled,
int converted) {
// fixed dimensions for cropping or not cropping, square vertices starting
// from initial point in top left corner going down and right
int top;
int bottom;
int left;
int right;
if (cropped == 1) {
top = 0;
bottom = 0;
left = 0;
right = 0;
} else {
top = 0;
bottom = height - 1;
left = 0;
right = width - 1;
}
// dimensions of new cropped image
int height_new = bottom - top + 1;
int width_new = right - left + 1;
// counters
int i, j;
// allocate memory for cropped/flipped frame
fp *result = (fp *)malloc(height_new * width_new * sizeof(fp));
// crop/flip and scale frame
fp temp;
if (scaled) {
fp scale = 1.0 / 255.0;
for (i = 0; i < height_new; i++) { // rows
for (j = 0; j < width_new; j++) { // colums
temp =
(fp)image[((height - 1 - (i + top)) * width) + (j + left)] * scale;
if (temp < 0) {
result[i * width_new + j] = temp + 256;
} else {
result[i * width_new + j] = temp;
}
}
}
} else {
for (i = 0; i < height_new; i++) { // rows
for (j = 0; j < width_new; j++) { // colums
temp = (fp)image[((height - 1 - (i + top)) * width) + (j + left)];
if (temp < 0) {
result[i * width_new + j] = temp + 256;
} else {
result[i * width_new + j] = temp;
}
}
}
}
// convert storage method (from row-major to column-major)
fp *result_converted = (fp *)malloc(height_new * width_new * sizeof(fp));
if (converted == 1) {
for (i = 0; i < width_new; i++) { // rows
for (j = 0; j < height_new; j++) { // colums
result_converted[i * height_new + j] = result[j * width_new + i];
}
}
} else {
result_converted = result;
}
free(result);
// return
return result_converted;
}
// Returns the specified frame from the specified video file
// If cropped == true, the frame is cropped to pre-determined dimensions
// (hardcoded to the boundaries of the blood vessel in the test video)
// If scaled == true, all values are scaled to the range [0.0, 1.0]
fp *get_frame(avi_t *cell_file, int frame_num, int cropped, int scaled,
int converted) {
// variable
int dummy;
int width = AVI_video_width(cell_file);
int height = AVI_video_height(cell_file);
int status;
// There are 600 frames in this file (i.e. frame_num = 600 causes an error)
AVI_set_video_position(cell_file, frame_num);
// Read in the frame from the AVI
char *image_buf = (char *)malloc(width * height * sizeof(char));
status = AVI_read_frame(cell_file, image_buf, &dummy);
if (status == -1) {
AVI_print_error((char *)"Error with AVI_read_frame");
exit(-1);
}
// The image is read in upside-down, so we need to flip it
fp *image_chopped;
image_chopped =
chop_flip_image(image_buf, height, width, cropped, scaled, converted);
// free image buffer
free(image_buf);
// return
return image_chopped;
}
// #ifdef __cplusplus
// }
// #endif

View File

@ -1,24 +0,0 @@
#ifdef __cplusplus
extern "C" {
#endif
//===============================================================================================================================================================================================================
// DEFINE / INCLUDE
//===============================================================================================================================================================================================================
#define fp float
#include "avilib.h"
//===============================================================================================================================================================================================================
// DEFINE / INCLUDE
//===============================================================================================================================================================================================================
fp *chop_flip_image(char *image, int height, int width, int cropped, int scaled,
int converted);
fp *get_frame(avi_t *cell_file, int frame_num, int cropped, int scaled,
int converted);
#ifdef __cplusplus
}
#endif

View File

@ -1,396 +0,0 @@
//===============================================================================================================================================================================================================
//===============================================================================================================================================================================================================
// DEFINE / INCLUDE
//===============================================================================================================================================================================================================
//===============================================================================================================================================================================================================
#define fp float
/* #define NUMBER_THREADS 512 */
#ifdef RD_WG_SIZE_0_0
#define NUMBER_THREADS RD_WG_SIZE_0_0
#elif defined(RD_WG_SIZE_0)
#define NUMBER_THREADS RD_WG_SIZE_0
#elif defined(RD_WG_SIZE)
#define NUMBER_THREADS RD_WG_SIZE
#else
#define NUMBER_THREADS 256
#endif
#define ENDO_POINTS 20
#define EPI_POINTS 31
#define ALL_POINTS 51
//===============================================================================================================================================================================================================
//===============================================================================================================================================================================================================
// PARAMS_COMMON_CHANGE STRUCT
//===============================================================================================================================================================================================================
//===============================================================================================================================================================================================================
typedef struct params_common_change {
//======================================================================================================================================================
// FRAME
//======================================================================================================================================================
fp *d_frame;
int frame_no;
} params_common_change;
//===============================================================================================================================================================================================================
//===============================================================================================================================================================================================================
// PARAMS_COMMON STRUCTURE
//===============================================================================================================================================================================================================
//===============================================================================================================================================================================================================
typedef struct params_common {
//======================================================================================================================================================
// HARDCODED INPUTS FROM MATLAB
//======================================================================================================================================================
//====================================================================================================
// CONSTANTS
//====================================================================================================
int sSize;
int tSize;
int maxMove;
fp alpha;
//====================================================================================================
// FRAME
//====================================================================================================
int no_frames;
int frame_rows;
int frame_cols;
int frame_elem;
int frame_mem;
//====================================================================================================
// ENDO POINTS
//====================================================================================================
int endoPoints;
int endo_mem;
int *endoRow;
int *endoCol;
int *tEndoRowLoc;
int *tEndoColLoc;
int *d_endoRow;
int *d_endoCol;
int *d_tEndoRowLoc;
int *d_tEndoColLoc;
fp *d_endoT;
//====================================================================================================
// EPI POINTS
//====================================================================================================
int epiPoints;
int epi_mem;
int *epiRow;
int *epiCol;
int *tEpiRowLoc;
int *tEpiColLoc;
int *d_epiRow;
int *d_epiCol;
int *d_tEpiRowLoc;
int *d_tEpiColLoc;
fp *d_epiT;
//====================================================================================================
// ALL POINTS
//====================================================================================================
int allPoints;
//======================================================================================================================================================
// RIGHT TEMPLATE FROM TEMPLATE ARRAY
//======================================================================================================================================================
int in_rows;
int in_cols;
int in_elem;
int in_mem;
//======================================================================================================================================================
// AREA AROUND POINT FROM FRAME
//======================================================================================================================================================
int in2_rows;
int in2_cols;
int in2_elem;
int in2_mem;
//======================================================================================================================================================
// CONVOLUTION
//======================================================================================================================================================
int conv_rows;
int conv_cols;
int conv_elem;
int conv_mem;
int ioffset;
int joffset;
//======================================================================================================================================================
// CUMULATIVE SUM 1
//======================================================================================================================================================
//====================================================================================================
// PAD ARRAY, VERTICAL CUMULATIVE SUM
//====================================================================================================
int in2_pad_add_rows;
int in2_pad_add_cols;
int in2_pad_cumv_rows;
int in2_pad_cumv_cols;
int in2_pad_cumv_elem;
int in2_pad_cumv_mem;
//====================================================================================================
// SELECTION
//====================================================================================================
int in2_pad_cumv_sel_rows;
int in2_pad_cumv_sel_cols;
int in2_pad_cumv_sel_elem;
int in2_pad_cumv_sel_mem;
int in2_pad_cumv_sel_rowlow;
int in2_pad_cumv_sel_rowhig;
int in2_pad_cumv_sel_collow;
int in2_pad_cumv_sel_colhig;
//====================================================================================================
// SELECTION 2, SUBTRACTION, HORIZONTAL CUMULATIVE SUM
//====================================================================================================
int in2_pad_cumv_sel2_rowlow;
int in2_pad_cumv_sel2_rowhig;
int in2_pad_cumv_sel2_collow;
int in2_pad_cumv_sel2_colhig;
int in2_sub_cumh_rows;
int in2_sub_cumh_cols;
int in2_sub_cumh_elem;
int in2_sub_cumh_mem;
//====================================================================================================
// SELECTION
//====================================================================================================
int in2_sub_cumh_sel_rows;
int in2_sub_cumh_sel_cols;
int in2_sub_cumh_sel_elem;
int in2_sub_cumh_sel_mem;
int in2_sub_cumh_sel_rowlow;
int in2_sub_cumh_sel_rowhig;
int in2_sub_cumh_sel_collow;
int in2_sub_cumh_sel_colhig;
//====================================================================================================
// SELECTION 2, SUBTRACTION
//====================================================================================================
int in2_sub_cumh_sel2_rowlow;
int in2_sub_cumh_sel2_rowhig;
int in2_sub_cumh_sel2_collow;
int in2_sub_cumh_sel2_colhig;
int in2_sub2_rows;
int in2_sub2_cols;
int in2_sub2_elem;
int in2_sub2_mem;
//======================================================================================================================================================
// CUMULATIVE SUM 2
//======================================================================================================================================================
//====================================================================================================
// MULTIPLICATION
//====================================================================================================
int in2_sqr_rows;
int in2_sqr_cols;
int in2_sqr_elem;
int in2_sqr_mem;
//====================================================================================================
// SELECTION 2, SUBTRACTION
//====================================================================================================
int in2_sqr_sub2_rows;
int in2_sqr_sub2_cols;
int in2_sqr_sub2_elem;
int in2_sqr_sub2_mem;
//======================================================================================================================================================
// FINAL
//======================================================================================================================================================
int in_sqr_rows;
int in_sqr_cols;
int in_sqr_elem;
int in_sqr_mem;
//======================================================================================================================================================
// TEMPLATE MASK CREATE
//======================================================================================================================================================
int tMask_rows;
int tMask_cols;
int tMask_elem;
int tMask_mem;
//======================================================================================================================================================
// POINT MASK INITIALIZE
//======================================================================================================================================================
int mask_rows;
int mask_cols;
int mask_elem;
int mask_mem;
//======================================================================================================================================================
// MASK CONVOLUTION
//======================================================================================================================================================
int mask_conv_rows;
int mask_conv_cols;
int mask_conv_elem;
int mask_conv_mem;
int mask_conv_ioffset;
int mask_conv_joffset;
} params_common;
//===============================================================================================================================================================================================================
//===============================================================================================================================================================================================================
// PARAMS_UNIQUE STRUCTURE
//===============================================================================================================================================================================================================
//===============================================================================================================================================================================================================
typedef struct params_unique {
//======================================================================================================================================================
// POINT NUMBER
//======================================================================================================================================================
int *d_Row;
int *d_Col;
int *d_tRowLoc;
int *d_tColLoc;
fp *d_T;
//======================================================================================================================================================
// POINT NUMBER
//======================================================================================================================================================
int point_no;
//======================================================================================================================================================
// RIGHT TEMPLATE FROM TEMPLATE ARRAY
//======================================================================================================================================================
int in_pointer;
//======================================================================================================================================================
// AREA AROUND POINT FROM FRAME
//======================================================================================================================================================
fp *d_in2;
//======================================================================================================================================================
// CONVOLUTION
//======================================================================================================================================================
fp *d_conv;
fp *d_in_mod;
//======================================================================================================================================================
// CUMULATIVE SUM
//======================================================================================================================================================
//====================================================================================================
// PAD ARRAY, VERTICAL CUMULATIVE SUM
//====================================================================================================
fp *d_in2_pad_cumv;
//====================================================================================================
// SELECTION
//====================================================================================================
fp *d_in2_pad_cumv_sel;
//====================================================================================================
// SELECTION 2, SUBTRACTION, HORIZONTAL CUMULATIVE SUM
//====================================================================================================
fp *d_in2_sub_cumh;
//====================================================================================================
// SELECTION
//====================================================================================================
fp *d_in2_sub_cumh_sel;
//====================================================================================================
// SELECTION 2, SUBTRACTION
//====================================================================================================
fp *d_in2_sub2;
//======================================================================================================================================================
// CUMULATIVE SUM 2
//======================================================================================================================================================
//====================================================================================================
// MULTIPLICATION
//====================================================================================================
fp *d_in2_sqr;
//====================================================================================================
// SELECTION 2, SUBTRACTION
//====================================================================================================
fp *d_in2_sqr_sub2;
//======================================================================================================================================================
// FINAL
//======================================================================================================================================================
fp *d_in_sqr;
//======================================================================================================================================================
// TEMPLATE MASK
//======================================================================================================================================================
fp *d_tMask;
//======================================================================================================================================================
// POINT MASK INITIALIZE
//======================================================================================================================================================
fp *d_mask;
//======================================================================================================================================================
// MASK CONVOLUTION
//======================================================================================================================================================
fp *d_mask_conv;
} params_unique;
//===============================================================================================================================================================================================================
//===============================================================================================================================================================================================================
// END OF STRUCTURE
//===============================================================================================================================================================================================================
//===============================================================================================================================================================================================================

File diff suppressed because it is too large Load Diff

View File

@ -1,795 +0,0 @@
//===============================================================================================================================================================================================================
//===============================================================================================================================================================================================================
// DEFINE / INCLUDE
//===============================================================================================================================================================================================================
//===============================================================================================================================================================================================================
//======================================================================================================================================================
// LIBRARIES
//======================================================================================================================================================
#include <math.h>
#include <stdlib.h>
#include <string.h>
#include <avilib.h>
#include <avimod.h>
#include <cuda.h>
//======================================================================================================================================================
// STRUCTURES, GLOBAL STRUCTURE VARIABLES
//======================================================================================================================================================
#include "define.c"
params_common_change common_change;
__constant__ params_common_change d_common_change;
params_common common;
__constant__ params_common d_common;
params_unique unique[ALL_POINTS]; // cannot determine size dynamically so choose
// more than usually needed
__constant__ params_unique d_unique[ALL_POINTS];
//======================================================================================================================================================
// KERNEL CODE
//======================================================================================================================================================
#include "kernel.cu"
// WRITE DATA FUNCTION
//===============================================================================================================================================================================================================200
void write_data(char *filename, int frameNo, int frames_processed,
int endoPoints, int *input_a, int *input_b, int epiPoints,
int *input_2a, int *input_2b) {
//================================================================================80
// VARIABLES
//================================================================================80
FILE *fid;
int i, j;
char c;
//================================================================================80
// OPEN FILE FOR READING
//================================================================================80
fid = fopen(filename, "w+");
if (fid == NULL) {
printf("The file was not opened for writing\n");
return;
}
//================================================================================80
// WRITE VALUES TO THE FILE
//================================================================================80
fprintf(fid, "Total AVI Frames: %d\n", frameNo);
fprintf(fid, "Frames Processed: %d\n", frames_processed);
fprintf(fid, "endoPoints: %d\n", endoPoints);
fprintf(fid, "epiPoints: %d", epiPoints);
for (j = 0; j < frames_processed; j++) {
fprintf(fid, "\n---Frame %d---", j);
fprintf(fid, "\n--endo--\n", j);
for (i = 0; i < endoPoints; i++) {
fprintf(fid, "%d\t", input_a[j + i * frameNo]);
}
fprintf(fid, "\n");
for (i = 0; i < endoPoints; i++) {
// if(input_b[j*size+i] > 2000) input_b[j*size+i]=0;
fprintf(fid, "%d\t", input_b[j + i * frameNo]);
}
fprintf(fid, "\n--epi--\n", j);
for (i = 0; i < epiPoints; i++) {
// if(input_2a[j*size_2+i] > 2000) input_2a[j*size_2+i]=0;
fprintf(fid, "%d\t", input_2a[j + i * frameNo]);
}
fprintf(fid, "\n");
for (i = 0; i < epiPoints; i++) {
// if(input_2b[j*size_2+i] > 2000) input_2b[j*size_2+i]=0;
fprintf(fid, "%d\t", input_2b[j + i * frameNo]);
}
}
// ================================================================================80
// CLOSE FILE
// ================================================================================80
fclose(fid);
}
//===============================================================================================================================================================================================================
//===============================================================================================================================================================================================================
// MAIN FUNCTION
//===============================================================================================================================================================================================================
//===============================================================================================================================================================================================================
int main(int argc, char *argv[]) {
cudaSetDevice(0);
printf("WG size of kernel = %d \n", NUMBER_THREADS);
//======================================================================================================================================================
// VARIABLES
//======================================================================================================================================================
// CUDA kernel execution parameters
dim3 threads;
dim3 blocks;
// counter
int i;
int frames_processed;
// frames
char *video_file_name;
avi_t *frames;
fp *frame;
//======================================================================================================================================================
// FRAME
//======================================================================================================================================================
if (argc != 3) {
printf("ERROR: usage: heartwall <inputfile> <num of frames>\n");
exit(1);
}
// open movie file
video_file_name = argv[1];
frames = (avi_t *)AVI_open_input_file(video_file_name, 1); // added casting
if (frames == NULL) {
AVI_print_error((char *)"Error with AVI_open_input_file");
return -1;
}
// common
common.no_frames = AVI_video_frames(frames);
common.frame_rows = AVI_video_height(frames);
common.frame_cols = AVI_video_width(frames);
common.frame_elem = common.frame_rows * common.frame_cols;
common.frame_mem = sizeof(fp) * common.frame_elem;
// pointers
cudaMalloc((void **)&common_change.d_frame, common.frame_mem);
//======================================================================================================================================================
// CHECK INPUT ARGUMENTS
//======================================================================================================================================================
frames_processed = atoi(argv[2]);
if (frames_processed < 0 || frames_processed > common.no_frames) {
printf("ERROR: %d is an incorrect number of frames specified, select in "
"the range of 0-%d\n",
frames_processed, common.no_frames);
return 0;
}
//======================================================================================================================================================
// HARDCODED INPUTS FROM MATLAB
//======================================================================================================================================================
//====================================================================================================
// CONSTANTS
//====================================================================================================
common.sSize = 40;
common.tSize = 25;
common.maxMove = 10;
common.alpha = 0.87;
//====================================================================================================
// ENDO POINTS
//====================================================================================================
common.endoPoints = ENDO_POINTS;
common.endo_mem = sizeof(int) * common.endoPoints;
common.endoRow = (int *)malloc(common.endo_mem);
common.endoRow[0] = 369;
common.endoRow[1] = 400;
common.endoRow[2] = 429;
common.endoRow[3] = 452;
common.endoRow[4] = 476;
common.endoRow[5] = 486;
common.endoRow[6] = 479;
common.endoRow[7] = 458;
common.endoRow[8] = 433;
common.endoRow[9] = 404;
common.endoRow[10] = 374;
common.endoRow[11] = 346;
common.endoRow[12] = 318;
common.endoRow[13] = 294;
common.endoRow[14] = 277;
common.endoRow[15] = 269;
common.endoRow[16] = 275;
common.endoRow[17] = 287;
common.endoRow[18] = 311;
common.endoRow[19] = 339;
cudaMalloc((void **)&common.d_endoRow, common.endo_mem);
cudaMemcpy(common.d_endoRow, common.endoRow, common.endo_mem,
cudaMemcpyHostToDevice);
common.endoCol = (int *)malloc(common.endo_mem);
common.endoCol[0] = 408;
common.endoCol[1] = 406;
common.endoCol[2] = 397;
common.endoCol[3] = 383;
common.endoCol[4] = 354;
common.endoCol[5] = 322;
common.endoCol[6] = 294;
common.endoCol[7] = 270;
common.endoCol[8] = 250;
common.endoCol[9] = 237;
common.endoCol[10] = 235;
common.endoCol[11] = 241;
common.endoCol[12] = 254;
common.endoCol[13] = 273;
common.endoCol[14] = 300;
common.endoCol[15] = 328;
common.endoCol[16] = 356;
common.endoCol[17] = 383;
common.endoCol[18] = 401;
common.endoCol[19] = 411;
cudaMalloc((void **)&common.d_endoCol, common.endo_mem);
cudaMemcpy(common.d_endoCol, common.endoCol, common.endo_mem,
cudaMemcpyHostToDevice);
common.tEndoRowLoc = (int *)malloc(common.endo_mem * common.no_frames);
cudaMalloc((void **)&common.d_tEndoRowLoc,
common.endo_mem * common.no_frames);
common.tEndoColLoc = (int *)malloc(common.endo_mem * common.no_frames);
cudaMalloc((void **)&common.d_tEndoColLoc,
common.endo_mem * common.no_frames);
//====================================================================================================
// EPI POINTS
//====================================================================================================
common.epiPoints = EPI_POINTS;
common.epi_mem = sizeof(int) * common.epiPoints;
common.epiRow = (int *)malloc(common.epi_mem);
common.epiRow[0] = 390;
common.epiRow[1] = 419;
common.epiRow[2] = 448;
common.epiRow[3] = 474;
common.epiRow[4] = 501;
common.epiRow[5] = 519;
common.epiRow[6] = 535;
common.epiRow[7] = 542;
common.epiRow[8] = 543;
common.epiRow[9] = 538;
common.epiRow[10] = 528;
common.epiRow[11] = 511;
common.epiRow[12] = 491;
common.epiRow[13] = 466;
common.epiRow[14] = 438;
common.epiRow[15] = 406;
common.epiRow[16] = 376;
common.epiRow[17] = 347;
common.epiRow[18] = 318;
common.epiRow[19] = 291;
common.epiRow[20] = 275;
common.epiRow[21] = 259;
common.epiRow[22] = 256;
common.epiRow[23] = 252;
common.epiRow[24] = 252;
common.epiRow[25] = 257;
common.epiRow[26] = 266;
common.epiRow[27] = 283;
common.epiRow[28] = 305;
common.epiRow[29] = 331;
common.epiRow[30] = 360;
cudaMalloc((void **)&common.d_epiRow, common.epi_mem);
cudaMemcpy(common.d_epiRow, common.epiRow, common.epi_mem,
cudaMemcpyHostToDevice);
common.epiCol = (int *)malloc(common.epi_mem);
common.epiCol[0] = 457;
common.epiCol[1] = 454;
common.epiCol[2] = 446;
common.epiCol[3] = 431;
common.epiCol[4] = 411;
common.epiCol[5] = 388;
common.epiCol[6] = 361;
common.epiCol[7] = 331;
common.epiCol[8] = 301;
common.epiCol[9] = 273;
common.epiCol[10] = 243;
common.epiCol[11] = 218;
common.epiCol[12] = 196;
common.epiCol[13] = 178;
common.epiCol[14] = 166;
common.epiCol[15] = 157;
common.epiCol[16] = 155;
common.epiCol[17] = 165;
common.epiCol[18] = 177;
common.epiCol[19] = 197;
common.epiCol[20] = 218;
common.epiCol[21] = 248;
common.epiCol[22] = 276;
common.epiCol[23] = 304;
common.epiCol[24] = 333;
common.epiCol[25] = 361;
common.epiCol[26] = 391;
common.epiCol[27] = 415;
common.epiCol[28] = 434;
common.epiCol[29] = 448;
common.epiCol[30] = 455;
cudaMalloc((void **)&common.d_epiCol, common.epi_mem);
cudaMemcpy(common.d_epiCol, common.epiCol, common.epi_mem,
cudaMemcpyHostToDevice);
common.tEpiRowLoc = (int *)malloc(common.epi_mem * common.no_frames);
cudaMalloc((void **)&common.d_tEpiRowLoc, common.epi_mem * common.no_frames);
common.tEpiColLoc = (int *)malloc(common.epi_mem * common.no_frames);
cudaMalloc((void **)&common.d_tEpiColLoc, common.epi_mem * common.no_frames);
//====================================================================================================
// ALL POINTS
//====================================================================================================
common.allPoints = ALL_POINTS;
//======================================================================================================================================================
// TEMPLATE SIZES
//======================================================================================================================================================
// common
common.in_rows = common.tSize + 1 + common.tSize;
common.in_cols = common.in_rows;
common.in_elem = common.in_rows * common.in_cols;
common.in_mem = sizeof(fp) * common.in_elem;
//======================================================================================================================================================
// CREATE ARRAY OF TEMPLATES FOR ALL POINTS
//======================================================================================================================================================
// common
cudaMalloc((void **)&common.d_endoT, common.in_mem * common.endoPoints);
cudaMalloc((void **)&common.d_epiT, common.in_mem * common.epiPoints);
//======================================================================================================================================================
// SPECIFIC TO ENDO OR EPI TO BE SET HERE
//======================================================================================================================================================
for (i = 0; i < common.endoPoints; i++) {
unique[i].point_no = i;
unique[i].d_Row = common.d_endoRow;
unique[i].d_Col = common.d_endoCol;
unique[i].d_tRowLoc = common.d_tEndoRowLoc;
unique[i].d_tColLoc = common.d_tEndoColLoc;
unique[i].d_T = common.d_endoT;
}
for (i = common.endoPoints; i < common.allPoints; i++) {
unique[i].point_no = i - common.endoPoints;
unique[i].d_Row = common.d_epiRow;
unique[i].d_Col = common.d_epiCol;
unique[i].d_tRowLoc = common.d_tEpiRowLoc;
unique[i].d_tColLoc = common.d_tEpiColLoc;
unique[i].d_T = common.d_epiT;
}
//======================================================================================================================================================
// RIGHT TEMPLATE FROM TEMPLATE ARRAY
//======================================================================================================================================================
// pointers
for (i = 0; i < common.allPoints; i++) {
unique[i].in_pointer = unique[i].point_no * common.in_elem;
}
//======================================================================================================================================================
// AREA AROUND POINT FROM FRAME
//======================================================================================================================================================
// common
common.in2_rows = 2 * common.sSize + 1;
common.in2_cols = 2 * common.sSize + 1;
common.in2_elem = common.in2_rows * common.in2_cols;
common.in2_mem = sizeof(float) * common.in2_elem;
// pointers
for (i = 0; i < common.allPoints; i++) {
cudaMalloc((void **)&unique[i].d_in2, common.in2_mem);
}
//======================================================================================================================================================
// CONVOLUTION
//======================================================================================================================================================
// common
common.conv_rows =
common.in_rows + common.in2_rows - 1; // number of rows in I
common.conv_cols =
common.in_cols + common.in2_cols - 1; // number of columns in I
common.conv_elem = common.conv_rows * common.conv_cols; // number of elements
common.conv_mem = sizeof(float) * common.conv_elem;
common.ioffset = 0;
common.joffset = 0;
// pointers
for (i = 0; i < common.allPoints; i++) {
cudaMalloc((void **)&unique[i].d_conv, common.conv_mem);
}
//======================================================================================================================================================
// CUMULATIVE SUM
//======================================================================================================================================================
//====================================================================================================
// PADDING OF ARRAY, VERTICAL CUMULATIVE SUM
//====================================================================================================
// common
common.in2_pad_add_rows = common.in_rows;
common.in2_pad_add_cols = common.in_cols;
common.in2_pad_cumv_rows = common.in2_rows + 2 * common.in2_pad_add_rows;
common.in2_pad_cumv_cols = common.in2_cols + 2 * common.in2_pad_add_cols;
common.in2_pad_cumv_elem =
common.in2_pad_cumv_rows * common.in2_pad_cumv_cols;
common.in2_pad_cumv_mem = sizeof(float) * common.in2_pad_cumv_elem;
// pointers
for (i = 0; i < common.allPoints; i++) {
cudaMalloc((void **)&unique[i].d_in2_pad_cumv, common.in2_pad_cumv_mem);
}
//====================================================================================================
// SELECTION
//====================================================================================================
// common
common.in2_pad_cumv_sel_rowlow = 1 + common.in_rows; // (1 to n+1)
common.in2_pad_cumv_sel_rowhig = common.in2_pad_cumv_rows - 1;
common.in2_pad_cumv_sel_collow = 1;
common.in2_pad_cumv_sel_colhig = common.in2_pad_cumv_cols;
common.in2_pad_cumv_sel_rows =
common.in2_pad_cumv_sel_rowhig - common.in2_pad_cumv_sel_rowlow + 1;
common.in2_pad_cumv_sel_cols =
common.in2_pad_cumv_sel_colhig - common.in2_pad_cumv_sel_collow + 1;
common.in2_pad_cumv_sel_elem =
common.in2_pad_cumv_sel_rows * common.in2_pad_cumv_sel_cols;
common.in2_pad_cumv_sel_mem = sizeof(float) * common.in2_pad_cumv_sel_elem;
// pointers
for (i = 0; i < common.allPoints; i++) {
cudaMalloc((void **)&unique[i].d_in2_pad_cumv_sel,
common.in2_pad_cumv_sel_mem);
}
//====================================================================================================
// SELECTION 2, SUBTRACTION, HORIZONTAL CUMULATIVE SUM
//====================================================================================================
// common
common.in2_pad_cumv_sel2_rowlow = 1;
common.in2_pad_cumv_sel2_rowhig =
common.in2_pad_cumv_rows - common.in_rows - 1;
common.in2_pad_cumv_sel2_collow = 1;
common.in2_pad_cumv_sel2_colhig = common.in2_pad_cumv_cols;
common.in2_sub_cumh_rows =
common.in2_pad_cumv_sel2_rowhig - common.in2_pad_cumv_sel2_rowlow + 1;
common.in2_sub_cumh_cols =
common.in2_pad_cumv_sel2_colhig - common.in2_pad_cumv_sel2_collow + 1;
common.in2_sub_cumh_elem =
common.in2_sub_cumh_rows * common.in2_sub_cumh_cols;
common.in2_sub_cumh_mem = sizeof(float) * common.in2_sub_cumh_elem;
// pointers
for (i = 0; i < common.allPoints; i++) {
cudaMalloc((void **)&unique[i].d_in2_sub_cumh, common.in2_sub_cumh_mem);
}
//====================================================================================================
// SELECTION
//====================================================================================================
// common
common.in2_sub_cumh_sel_rowlow = 1;
common.in2_sub_cumh_sel_rowhig = common.in2_sub_cumh_rows;
common.in2_sub_cumh_sel_collow = 1 + common.in_cols;
common.in2_sub_cumh_sel_colhig = common.in2_sub_cumh_cols - 1;
common.in2_sub_cumh_sel_rows =
common.in2_sub_cumh_sel_rowhig - common.in2_sub_cumh_sel_rowlow + 1;
common.in2_sub_cumh_sel_cols =
common.in2_sub_cumh_sel_colhig - common.in2_sub_cumh_sel_collow + 1;
common.in2_sub_cumh_sel_elem =
common.in2_sub_cumh_sel_rows * common.in2_sub_cumh_sel_cols;
common.in2_sub_cumh_sel_mem = sizeof(float) * common.in2_sub_cumh_sel_elem;
// pointers
for (i = 0; i < common.allPoints; i++) {
cudaMalloc((void **)&unique[i].d_in2_sub_cumh_sel,
common.in2_sub_cumh_sel_mem);
}
//====================================================================================================
// SELECTION 2, SUBTRACTION
//====================================================================================================
// common
common.in2_sub_cumh_sel2_rowlow = 1;
common.in2_sub_cumh_sel2_rowhig = common.in2_sub_cumh_rows;
common.in2_sub_cumh_sel2_collow = 1;
common.in2_sub_cumh_sel2_colhig =
common.in2_sub_cumh_cols - common.in_cols - 1;
common.in2_sub2_rows =
common.in2_sub_cumh_sel2_rowhig - common.in2_sub_cumh_sel2_rowlow + 1;
common.in2_sub2_cols =
common.in2_sub_cumh_sel2_colhig - common.in2_sub_cumh_sel2_collow + 1;
common.in2_sub2_elem = common.in2_sub2_rows * common.in2_sub2_cols;
common.in2_sub2_mem = sizeof(float) * common.in2_sub2_elem;
// pointers
for (i = 0; i < common.allPoints; i++) {
cudaMalloc((void **)&unique[i].d_in2_sub2, common.in2_sub2_mem);
}
//======================================================================================================================================================
// CUMULATIVE SUM 2
//======================================================================================================================================================
//====================================================================================================
// MULTIPLICATION
//====================================================================================================
// common
common.in2_sqr_rows = common.in2_rows;
common.in2_sqr_cols = common.in2_cols;
common.in2_sqr_elem = common.in2_elem;
common.in2_sqr_mem = common.in2_mem;
// pointers
for (i = 0; i < common.allPoints; i++) {
cudaMalloc((void **)&unique[i].d_in2_sqr, common.in2_sqr_mem);
}
//====================================================================================================
// SELECTION 2, SUBTRACTION
//====================================================================================================
// common
common.in2_sqr_sub2_rows = common.in2_sub2_rows;
common.in2_sqr_sub2_cols = common.in2_sub2_cols;
common.in2_sqr_sub2_elem = common.in2_sub2_elem;
common.in2_sqr_sub2_mem = common.in2_sub2_mem;
// pointers
for (i = 0; i < common.allPoints; i++) {
cudaMalloc((void **)&unique[i].d_in2_sqr_sub2, common.in2_sqr_sub2_mem);
}
//======================================================================================================================================================
// FINAL
//======================================================================================================================================================
// common
common.in_sqr_rows = common.in_rows;
common.in_sqr_cols = common.in_cols;
common.in_sqr_elem = common.in_elem;
common.in_sqr_mem = common.in_mem;
// pointers
for (i = 0; i < common.allPoints; i++) {
cudaMalloc((void **)&unique[i].d_in_sqr, common.in_sqr_mem);
}
//======================================================================================================================================================
// TEMPLATE MASK CREATE
//======================================================================================================================================================
// common
common.tMask_rows = common.in_rows + (common.sSize + 1 + common.sSize) - 1;
common.tMask_cols = common.tMask_rows;
common.tMask_elem = common.tMask_rows * common.tMask_cols;
common.tMask_mem = sizeof(float) * common.tMask_elem;
// pointers
for (i = 0; i < common.allPoints; i++) {
cudaMalloc((void **)&unique[i].d_tMask, common.tMask_mem);
}
//======================================================================================================================================================
// POINT MASK INITIALIZE
//======================================================================================================================================================
// common
common.mask_rows = common.maxMove;
common.mask_cols = common.mask_rows;
common.mask_elem = common.mask_rows * common.mask_cols;
common.mask_mem = sizeof(float) * common.mask_elem;
//======================================================================================================================================================
// MASK CONVOLUTION
//======================================================================================================================================================
// common
common.mask_conv_rows = common.tMask_rows; // number of rows in I
common.mask_conv_cols = common.tMask_cols; // number of columns in I
common.mask_conv_elem =
common.mask_conv_rows * common.mask_conv_cols; // number of elements
common.mask_conv_mem = sizeof(float) * common.mask_conv_elem;
common.mask_conv_ioffset = (common.mask_rows - 1) / 2;
if ((common.mask_rows - 1) % 2 > 0.5) {
common.mask_conv_ioffset = common.mask_conv_ioffset + 1;
}
common.mask_conv_joffset = (common.mask_cols - 1) / 2;
if ((common.mask_cols - 1) % 2 > 0.5) {
common.mask_conv_joffset = common.mask_conv_joffset + 1;
}
// pointers
for (i = 0; i < common.allPoints; i++) {
cudaMalloc((void **)&unique[i].d_mask_conv, common.mask_conv_mem);
}
//======================================================================================================================================================
// KERNEL
//======================================================================================================================================================
//====================================================================================================
// THREAD BLOCK
//====================================================================================================
// All kernels operations within kernel use same max size of threads. Size of
// block size is set to the size appropriate for max size operation (on padded
// matrix). Other use subsets of that.
threads.x = NUMBER_THREADS; // define the number of threads in the block
threads.y = 1;
blocks.x = common.allPoints; // define the number of blocks in the grid
blocks.y = 1;
//====================================================================================================
// COPY ARGUMENTS
//====================================================================================================
cudaMemcpyToSymbol(d_common, &common, sizeof(params_common));
cudaMemcpyToSymbol(d_unique, &unique, sizeof(params_unique) * ALL_POINTS);
//====================================================================================================
// PRINT FRAME PROGRESS START
//====================================================================================================
printf("frame progress: ");
fflush(NULL);
//====================================================================================================
// LAUNCH
//====================================================================================================
for (common_change.frame_no = 0; common_change.frame_no < frames_processed;
common_change.frame_no++) {
printf("get frame\n");
// Extract a cropped version of the first frame from the video file
frame = get_frame(
frames, // pointer to video file
common_change.frame_no, // number of frame that needs to be returned
0, // cropped?
0, // scaled?
1); // converted
printf("memcpy\n");
// copy frame to GPU memory
cudaMemcpy(common_change.d_frame, frame, common.frame_mem,
cudaMemcpyHostToDevice);
printf("toSymbol\n");
cudaMemcpyToSymbol(d_common_change, &common_change,
sizeof(params_common_change));
// launch GPU kernel
printf("launch\n");
kernel<<<1, 32>>>();
cudaDeviceSynchronize();
printf("return\n");
// free frame after each loop iteration, since AVI library allocates memory
// for every frame fetched
printf("free\n");
free(frame);
// print frame progress
printf("%d ", common_change.frame_no);
fflush(NULL);
}
//====================================================================================================
// PRINT FRAME PROGRESS END
//====================================================================================================
printf("\n");
fflush(NULL);
//====================================================================================================
// OUTPUT
//====================================================================================================
cudaMemcpy(common.tEndoRowLoc, common.d_tEndoRowLoc,
common.endo_mem * common.no_frames, cudaMemcpyDeviceToHost);
cudaMemcpy(common.tEndoColLoc, common.d_tEndoColLoc,
common.endo_mem * common.no_frames, cudaMemcpyDeviceToHost);
cudaMemcpy(common.tEpiRowLoc, common.d_tEpiRowLoc,
common.epi_mem * common.no_frames, cudaMemcpyDeviceToHost);
cudaMemcpy(common.tEpiColLoc, common.d_tEpiColLoc,
common.epi_mem * common.no_frames, cudaMemcpyDeviceToHost);
#ifdef OUTPUT
//==================================================50
// DUMP DATA TO FILE
//==================================================50
write_data("result.txt", common.no_frames, frames_processed,
common.endoPoints, common.tEndoRowLoc, common.tEndoColLoc,
common.epiPoints, common.tEpiRowLoc, common.tEpiColLoc);
//==================================================50
// End
//==================================================50
#endif
//======================================================================================================================================================
// DEALLOCATION
//======================================================================================================================================================
//====================================================================================================
// COMMON
//====================================================================================================
// frame
cudaFree(common_change.d_frame);
// endo points
free(common.endoRow);
free(common.endoCol);
free(common.tEndoRowLoc);
free(common.tEndoColLoc);
cudaFree(common.d_endoRow);
cudaFree(common.d_endoCol);
cudaFree(common.d_tEndoRowLoc);
cudaFree(common.d_tEndoColLoc);
cudaFree(common.d_endoT);
// epi points
free(common.epiRow);
free(common.epiCol);
free(common.tEpiRowLoc);
free(common.tEpiColLoc);
cudaFree(common.d_epiRow);
cudaFree(common.d_epiCol);
cudaFree(common.d_tEpiRowLoc);
cudaFree(common.d_tEpiColLoc);
cudaFree(common.d_epiT);
//====================================================================================================
// POINTERS
//====================================================================================================
for (i = 0; i < common.allPoints; i++) {
cudaFree(unique[i].d_in2);
cudaFree(unique[i].d_conv);
cudaFree(unique[i].d_in2_pad_cumv);
cudaFree(unique[i].d_in2_pad_cumv_sel);
cudaFree(unique[i].d_in2_sub_cumh);
cudaFree(unique[i].d_in2_sub_cumh_sel);
cudaFree(unique[i].d_in2_sub2);
cudaFree(unique[i].d_in2_sqr);
cudaFree(unique[i].d_in2_sqr_sub2);
cudaFree(unique[i].d_in_sqr);
cudaFree(unique[i].d_tMask);
cudaFree(unique[i].d_mask_conv);
}
}
//===============================================================================================================================================================================================================
//===============================================================================================================================================================================================================
// MAIN FUNCTION
//===============================================================================================================================================================================================================
//===============================================================================================================================================================================================================

View File

@ -1,17 +0,0 @@
#!/bin/bash
cd AVI; make; cd ..;
clang++ -DOUTPUT main.cu -I./AVI --cuda-path=/usr/local/cuda-10.1 --cuda-gpu-arch=sm_61 -L/usr/local/cuda-10.1/lib64 -lcudart_static -ldl -lrt -pthread -save-temps -v
/home/robinhan/repo/open_source_template/build/compilation/kernelTranslator main-cuda-nvptx64-nvidia-cuda-sm_61.bc kernel.bc
/home/robinhan/repo/open_source_template/build/compilation/hostTranslator main-host-x86_64-unknown-linux-gnu.bc host.bc
llc --relocation-model=pic --filetype=obj kernel.bc
llc --relocation-model=pic --filetype=obj host.bc
g++ -Wall -L/home/robinhan/repo/open_source_template/build/runtime -L/home/robinhan/repo/open_source_template/build/runtime/threadPool -o heartwall -fPIC -no-pie host.o kernel.o ./AVI/avilib.o ./AVI/avimod.o -lc -lx86Runtime -lthreadPool -lpthread
./heartwall /home/robinhan/repo/open_source_template/runtime/examples/rodinia-data/heartwall/test.avi 20

View File

@ -1,5 +0,0 @@
////////////////////////////////////////////////////////////////////////////////
// Set Device
////////////////////////////////////////////////////////////////////////////////
void setdevice(void) { cudaSetDevice(0); }

View File

@ -1,719 +0,0 @@
; ModuleID = 'hotspot-cuda-nvptx64-nvidia-cuda-sm_61.bc'
source_filename = "hotspot.cu"
target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
target triple = "nvptx64-nvidia-cuda"
%struct.__cuda_builtin_blockIdx_t = type { i8 }
%struct.__cuda_builtin_threadIdx_t = type { i8 }
%struct.cudaFuncAttributes = type { i64, i64, i64, i32, i32, i32, i32, i32, i32, i32 }
$_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv = comdat any
$_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_yEv = comdat any
$_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv = comdat any
$_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_yEv = comdat any
@_ZZ14calculate_tempiPfS_S_iiiiffffffE12temp_on_cuda = internal addrspace(3) global [16 x [16 x float]] undef, align 4
@_ZZ14calculate_tempiPfS_S_iiiiffffffE13power_on_cuda = internal addrspace(3) global [16 x [16 x float]] undef, align 4
@_ZZ14calculate_tempiPfS_S_iiiiffffffE6temp_t = internal addrspace(3) global [16 x [16 x float]] undef, align 4
@blockIdx = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_blockIdx_t, align 1
@threadIdx = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_threadIdx_t, align 1
; Function Attrs: convergent noinline nounwind optnone
define weak dso_local i32 @cudaMalloc(i8** %p, i64 %s) #0 {
entry:
%p.addr = alloca i8**, align 8
%s.addr = alloca i64, align 8
store i8** %p, i8*** %p.addr, align 8
store i64 %s, i64* %s.addr, align 8
ret i32 999
}
; Function Attrs: convergent noinline nounwind optnone
define weak dso_local i32 @cudaFuncGetAttributes(%struct.cudaFuncAttributes* %p, i8* %c) #0 {
entry:
%p.addr = alloca %struct.cudaFuncAttributes*, align 8
%c.addr = alloca i8*, align 8
store %struct.cudaFuncAttributes* %p, %struct.cudaFuncAttributes** %p.addr, align 8
store i8* %c, i8** %c.addr, align 8
ret i32 999
}
; Function Attrs: convergent noinline nounwind optnone
define weak dso_local i32 @cudaDeviceGetAttribute(i32* %value, i32 %attr, i32 %device) #0 {
entry:
%value.addr = alloca i32*, align 8
%attr.addr = alloca i32, align 4
%device.addr = alloca i32, align 4
store i32* %value, i32** %value.addr, align 8
store i32 %attr, i32* %attr.addr, align 4
store i32 %device, i32* %device.addr, align 4
ret i32 999
}
; Function Attrs: convergent noinline nounwind optnone
define weak dso_local i32 @cudaGetDevice(i32* %device) #0 {
entry:
%device.addr = alloca i32*, align 8
store i32* %device, i32** %device.addr, align 8
ret i32 999
}
; Function Attrs: convergent noinline nounwind optnone
define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessor(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize) #0 {
entry:
%numBlocks.addr = alloca i32*, align 8
%func.addr = alloca i8*, align 8
%blockSize.addr = alloca i32, align 4
%dynamicSmemSize.addr = alloca i64, align 8
store i32* %numBlocks, i32** %numBlocks.addr, align 8
store i8* %func, i8** %func.addr, align 8
store i32 %blockSize, i32* %blockSize.addr, align 4
store i64 %dynamicSmemSize, i64* %dynamicSmemSize.addr, align 8
ret i32 999
}
; Function Attrs: convergent noinline nounwind optnone
define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize, i32 %flags) #0 {
entry:
%numBlocks.addr = alloca i32*, align 8
%func.addr = alloca i8*, align 8
%blockSize.addr = alloca i32, align 4
%dynamicSmemSize.addr = alloca i64, align 8
%flags.addr = alloca i32, align 4
store i32* %numBlocks, i32** %numBlocks.addr, align 8
store i8* %func, i8** %func.addr, align 8
store i32 %blockSize, i32* %blockSize.addr, align 4
store i64 %dynamicSmemSize, i64* %dynamicSmemSize.addr, align 8
store i32 %flags, i32* %flags.addr, align 4
ret i32 999
}
; Function Attrs: convergent noinline nounwind optnone
define dso_local void @_Z14calculate_tempiPfS_S_iiiiffffff(i32 %iteration, float* %power, float* %temp_src, float* %temp_dst, i32 %grid_cols, i32 %grid_rows, i32 %border_cols, i32 %border_rows, float %Cap, float %Rx, float %Ry, float %Rz, float %step, float %time_elapsed) #0 {
entry:
%iteration.addr = alloca i32, align 4
%power.addr = alloca float*, align 8
%temp_src.addr = alloca float*, align 8
%temp_dst.addr = alloca float*, align 8
%grid_cols.addr = alloca i32, align 4
%grid_rows.addr = alloca i32, align 4
%border_cols.addr = alloca i32, align 4
%border_rows.addr = alloca i32, align 4
%Cap.addr = alloca float, align 4
%Rx.addr = alloca float, align 4
%Ry.addr = alloca float, align 4
%Rz.addr = alloca float, align 4
%step.addr = alloca float, align 4
%time_elapsed.addr = alloca float, align 4
%amb_temp = alloca float, align 4
%step_div_Cap = alloca float, align 4
%Rx_1 = alloca float, align 4
%Ry_1 = alloca float, align 4
%Rz_1 = alloca float, align 4
%bx = alloca i32, align 4
%by = alloca i32, align 4
%tx = alloca i32, align 4
%ty = alloca i32, align 4
%small_block_rows = alloca i32, align 4
%small_block_cols = alloca i32, align 4
%blkY = alloca i32, align 4
%blkX = alloca i32, align 4
%blkYmax = alloca i32, align 4
%blkXmax = alloca i32, align 4
%yidx = alloca i32, align 4
%xidx = alloca i32, align 4
%loadYidx = alloca i32, align 4
%loadXidx = alloca i32, align 4
%index = alloca i32, align 4
%validYmin = alloca i32, align 4
%validYmax = alloca i32, align 4
%validXmin = alloca i32, align 4
%validXmax = alloca i32, align 4
%N = alloca i32, align 4
%S = alloca i32, align 4
%W = alloca i32, align 4
%E = alloca i32, align 4
%computed = alloca i8, align 1
%i = alloca i32, align 4
store i32 %iteration, i32* %iteration.addr, align 4
store float* %power, float** %power.addr, align 8
store float* %temp_src, float** %temp_src.addr, align 8
store float* %temp_dst, float** %temp_dst.addr, align 8
store i32 %grid_cols, i32* %grid_cols.addr, align 4
store i32 %grid_rows, i32* %grid_rows.addr, align 4
store i32 %border_cols, i32* %border_cols.addr, align 4
store i32 %border_rows, i32* %border_rows.addr, align 4
store float %Cap, float* %Cap.addr, align 4
store float %Rx, float* %Rx.addr, align 4
store float %Ry, float* %Ry.addr, align 4
store float %Rz, float* %Rz.addr, align 4
store float %step, float* %step.addr, align 4
store float %time_elapsed, float* %time_elapsed.addr, align 4
store float 8.000000e+01, float* %amb_temp, align 4
%call = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #2
store i32 %call, i32* %bx, align 4
%call1 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_yEv() #2
store i32 %call1, i32* %by, align 4
%call2 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #2
store i32 %call2, i32* %tx, align 4
%call3 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_yEv() #2
store i32 %call3, i32* %ty, align 4
%0 = load float, float* %step.addr, align 4
%1 = load float, float* %Cap.addr, align 4
%div = fdiv float %0, %1
store float %div, float* %step_div_Cap, align 4
%2 = load float, float* %Rx.addr, align 4
%div4 = fdiv float 1.000000e+00, %2
store float %div4, float* %Rx_1, align 4
%3 = load float, float* %Ry.addr, align 4
%div5 = fdiv float 1.000000e+00, %3
store float %div5, float* %Ry_1, align 4
%4 = load float, float* %Rz.addr, align 4
%div6 = fdiv float 1.000000e+00, %4
store float %div6, float* %Rz_1, align 4
%5 = load i32, i32* %iteration.addr, align 4
%mul = mul nsw i32 %5, 2
%sub = sub nsw i32 16, %mul
store i32 %sub, i32* %small_block_rows, align 4
%6 = load i32, i32* %iteration.addr, align 4
%mul7 = mul nsw i32 %6, 2
%sub8 = sub nsw i32 16, %mul7
store i32 %sub8, i32* %small_block_cols, align 4
%7 = load i32, i32* %small_block_rows, align 4
%8 = load i32, i32* %by, align 4
%mul9 = mul nsw i32 %7, %8
%9 = load i32, i32* %border_rows.addr, align 4
%sub10 = sub nsw i32 %mul9, %9
store i32 %sub10, i32* %blkY, align 4
%10 = load i32, i32* %small_block_cols, align 4
%11 = load i32, i32* %bx, align 4
%mul11 = mul nsw i32 %10, %11
%12 = load i32, i32* %border_cols.addr, align 4
%sub12 = sub nsw i32 %mul11, %12
store i32 %sub12, i32* %blkX, align 4
%13 = load i32, i32* %blkY, align 4
%add = add nsw i32 %13, 16
%sub13 = sub nsw i32 %add, 1
store i32 %sub13, i32* %blkYmax, align 4
%14 = load i32, i32* %blkX, align 4
%add14 = add nsw i32 %14, 16
%sub15 = sub nsw i32 %add14, 1
store i32 %sub15, i32* %blkXmax, align 4
%15 = load i32, i32* %blkY, align 4
%16 = load i32, i32* %ty, align 4
%add16 = add nsw i32 %15, %16
store i32 %add16, i32* %yidx, align 4
%17 = load i32, i32* %blkX, align 4
%18 = load i32, i32* %tx, align 4
%add17 = add nsw i32 %17, %18
store i32 %add17, i32* %xidx, align 4
%19 = load i32, i32* %yidx, align 4
store i32 %19, i32* %loadYidx, align 4
%20 = load i32, i32* %xidx, align 4
store i32 %20, i32* %loadXidx, align 4
%21 = load i32, i32* %grid_cols.addr, align 4
%22 = load i32, i32* %loadYidx, align 4
%mul18 = mul nsw i32 %21, %22
%23 = load i32, i32* %loadXidx, align 4
%add19 = add nsw i32 %mul18, %23
store i32 %add19, i32* %index, align 4
%24 = load i32, i32* %loadYidx, align 4
%cmp = icmp sge i32 %24, 0
br i1 %cmp, label %land.lhs.true, label %if.end
land.lhs.true: ; preds = %entry
%25 = load i32, i32* %loadYidx, align 4
%26 = load i32, i32* %grid_rows.addr, align 4
%sub20 = sub nsw i32 %26, 1
%cmp21 = icmp sle i32 %25, %sub20
br i1 %cmp21, label %land.lhs.true22, label %if.end
land.lhs.true22: ; preds = %land.lhs.true
%27 = load i32, i32* %loadXidx, align 4
%cmp23 = icmp sge i32 %27, 0
br i1 %cmp23, label %land.lhs.true24, label %if.end
land.lhs.true24: ; preds = %land.lhs.true22
%28 = load i32, i32* %loadXidx, align 4
%29 = load i32, i32* %grid_cols.addr, align 4
%sub25 = sub nsw i32 %29, 1
%cmp26 = icmp sle i32 %28, %sub25
br i1 %cmp26, label %if.then, label %if.end
if.then: ; preds = %land.lhs.true24
%30 = load float*, float** %temp_src.addr, align 8
%31 = load i32, i32* %index, align 4
%idxprom = sext i32 %31 to i64
%arrayidx = getelementptr inbounds float, float* %30, i64 %idxprom
%32 = load float, float* %arrayidx, align 4
%33 = load i32, i32* %ty, align 4
%idxprom27 = sext i32 %33 to i64
%arrayidx28 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ14calculate_tempiPfS_S_iiiiffffffE12temp_on_cuda to [16 x [16 x float]]*), i64 0, i64 %idxprom27
%34 = load i32, i32* %tx, align 4
%idxprom29 = sext i32 %34 to i64
%arrayidx30 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx28, i64 0, i64 %idxprom29
store float %32, float* %arrayidx30, align 4
%35 = load float*, float** %power.addr, align 8
%36 = load i32, i32* %index, align 4
%idxprom31 = sext i32 %36 to i64
%arrayidx32 = getelementptr inbounds float, float* %35, i64 %idxprom31
%37 = load float, float* %arrayidx32, align 4
%38 = load i32, i32* %ty, align 4
%idxprom33 = sext i32 %38 to i64
%arrayidx34 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ14calculate_tempiPfS_S_iiiiffffffE13power_on_cuda to [16 x [16 x float]]*), i64 0, i64 %idxprom33
%39 = load i32, i32* %tx, align 4
%idxprom35 = sext i32 %39 to i64
%arrayidx36 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx34, i64 0, i64 %idxprom35
store float %37, float* %arrayidx36, align 4
br label %if.end
if.end: ; preds = %if.then, %land.lhs.true24, %land.lhs.true22, %land.lhs.true, %entry
call void @llvm.nvvm.barrier0()
%40 = load i32, i32* %blkY, align 4
%cmp37 = icmp slt i32 %40, 0
br i1 %cmp37, label %cond.true, label %cond.false
cond.true: ; preds = %if.end
%41 = load i32, i32* %blkY, align 4
%sub38 = sub nsw i32 0, %41
br label %cond.end
cond.false: ; preds = %if.end
br label %cond.end
cond.end: ; preds = %cond.false, %cond.true
%cond = phi i32 [ %sub38, %cond.true ], [ 0, %cond.false ]
store i32 %cond, i32* %validYmin, align 4
%42 = load i32, i32* %blkYmax, align 4
%43 = load i32, i32* %grid_rows.addr, align 4
%sub39 = sub nsw i32 %43, 1
%cmp40 = icmp sgt i32 %42, %sub39
br i1 %cmp40, label %cond.true41, label %cond.false45
cond.true41: ; preds = %cond.end
%44 = load i32, i32* %blkYmax, align 4
%45 = load i32, i32* %grid_rows.addr, align 4
%sub42 = sub nsw i32 %44, %45
%add43 = add nsw i32 %sub42, 1
%sub44 = sub nsw i32 15, %add43
br label %cond.end46
cond.false45: ; preds = %cond.end
br label %cond.end46
cond.end46: ; preds = %cond.false45, %cond.true41
%cond47 = phi i32 [ %sub44, %cond.true41 ], [ 15, %cond.false45 ]
store i32 %cond47, i32* %validYmax, align 4
%46 = load i32, i32* %blkX, align 4
%cmp48 = icmp slt i32 %46, 0
br i1 %cmp48, label %cond.true49, label %cond.false51
cond.true49: ; preds = %cond.end46
%47 = load i32, i32* %blkX, align 4
%sub50 = sub nsw i32 0, %47
br label %cond.end52
cond.false51: ; preds = %cond.end46
br label %cond.end52
cond.end52: ; preds = %cond.false51, %cond.true49
%cond53 = phi i32 [ %sub50, %cond.true49 ], [ 0, %cond.false51 ]
store i32 %cond53, i32* %validXmin, align 4
%48 = load i32, i32* %blkXmax, align 4
%49 = load i32, i32* %grid_cols.addr, align 4
%sub54 = sub nsw i32 %49, 1
%cmp55 = icmp sgt i32 %48, %sub54
br i1 %cmp55, label %cond.true56, label %cond.false60
cond.true56: ; preds = %cond.end52
%50 = load i32, i32* %blkXmax, align 4
%51 = load i32, i32* %grid_cols.addr, align 4
%sub57 = sub nsw i32 %50, %51
%add58 = add nsw i32 %sub57, 1
%sub59 = sub nsw i32 15, %add58
br label %cond.end61
cond.false60: ; preds = %cond.end52
br label %cond.end61
cond.end61: ; preds = %cond.false60, %cond.true56
%cond62 = phi i32 [ %sub59, %cond.true56 ], [ 15, %cond.false60 ]
store i32 %cond62, i32* %validXmax, align 4
%52 = load i32, i32* %ty, align 4
%sub63 = sub nsw i32 %52, 1
store i32 %sub63, i32* %N, align 4
%53 = load i32, i32* %ty, align 4
%add64 = add nsw i32 %53, 1
store i32 %add64, i32* %S, align 4
%54 = load i32, i32* %tx, align 4
%sub65 = sub nsw i32 %54, 1
store i32 %sub65, i32* %W, align 4
%55 = load i32, i32* %tx, align 4
%add66 = add nsw i32 %55, 1
store i32 %add66, i32* %E, align 4
%56 = load i32, i32* %N, align 4
%57 = load i32, i32* %validYmin, align 4
%cmp67 = icmp slt i32 %56, %57
br i1 %cmp67, label %cond.true68, label %cond.false69
cond.true68: ; preds = %cond.end61
%58 = load i32, i32* %validYmin, align 4
br label %cond.end70
cond.false69: ; preds = %cond.end61
%59 = load i32, i32* %N, align 4
br label %cond.end70
cond.end70: ; preds = %cond.false69, %cond.true68
%cond71 = phi i32 [ %58, %cond.true68 ], [ %59, %cond.false69 ]
store i32 %cond71, i32* %N, align 4
%60 = load i32, i32* %S, align 4
%61 = load i32, i32* %validYmax, align 4
%cmp72 = icmp sgt i32 %60, %61
br i1 %cmp72, label %cond.true73, label %cond.false74
cond.true73: ; preds = %cond.end70
%62 = load i32, i32* %validYmax, align 4
br label %cond.end75
cond.false74: ; preds = %cond.end70
%63 = load i32, i32* %S, align 4
br label %cond.end75
cond.end75: ; preds = %cond.false74, %cond.true73
%cond76 = phi i32 [ %62, %cond.true73 ], [ %63, %cond.false74 ]
store i32 %cond76, i32* %S, align 4
%64 = load i32, i32* %W, align 4
%65 = load i32, i32* %validXmin, align 4
%cmp77 = icmp slt i32 %64, %65
br i1 %cmp77, label %cond.true78, label %cond.false79
cond.true78: ; preds = %cond.end75
%66 = load i32, i32* %validXmin, align 4
br label %cond.end80
cond.false79: ; preds = %cond.end75
%67 = load i32, i32* %W, align 4
br label %cond.end80
cond.end80: ; preds = %cond.false79, %cond.true78
%cond81 = phi i32 [ %66, %cond.true78 ], [ %67, %cond.false79 ]
store i32 %cond81, i32* %W, align 4
%68 = load i32, i32* %E, align 4
%69 = load i32, i32* %validXmax, align 4
%cmp82 = icmp sgt i32 %68, %69
br i1 %cmp82, label %cond.true83, label %cond.false84
cond.true83: ; preds = %cond.end80
%70 = load i32, i32* %validXmax, align 4
br label %cond.end85
cond.false84: ; preds = %cond.end80
%71 = load i32, i32* %E, align 4
br label %cond.end85
cond.end85: ; preds = %cond.false84, %cond.true83
%cond86 = phi i32 [ %70, %cond.true83 ], [ %71, %cond.false84 ]
store i32 %cond86, i32* %E, align 4
store i32 0, i32* %i, align 4
br label %for.cond
for.cond: ; preds = %for.inc, %cond.end85
%72 = load i32, i32* %i, align 4
%73 = load i32, i32* %iteration.addr, align 4
%cmp87 = icmp slt i32 %72, %73
br i1 %cmp87, label %for.body, label %for.end
for.body: ; preds = %for.cond
store i8 0, i8* %computed, align 1
%74 = load i32, i32* %tx, align 4
%75 = load i32, i32* %i, align 4
%add88 = add nsw i32 %75, 1
%cmp89 = icmp sge i32 %74, %add88
br i1 %cmp89, label %land.lhs.true90, label %if.end175
land.lhs.true90: ; preds = %for.body
%76 = load i32, i32* %tx, align 4
%77 = load i32, i32* %i, align 4
%sub91 = sub nsw i32 16, %77
%sub92 = sub nsw i32 %sub91, 2
%cmp93 = icmp sle i32 %76, %sub92
br i1 %cmp93, label %land.lhs.true94, label %if.end175
land.lhs.true94: ; preds = %land.lhs.true90
%78 = load i32, i32* %ty, align 4
%79 = load i32, i32* %i, align 4
%add95 = add nsw i32 %79, 1
%cmp96 = icmp sge i32 %78, %add95
br i1 %cmp96, label %land.lhs.true97, label %if.end175
land.lhs.true97: ; preds = %land.lhs.true94
%80 = load i32, i32* %ty, align 4
%81 = load i32, i32* %i, align 4
%sub98 = sub nsw i32 16, %81
%sub99 = sub nsw i32 %sub98, 2
%cmp100 = icmp sle i32 %80, %sub99
br i1 %cmp100, label %land.lhs.true101, label %if.end175
land.lhs.true101: ; preds = %land.lhs.true97
%82 = load i32, i32* %tx, align 4
%83 = load i32, i32* %validXmin, align 4
%cmp102 = icmp sge i32 %82, %83
br i1 %cmp102, label %land.lhs.true103, label %if.end175
land.lhs.true103: ; preds = %land.lhs.true101
%84 = load i32, i32* %tx, align 4
%85 = load i32, i32* %validXmax, align 4
%cmp104 = icmp sle i32 %84, %85
br i1 %cmp104, label %land.lhs.true105, label %if.end175
land.lhs.true105: ; preds = %land.lhs.true103
%86 = load i32, i32* %ty, align 4
%87 = load i32, i32* %validYmin, align 4
%cmp106 = icmp sge i32 %86, %87
br i1 %cmp106, label %land.lhs.true107, label %if.end175
land.lhs.true107: ; preds = %land.lhs.true105
%88 = load i32, i32* %ty, align 4
%89 = load i32, i32* %validYmax, align 4
%cmp108 = icmp sle i32 %88, %89
br i1 %cmp108, label %if.then109, label %if.end175
if.then109: ; preds = %land.lhs.true107
store i8 1, i8* %computed, align 1
%90 = load i32, i32* %ty, align 4
%idxprom110 = sext i32 %90 to i64
%arrayidx111 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ14calculate_tempiPfS_S_iiiiffffffE12temp_on_cuda to [16 x [16 x float]]*), i64 0, i64 %idxprom110
%91 = load i32, i32* %tx, align 4
%idxprom112 = sext i32 %91 to i64
%arrayidx113 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx111, i64 0, i64 %idxprom112
%92 = load float, float* %arrayidx113, align 4
%conv = fpext float %92 to double
%93 = load float, float* %step_div_Cap, align 4
%conv114 = fpext float %93 to double
%94 = load i32, i32* %ty, align 4
%idxprom115 = sext i32 %94 to i64
%arrayidx116 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ14calculate_tempiPfS_S_iiiiffffffE13power_on_cuda to [16 x [16 x float]]*), i64 0, i64 %idxprom115
%95 = load i32, i32* %tx, align 4
%idxprom117 = sext i32 %95 to i64
%arrayidx118 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx116, i64 0, i64 %idxprom117
%96 = load float, float* %arrayidx118, align 4
%conv119 = fpext float %96 to double
%97 = load i32, i32* %S, align 4
%idxprom120 = sext i32 %97 to i64
%arrayidx121 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ14calculate_tempiPfS_S_iiiiffffffE12temp_on_cuda to [16 x [16 x float]]*), i64 0, i64 %idxprom120
%98 = load i32, i32* %tx, align 4
%idxprom122 = sext i32 %98 to i64
%arrayidx123 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx121, i64 0, i64 %idxprom122
%99 = load float, float* %arrayidx123, align 4
%100 = load i32, i32* %N, align 4
%idxprom124 = sext i32 %100 to i64
%arrayidx125 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ14calculate_tempiPfS_S_iiiiffffffE12temp_on_cuda to [16 x [16 x float]]*), i64 0, i64 %idxprom124
%101 = load i32, i32* %tx, align 4
%idxprom126 = sext i32 %101 to i64
%arrayidx127 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx125, i64 0, i64 %idxprom126
%102 = load float, float* %arrayidx127, align 4
%add128 = fadd contract float %99, %102
%conv129 = fpext float %add128 to double
%103 = load i32, i32* %ty, align 4
%idxprom130 = sext i32 %103 to i64
%arrayidx131 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ14calculate_tempiPfS_S_iiiiffffffE12temp_on_cuda to [16 x [16 x float]]*), i64 0, i64 %idxprom130
%104 = load i32, i32* %tx, align 4
%idxprom132 = sext i32 %104 to i64
%arrayidx133 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx131, i64 0, i64 %idxprom132
%105 = load float, float* %arrayidx133, align 4
%conv134 = fpext float %105 to double
%mul135 = fmul contract double 2.000000e+00, %conv134
%sub136 = fsub contract double %conv129, %mul135
%106 = load float, float* %Ry_1, align 4
%conv137 = fpext float %106 to double
%mul138 = fmul contract double %sub136, %conv137
%add139 = fadd contract double %conv119, %mul138
%107 = load i32, i32* %ty, align 4
%idxprom140 = sext i32 %107 to i64
%arrayidx141 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ14calculate_tempiPfS_S_iiiiffffffE12temp_on_cuda to [16 x [16 x float]]*), i64 0, i64 %idxprom140
%108 = load i32, i32* %E, align 4
%idxprom142 = sext i32 %108 to i64
%arrayidx143 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx141, i64 0, i64 %idxprom142
%109 = load float, float* %arrayidx143, align 4
%110 = load i32, i32* %ty, align 4
%idxprom144 = sext i32 %110 to i64
%arrayidx145 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ14calculate_tempiPfS_S_iiiiffffffE12temp_on_cuda to [16 x [16 x float]]*), i64 0, i64 %idxprom144
%111 = load i32, i32* %W, align 4
%idxprom146 = sext i32 %111 to i64
%arrayidx147 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx145, i64 0, i64 %idxprom146
%112 = load float, float* %arrayidx147, align 4
%add148 = fadd contract float %109, %112
%conv149 = fpext float %add148 to double
%113 = load i32, i32* %ty, align 4
%idxprom150 = sext i32 %113 to i64
%arrayidx151 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ14calculate_tempiPfS_S_iiiiffffffE12temp_on_cuda to [16 x [16 x float]]*), i64 0, i64 %idxprom150
%114 = load i32, i32* %tx, align 4
%idxprom152 = sext i32 %114 to i64
%arrayidx153 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx151, i64 0, i64 %idxprom152
%115 = load float, float* %arrayidx153, align 4
%conv154 = fpext float %115 to double
%mul155 = fmul contract double 2.000000e+00, %conv154
%sub156 = fsub contract double %conv149, %mul155
%116 = load float, float* %Rx_1, align 4
%conv157 = fpext float %116 to double
%mul158 = fmul contract double %sub156, %conv157
%add159 = fadd contract double %add139, %mul158
%117 = load float, float* %amb_temp, align 4
%118 = load i32, i32* %ty, align 4
%idxprom160 = sext i32 %118 to i64
%arrayidx161 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ14calculate_tempiPfS_S_iiiiffffffE12temp_on_cuda to [16 x [16 x float]]*), i64 0, i64 %idxprom160
%119 = load i32, i32* %tx, align 4
%idxprom162 = sext i32 %119 to i64
%arrayidx163 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx161, i64 0, i64 %idxprom162
%120 = load float, float* %arrayidx163, align 4
%sub164 = fsub contract float %117, %120
%121 = load float, float* %Rz_1, align 4
%mul165 = fmul contract float %sub164, %121
%conv166 = fpext float %mul165 to double
%add167 = fadd contract double %add159, %conv166
%mul168 = fmul contract double %conv114, %add167
%add169 = fadd contract double %conv, %mul168
%conv170 = fptrunc double %add169 to float
%122 = load i32, i32* %ty, align 4
%idxprom171 = sext i32 %122 to i64
%arrayidx172 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ14calculate_tempiPfS_S_iiiiffffffE6temp_t to [16 x [16 x float]]*), i64 0, i64 %idxprom171
%123 = load i32, i32* %tx, align 4
%idxprom173 = sext i32 %123 to i64
%arrayidx174 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx172, i64 0, i64 %idxprom173
store float %conv170, float* %arrayidx174, align 4
br label %if.end175
if.end175: ; preds = %if.then109, %land.lhs.true107, %land.lhs.true105, %land.lhs.true103, %land.lhs.true101, %land.lhs.true97, %land.lhs.true94, %land.lhs.true90, %for.body
call void @llvm.nvvm.barrier0()
%124 = load i32, i32* %i, align 4
%125 = load i32, i32* %iteration.addr, align 4
%sub176 = sub nsw i32 %125, 1
%cmp177 = icmp eq i32 %124, %sub176
br i1 %cmp177, label %if.then178, label %if.end179
if.then178: ; preds = %if.end175
br label %for.end
if.end179: ; preds = %if.end175
%126 = load i8, i8* %computed, align 1
%tobool = trunc i8 %126 to i1
br i1 %tobool, label %if.then180, label %if.end189
if.then180: ; preds = %if.end179
%127 = load i32, i32* %ty, align 4
%idxprom181 = sext i32 %127 to i64
%arrayidx182 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ14calculate_tempiPfS_S_iiiiffffffE6temp_t to [16 x [16 x float]]*), i64 0, i64 %idxprom181
%128 = load i32, i32* %tx, align 4
%idxprom183 = sext i32 %128 to i64
%arrayidx184 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx182, i64 0, i64 %idxprom183
%129 = load float, float* %arrayidx184, align 4
%130 = load i32, i32* %ty, align 4
%idxprom185 = sext i32 %130 to i64
%arrayidx186 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ14calculate_tempiPfS_S_iiiiffffffE12temp_on_cuda to [16 x [16 x float]]*), i64 0, i64 %idxprom185
%131 = load i32, i32* %tx, align 4
%idxprom187 = sext i32 %131 to i64
%arrayidx188 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx186, i64 0, i64 %idxprom187
store float %129, float* %arrayidx188, align 4
br label %if.end189
if.end189: ; preds = %if.then180, %if.end179
call void @llvm.nvvm.barrier0()
br label %for.inc
for.inc: ; preds = %if.end189
%132 = load i32, i32* %i, align 4
%inc = add nsw i32 %132, 1
store i32 %inc, i32* %i, align 4
br label %for.cond
for.end: ; preds = %if.then178, %for.cond
%133 = load i8, i8* %computed, align 1
%tobool190 = trunc i8 %133 to i1
br i1 %tobool190, label %if.then191, label %if.end198
if.then191: ; preds = %for.end
%134 = load i32, i32* %ty, align 4
%idxprom192 = sext i32 %134 to i64
%arrayidx193 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ14calculate_tempiPfS_S_iiiiffffffE6temp_t to [16 x [16 x float]]*), i64 0, i64 %idxprom192
%135 = load i32, i32* %tx, align 4
%idxprom194 = sext i32 %135 to i64
%arrayidx195 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx193, i64 0, i64 %idxprom194
%136 = load float, float* %arrayidx195, align 4
%137 = load float*, float** %temp_dst.addr, align 8
%138 = load i32, i32* %index, align 4
%idxprom196 = sext i32 %138 to i64
%arrayidx197 = getelementptr inbounds float, float* %137, i64 %idxprom196
store float %136, float* %arrayidx197, align 4
br label %if.end198
if.end198: ; preds = %if.then191, %for.end
ret void
}
; Function Attrs: alwaysinline convergent nounwind
define linkonce_odr dso_local i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #1 comdat align 2 {
entry:
%0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
ret i32 %0
}
; Function Attrs: alwaysinline convergent nounwind
define linkonce_odr dso_local i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_yEv() #1 comdat align 2 {
entry:
%0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y()
ret i32 %0
}
; Function Attrs: alwaysinline convergent nounwind
define linkonce_odr dso_local i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #1 comdat align 2 {
entry:
%0 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
ret i32 %0
}
; Function Attrs: alwaysinline convergent nounwind
define linkonce_odr dso_local i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_yEv() #1 comdat align 2 {
entry:
%0 = call i32 @llvm.nvvm.read.ptx.sreg.tid.y()
ret i32 %0
}
; Function Attrs: convergent nounwind
declare void @llvm.nvvm.barrier0() #2
; Function Attrs: nounwind readnone
declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #3
; Function Attrs: nounwind readnone
declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.y() #3
; Function Attrs: nounwind readnone
declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() #3
; Function Attrs: nounwind readnone
declare i32 @llvm.nvvm.read.ptx.sreg.tid.y() #3
attributes #0 = { convergent noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #1 = { alwaysinline convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #2 = { convergent nounwind }
attributes #3 = { nounwind readnone }
!llvm.module.flags = !{!0, !1, !2}
!nvvm.annotations = !{!3, !4, !5, !4, !6, !6, !6, !6, !7, !7, !6}
!llvm.ident = !{!8}
!nvvmir.version = !{!9}
!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 10, i32 1]}
!1 = !{i32 1, !"wchar_size", i32 4}
!2 = !{i32 4, !"nvvm-reflect-ftz", i32 0}
!3 = !{void (i32, float*, float*, float*, i32, i32, i32, i32, float, float, float, float, float, float)* @_Z14calculate_tempiPfS_S_iiiiffffff, !"kernel", i32 1}
!4 = !{null, !"align", i32 8}
!5 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080}
!6 = !{null, !"align", i32 16}
!7 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088}
!8 = !{!"clang version 10.0.1 (https://github.com/llvm/llvm-project.git ef32c611aa214dea855364efd7ba451ec5ec3f74)"}
!9 = !{i32 1, i32 4}

File diff suppressed because one or more lines are too long

View File

@ -1,353 +0,0 @@
#include <assert.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#ifdef RD_WG_SIZE_0_0
#define BLOCK_SIZE RD_WG_SIZE_0_0
#elif defined(RD_WG_SIZE_0)
#define BLOCK_SIZE RD_WG_SIZE_0
#elif defined(RD_WG_SIZE)
#define BLOCK_SIZE RD_WG_SIZE
#else
#define BLOCK_SIZE 16
#endif
#define STR_SIZE 256
/* maximum power density possible (say 300W for a 10mm x 10mm chip) */
#define MAX_PD (3.0e6)
/* required precision in degrees */
#define PRECISION 0.001
#define SPEC_HEAT_SI 1.75e6
#define K_SI 100
/* capacitance fitting factor */
#define FACTOR_CHIP 0.5
/* chip parameters */
float t_chip = 0.0005;
float chip_height = 0.016;
float chip_width = 0.016;
/* ambient temperature, assuming no package at all */
float amb_temp = 80.0;
void run(int argc, char **argv);
/* define timer macros */
#define pin_stats_reset() startCycle()
#define pin_stats_pause(cycles) stopCycle(cycles)
#define pin_stats_dump(cycles) printf("timer: %Lu\n", cycles)
void fatal(char *s) { fprintf(stderr, "error: %s\n", s); }
void writeoutput(float *vect, int grid_rows, int grid_cols, char *file) {
int i, j, index = 0;
FILE *fp;
char str[STR_SIZE];
if ((fp = fopen(file, "w")) == 0)
printf("The file was not opened\n");
for (i = 0; i < grid_rows; i++)
for (j = 0; j < grid_cols; j++) {
sprintf(str, "%d\t%g\n", index, vect[i * grid_cols + j]);
fputs(str, fp);
index++;
}
fclose(fp);
}
void readinput(float *vect, int grid_rows, int grid_cols, char *file) {
int i, j;
FILE *fp;
char str[STR_SIZE];
float val;
if ((fp = fopen(file, "r")) == 0)
printf("The file was not opened\n");
for (i = 0; i <= grid_rows - 1; i++)
for (j = 0; j <= grid_cols - 1; j++) {
fgets(str, STR_SIZE, fp);
if (feof(fp))
fatal("not enough lines in file");
// if ((sscanf(str, "%d%f", &index, &val) != 2) || (index !=
// ((i-1)*(grid_cols-2)+j-1)))
if ((sscanf(str, "%f", &val) != 1))
fatal("invalid file format");
vect[i * grid_cols + j] = val;
}
fclose(fp);
}
#define IN_RANGE(x, min, max) ((x) >= (min) && (x) <= (max))
#define CLAMP_RANGE(x, min, max) x = (x < (min)) ? min : ((x > (max)) ? max : x)
#define MIN(a, b) ((a) <= (b) ? (a) : (b))
__global__ void calculate_temp(int iteration, // number of iteration
float *power, // power input
float *temp_src, // temperature input/output
float *temp_dst, // temperature input/output
int grid_cols, // Col of grid
int grid_rows, // Row of grid
int border_cols, // border offset
int border_rows, // border offset
float Cap, // Capacitance
float Rx, float Ry, float Rz, float step,
float time_elapsed) {
__shared__ float temp_on_cuda[BLOCK_SIZE][BLOCK_SIZE];
__shared__ float power_on_cuda[BLOCK_SIZE][BLOCK_SIZE];
__shared__ float temp_t[BLOCK_SIZE]
[BLOCK_SIZE]; // saving temparary temperature result
float amb_temp = 80.0;
float step_div_Cap;
float Rx_1, Ry_1, Rz_1;
int bx = blockIdx.x;
int by = blockIdx.y;
int tx = threadIdx.x;
int ty = threadIdx.y;
step_div_Cap = step / Cap;
Rx_1 = 1 / Rx;
Ry_1 = 1 / Ry;
Rz_1 = 1 / Rz;
// each block finally computes result for a small block
// after N iterations.
// it is the non-overlapping small blocks that cover
// all the input data
// calculate the small block size
int small_block_rows = BLOCK_SIZE - iteration * 2; // EXPAND_RATE
int small_block_cols = BLOCK_SIZE - iteration * 2; // EXPAND_RATE
// calculate the boundary for the block according to
// the boundary of its small block
int blkY = small_block_rows * by - border_rows;
int blkX = small_block_cols * bx - border_cols;
int blkYmax = blkY + BLOCK_SIZE - 1;
int blkXmax = blkX + BLOCK_SIZE - 1;
// calculate the global thread coordination
int yidx = blkY + ty;
int xidx = blkX + tx;
// load data if it is within the valid input range
int loadYidx = yidx, loadXidx = xidx;
int index = grid_cols * loadYidx + loadXidx;
if (IN_RANGE(loadYidx, 0, grid_rows - 1) &&
IN_RANGE(loadXidx, 0, grid_cols - 1)) {
temp_on_cuda[ty][tx] = temp_src[index]; // Load the temperature data from
// global memory to shared memory
power_on_cuda[ty][tx] =
power[index]; // Load the power data from global memory to shared memory
}
__syncthreads();
// effective range within this block that falls within
// the valid range of the input data
// used to rule out computation outside the boundary.
int validYmin = (blkY < 0) ? -blkY : 0;
int validYmax = (blkYmax > grid_rows - 1)
? BLOCK_SIZE - 1 - (blkYmax - grid_rows + 1)
: BLOCK_SIZE - 1;
int validXmin = (blkX < 0) ? -blkX : 0;
int validXmax = (blkXmax > grid_cols - 1)
? BLOCK_SIZE - 1 - (blkXmax - grid_cols + 1)
: BLOCK_SIZE - 1;
int N = ty - 1;
int S = ty + 1;
int W = tx - 1;
int E = tx + 1;
N = (N < validYmin) ? validYmin : N;
S = (S > validYmax) ? validYmax : S;
W = (W < validXmin) ? validXmin : W;
E = (E > validXmax) ? validXmax : E;
bool computed;
for (int i = 0; i < iteration; i++) {
computed = false;
if (IN_RANGE(tx, i + 1, BLOCK_SIZE - i - 2) &&
IN_RANGE(ty, i + 1, BLOCK_SIZE - i - 2) &&
IN_RANGE(tx, validXmin, validXmax) &&
IN_RANGE(ty, validYmin, validYmax)) {
computed = true;
temp_t[ty][tx] =
temp_on_cuda[ty][tx] +
step_div_Cap * (power_on_cuda[ty][tx] +
(temp_on_cuda[S][tx] + temp_on_cuda[N][tx] -
2.0 * temp_on_cuda[ty][tx]) *
Ry_1 +
(temp_on_cuda[ty][E] + temp_on_cuda[ty][W] -
2.0 * temp_on_cuda[ty][tx]) *
Rx_1 +
(amb_temp - temp_on_cuda[ty][tx]) * Rz_1);
}
__syncthreads();
if (i == iteration - 1)
break;
if (computed) // Assign the computation range
temp_on_cuda[ty][tx] = temp_t[ty][tx];
__syncthreads();
}
// update the global memory
// after the last iteration, only threads coordinated within the
// small block perform the calculation and switch on ``computed''
if (computed) {
temp_dst[index] = temp_t[ty][tx];
}
}
/*
compute N time steps
*/
int compute_tran_temp(float *MatrixPower, float *MatrixTemp[2], int col,
int row, int total_iterations, int num_iterations,
int blockCols, int blockRows, int borderCols,
int borderRows) {
dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
dim3 dimGrid(blockCols, blockRows);
float grid_height = chip_height / row;
float grid_width = chip_width / col;
float Cap = FACTOR_CHIP * SPEC_HEAT_SI * t_chip * grid_width * grid_height;
float Rx = grid_width / (2.0 * K_SI * t_chip * grid_height);
float Ry = grid_height / (2.0 * K_SI * t_chip * grid_width);
float Rz = t_chip / (K_SI * grid_height * grid_width);
float max_slope = MAX_PD / (FACTOR_CHIP * t_chip * SPEC_HEAT_SI);
float step = PRECISION / max_slope;
float t;
float time_elapsed;
time_elapsed = 0.001;
int src = 1, dst = 0;
for (t = 0; t < total_iterations; t += num_iterations) {
int temp = src;
src = dst;
dst = temp;
calculate_temp<<<dimGrid, dimBlock>>>(
MIN(num_iterations, total_iterations - t), MatrixPower, MatrixTemp[src],
MatrixTemp[dst], col, row, borderCols, borderRows, Cap, Rx, Ry, Rz,
step, time_elapsed);
cudaDeviceSynchronize();
}
return dst;
}
void usage(int argc, char **argv) {
fprintf(stderr,
"Usage: %s <grid_rows/grid_cols> <pyramid_height> <sim_time> "
"<temp_file> <power_file> <output_file>\n",
argv[0]);
fprintf(stderr, "\t<grid_rows/grid_cols> - number of rows/cols in the grid "
"(positive integer)\n");
fprintf(stderr, "\t<pyramid_height> - pyramid heigh(positive integer)\n");
fprintf(stderr, "\t<sim_time> - number of iterations\n");
fprintf(stderr, "\t<temp_file> - name of the file containing the initial "
"temperature values of each cell\n");
fprintf(stderr, "\t<power_file> - name of the file containing the dissipated "
"power values of each cell\n");
fprintf(stderr, "\t<output_file> - name of the output file\n");
exit(1);
}
int main(int argc, char **argv) {
cudaSetDevice(0);
printf("WG size of kernel = %d X %d\n", BLOCK_SIZE, BLOCK_SIZE);
run(argc, argv);
return EXIT_SUCCESS;
}
void run(int argc, char **argv) {
int size;
int grid_rows, grid_cols;
float *FilesavingTemp, *FilesavingPower, *MatrixOut;
char *tfile, *pfile, *ofile;
int total_iterations = 60;
int pyramid_height = 1; // number of iterations
if (argc != 7)
usage(argc, argv);
if ((grid_rows = atoi(argv[1])) <= 0 || (grid_cols = atoi(argv[1])) <= 0 ||
(pyramid_height = atoi(argv[2])) <= 0 ||
(total_iterations = atoi(argv[3])) <= 0)
usage(argc, argv);
tfile = argv[4];
pfile = argv[5];
ofile = argv[6];
size = grid_rows * grid_cols;
/* --------------- pyramid parameters --------------- */
#define EXPAND_RATE \
2 // add one iteration will extend the pyramid base by 2 per each borderline
int borderCols = (pyramid_height)*EXPAND_RATE / 2;
int borderRows = (pyramid_height)*EXPAND_RATE / 2;
int smallBlockCol = BLOCK_SIZE - (pyramid_height)*EXPAND_RATE;
int smallBlockRow = BLOCK_SIZE - (pyramid_height)*EXPAND_RATE;
int blockCols =
grid_cols / smallBlockCol + ((grid_cols % smallBlockCol == 0) ? 0 : 1);
int blockRows =
grid_rows / smallBlockRow + ((grid_rows % smallBlockRow == 0) ? 0 : 1);
FilesavingTemp = (float *)malloc(size * sizeof(float));
FilesavingPower = (float *)malloc(size * sizeof(float));
MatrixOut = (float *)calloc(size, sizeof(float));
if (!FilesavingPower || !FilesavingTemp || !MatrixOut)
fatal("unable to allocate memory");
printf("pyramidHeight: %d\ngridSize: [%d, %d]\nborder:[%d, "
"%d]\nblockGrid:[%d, %d]\ntargetBlock:[%d, %d]\n",
pyramid_height, grid_cols, grid_rows, borderCols, borderRows,
blockCols, blockRows, smallBlockCol, smallBlockRow);
readinput(FilesavingTemp, grid_rows, grid_cols, tfile);
readinput(FilesavingPower, grid_rows, grid_cols, pfile);
float *MatrixTemp[2], *MatrixPower;
cudaMalloc((void **)&MatrixTemp[0], sizeof(float) * size);
cudaMalloc((void **)&MatrixTemp[1], sizeof(float) * size);
cudaMemcpy(MatrixTemp[0], FilesavingTemp, sizeof(float) * size,
cudaMemcpyHostToDevice);
cudaMalloc((void **)&MatrixPower, sizeof(float) * size);
cudaMemcpy(MatrixPower, FilesavingPower, sizeof(float) * size,
cudaMemcpyHostToDevice);
printf("Start computing the transient temperature\n");
int ret = compute_tran_temp(MatrixPower, MatrixTemp, grid_cols, grid_rows,
total_iterations, pyramid_height, blockCols,
blockRows, borderCols, borderRows);
printf("Ending simulation\n");
cudaMemcpy(MatrixOut, MatrixTemp[ret], sizeof(float) * size,
cudaMemcpyDeviceToHost);
writeoutput(MatrixOut, grid_rows, grid_cols, ofile);
cudaFree(MatrixPower);
cudaFree(MatrixTemp[0]);
cudaFree(MatrixTemp[1]);
free(MatrixOut);
}

View File

@ -1,21 +0,0 @@
#!/bin/bash
set -e
llvm-as hotspot-cuda-nvptx64-nvidia-cuda-sm_61.ll
llvm-as hotspot-host-x86_64-unknown-linux-gnu.ll
../../build/compilation/kernelTranslator hotspot-cuda-nvptx64-nvidia-cuda-sm_61.bc kernel.bc
../../build/compilation/hostTranslator hotspot-host-x86_64-unknown-linux-gnu.bc host.bc
llc --relocation-model=pic --filetype=obj kernel.bc
llc --relocation-model=pic --filetype=obj host.bc
g++ -Wall -L../../build/runtime -L../../build/runtime/threadPool \
-o hotspot -fPIC -no-pie host.o kernel.o -lc -lx86Runtime -lthreadPool -lpthread
export LD_LIBRARY_PATH=../../build/runtime:../../build/runtime/threadPool:$LD_LIBRARY_PATH
./hotspot 512 2 2 ../../rodinia-data/hotspot/temp_512 ../../rodinia-data/hotspot/power_512 output.out
if head output.out | grep -q "323.829"; then
echo "Pass"
else
echo "Error result"
exit 1
fi

View File

@ -1,587 +0,0 @@
; ModuleID = '3D-cuda-nvptx64-nvidia-cuda-sm_61.bc'
source_filename = "3D.cu"
target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
target triple = "nvptx64-nvidia-cuda"
%struct.__cuda_builtin_blockDim_t = type { i8 }
%struct.__cuda_builtin_blockIdx_t = type { i8 }
%struct.__cuda_builtin_threadIdx_t = type { i8 }
%struct.cudaFuncAttributes = type { i64, i64, i64, i32, i32, i32, i32, i32, i32, i32 }
$_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv = comdat any
$_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv = comdat any
$_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv = comdat any
$_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_yEv = comdat any
$_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_yEv = comdat any
$_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_yEv = comdat any
@blockDim = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_blockDim_t, align 1
@blockIdx = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_blockIdx_t, align 1
@threadIdx = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_threadIdx_t, align 1
; Function Attrs: convergent noinline nounwind optnone
define weak dso_local i32 @cudaMalloc(i8** %p, i64 %s) #0 {
entry:
%p.addr = alloca i8**, align 8
%s.addr = alloca i64, align 8
store i8** %p, i8*** %p.addr, align 8
store i64 %s, i64* %s.addr, align 8
ret i32 999
}
; Function Attrs: convergent noinline nounwind optnone
define weak dso_local i32 @cudaFuncGetAttributes(%struct.cudaFuncAttributes* %p, i8* %c) #0 {
entry:
%p.addr = alloca %struct.cudaFuncAttributes*, align 8
%c.addr = alloca i8*, align 8
store %struct.cudaFuncAttributes* %p, %struct.cudaFuncAttributes** %p.addr, align 8
store i8* %c, i8** %c.addr, align 8
ret i32 999
}
; Function Attrs: convergent noinline nounwind optnone
define weak dso_local i32 @cudaDeviceGetAttribute(i32* %value, i32 %attr, i32 %device) #0 {
entry:
%value.addr = alloca i32*, align 8
%attr.addr = alloca i32, align 4
%device.addr = alloca i32, align 4
store i32* %value, i32** %value.addr, align 8
store i32 %attr, i32* %attr.addr, align 4
store i32 %device, i32* %device.addr, align 4
ret i32 999
}
; Function Attrs: convergent noinline nounwind optnone
define weak dso_local i32 @cudaGetDevice(i32* %device) #0 {
entry:
%device.addr = alloca i32*, align 8
store i32* %device, i32** %device.addr, align 8
ret i32 999
}
; Function Attrs: convergent noinline nounwind optnone
define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessor(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize) #0 {
entry:
%numBlocks.addr = alloca i32*, align 8
%func.addr = alloca i8*, align 8
%blockSize.addr = alloca i32, align 4
%dynamicSmemSize.addr = alloca i64, align 8
store i32* %numBlocks, i32** %numBlocks.addr, align 8
store i8* %func, i8** %func.addr, align 8
store i32 %blockSize, i32* %blockSize.addr, align 4
store i64 %dynamicSmemSize, i64* %dynamicSmemSize.addr, align 8
ret i32 999
}
; Function Attrs: convergent noinline nounwind optnone
define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize, i32 %flags) #0 {
entry:
%numBlocks.addr = alloca i32*, align 8
%func.addr = alloca i8*, align 8
%blockSize.addr = alloca i32, align 4
%dynamicSmemSize.addr = alloca i64, align 8
%flags.addr = alloca i32, align 4
store i32* %numBlocks, i32** %numBlocks.addr, align 8
store i8* %func, i8** %func.addr, align 8
store i32 %blockSize, i32* %blockSize.addr, align 4
store i64 %dynamicSmemSize, i64* %dynamicSmemSize.addr, align 8
store i32 %flags, i32* %flags.addr, align 4
ret i32 999
}
; Function Attrs: convergent noinline nounwind optnone
define dso_local void @_Z11hotspotOpt1PfS_S_fiiifffffff(float* %p, float* %tIn, float* %tOut, float %sdc, i32 %nx, i32 %ny, i32 %nz, float %ce, float %cw, float %cn, float %cs, float %ct, float %cb, float %cc) #0 {
entry:
%p.addr = alloca float*, align 8
%tIn.addr = alloca float*, align 8
%tOut.addr = alloca float*, align 8
%sdc.addr = alloca float, align 4
%nx.addr = alloca i32, align 4
%ny.addr = alloca i32, align 4
%nz.addr = alloca i32, align 4
%ce.addr = alloca float, align 4
%cw.addr = alloca float, align 4
%cn.addr = alloca float, align 4
%cs.addr = alloca float, align 4
%ct.addr = alloca float, align 4
%cb.addr = alloca float, align 4
%cc.addr = alloca float, align 4
%amb_temp = alloca float, align 4
%i = alloca i32, align 4
%j = alloca i32, align 4
%c = alloca i32, align 4
%xy = alloca i32, align 4
%W = alloca i32, align 4
%E = alloca i32, align 4
%N = alloca i32, align 4
%S = alloca i32, align 4
%temp1 = alloca float, align 4
%temp2 = alloca float, align 4
%temp3 = alloca float, align 4
%k = alloca i32, align 4
store float* %p, float** %p.addr, align 8
store float* %tIn, float** %tIn.addr, align 8
store float* %tOut, float** %tOut.addr, align 8
store float %sdc, float* %sdc.addr, align 4
store i32 %nx, i32* %nx.addr, align 4
store i32 %ny, i32* %ny.addr, align 4
store i32 %nz, i32* %nz.addr, align 4
store float %ce, float* %ce.addr, align 4
store float %cw, float* %cw.addr, align 4
store float %cn, float* %cn.addr, align 4
store float %cs, float* %cs.addr, align 4
store float %ct, float* %ct.addr, align 4
store float %cb, float* %cb.addr, align 4
store float %cc, float* %cc.addr, align 4
store float 8.000000e+01, float* %amb_temp, align 4
%call = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #3
%call1 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #3
%mul = mul i32 %call, %call1
%call2 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #3
%add = add i32 %mul, %call2
store i32 %add, i32* %i, align 4
%call3 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_yEv() #3
%call4 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_yEv() #3
%mul5 = mul i32 %call3, %call4
%call6 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_yEv() #3
%add7 = add i32 %mul5, %call6
store i32 %add7, i32* %j, align 4
%0 = load i32, i32* %i, align 4
%1 = load i32, i32* %j, align 4
%2 = load i32, i32* %nx.addr, align 4
%mul8 = mul nsw i32 %1, %2
%add9 = add nsw i32 %0, %mul8
store i32 %add9, i32* %c, align 4
%3 = load i32, i32* %nx.addr, align 4
%4 = load i32, i32* %ny.addr, align 4
%mul10 = mul nsw i32 %3, %4
store i32 %mul10, i32* %xy, align 4
%5 = load i32, i32* %i, align 4
%cmp = icmp eq i32 %5, 0
br i1 %cmp, label %cond.true, label %cond.false
cond.true: ; preds = %entry
%6 = load i32, i32* %c, align 4
br label %cond.end
cond.false: ; preds = %entry
%7 = load i32, i32* %c, align 4
%sub = sub nsw i32 %7, 1
br label %cond.end
cond.end: ; preds = %cond.false, %cond.true
%cond = phi i32 [ %6, %cond.true ], [ %sub, %cond.false ]
store i32 %cond, i32* %W, align 4
%8 = load i32, i32* %i, align 4
%9 = load i32, i32* %nx.addr, align 4
%sub11 = sub nsw i32 %9, 1
%cmp12 = icmp eq i32 %8, %sub11
br i1 %cmp12, label %cond.true13, label %cond.false14
cond.true13: ; preds = %cond.end
%10 = load i32, i32* %c, align 4
br label %cond.end16
cond.false14: ; preds = %cond.end
%11 = load i32, i32* %c, align 4
%add15 = add nsw i32 %11, 1
br label %cond.end16
cond.end16: ; preds = %cond.false14, %cond.true13
%cond17 = phi i32 [ %10, %cond.true13 ], [ %add15, %cond.false14 ]
store i32 %cond17, i32* %E, align 4
%12 = load i32, i32* %j, align 4
%cmp18 = icmp eq i32 %12, 0
br i1 %cmp18, label %cond.true19, label %cond.false20
cond.true19: ; preds = %cond.end16
%13 = load i32, i32* %c, align 4
br label %cond.end22
cond.false20: ; preds = %cond.end16
%14 = load i32, i32* %c, align 4
%15 = load i32, i32* %nx.addr, align 4
%sub21 = sub nsw i32 %14, %15
br label %cond.end22
cond.end22: ; preds = %cond.false20, %cond.true19
%cond23 = phi i32 [ %13, %cond.true19 ], [ %sub21, %cond.false20 ]
store i32 %cond23, i32* %N, align 4
%16 = load i32, i32* %j, align 4
%17 = load i32, i32* %ny.addr, align 4
%sub24 = sub nsw i32 %17, 1
%cmp25 = icmp eq i32 %16, %sub24
br i1 %cmp25, label %cond.true26, label %cond.false27
cond.true26: ; preds = %cond.end22
%18 = load i32, i32* %c, align 4
br label %cond.end29
cond.false27: ; preds = %cond.end22
%19 = load i32, i32* %c, align 4
%20 = load i32, i32* %nx.addr, align 4
%add28 = add nsw i32 %19, %20
br label %cond.end29
cond.end29: ; preds = %cond.false27, %cond.true26
%cond30 = phi i32 [ %18, %cond.true26 ], [ %add28, %cond.false27 ]
store i32 %cond30, i32* %S, align 4
%21 = load float*, float** %tIn.addr, align 8
%22 = load i32, i32* %c, align 4
%idxprom = sext i32 %22 to i64
%arrayidx = getelementptr inbounds float, float* %21, i64 %idxprom
%23 = load float, float* %arrayidx, align 4
store float %23, float* %temp2, align 4
store float %23, float* %temp1, align 4
%24 = load float*, float** %tIn.addr, align 8
%25 = load i32, i32* %c, align 4
%26 = load i32, i32* %xy, align 4
%add31 = add nsw i32 %25, %26
%idxprom32 = sext i32 %add31 to i64
%arrayidx33 = getelementptr inbounds float, float* %24, i64 %idxprom32
%27 = load float, float* %arrayidx33, align 4
store float %27, float* %temp3, align 4
%28 = load float, float* %cc.addr, align 4
%29 = load float, float* %temp2, align 4
%mul34 = fmul contract float %28, %29
%30 = load float, float* %cw.addr, align 4
%31 = load float*, float** %tIn.addr, align 8
%32 = load i32, i32* %W, align 4
%idxprom35 = sext i32 %32 to i64
%arrayidx36 = getelementptr inbounds float, float* %31, i64 %idxprom35
%33 = load float, float* %arrayidx36, align 4
%mul37 = fmul contract float %30, %33
%add38 = fadd contract float %mul34, %mul37
%34 = load float, float* %ce.addr, align 4
%35 = load float*, float** %tIn.addr, align 8
%36 = load i32, i32* %E, align 4
%idxprom39 = sext i32 %36 to i64
%arrayidx40 = getelementptr inbounds float, float* %35, i64 %idxprom39
%37 = load float, float* %arrayidx40, align 4
%mul41 = fmul contract float %34, %37
%add42 = fadd contract float %add38, %mul41
%38 = load float, float* %cs.addr, align 4
%39 = load float*, float** %tIn.addr, align 8
%40 = load i32, i32* %S, align 4
%idxprom43 = sext i32 %40 to i64
%arrayidx44 = getelementptr inbounds float, float* %39, i64 %idxprom43
%41 = load float, float* %arrayidx44, align 4
%mul45 = fmul contract float %38, %41
%add46 = fadd contract float %add42, %mul45
%42 = load float, float* %cn.addr, align 4
%43 = load float*, float** %tIn.addr, align 8
%44 = load i32, i32* %N, align 4
%idxprom47 = sext i32 %44 to i64
%arrayidx48 = getelementptr inbounds float, float* %43, i64 %idxprom47
%45 = load float, float* %arrayidx48, align 4
%mul49 = fmul contract float %42, %45
%add50 = fadd contract float %add46, %mul49
%46 = load float, float* %cb.addr, align 4
%47 = load float, float* %temp1, align 4
%mul51 = fmul contract float %46, %47
%add52 = fadd contract float %add50, %mul51
%48 = load float, float* %ct.addr, align 4
%49 = load float, float* %temp3, align 4
%mul53 = fmul contract float %48, %49
%add54 = fadd contract float %add52, %mul53
%50 = load float, float* %sdc.addr, align 4
%51 = load float*, float** %p.addr, align 8
%52 = load i32, i32* %c, align 4
%idxprom55 = sext i32 %52 to i64
%arrayidx56 = getelementptr inbounds float, float* %51, i64 %idxprom55
%53 = load float, float* %arrayidx56, align 4
%mul57 = fmul contract float %50, %53
%add58 = fadd contract float %add54, %mul57
%54 = load float, float* %ct.addr, align 4
%55 = load float, float* %amb_temp, align 4
%mul59 = fmul contract float %54, %55
%add60 = fadd contract float %add58, %mul59
%56 = load float*, float** %tOut.addr, align 8
%57 = load i32, i32* %c, align 4
%idxprom61 = sext i32 %57 to i64
%arrayidx62 = getelementptr inbounds float, float* %56, i64 %idxprom61
store float %add60, float* %arrayidx62, align 4
%58 = load i32, i32* %xy, align 4
%59 = load i32, i32* %c, align 4
%add63 = add nsw i32 %59, %58
store i32 %add63, i32* %c, align 4
%60 = load i32, i32* %xy, align 4
%61 = load i32, i32* %W, align 4
%add64 = add nsw i32 %61, %60
store i32 %add64, i32* %W, align 4
%62 = load i32, i32* %xy, align 4
%63 = load i32, i32* %E, align 4
%add65 = add nsw i32 %63, %62
store i32 %add65, i32* %E, align 4
%64 = load i32, i32* %xy, align 4
%65 = load i32, i32* %N, align 4
%add66 = add nsw i32 %65, %64
store i32 %add66, i32* %N, align 4
%66 = load i32, i32* %xy, align 4
%67 = load i32, i32* %S, align 4
%add67 = add nsw i32 %67, %66
store i32 %add67, i32* %S, align 4
store i32 1, i32* %k, align 4
br label %for.cond
for.cond: ; preds = %for.inc, %cond.end29
%68 = load i32, i32* %k, align 4
%69 = load i32, i32* %nz.addr, align 4
%sub68 = sub nsw i32 %69, 1
%cmp69 = icmp slt i32 %68, %sub68
br i1 %cmp69, label %for.body, label %for.end
for.body: ; preds = %for.cond
%70 = load float, float* %temp2, align 4
store float %70, float* %temp1, align 4
%71 = load float, float* %temp3, align 4
store float %71, float* %temp2, align 4
%72 = load float*, float** %tIn.addr, align 8
%73 = load i32, i32* %c, align 4
%74 = load i32, i32* %xy, align 4
%add70 = add nsw i32 %73, %74
%idxprom71 = sext i32 %add70 to i64
%arrayidx72 = getelementptr inbounds float, float* %72, i64 %idxprom71
%75 = load float, float* %arrayidx72, align 4
store float %75, float* %temp3, align 4
%76 = load float, float* %cc.addr, align 4
%77 = load float, float* %temp2, align 4
%mul73 = fmul contract float %76, %77
%78 = load float, float* %cw.addr, align 4
%79 = load float*, float** %tIn.addr, align 8
%80 = load i32, i32* %W, align 4
%idxprom74 = sext i32 %80 to i64
%arrayidx75 = getelementptr inbounds float, float* %79, i64 %idxprom74
%81 = load float, float* %arrayidx75, align 4
%mul76 = fmul contract float %78, %81
%add77 = fadd contract float %mul73, %mul76
%82 = load float, float* %ce.addr, align 4
%83 = load float*, float** %tIn.addr, align 8
%84 = load i32, i32* %E, align 4
%idxprom78 = sext i32 %84 to i64
%arrayidx79 = getelementptr inbounds float, float* %83, i64 %idxprom78
%85 = load float, float* %arrayidx79, align 4
%mul80 = fmul contract float %82, %85
%add81 = fadd contract float %add77, %mul80
%86 = load float, float* %cs.addr, align 4
%87 = load float*, float** %tIn.addr, align 8
%88 = load i32, i32* %S, align 4
%idxprom82 = sext i32 %88 to i64
%arrayidx83 = getelementptr inbounds float, float* %87, i64 %idxprom82
%89 = load float, float* %arrayidx83, align 4
%mul84 = fmul contract float %86, %89
%add85 = fadd contract float %add81, %mul84
%90 = load float, float* %cn.addr, align 4
%91 = load float*, float** %tIn.addr, align 8
%92 = load i32, i32* %N, align 4
%idxprom86 = sext i32 %92 to i64
%arrayidx87 = getelementptr inbounds float, float* %91, i64 %idxprom86
%93 = load float, float* %arrayidx87, align 4
%mul88 = fmul contract float %90, %93
%add89 = fadd contract float %add85, %mul88
%94 = load float, float* %cb.addr, align 4
%95 = load float, float* %temp1, align 4
%mul90 = fmul contract float %94, %95
%add91 = fadd contract float %add89, %mul90
%96 = load float, float* %ct.addr, align 4
%97 = load float, float* %temp3, align 4
%mul92 = fmul contract float %96, %97
%add93 = fadd contract float %add91, %mul92
%98 = load float, float* %sdc.addr, align 4
%99 = load float*, float** %p.addr, align 8
%100 = load i32, i32* %c, align 4
%idxprom94 = sext i32 %100 to i64
%arrayidx95 = getelementptr inbounds float, float* %99, i64 %idxprom94
%101 = load float, float* %arrayidx95, align 4
%mul96 = fmul contract float %98, %101
%add97 = fadd contract float %add93, %mul96
%102 = load float, float* %ct.addr, align 4
%103 = load float, float* %amb_temp, align 4
%mul98 = fmul contract float %102, %103
%add99 = fadd contract float %add97, %mul98
%104 = load float*, float** %tOut.addr, align 8
%105 = load i32, i32* %c, align 4
%idxprom100 = sext i32 %105 to i64
%arrayidx101 = getelementptr inbounds float, float* %104, i64 %idxprom100
store float %add99, float* %arrayidx101, align 4
%106 = load i32, i32* %xy, align 4
%107 = load i32, i32* %c, align 4
%add102 = add nsw i32 %107, %106
store i32 %add102, i32* %c, align 4
%108 = load i32, i32* %xy, align 4
%109 = load i32, i32* %W, align 4
%add103 = add nsw i32 %109, %108
store i32 %add103, i32* %W, align 4
%110 = load i32, i32* %xy, align 4
%111 = load i32, i32* %E, align 4
%add104 = add nsw i32 %111, %110
store i32 %add104, i32* %E, align 4
%112 = load i32, i32* %xy, align 4
%113 = load i32, i32* %N, align 4
%add105 = add nsw i32 %113, %112
store i32 %add105, i32* %N, align 4
%114 = load i32, i32* %xy, align 4
%115 = load i32, i32* %S, align 4
%add106 = add nsw i32 %115, %114
store i32 %add106, i32* %S, align 4
br label %for.inc
for.inc: ; preds = %for.body
%116 = load i32, i32* %k, align 4
%inc = add nsw i32 %116, 1
store i32 %inc, i32* %k, align 4
br label %for.cond
for.end: ; preds = %for.cond
%117 = load float, float* %temp2, align 4
store float %117, float* %temp1, align 4
%118 = load float, float* %temp3, align 4
store float %118, float* %temp2, align 4
%119 = load float, float* %cc.addr, align 4
%120 = load float, float* %temp2, align 4
%mul107 = fmul contract float %119, %120
%121 = load float, float* %cw.addr, align 4
%122 = load float*, float** %tIn.addr, align 8
%123 = load i32, i32* %W, align 4
%idxprom108 = sext i32 %123 to i64
%arrayidx109 = getelementptr inbounds float, float* %122, i64 %idxprom108
%124 = load float, float* %arrayidx109, align 4
%mul110 = fmul contract float %121, %124
%add111 = fadd contract float %mul107, %mul110
%125 = load float, float* %ce.addr, align 4
%126 = load float*, float** %tIn.addr, align 8
%127 = load i32, i32* %E, align 4
%idxprom112 = sext i32 %127 to i64
%arrayidx113 = getelementptr inbounds float, float* %126, i64 %idxprom112
%128 = load float, float* %arrayidx113, align 4
%mul114 = fmul contract float %125, %128
%add115 = fadd contract float %add111, %mul114
%129 = load float, float* %cs.addr, align 4
%130 = load float*, float** %tIn.addr, align 8
%131 = load i32, i32* %S, align 4
%idxprom116 = sext i32 %131 to i64
%arrayidx117 = getelementptr inbounds float, float* %130, i64 %idxprom116
%132 = load float, float* %arrayidx117, align 4
%mul118 = fmul contract float %129, %132
%add119 = fadd contract float %add115, %mul118
%133 = load float, float* %cn.addr, align 4
%134 = load float*, float** %tIn.addr, align 8
%135 = load i32, i32* %N, align 4
%idxprom120 = sext i32 %135 to i64
%arrayidx121 = getelementptr inbounds float, float* %134, i64 %idxprom120
%136 = load float, float* %arrayidx121, align 4
%mul122 = fmul contract float %133, %136
%add123 = fadd contract float %add119, %mul122
%137 = load float, float* %cb.addr, align 4
%138 = load float, float* %temp1, align 4
%mul124 = fmul contract float %137, %138
%add125 = fadd contract float %add123, %mul124
%139 = load float, float* %ct.addr, align 4
%140 = load float, float* %temp3, align 4
%mul126 = fmul contract float %139, %140
%add127 = fadd contract float %add125, %mul126
%141 = load float, float* %sdc.addr, align 4
%142 = load float*, float** %p.addr, align 8
%143 = load i32, i32* %c, align 4
%idxprom128 = sext i32 %143 to i64
%arrayidx129 = getelementptr inbounds float, float* %142, i64 %idxprom128
%144 = load float, float* %arrayidx129, align 4
%mul130 = fmul contract float %141, %144
%add131 = fadd contract float %add127, %mul130
%145 = load float, float* %ct.addr, align 4
%146 = load float, float* %amb_temp, align 4
%mul132 = fmul contract float %145, %146
%add133 = fadd contract float %add131, %mul132
%147 = load float*, float** %tOut.addr, align 8
%148 = load i32, i32* %c, align 4
%idxprom134 = sext i32 %148 to i64
%arrayidx135 = getelementptr inbounds float, float* %147, i64 %idxprom134
store float %add133, float* %arrayidx135, align 4
ret void
}
; Function Attrs: alwaysinline convergent nounwind
define linkonce_odr dso_local i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #1 comdat align 2 {
entry:
%0 = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
ret i32 %0
}
; Function Attrs: alwaysinline convergent nounwind
define linkonce_odr dso_local i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #1 comdat align 2 {
entry:
%0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
ret i32 %0
}
; Function Attrs: alwaysinline convergent nounwind
define linkonce_odr dso_local i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #1 comdat align 2 {
entry:
%0 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
ret i32 %0
}
; Function Attrs: alwaysinline convergent nounwind
define linkonce_odr dso_local i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_yEv() #1 comdat align 2 {
entry:
%0 = call i32 @llvm.nvvm.read.ptx.sreg.ntid.y()
ret i32 %0
}
; Function Attrs: alwaysinline convergent nounwind
define linkonce_odr dso_local i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_yEv() #1 comdat align 2 {
entry:
%0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y()
ret i32 %0
}
; Function Attrs: alwaysinline convergent nounwind
define linkonce_odr dso_local i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_yEv() #1 comdat align 2 {
entry:
%0 = call i32 @llvm.nvvm.read.ptx.sreg.tid.y()
ret i32 %0
}
; Function Attrs: nounwind readnone
declare i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #2
; Function Attrs: nounwind readnone
declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #2
; Function Attrs: nounwind readnone
declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() #2
; Function Attrs: nounwind readnone
declare i32 @llvm.nvvm.read.ptx.sreg.ntid.y() #2
; Function Attrs: nounwind readnone
declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.y() #2
; Function Attrs: nounwind readnone
declare i32 @llvm.nvvm.read.ptx.sreg.tid.y() #2
attributes #0 = { convergent noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #1 = { alwaysinline convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #2 = { nounwind readnone }
attributes #3 = { convergent nounwind }
!llvm.module.flags = !{!0, !1, !2}
!nvvm.annotations = !{!3, !4, !5, !4, !6, !6, !6, !6, !7, !7, !6}
!llvm.ident = !{!8}
!nvvmir.version = !{!9}
!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 10, i32 1]}
!1 = !{i32 1, !"wchar_size", i32 4}
!2 = !{i32 4, !"nvvm-reflect-ftz", i32 0}
!3 = !{void (float*, float*, float*, float, i32, i32, i32, float, float, float, float, float, float, float)* @_Z11hotspotOpt1PfS_S_fiiifffffff, !"kernel", i32 1}
!4 = !{null, !"align", i32 8}
!5 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080}
!6 = !{null, !"align", i32 16}
!7 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088}
!8 = !{!"clang version 10.0.1 (https://github.com/llvm/llvm-project.git ef32c611aa214dea855364efd7ba451ec5ec3f74)"}
!9 = !{i32 1, i32 4}

File diff suppressed because one or more lines are too long

View File

@ -1,205 +0,0 @@
#include <assert.h>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/time.h>
#include <time.h>
#define BLOCK_SIZE 16
#define STR_SIZE 256
#define block_x_ 128
#define block_y_ 2
#define block_z_ 1
#define MAX_PD (3.0e6)
/* required precision in degrees */
#define PRECISION 0.001
#define SPEC_HEAT_SI 1.75e6
#define K_SI 100
/* capacitance fitting factor */
#define FACTOR_CHIP 0.5
#include "opt1.cu"
/* chip parameters */
float t_chip = 0.0005;
float chip_height = 0.016;
float chip_width = 0.016; /* ambient temperature, assuming no package at all
*/
float amb_temp = 80.0;
void fatal(const char *s) { fprintf(stderr, "Error: %s\n", s); }
void readinput(float *vect, int grid_rows, int grid_cols, int layers,
char *file) {
int i, j, k;
FILE *fp;
char str[STR_SIZE];
float val;
if ((fp = fopen(file, "r")) == 0)
fatal("The file was not opened");
for (i = 0; i <= grid_rows - 1; i++)
for (j = 0; j <= grid_cols - 1; j++)
for (k = 0; k <= layers - 1; k++) {
if (fgets(str, STR_SIZE, fp) == NULL)
fatal("Error reading file\n");
if (feof(fp))
fatal("not enough lines in file");
if ((sscanf(str, "%f", &val) != 1))
fatal("invalid file format");
vect[i * grid_cols + j + k * grid_rows * grid_cols] = val;
}
fclose(fp);
}
void writeoutput(float *vect, int grid_rows, int grid_cols, int layers,
char *file) {
int i, j, k, index = 0;
FILE *fp;
char str[STR_SIZE];
if ((fp = fopen(file, "w")) == 0)
printf("The file was not opened\n");
for (i = 0; i < grid_rows; i++)
for (j = 0; j < grid_cols; j++)
for (k = 0; k < layers; k++) {
sprintf(str, "%d\t%g\n", index,
vect[i * grid_cols + j + k * grid_rows * grid_cols]);
fputs(str, fp);
index++;
}
fclose(fp);
}
void computeTempCPU(float *pIn, float *tIn, float *tOut, int nx, int ny, int nz,
float Cap, float Rx, float Ry, float Rz, float dt,
int numiter) {
float ce, cw, cn, cs, ct, cb, cc;
float stepDivCap = dt / Cap;
ce = cw = stepDivCap / Rx;
cn = cs = stepDivCap / Ry;
ct = cb = stepDivCap / Rz;
cc = 1.0 - (2.0 * ce + 2.0 * cn + 3.0 * ct);
int c, w, e, n, s, b, t;
int x, y, z;
int i = 0;
do {
for (z = 0; z < nz; z++)
for (y = 0; y < ny; y++)
for (x = 0; x < nx; x++) {
c = x + y * nx + z * nx * ny;
w = (x == 0) ? c : c - 1;
e = (x == nx - 1) ? c : c + 1;
n = (y == 0) ? c : c - nx;
s = (y == ny - 1) ? c : c + nx;
b = (z == 0) ? c : c - nx * ny;
t = (z == nz - 1) ? c : c + nx * ny;
tOut[c] = tIn[c] * cc + tIn[n] * cn + tIn[s] * cs + tIn[e] * ce +
tIn[w] * cw + tIn[t] * ct + tIn[b] * cb +
(dt / Cap) * pIn[c] + ct * amb_temp;
}
float *temp = tIn;
tIn = tOut;
tOut = temp;
i++;
} while (i < numiter);
}
float accuracy(float *arr1, float *arr2, int len) {
float err = 0.0;
int i;
for (i = 0; i < len; i++) {
err += (arr1[i] - arr2[i]) * (arr1[i] - arr2[i]);
}
return (float)sqrt(err / len);
}
void usage(int argc, char **argv) {
fprintf(stderr,
"Usage: %s <rows/cols> <layers> <iterations> <powerFile> <tempFile> "
"<outputFile>\n",
argv[0]);
fprintf(
stderr,
"\t<rows/cols> - number of rows/cols in the grid (positive integer)\n");
fprintf(stderr,
"\t<layers> - number of layers in the grid (positive integer)\n");
fprintf(stderr, "\t<iteration> - number of iterations\n");
fprintf(stderr, "\t<powerFile> - name of the file containing the initial "
"power values of each cell\n");
fprintf(stderr, "\t<tempFile> - name of the file containing the initial "
"temperature values of each cell\n");
fprintf(stderr, "\t<outputFile - output file\n");
exit(1);
}
int main(int argc, char **argv) {
cudaSetDevice(0);
if (argc != 7) {
usage(argc, argv);
}
char *pfile, *tfile, *ofile;
int iterations = atoi(argv[3]);
pfile = argv[4];
tfile = argv[5];
ofile = argv[6];
int numCols = atoi(argv[1]);
int numRows = atoi(argv[1]);
int layers = atoi(argv[2]);
/* calculating parameters*/
float dx = chip_height / numRows;
float dy = chip_width / numCols;
float dz = t_chip / layers;
float Cap = FACTOR_CHIP * SPEC_HEAT_SI * t_chip * dx * dy;
float Rx = dy / (2.0 * K_SI * t_chip * dx);
float Ry = dx / (2.0 * K_SI * t_chip * dy);
float Rz = dz / (K_SI * dx * dy);
float max_slope = MAX_PD / (FACTOR_CHIP * t_chip * SPEC_HEAT_SI);
float dt = PRECISION / max_slope;
float *powerIn, *tempOut, *tempIn, *tempCopy;
int size = numCols * numRows * layers;
powerIn = (float *)calloc(size, sizeof(float));
tempCopy = (float *)malloc(size * sizeof(float));
tempIn = (float *)calloc(size, sizeof(float));
tempOut = (float *)calloc(size, sizeof(float));
float *answer = (float *)calloc(size, sizeof(float));
readinput(powerIn, numRows, numCols, layers, pfile);
readinput(tempIn, numRows, numCols, layers, tfile);
memcpy(tempCopy, tempIn, size * sizeof(float));
hotspot_opt1(powerIn, tempIn, tempOut, numCols, numRows, layers, Cap, Rx, Ry,
Rz, dt, iterations);
computeTempCPU(powerIn, tempCopy, answer, numCols, numRows, layers, Cap, Rx,
Ry, Rz, dt, iterations);
float acc = accuracy(tempOut, answer, numRows * numCols * layers);
printf("Accuracy: %e\n", acc);
writeoutput(tempOut, numRows, numCols, layers, ofile);
free(tempIn);
free(tempOut);
free(powerIn);
return 0;
}

View File

@ -1,22 +0,0 @@
# # #!/bin/bash
set -e
llvm-as 3D-cuda-nvptx64-nvidia-cuda-sm_61.ll
llvm-as 3D-host-x86_64-unknown-linux-gnu.ll
../../build/compilation/kernelTranslator 3D-cuda-nvptx64-nvidia-cuda-sm_61.bc kernel.bc
../../build/compilation/hostTranslator 3D-host-x86_64-unknown-linux-gnu.bc host.bc
llc --relocation-model=pic --filetype=obj kernel.bc
llc --relocation-model=pic --filetype=obj host.bc
g++ -g -Wall -L../../build/runtime -L../../build/runtime/threadPool -o 3D \
-fPIC -no-pie host.o kernel.o -lc -lx86Runtime -lthreadPool -lpthread
export LD_LIBRARY_PATH=../../build/runtime:../../build/runtime/threadPool:$LD_LIBRARY_PATH
./3D 512 8 100 ../../rodinia-data/hotspot3D/power_512x8 ../../rodinia-data/hotspot3D/temp_512x8 output.out
if head output.out | grep -q "334.017"; then
echo "Pass"
else
echo "Error result"
exit 1
fi

View File

@ -1,24 +0,0 @@
#ifndef _COMPARISON_HELPERS_H_
#define _COMPARISON_HELPERS_H_
#include <stdio.h>
template <typename T>
__inline int compare_vectors(T *data1, T *data2, unsigned int size) {
printf("Comparing vectors: \n");
bool match = true;
for (unsigned int i = 0; i < size; i++)
if (data1[i] != data2[i]) {
match = false;
printf("Diff: data1[%d]=%d, data1[%d]=%d.\n", i, data1[i], i, data2[i]);
}
if (match) {
printf("PASS! vectors are matching!\n");
return 0;
} else {
printf("FAIL! vectors are NOT matching!\n");
exit(1);
return -1;
}
}
#endif

View File

@ -1,116 +0,0 @@
#include "stdafx.h"
#include "cpuencode.h"
#include "print_helpers.h"
using namespace std;
#if 1
// The max. codeword length for each byte symbol is 32-bits
extern "C" void cpu_vlc_encode(unsigned int *indata, unsigned int num_elements,
unsigned int *outdata, unsigned int *outsize,
unsigned int *codewords,
unsigned int *codewordlens) {
unsigned int *bitstreamPt =
(unsigned int *)outdata; /* Pointer to current byte */
*bitstreamPt = 0x00000000U;
unsigned int startbit = 0;
unsigned int totalBytes = 0;
for (unsigned int k = 0; k < num_elements; k++) {
unsigned int cw32 = 0;
unsigned int val32 = indata[k];
unsigned int numbits = 0;
unsigned int mask32;
for (unsigned int i = 0; i < 4; i++) {
unsigned char symbol = (unsigned char)(val32 >> (8 * (3 - i)));
cw32 = codewords[symbol];
numbits = codewordlens[symbol];
while (numbits > 0) {
int writebits = min(32 - startbit, numbits);
if (numbits == writebits)
mask32 = (cw32 & ((1 << numbits) - 1))
<< (32 - startbit -
numbits); // first make sure that the start of the word
// is clean, then shift to the left as many
// places as you need
else
mask32 = cw32 >>
(numbits - writebits); // shift out the bits that can not fit
*bitstreamPt = (*bitstreamPt) | mask32;
numbits = numbits - writebits;
startbit = (startbit + writebits) % 32;
if (startbit == 0) {
bitstreamPt++;
*bitstreamPt = 0x00000000;
totalBytes += 4;
}
}
}
}
totalBytes += (startbit / 8) +
((startbit % 8 == 0) ? 0 : 1); // return aligned to 8-bits
*outsize = totalBytes;
}
//////////////////////////////////////////////////////////////////////
/// ALTERNATIVE CODER
/// ASSUMPTION: The max. length of 4 combined codewords can be 2x original data,
/// i.e. g 64 bits
///////////////////////////////////////////////////////////////////////
#else
extern "C" void cpu_vlc_encode(unsigned int *indata, unsigned int num_elements,
unsigned int *outdata, unsigned int *outsize,
unsigned int *codewords,
unsigned int *codewordlens) {
unsigned int *bitstreamPt =
(unsigned int *)outdata; /* Pointer to current byte */
// assume memset is done.
*bitstreamPt = 0x00000000U;
unsigned int startbit = 0;
unsigned int totalBytes = 0;
for (unsigned int k = 0; k < num_elements; k++) {
unsigned long long cw64 = 0, mask64 = 0;
unsigned int val32 = indata[k];
unsigned int numbits = 0;
unsigned int mask32, temp32;
for (unsigned int i = 0; i < 4; i++) {
unsigned char symbol = (unsigned char)(val32 >> (8 * (3 - i)));
cw64 = (cw64 << codewordlens[symbol]) | codewords[symbol];
numbits += codewordlens[symbol];
// if (numbits>32) printf("WARRNING! Element %d is combined into numbits =
// %d!!!!!!!\n", k, numbits);
}
while (numbits > 0) {
int writebits = min(32 - startbit, numbits);
if (numbits == writebits) {
temp32 = (unsigned int)cw64; //(cw64 & 0xFFFFFFFF);
mask32 = temp32 << (32 - startbit - numbits);
} else {
mask32 = (unsigned int)(cw64 >> (numbits - writebits));
cw64 = cw64 & ((1 << (numbits - writebits)) - 1);
}
*bitstreamPt = (*bitstreamPt) | mask32;
numbits = numbits - writebits;
startbit = (startbit + writebits) % 32;
if (startbit == 0) {
bitstreamPt++;
*bitstreamPt = 0x00000000;
totalBytes += 4;
}
}
}
totalBytes += (startbit / 8) +
((startbit % 8 == 0) ? 0 : 1); // return aligned to 8-bits
*outsize = totalBytes;
}
#endif

View File

@ -1,8 +0,0 @@
#ifndef _CE_H_
#define _CE_H_
extern "C" void cpu_vlc_encode(unsigned int *indata, unsigned int num_elements,
unsigned int *outdata, unsigned int *outsize,
unsigned int *codewords,
unsigned int *codewordlens);
#endif

View File

@ -1,20 +0,0 @@
#ifndef __CUDA_HELPERS__
#define __CUDA_HELPERS__
#include <stdio.h>
/************************************************************************/
/* Init CUDA */
/************************************************************************/
#if __DEVICE_EMULATION__
bool InitCUDA(void) { return true; }
#else
bool InitCUDA(void) {
cudaSetDevice(0);
printf("CUDA initialized.\n");
return true;
}
#endif
#endif

View File

@ -1,931 +0,0 @@
/*
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
*
* Please refer to the NVIDIA end user license agreement (EULA) associated
* with this source code for terms and conditions that govern your use of
* this software. Any use, reproduction, disclosure, or distribution of
* this software and related documentation outside the terms of the EULA
* is strictly prohibited.
*
*/
/*
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
*
* Please refer to the NVIDIA end user license agreement (EULA) associated
* with this source code for terms and conditions that govern your use of
* this software. Any use, reproduction, disclosure, or distribution of
* this software and related documentation outside the terms of the EULA
* is strictly prohibited.
*
*/
/* CUda UTility Library */
#ifndef _CUTIL_H_
#define _CUTIL_H_
#ifdef _WIN32
#pragma warning(disable : 4996) // disable deprecated warning
#endif
#include <stdio.h>
#include <stdlib.h>
// helper typedefs for building DLL
#ifdef _WIN32
#ifdef BUILD_DLL
#define DLL_MAPPING __declspec(dllexport)
#else
#define DLL_MAPPING __declspec(dllimport)
#endif
#else
#define DLL_MAPPING
#endif
#ifdef _WIN32
#define CUTIL_API __stdcall
#else
#define CUTIL_API
#endif
////////////////////////////////////////////////////////////////////////////
//! CUT bool type
////////////////////////////////////////////////////////////////////////////
enum CUTBoolean { CUTFalse = 0, CUTTrue = 1 };
////////////////////////////////////////////////////////////////////////////
//! Deallocate memory allocated within Cutil
//! @param pointer to memory
////////////////////////////////////////////////////////////////////////////
DLL_MAPPING
void CUTIL_API cutFree(void *ptr);
////////////////////////////////////////////////////////////////////////////
//! Helper for bank conflict checking (should only be used with the
//! CUT_BANK_CHECKER macro)
//! @param tidx thread id in x dimension of block
//! @param tidy thread id in y dimension of block
//! @param tidz thread id in z dimension of block
//! @param bdimx block size in x dimension
//! @param bdimy block size in y dimension
//! @param bdimz block size in z dimension
//! @param file name of the source file where the access takes place
//! @param line line in the source file where the access takes place
//! @param aname name of the array which is accessed
//! @param index index into the array
////////////////////////////////////////////////////////////////////////////
DLL_MAPPING
void CUTIL_API cutCheckBankAccess(unsigned int tidx, unsigned int tidy,
unsigned int tidz, unsigned int bdimx,
unsigned int bdimy, unsigned int bdimz,
const char *file, const int line,
const char *aname, const int index);
////////////////////////////////////////////////////////////////////////////
//! Find the path for a filename
//! @return the path if succeeded, otherwise 0
//! @param filename name of the file
//! @param executablePath optional absolute path of the executable
////////////////////////////////////////////////////////////////////////////
DLL_MAPPING
char *CUTIL_API cutFindFilePath(const char *filename,
const char *executablePath);
////////////////////////////////////////////////////////////////////////////
//! Read file \filename containing single precision floating point data
//! @return CUTTrue if reading the file succeeded, otherwise false
//! @param filename name of the source file
//! @param data uninitialized pointer, returned initialized and pointing to
//! the data read
//! @param len number of data elements in data, -1 on error
//! @note If a NULL pointer is passed to this function and it is
//! initialized within Cutil then cutFree() has to be used to
//! deallocate the memory
////////////////////////////////////////////////////////////////////////////
DLL_MAPPING
CUTBoolean CUTIL_API cutReadFilef(const char *filename, float **data,
unsigned int *len, bool verbose = false);
////////////////////////////////////////////////////////////////////////////
//! Read file \filename containing double precision floating point data
//! @return CUTTrue if reading the file succeeded, otherwise false
//! @param filename name of the source file
//! @param data uninitialized pointer, returned initialized and pointing to
//! the data read
//! @param len number of data elements in data, -1 on error
//! @note If a NULL pointer is passed to this function and it is
//! initialized within Cutil then cutFree() has to be used to
//! deallocate the memory
////////////////////////////////////////////////////////////////////////////
DLL_MAPPING
CUTBoolean CUTIL_API cutReadFiled(const char *filename, double **data,
unsigned int *len, bool verbose = false);
////////////////////////////////////////////////////////////////////////////
//! Read file \filename containing integer data
//! @return CUTTrue if reading the file succeeded, otherwise false
//! @param filename name of the source file
//! @param data uninitialized pointer, returned initialized and pointing to
//! the data read
//! @param len number of data elements in data, -1 on error
//! @note If a NULL pointer is passed to this function and it is
//! initialized within Cutil then cutFree() has to be used to
//! deallocate the memory
////////////////////////////////////////////////////////////////////////////
DLL_MAPPING
CUTBoolean CUTIL_API cutReadFilei(const char *filename, int **data,
unsigned int *len, bool verbose = false);
////////////////////////////////////////////////////////////////////////////
//! Read file \filename containing unsigned integer data
//! @return CUTTrue if reading the file succeeded, otherwise false
//! @param filename name of the source file
//! @param data uninitialized pointer, returned initialized and pointing to
//! the data read
//! @param len number of data elements in data, -1 on error
//! @note If a NULL pointer is passed to this function and it is
//! initialized within Cutil then cutFree() has to be used to
//! deallocate the memory
////////////////////////////////////////////////////////////////////////////
DLL_MAPPING
CUTBoolean CUTIL_API cutReadFileui(const char *filename, unsigned int **data,
unsigned int *len, bool verbose = false);
////////////////////////////////////////////////////////////////////////////
//! Read file \filename containing char / byte data
//! @return CUTTrue if reading the file succeeded, otherwise false
//! @param filename name of the source file
//! @param data uninitialized pointer, returned initialized and pointing to
//! the data read
//! @param len number of data elements in data, -1 on error
//! @note If a NULL pointer is passed to this function and it is
//! initialized within Cutil then cutFree() has to be used to
//! deallocate the memory
////////////////////////////////////////////////////////////////////////////
DLL_MAPPING
CUTBoolean CUTIL_API cutReadFileb(const char *filename, char **data,
unsigned int *len, bool verbose = false);
////////////////////////////////////////////////////////////////////////////
//! Read file \filename containing unsigned char / byte data
//! @return CUTTrue if reading the file succeeded, otherwise false
//! @param filename name of the source file
//! @param data uninitialized pointer, returned initialized and pointing to
//! the data read
//! @param len number of data elements in data, -1 on error
//! @note If a NULL pointer is passed to this function and it is
//! initialized within Cutil then cutFree() has to be used to
//! deallocate the memory
////////////////////////////////////////////////////////////////////////////
DLL_MAPPING
CUTBoolean CUTIL_API cutReadFileub(const char *filename, unsigned char **data,
unsigned int *len, bool verbose = false);
////////////////////////////////////////////////////////////////////////////
//! Write a data file \filename containing single precision floating point
//! data
//! @return CUTTrue if writing the file succeeded, otherwise false
//! @param filename name of the file to write
//! @param data pointer to data to write
//! @param len number of data elements in data, -1 on error
//! @param epsilon epsilon for comparison
////////////////////////////////////////////////////////////////////////////
DLL_MAPPING
CUTBoolean CUTIL_API cutWriteFilef(const char *filename, const float *data,
unsigned int len, const float epsilon,
bool verbose = false);
////////////////////////////////////////////////////////////////////////////
//! Write a data file \filename containing double precision floating point
//! data
//! @return CUTTrue if writing the file succeeded, otherwise false
//! @param filename name of the file to write
//! @param data pointer to data to write
//! @param len number of data elements in data, -1 on error
//! @param epsilon epsilon for comparison
////////////////////////////////////////////////////////////////////////////
DLL_MAPPING
CUTBoolean CUTIL_API cutWriteFiled(const char *filename, const float *data,
unsigned int len, const double epsilon,
bool verbose = false);
////////////////////////////////////////////////////////////////////////////
//! Write a data file \filename containing integer data
//! @return CUTTrue if writing the file succeeded, otherwise false
//! @param filename name of the file to write
//! @param data pointer to data to write
//! @param len number of data elements in data, -1 on error
////////////////////////////////////////////////////////////////////////////
DLL_MAPPING
CUTBoolean CUTIL_API cutWriteFilei(const char *filename, const int *data,
unsigned int len, bool verbose = false);
////////////////////////////////////////////////////////////////////////////
//! Write a data file \filename containing unsigned integer data
//! @return CUTTrue if writing the file succeeded, otherwise false
//! @param filename name of the file to write
//! @param data pointer to data to write
//! @param len number of data elements in data, -1 on error
////////////////////////////////////////////////////////////////////////////
DLL_MAPPING
CUTBoolean CUTIL_API cutWriteFileui(const char *filename,
const unsigned int *data, unsigned int len,
bool verbose = false);
////////////////////////////////////////////////////////////////////////////
//! Write a data file \filename containing char / byte data
//! @return CUTTrue if writing the file succeeded, otherwise false
//! @param filename name of the file to write
//! @param data pointer to data to write
//! @param len number of data elements in data, -1 on error
////////////////////////////////////////////////////////////////////////////
DLL_MAPPING
CUTBoolean CUTIL_API cutWriteFileb(const char *filename, const char *data,
unsigned int len, bool verbose = false);
////////////////////////////////////////////////////////////////////////////
//! Write a data file \filename containing unsigned char / byte data
//! @return CUTTrue if writing the file succeeded, otherwise false
//! @param filename name of the file to write
//! @param data pointer to data to write
//! @param len number of data elements in data, -1 on error
////////////////////////////////////////////////////////////////////////////
DLL_MAPPING
CUTBoolean CUTIL_API cutWriteFileub(const char *filename,
const unsigned char *data, unsigned int len,
bool verbose = false);
////////////////////////////////////////////////////////////////////////////
//! Load PGM image file (with unsigned char as data element type)
//! @return CUTTrue if reading the file succeeded, otherwise false
//! @param file name of the image file
//! @param data handle to the data read
//! @param w width of the image
//! @param h height of the image
//! @note If a NULL pointer is passed to this function and it is
//! initialized within Cutil then cutFree() has to be used to
//! deallocate the memory
////////////////////////////////////////////////////////////////////////////
DLL_MAPPING
CUTBoolean CUTIL_API cutLoadPGMub(const char *file, unsigned char **data,
unsigned int *w, unsigned int *h);
////////////////////////////////////////////////////////////////////////////
//! Load PPM image file (with unsigned char as data element type)
//! @return CUTTrue if reading the file succeeded, otherwise false
//! @param file name of the image file
//! @param data handle to the data read
//! @param w width of the image
//! @param h height of the image
////////////////////////////////////////////////////////////////////////////
DLL_MAPPING
CUTBoolean CUTIL_API cutLoadPPMub(const char *file, unsigned char **data,
unsigned int *w, unsigned int *h);
////////////////////////////////////////////////////////////////////////////
//! Load PPM image file (with unsigned char as data element type), padding
//! 4th component
//! @return CUTTrue if reading the file succeeded, otherwise false
//! @param file name of the image file
//! @param data handle to the data read
//! @param w width of the image
//! @param h height of the image
////////////////////////////////////////////////////////////////////////////
DLL_MAPPING
CUTBoolean CUTIL_API cutLoadPPM4ub(const char *file, unsigned char **data,
unsigned int *w, unsigned int *h);
////////////////////////////////////////////////////////////////////////////
//! Load PGM image file (with unsigned int as data element type)
//! @return CUTTrue if reading the file succeeded, otherwise false
//! @param file name of the image file
//! @param data handle to the data read
//! @param w width of the image
//! @param h height of the image
//! @note If a NULL pointer is passed to this function and it is
//! initialized within Cutil then cutFree() has to be used to
//! deallocate the memory
////////////////////////////////////////////////////////////////////////////
DLL_MAPPING
CUTBoolean CUTIL_API cutLoadPGMi(const char *file, unsigned int **data,
unsigned int *w, unsigned int *h);
////////////////////////////////////////////////////////////////////////////
//! Load PGM image file (with unsigned short as data element type)
//! @return CUTTrue if reading the file succeeded, otherwise false
//! @param file name of the image file
//! @param data handle to the data read
//! @param w width of the image
//! @param h height of the image
//! @note If a NULL pointer is passed to this function and it is
//! initialized withing Cutil then cutFree() has to be used to
//! deallocate the memory
////////////////////////////////////////////////////////////////////////////
DLL_MAPPING
CUTBoolean CUTIL_API cutLoadPGMs(const char *file, unsigned short **data,
unsigned int *w, unsigned int *h);
////////////////////////////////////////////////////////////////////////////
//! Load PGM image file (with float as data element type)
//! @param file name of the image file
//! @param data handle to the data read
//! @param w width of the image
//! @param h height of the image
//! @note If a NULL pointer is passed to this function and it is
//! initialized withing Cutil then cutFree() has to be used to
//! deallocate the memory
////////////////////////////////////////////////////////////////////////////
DLL_MAPPING
CUTBoolean CUTIL_API cutLoadPGMf(const char *file, float **data,
unsigned int *w, unsigned int *h);
////////////////////////////////////////////////////////////////////////////
//! Save PGM image file (with unsigned char as data element type)
//! @param file name of the image file
//! @param data handle to the data read
//! @param w width of the image
//! @param h height of the image
////////////////////////////////////////////////////////////////////////////
DLL_MAPPING
CUTBoolean CUTIL_API cutSavePGMub(const char *file, unsigned char *data,
unsigned int w, unsigned int h);
////////////////////////////////////////////////////////////////////////////
//! Save PPM image file (with unsigned char as data element type)
//! @param file name of the image file
//! @param data handle to the data read
//! @param w width of the image
//! @param h height of the image
////////////////////////////////////////////////////////////////////////////
DLL_MAPPING
CUTBoolean CUTIL_API cutSavePPMub(const char *file, unsigned char *data,
unsigned int w, unsigned int h);
////////////////////////////////////////////////////////////////////////////
//! Save PPM image file (with unsigned char as data element type, padded to
//! 4 bytes)
//! @param file name of the image file
//! @param data handle to the data read
//! @param w width of the image
//! @param h height of the image
////////////////////////////////////////////////////////////////////////////
DLL_MAPPING
CUTBoolean CUTIL_API cutSavePPM4ub(const char *file, unsigned char *data,
unsigned int w, unsigned int h);
////////////////////////////////////////////////////////////////////////////
//! Save PGM image file (with unsigned int as data element type)
//! @param file name of the image file
//! @param data handle to the data read
//! @param w width of the image
//! @param h height of the image
////////////////////////////////////////////////////////////////////////////
DLL_MAPPING
CUTBoolean CUTIL_API cutSavePGMi(const char *file, unsigned int *data,
unsigned int w, unsigned int h);
////////////////////////////////////////////////////////////////////////////
//! Save PGM image file (with unsigned short as data element type)
//! @param file name of the image file
//! @param data handle to the data read
//! @param w width of the image
//! @param h height of the image
////////////////////////////////////////////////////////////////////////////
DLL_MAPPING
CUTBoolean CUTIL_API cutSavePGMs(const char *file, unsigned short *data,
unsigned int w, unsigned int h);
////////////////////////////////////////////////////////////////////////////
//! Save PGM image file (with float as data element type)
//! @param file name of the image file
//! @param data handle to the data read
//! @param w width of the image
//! @param h height of the image
////////////////////////////////////////////////////////////////////////////
DLL_MAPPING
CUTBoolean CUTIL_API cutSavePGMf(const char *file, float *data, unsigned int w,
unsigned int h);
////////////////////////////////////////////////////////////////////////////
// Command line arguments: General notes
// * All command line arguments begin with '--' followed by the token;
// token and value are seperated by '='; example --samples=50
// * Arrays have the form --model=[one.obj,two.obj,three.obj]
// (without whitespaces)
////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////
//! Check if command line argument \a flag-name is given
//! @return CUTTrue if command line argument \a flag_name has been given,
//! otherwise 0
//! @param argc argc as passed to main()
//! @param argv argv as passed to main()
//! @param flag_name name of command line flag
////////////////////////////////////////////////////////////////////////////
DLL_MAPPING
CUTBoolean CUTIL_API cutCheckCmdLineFlag(const int argc, const char **argv,
const char *flag_name);
////////////////////////////////////////////////////////////////////////////
//! Get the value of a command line argument of type int
//! @return CUTTrue if command line argument \a arg_name has been given and
//! is of the requested type, otherwise CUTFalse
//! @param argc argc as passed to main()
//! @param argv argv as passed to main()
//! @param arg_name name of the command line argument
//! @param val value of the command line argument
////////////////////////////////////////////////////////////////////////////
DLL_MAPPING
CUTBoolean CUTIL_API cutGetCmdLineArgumenti(const int argc, const char **argv,
const char *arg_name, int *val);
////////////////////////////////////////////////////////////////////////////
//! Get the value of a command line argument of type float
//! @return CUTTrue if command line argument \a arg_name has been given and
//! is of the requested type, otherwise CUTFalse
//! @param argc argc as passed to main()
//! @param argv argv as passed to main()
//! @param arg_name name of the command line argument
//! @param val value of the command line argument
////////////////////////////////////////////////////////////////////////////
DLL_MAPPING
CUTBoolean CUTIL_API cutGetCmdLineArgumentf(const int argc, const char **argv,
const char *arg_name, float *val);
////////////////////////////////////////////////////////////////////////////
//! Get the value of a command line argument of type string
//! @return CUTTrue if command line argument \a arg_name has been given and
//! is of the requested type, otherwise CUTFalse
//! @param argc argc as passed to main()
//! @param argv argv as passed to main()
//! @param arg_name name of the command line argument
//! @param val value of the command line argument
////////////////////////////////////////////////////////////////////////////
DLL_MAPPING
CUTBoolean CUTIL_API cutGetCmdLineArgumentstr(const int argc, const char **argv,
const char *arg_name, char **val);
////////////////////////////////////////////////////////////////////////////
//! Get the value of a command line argument list those element are strings
//! @return CUTTrue if command line argument \a arg_name has been given and
//! is of the requested type, otherwise CUTFalse
//! @param argc argc as passed to main()
//! @param argv argv as passed to main()
//! @param arg_name name of the command line argument
//! @param val command line argument list
//! @param len length of the list / number of elements
////////////////////////////////////////////////////////////////////////////
DLL_MAPPING
CUTBoolean CUTIL_API cutGetCmdLineArgumentListstr(const int argc,
const char **argv,
const char *arg_name,
char **val,
unsigned int *len);
////////////////////////////////////////////////////////////////////////////
//! Extended assert
//! @return CUTTrue if the condition \a val holds, otherwise CUTFalse
//! @param val condition to test
//! @param file __FILE__ macro
//! @param line __LINE__ macro
//! @note This function should be used via the CONDITION(val) macro
////////////////////////////////////////////////////////////////////////////
DLL_MAPPING
CUTBoolean CUTIL_API cutCheckCondition(int val, const char *file,
const int line);
////////////////////////////////////////////////////////////////////////////
//! Compare two float arrays
//! @return CUTTrue if \a reference and \a data are identical,
//! otherwise CUTFalse
//! @param reference handle to the reference data / gold image
//! @param data handle to the computed data
//! @param len number of elements in reference and data
////////////////////////////////////////////////////////////////////////////
DLL_MAPPING
CUTBoolean CUTIL_API cutComparef(const float *reference, const float *data,
const unsigned int len);
////////////////////////////////////////////////////////////////////////////
//! Compare two integer arrays
//! @return CUTTrue if \a reference and \a data are identical,
//! otherwise CUTFalse
//! @param reference handle to the reference data / gold image
//! @param data handle to the computed data
//! @param len number of elements in reference and data
////////////////////////////////////////////////////////////////////////////
DLL_MAPPING
CUTBoolean CUTIL_API cutComparei(const int *reference, const int *data,
const unsigned int len);
////////////////////////////////////////////////////////////////////////////////
//! Compare two unsigned integer arrays, with epsilon and threshold
//! @return CUTTrue if \a reference and \a data are identical,
//! otherwise CUTFalse
//! @param reference handle to the reference data / gold image
//! @param data handle to the computed data
//! @param len number of elements in reference and data
//! @param threshold tolerance % # of comparison errors (0.15f = 15%)
////////////////////////////////////////////////////////////////////////////////
DLL_MAPPING
CUTBoolean CUTIL_API cutCompareuit(const unsigned int *reference,
const unsigned int *data,
const unsigned int len, const float epsilon,
const float threshold);
////////////////////////////////////////////////////////////////////////////
//! Compare two unsigned char arrays
//! @return CUTTrue if \a reference and \a data are identical,
//! otherwise CUTFalse
//! @param reference handle to the reference data / gold image
//! @param data handle to the computed data
//! @param len number of elements in reference and data
////////////////////////////////////////////////////////////////////////////
DLL_MAPPING
CUTBoolean CUTIL_API cutCompareub(const unsigned char *reference,
const unsigned char *data,
const unsigned int len);
////////////////////////////////////////////////////////////////////////////////
//! Compare two integers with a tolernance for # of byte errors
//! @return CUTTrue if \a reference and \a data are identical,
//! otherwise CUTFalse
//! @param reference handle to the reference data / gold image
//! @param data handle to the computed data
//! @param len number of elements in reference and data
//! @param epsilon epsilon to use for the comparison
//! @param threshold tolerance % # of comparison errors (0.15f = 15%)
////////////////////////////////////////////////////////////////////////////////
DLL_MAPPING
CUTBoolean CUTIL_API cutCompareubt(const unsigned char *reference,
const unsigned char *data,
const unsigned int len, const float epsilon,
const float threshold);
////////////////////////////////////////////////////////////////////////////////
//! Compare two integer arrays witha n epsilon tolerance for equality
//! @return CUTTrue if \a reference and \a data are identical,
//! otherwise CUTFalse
//! @param reference handle to the reference data / gold image
//! @param data handle to the computed data
//! @param len number of elements in reference and data
//! @param epsilon epsilon to use for the comparison
////////////////////////////////////////////////////////////////////////////////
DLL_MAPPING
CUTBoolean CUTIL_API cutCompareube(const unsigned char *reference,
const unsigned char *data,
const unsigned int len, const float epsilon);
////////////////////////////////////////////////////////////////////////////
//! Compare two float arrays with an epsilon tolerance for equality
//! @return CUTTrue if \a reference and \a data are identical,
//! otherwise CUTFalse
//! @param reference handle to the reference data / gold image
//! @param data handle to the computed data
//! @param len number of elements in reference and data
//! @param epsilon epsilon to use for the comparison
////////////////////////////////////////////////////////////////////////////
DLL_MAPPING
CUTBoolean CUTIL_API cutComparefe(const float *reference, const float *data,
const unsigned int len, const float epsilon);
////////////////////////////////////////////////////////////////////////////////
//! Compare two float arrays with an epsilon tolerance for equality and a
//! threshold for # pixel errors
//! @return CUTTrue if \a reference and \a data are identical,
//! otherwise CUTFalse
//! @param reference handle to the reference data / gold image
//! @param data handle to the computed data
//! @param len number of elements in reference and data
//! @param epsilon epsilon to use for the comparison
////////////////////////////////////////////////////////////////////////////////
DLL_MAPPING
CUTBoolean CUTIL_API cutComparefet(const float *reference, const float *data,
const unsigned int len, const float epsilon,
const float threshold);
////////////////////////////////////////////////////////////////////////////
//! Compare two float arrays using L2-norm with an epsilon tolerance for
//! equality
//! @return CUTTrue if \a reference and \a data are identical,
//! otherwise CUTFalse
//! @param reference handle to the reference data / gold image
//! @param data handle to the computed data
//! @param len number of elements in reference and data
//! @param epsilon epsilon to use for the comparison
////////////////////////////////////////////////////////////////////////////
DLL_MAPPING
CUTBoolean CUTIL_API cutCompareL2fe(const float *reference, const float *data,
const unsigned int len,
const float epsilon);
////////////////////////////////////////////////////////////////////////////////
//! Compare two PPM image files with an epsilon tolerance for equality
//! @return CUTTrue if \a reference and \a data are identical,
//! otherwise CUTFalse
//! @param src_file filename for the image to be compared
//! @param data filename for the reference data / gold image
//! @param epsilon epsilon to use for the comparison
//! @param threshold threshold of pixels that can still mismatch to pass (i.e.
//! 0.15f = 15% must pass) $param verboseErrors output details of image mismatch
//! to std::err
////////////////////////////////////////////////////////////////////////////////
DLL_MAPPING
CUTBoolean CUTIL_API cutComparePPM(const char *src_file, const char *ref_file,
const float epsilon, const float threshold,
bool verboseErrors = false);
////////////////////////////////////////////////////////////////////////////
//! Timer functionality
////////////////////////////////////////////////////////////////////////////
//! Create a new timer
//! @return CUTTrue if a time has been created, otherwise false
//! @param name of the new timer, 0 if the creation failed
////////////////////////////////////////////////////////////////////////////
DLL_MAPPING
CUTBoolean CUTIL_API cutCreateTimer(unsigned int *name);
////////////////////////////////////////////////////////////////////////////
//! Delete a timer
//! @return CUTTrue if a time has been deleted, otherwise false
//! @param name of the timer to delete
////////////////////////////////////////////////////////////////////////////
DLL_MAPPING
CUTBoolean CUTIL_API cutDeleteTimer(unsigned int name);
////////////////////////////////////////////////////////////////////////////
//! Start the time with name \a name
//! @param name name of the timer to start
////////////////////////////////////////////////////////////////////////////
DLL_MAPPING
CUTBoolean CUTIL_API cutStartTimer(const unsigned int name);
////////////////////////////////////////////////////////////////////////////
//! Stop the time with name \a name. Does not reset.
//! @param name name of the timer to stop
////////////////////////////////////////////////////////////////////////////
DLL_MAPPING
CUTBoolean CUTIL_API cutStopTimer(const unsigned int name);
////////////////////////////////////////////////////////////////////////////
//! Resets the timer's counter.
//! @param name name of the timer to reset.
////////////////////////////////////////////////////////////////////////////
DLL_MAPPING
CUTBoolean CUTIL_API cutResetTimer(const unsigned int name);
////////////////////////////////////////////////////////////////////////////
//! Returns total execution time in milliseconds for the timer over all
//! runs since the last reset or timer creation.
//! @param name name of the timer to return the time of
////////////////////////////////////////////////////////////////////////////
DLL_MAPPING
float CUTIL_API cutGetTimerValue(const unsigned int name);
////////////////////////////////////////////////////////////////////////////
//! Return the average time in milliseconds for timer execution as the
//! total time for the timer dividied by the number of completed (stopped)
//! runs the timer has made.
//! Excludes the current running time if the timer is currently running.
//! @param name name of the timer to return the time of
////////////////////////////////////////////////////////////////////////////
DLL_MAPPING
float CUTIL_API cutGetAverageTimerValue(const unsigned int name);
////////////////////////////////////////////////////////////////////////////
//! Macros
#if CUDART_VERSION >= 4000
#define CUT_DEVICE_SYNCHRONIZE() cudaDeviceSynchronize();
#else
#define CUT_DEVICE_SYNCHRONIZE() cudaThreadSynchronize();
#endif
#if CUDART_VERSION >= 4000
#define CUT_DEVICE_RESET() cudaDeviceReset();
#else
#define CUT_DEVICE_RESET() cudaThreadExit();
#endif
// This is for the CUTIL bank checker
#ifdef _DEBUG
#if __DEVICE_EMULATION__
// Interface for bank conflict checker
#define CUT_BANK_CHECKER(array, index) \
(cutCheckBankAccess(threadIdx.x, threadIdx.y, threadIdx.z, blockDim.x, \
blockDim.y, blockDim.z, __FILE__, __LINE__, #array, \
index), \
array[index])
#else
#define CUT_BANK_CHECKER(array, index) array[index]
#endif
#else
#define CUT_BANK_CHECKER(array, index) array[index]
#endif
#define CU_SAFE_CALL_NO_SYNC(call) \
{ \
CUresult err = call; \
if (CUDA_SUCCESS != err) { \
fprintf(stderr, "Cuda driver error %x in file '%s' in line %i.\n", err, \
__FILE__, __LINE__); \
exit(EXIT_FAILURE); \
} \
}
#define CU_SAFE_CALL(call) CU_SAFE_CALL_NO_SYNC(call);
#define CU_SAFE_CTX_SYNC() \
{ \
CUresult err = cuCtxSynchronize(); \
if (CUDA_SUCCESS != err) { \
fprintf(stderr, "Cuda driver error %x in file '%s' in line %i.\n", err, \
__FILE__, __LINE__); \
exit(EXIT_FAILURE); \
} \
}
#define CUDA_SAFE_CALL_NO_SYNC(call) \
{ \
cudaError err = call; \
if (cudaSuccess != err) { \
fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n", __FILE__, \
__LINE__, cudaGetErrorString(err)); \
exit(EXIT_FAILURE); \
} \
}
#define CUDA_SAFE_CALL(call) CUDA_SAFE_CALL_NO_SYNC(call);
#define CUDA_SAFE_THREAD_SYNC() \
{ \
cudaError err = CUT_DEVICE_SYNCHRONIZE(); \
if (cudaSuccess != err) { \
fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n", __FILE__, \
__LINE__, cudaGetErrorString(err)); \
} \
}
#define CUFFT_SAFE_CALL(call) \
{ \
cufftResult err = call; \
if (CUFFT_SUCCESS != err) { \
fprintf(stderr, "CUFFT error in file '%s' in line %i.\n", __FILE__, \
__LINE__); \
exit(EXIT_FAILURE); \
} \
}
#define CUT_SAFE_CALL(call) \
if (CUTTrue != call) { \
fprintf(stderr, "Cut error in file '%s' in line %i.\n", __FILE__, \
__LINE__); \
exit(EXIT_FAILURE); \
}
//! Check for CUDA error
#ifdef _DEBUG
#define CUT_CHECK_ERROR(errorMessage) \
{ \
cudaError_t err = cudaGetLastError(); \
if (cudaSuccess != err) { \
fprintf(stderr, "Cuda error: %s in file '%s' in line %i : %s.\n", \
errorMessage, __FILE__, __LINE__, cudaGetErrorString(err)); \
exit(EXIT_FAILURE); \
} \
err = CUT_DEVICE_SYNCHRONIZE(); \
if (cudaSuccess != err) { \
fprintf(stderr, "Cuda error: %s in file '%s' in line %i : %s.\n", \
errorMessage, __FILE__, __LINE__, cudaGetErrorString(err)); \
exit(EXIT_FAILURE); \
} \
}
#else
#define CUT_CHECK_ERROR(errorMessage) \
{ \
cudaError_t err = cudaGetLastError(); \
if (cudaSuccess != err) { \
fprintf(stderr, "Cuda error: %s in file '%s' in line %i : %s.\n", \
errorMessage, __FILE__, __LINE__, cudaGetErrorString(err)); \
exit(EXIT_FAILURE); \
} \
}
#endif
//! Check for malloc error
#define CUT_SAFE_MALLOC(mallocCall) \
{ \
if (!(mallocCall)) { \
fprintf(stderr, "Host malloc failure in file '%s' in line %i\n", \
__FILE__, __LINE__); \
exit(EXIT_FAILURE); \
} \
} \
while (0) \
;
//! Check if conditon is true (flexible assert)
#define CUT_CONDITION(val) \
if (CUTFalse == cutCheckCondition(val, __FILE__, __LINE__)) { \
exit(EXIT_FAILURE); \
}
#if __DEVICE_EMULATION__
#define CUT_DEVICE_INIT(ARGC, ARGV)
#else
#define CUT_DEVICE_INIT(ARGC, ARGV) \
{ \
int deviceCount; \
CUDA_SAFE_CALL_NO_SYNC(cudaGetDeviceCount(&deviceCount)); \
if (deviceCount == 0) { \
fprintf(stderr, "cutil error: no devices supporting CUDA.\n"); \
exit(EXIT_FAILURE); \
} \
int dev = 0; \
cutGetCmdLineArgumenti(ARGC, (const char **)ARGV, "device", &dev); \
if (dev < 0) \
dev = 0; \
if (dev > deviceCount - 1) \
dev = deviceCount - 1; \
cudaDeviceProp deviceProp; \
CUDA_SAFE_CALL_NO_SYNC(cudaGetDeviceProperties(&deviceProp, dev)); \
if (cutCheckCmdLineFlag(ARGC, (const char **)ARGV, "quiet") == CUTFalse) \
fprintf(stderr, "Using device %d: %s\n", dev, deviceProp.name); \
CUDA_SAFE_CALL(cudaSetDevice(dev)); \
}
//! Check for CUDA context lost
#define CUDA_CHECK_CTX_LOST(errorMessage) \
{ \
cudaError_t err = cudaGetLastError(); \
if (cudaSuccess != err) { \
fprintf(stderr, "Cuda error: %s in file '%s' in line %i : %s.\n", \
errorMessage, __FILE__, __LINE__, cudaGetErrorString(err)); \
exit(EXIT_FAILURE); \
} \
err = CUT_DEVICE_SYNCHRONIZE(); \
if (cudaSuccess != err) { \
fprintf(stderr, "Cuda error: %s in file '%s' in line %i : %s.\n", \
errorMessage, __FILE__, __LINE__, cudaGetErrorString(err)); \
exit(EXIT_FAILURE); \
} \
}
//! Check for CUDA context lost
#define CU_CHECK_CTX_LOST(errorMessage) \
{ \
cudaError_t err = cudaGetLastError(); \
if (CUDA_ERROR_INVALID_CONTEXT != err) { \
fprintf(stderr, "Cuda error: %s in file '%s' in line %i : %s.\n", \
errorMessage, __FILE__, __LINE__, cudaGetErrorString(err)); \
exit(EXIT_FAILURE); \
} \
err = CUT_DEVICE_SYNCHRONIZE(); \
if (cudaSuccess != err) { \
fprintf(stderr, "Cuda error: %s in file '%s' in line %i : %s.\n", \
errorMessage, __FILE__, __LINE__, cudaGetErrorString(err)); \
exit(EXIT_FAILURE); \
} \
}
#endif
#define CUT_DEVICE_INIT_DRV(cuDevice, ARGC, ARGV) \
{ \
cuDevice = 0; \
int deviceCount = 0; \
CUresult err = cuInit(0); \
if (CUDA_SUCCESS == err) \
CU_SAFE_CALL_NO_SYNC(cuDeviceGetCount(&deviceCount)); \
if (deviceCount == 0) { \
fprintf(stderr, "cutil error: no devices supporting CUDA\n"); \
exit(EXIT_FAILURE); \
} \
int dev = 0; \
cutGetCmdLineArgumenti(ARGC, (const char **)ARGV, "device", &dev); \
if (dev < 0) \
dev = 0; \
if (dev > deviceCount - 1) \
dev = deviceCount - 1; \
CU_SAFE_CALL_NO_SYNC(cuDeviceGet(&cuDevice, dev)); \
char name[100]; \
cuDeviceGetName(name, 100, cuDevice); \
if (cutCheckCmdLineFlag(ARGC, (const char **)ARGV, "quiet") == CUTFalse) \
fprintf(stderr, "Using device %d: %s\n", dev, name); \
}
#define CUT_EXIT(argc, argv) \
if (!cutCheckCmdLineFlag(argc, (const char **)argv, "noprompt")) { \
printf("\nPress ENTER to exit...\n"); \
fflush(stdout); \
fflush(stderr); \
getchar(); \
} \
exit(EXIT_SUCCESS);
#endif // #ifndef _CUTIL_H_

View File

@ -1,104 +0,0 @@
/*
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
*
* NVIDIA Corporation and its licensors retain all intellectual property and *
* proprietary rights in and to this software and related documentation. Any
* use, reproduction, disclosure, or distribution of this software and related
* documentation without an express license agreement from NVIDIA Corporation is
* strictly prohibited.
*
* Please refer to the applicable NVIDIA end user license agreement (EULA)
* associated with this source code for terms and conditions that govern
* your use of this NVIDIA software.
*
*/
#include <iostream>
#include <stdio.h>
#define CHECK(ans) \
{ gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line,
bool abort = true) {
if (code != cudaSuccess) {
fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file,
line);
if (abort)
exit(code);
}
}
using namespace std;
#define SIZE (100 * 1024 * 1024)
__global__ void histo_kernel(unsigned char *buffer, long size,
unsigned int *histo) {
__shared__ unsigned int temp[256];
temp[threadIdx.x] = 0;
__syncthreads();
int i = threadIdx.x + blockIdx.x * blockDim.x;
int offset = blockDim.x * gridDim.x;
while (i < size) {
atomicAdd(&temp[buffer[i]], 1);
i += offset;
}
__syncthreads();
atomicAdd(&(histo[threadIdx.x]), temp[threadIdx.x]);
}
int runHisto(char *file, unsigned int *freq, unsigned int memSize,
unsigned int *source) {
FILE *f = fopen(file, "rb");
if (!f) {
perror(file);
exit(1);
}
fseek(f, 0, SEEK_SET);
size_t result = fread(source, 1, memSize, f);
if (result != memSize)
fputs("Cannot read input file", stderr);
fclose(f);
unsigned char *buffer = (unsigned char *)source;
int blocks = 2;
// allocate memory on the GPU for the file's data
int partSize = memSize / 32;
int totalNum = memSize / sizeof(unsigned int);
int partialNum = partSize / sizeof(unsigned int);
unsigned char *dev_buffer0;
unsigned char *dev_buffer1;
unsigned int *dev_histo;
cudaMalloc((void **)&dev_buffer0, partSize);
cudaMalloc((void **)&dev_buffer1, partSize);
cudaMalloc((void **)&dev_histo, 256 * sizeof(int));
cudaMemset(dev_histo, 0, 256 * sizeof(int));
for (int i = 0; i < totalNum; i += partialNum * 2) {
CHECK(
cudaMemcpy(dev_buffer0, buffer + i, partSize, cudaMemcpyHostToDevice));
CHECK(cudaMemcpy(dev_buffer1, buffer + i + partialNum, partSize,
cudaMemcpyHostToDevice));
// kernel launch - 2x the number of mps gave best timing
histo_kernel<<<blocks * 2, 256>>>(dev_buffer0, partSize, dev_histo);
cudaDeviceSynchronize();
histo_kernel<<<blocks * 2, 256>>>(dev_buffer1, partSize, dev_histo);
cudaDeviceSynchronize();
}
cudaMemcpy(freq, dev_histo, 256 * sizeof(int), cudaMemcpyDeviceToHost);
cudaFree(dev_histo);
cudaFree(dev_buffer0);
cudaFree(dev_buffer1);
return 0;
}

View File

@ -1,90 +0,0 @@
#include "stdio.h"
#include <algorithm>
#include <climits> // for CHAR_BIT
#include <iostream>
#include <iterator>
#include <map>
#include <math.h>
#include <queue>
using namespace std;
const int UniqueSymbols = 1 << CHAR_BIT;
void printBits(unsigned int val, int numbits) {
for (int i = numbits - 1; i >= 0; i--)
putchar('0' + ((val >> i) & 1));
}
typedef vector<bool> HuffCode;
typedef map<unsigned char, HuffCode> HuffCodeMap;
class INode {
public:
const int f;
virtual ~INode() {}
protected:
INode(int f) : f(f) {}
};
class InternalNode : public INode {
public:
INode *const left;
INode *const right;
InternalNode(INode *c0, INode *c1)
: INode(c0->f + c1->f), left(c0), right(c1) {}
~InternalNode() {
delete left;
delete right;
}
};
class LeafNode : public INode {
public:
const char c;
LeafNode(int f, char c) : INode(f), c(c) {}
};
struct NodeCmp {
bool operator()(const INode *lhs, const INode *rhs) const {
return lhs->f > rhs->f;
}
};
INode *BuildTree(unsigned int (&frequencies)[UniqueSymbols]) {
std::priority_queue<INode *, std::vector<INode *>, NodeCmp> trees;
for (int i = 0; i < UniqueSymbols; ++i) {
if (frequencies[i] != 0)
trees.push(new LeafNode(frequencies[i], (char)i));
}
while (trees.size() > 1) {
INode *childR = trees.top();
trees.pop();
INode *childL = trees.top();
trees.pop();
INode *parent = new InternalNode(childR, childL);
trees.push(parent);
}
return trees.top();
}
void GenerateCodes(const INode *node, const HuffCode &prefix,
HuffCodeMap &outCodes) {
if (const LeafNode *lf = dynamic_cast<const LeafNode *>(node)) {
outCodes[lf->c] = prefix;
} else if (const InternalNode *in =
dynamic_cast<const InternalNode *>(node)) {
HuffCode leftPrefix = prefix;
leftPrefix.push_back(false);
GenerateCodes(in->left, leftPrefix, outCodes);
HuffCode rightPrefix = prefix;
rightPrefix.push_back(true);
GenerateCodes(in->right, rightPrefix, outCodes);
}
}

View File

@ -1,65 +0,0 @@
#ifndef _LOADTESTDATA_H_
#define _LOADTESTDATA_H_
//#include "testdatagen.h"
#include "hist.cu"
#include "huffTree.h"
inline void initParams(char *file_name, uint num_block_threads,
uint &num_blocks, uint &num_elements, uint &mem_size,
uint symbol_type_size) {
if (file_name == NULL) {
num_elements = num_blocks * num_block_threads;
mem_size = num_elements * symbol_type_size;
} else {
FILE *f = fopen(file_name, "rb");
if (!f) {
perror(file_name);
exit(1);
}
fseek(f, 0, SEEK_END);
mem_size = ftell(f);
fclose(f);
num_elements = mem_size / symbol_type_size;
// todo add check if we need 1 more block!
num_blocks = num_elements / num_block_threads;
}
}
inline void loadData(char *file_name, uint *sourceData, uint *codewords,
uint *codewordlens, uint num_elements, uint mem_size,
double &H) {
if (file_name == NULL) {
printf("No input file\n");
exit(-1);
} else {
unsigned int freqs[UniqueSymbols] = {0};
runHisto(file_name, freqs, mem_size, sourceData);
INode *root = BuildTree(freqs);
HuffCodeMap codes;
GenerateCodes(root, HuffCode(), codes);
delete root;
for (HuffCodeMap::const_iterator it = codes.begin(); it != codes.end();
++it) {
unsigned int count = distance(it->second.begin(), it->second.end());
for (int i = 0; i < count; i++)
if (it->second[i])
codewords[(unsigned int)(it->first)] +=
(uint)pow(2.0f, (int)count - i - 1);
codewordlens[(unsigned int)(it->first)] = count;
}
H = 0.0;
for (unsigned int i = 0; i < 256; i++)
if (freqs[i] > 0) {
double p = (double)freqs[i] / (double)mem_size;
H += p * log(p) / log(2.0);
}
H = -H;
printf("\n%s, %u bytes, entropy %f\n\n", file_name, mem_size, H);
}
}
#endif

File diff suppressed because it is too large Load Diff

File diff suppressed because one or more lines are too long

View File

@ -1,225 +0,0 @@
/*
* PAVLE - Parallel Variable-Length Encoder for CUDA. Main file.
*
* Copyright (C) 2009 Ana Balevic <ana.balevic@gmail.com>
* All rights reserved.
*
* This program is free software; you can redistribute it and/or modify it under
* the terms of the MIT License. Read the full licence:
* http://www.opensource.org/licenses/mit-license.php
*
* If you find this program useful, please contact me and reference PAVLE home
* page in your work.
*
*/
#include "comparison_helpers.h"
#include "cuda_helpers.h"
#include "load_data.h"
#include "print_helpers.h"
#include "stats_logger.h"
#include "stdafx.h"
#include <cuda_runtime.h>
#include <sys/time.h>
//#include "vlc_kernel_gm32.cu"
//#include "vlc_kernel_sm32.cu"
#include "vlc_kernel_sm64huff.cu"
//#include "vlc_kernel_dpt.cu"
//#include "vlc_kernel_dptt.cu"
//#include "scan_kernel.cu"
#include "cpuencode.h"
#include "pack_kernels.cu"
#include "scan.cu"
long long get_time() {
struct timeval tv;
gettimeofday(&tv, NULL);
return (tv.tv_sec * 1000000) + tv.tv_usec;
}
void runVLCTest(char *file_name, uint num_block_threads, uint num_blocks = 1);
extern "C" void cpu_vlc_encode(unsigned int *indata, unsigned int num_elements,
unsigned int *outdata, unsigned int *outsize,
unsigned int *codewords,
unsigned int *codewordlens);
int main(int argc, char *argv[]) {
if (!InitCUDA()) {
return 0;
}
unsigned int num_block_threads = 256;
if (argc > 1)
for (int i = 1; i < argc; i++)
runVLCTest(argv[i], num_block_threads);
else {
runVLCTest(NULL, num_block_threads, 1024);
}
return 0;
}
void runVLCTest(char *file_name, uint num_block_threads, uint num_blocks) {
printf("CUDA! Starting VLC Tests!\n");
unsigned int
num_elements; // uint num_elements = num_blocks * num_block_threads;
unsigned int mem_size; // uint mem_size = num_elements * sizeof(int);
unsigned int symbol_type_size = sizeof(int);
//////// LOAD DATA ///////////////
double H; // entropy
initParams(file_name, num_block_threads, num_blocks, num_elements, mem_size,
symbol_type_size);
printf("Parameters: num_elements: %d, num_blocks: %d, num_block_threads: "
"%d\n----------------------------\n",
num_elements, num_blocks, num_block_threads);
////////LOAD DATA ///////////////
uint *sourceData = (uint *)malloc(mem_size);
uint *destData = (uint *)malloc(mem_size);
uint *crefData = (uint *)malloc(mem_size);
uint *codewords = (uint *)malloc(NUM_SYMBOLS * symbol_type_size);
uint *codewordlens = (uint *)malloc(NUM_SYMBOLS * symbol_type_size);
uint *cw32 = (uint *)malloc(mem_size);
uint *cw32len = (uint *)malloc(mem_size);
uint *cw32idx = (uint *)malloc(mem_size);
uint *cindex2 = (uint *)malloc(num_blocks * sizeof(int));
memset(sourceData, 0, mem_size);
memset(destData, 0, mem_size);
memset(crefData, 0, mem_size);
memset(cw32, 0, mem_size);
memset(cw32len, 0, mem_size);
memset(cw32idx, 0, mem_size);
memset(codewords, 0, NUM_SYMBOLS * symbol_type_size);
memset(codewordlens, 0, NUM_SYMBOLS * symbol_type_size);
memset(cindex2, 0, num_blocks * sizeof(int));
//////// LOAD DATA ///////////////
loadData(file_name, sourceData, codewords, codewordlens, num_elements,
mem_size, H);
//////// LOAD DATA ///////////////
unsigned int *d_sourceData, *d_destData, *d_destDataPacked;
unsigned int *d_codewords, *d_codewordlens;
unsigned int *d_cw32, *d_cw32len, *d_cw32idx, *d_cindex, *d_cindex2;
CUDA_SAFE_CALL(cudaMalloc((void **)&d_sourceData, mem_size));
CUDA_SAFE_CALL(cudaMalloc((void **)&d_destData, mem_size));
CUDA_SAFE_CALL(cudaMalloc((void **)&d_destDataPacked, mem_size));
CUDA_SAFE_CALL(
cudaMalloc((void **)&d_codewords, NUM_SYMBOLS * symbol_type_size));
CUDA_SAFE_CALL(
cudaMalloc((void **)&d_codewordlens, NUM_SYMBOLS * symbol_type_size));
CUDA_SAFE_CALL(cudaMalloc((void **)&d_cw32, mem_size));
CUDA_SAFE_CALL(cudaMalloc((void **)&d_cw32len, mem_size));
CUDA_SAFE_CALL(cudaMalloc((void **)&d_cw32idx, mem_size));
CUDA_SAFE_CALL(
cudaMalloc((void **)&d_cindex, num_blocks * sizeof(unsigned int)));
CUDA_SAFE_CALL(
cudaMalloc((void **)&d_cindex2, num_blocks * sizeof(unsigned int)));
// printf("source data\n");
// for (int i = 0; i < 200; i++) {
// printf("%d ", sourceData[i]);
// }
// printf("\n");
// printf("codewords\n");
// for (int i = 0; i < 200; i++) {
// printf("%d ", codewords[i]);
// }
// printf("\n");
// printf("codeword lens\n");
// for (int i = 0; i < 200; i++) {
// printf("%d ", codewordlens[i]);
// }
// printf("\n");
// return;
CUDA_SAFE_CALL(
cudaMemcpy(d_sourceData, sourceData, mem_size, cudaMemcpyHostToDevice));
CUDA_SAFE_CALL(cudaMemcpy(d_codewords, codewords,
NUM_SYMBOLS * symbol_type_size,
cudaMemcpyHostToDevice));
CUDA_SAFE_CALL(cudaMemcpy(d_codewordlens, codewordlens,
NUM_SYMBOLS * symbol_type_size,
cudaMemcpyHostToDevice));
CUDA_SAFE_CALL(
cudaMemcpy(d_destData, destData, mem_size, cudaMemcpyHostToDevice));
dim3 grid_size(num_blocks, 1, 1);
dim3 block_size(num_block_threads, 1, 1);
unsigned int sm_size;
unsigned int NT = 10; // number of runs for each execution time
//////////////////* CPU ENCODER *///////////////////////////////////
unsigned int refbytesize;
long long timer = get_time();
cpu_vlc_encode((unsigned int *)sourceData, num_elements,
(unsigned int *)crefData, &refbytesize, codewords,
codewordlens);
float msec = (float)((get_time() - timer) / 1000.0);
printf("CPU Encoding time (CPU): %f (ms)\n", msec);
printf("CPU Encoded to %d [B]\n", refbytesize);
unsigned int num_ints = refbytesize / 4 + ((refbytesize % 4 == 0) ? 0 : 1);
//////////////////* END CPU *///////////////////////////////////
//////////////////* SM64HUFF KERNEL *///////////////////////////////////
grid_size.x = num_blocks;
block_size.x = num_block_threads;
sm_size = block_size.x * sizeof(unsigned int);
#ifdef CACHECWLUT
sm_size = 2 * NUM_SYMBOLS * sizeof(int) + block_size.x * sizeof(unsigned int);
#endif
for (int i = 0; i < NT; i++) {
vlc_encode_kernel_sm64huff<<<grid_size, block_size>>>(
d_sourceData, d_codewords, d_codewordlens,
#ifdef TESTING
d_cw32, d_cw32len, d_cw32idx,
#endif
d_destData, d_cindex); // testedOK2
cudaThreadSynchronize();
}
// //////////////////* END KERNEL *///////////////////////////////////
#ifdef TESTING
unsigned int num_scan_elements = grid_size.x;
preallocBlockSums(num_scan_elements);
cudaMemset(d_destDataPacked, 0, mem_size);
printf("Num_blocks to be passed to scan is %d.\n", num_scan_elements);
prescanArray(d_cindex2, d_cindex, num_scan_elements);
pack2<<<num_scan_elements / 32, 32>>>(
(unsigned int *)d_destData, d_cindex, d_cindex2,
(unsigned int *)d_destDataPacked, num_elements / num_scan_elements);
cudaThreadSynchronize();
CUT_CHECK_ERROR("Pack2 Kernel execution failed\n");
deallocBlockSums();
// return;
CUDA_SAFE_CALL(
cudaMemcpy(destData, d_destDataPacked, mem_size, cudaMemcpyDeviceToHost));
compare_vectors((unsigned int *)crefData, (unsigned int *)destData, num_ints);
#endif
free(sourceData);
free(destData);
free(codewords);
free(codewordlens);
free(cw32);
free(cw32len);
free(crefData);
CUDA_SAFE_CALL(cudaFree(d_sourceData));
CUDA_SAFE_CALL(cudaFree(d_destData));
CUDA_SAFE_CALL(cudaFree(d_destDataPacked));
CUDA_SAFE_CALL(cudaFree(d_codewords));
CUDA_SAFE_CALL(cudaFree(d_codewordlens));
CUDA_SAFE_CALL(cudaFree(d_cw32));
CUDA_SAFE_CALL(cudaFree(d_cw32len));
CUDA_SAFE_CALL(cudaFree(d_cw32idx));
CUDA_SAFE_CALL(cudaFree(d_cindex));
CUDA_SAFE_CALL(cudaFree(d_cindex2));
free(cindex2);
}

View File

@ -1,62 +0,0 @@
/*
* Copyright Ana Balevic, 2008-2009. All rights reserved.
*/
#ifndef _PABIO_KERNEL2_H_
#define _PABIO_KERNEL2_H_
#include "parameters.h"
/* PARALLEL PUT BITS IMPLEMENTATION (CUDA1.1+ compatible)
* Set numbits in the destination word out[kc] starting from the position startbit
* Implementation comments:
* Second atomic operation actually sets these bits to the value stored in the codeword; the other bits are left unotuched
* First atomic operation is a necessary prepration - we change only the bits that will be affected by the codeword to be written to 1s
* in order for set bits to work with using atomicand.
* TODOs: benchmark performance 1) gm atomics vs sm atomics; 2) memset at init time vs. atomicOr
*/
__device__ void static put_bits_atomic2(unsigned int* out, unsigned int kc,
unsigned int startbit, unsigned int numbits,
unsigned int codeword) {
unsigned int cw32 = codeword;
unsigned int restbits = 32-startbit-numbits;
/* 1. Prepare the memory location */
#ifndef MEMSET0 //Can remove this part if the contents of the memory are already set to all 0s
unsigned int mask = ((1<<numbits)-1); // -> 0000...001111
mask<<=restbits; //fill in zeros at the back positions -> 0000...001111000 -> 11111110000111111111111 (in order not to and other positions)
atomicAnd(&out[kc], ~mask); //set 0s in the destination from startbit in the len of numbits
#endif
/* 2. Write the codeword */
cw32 = cw32<<restbits;
atomicOr(&out[kc], cw32);
}
/* PARALLEL PUT BITS IMPLEMENTATION (CUDA1.1+ compatible)
* Checkes if the part of the word to be written matches whole memory location, and if yes, avoids using the atmoics.
* Experience: no benefits, even a bit slower on CUDA.
*/
__device__ void static put_bits_atomic2a(unsigned int* out, unsigned int kc,
unsigned int startbit, unsigned int numbits,
unsigned int codeword) {
unsigned int cw32 = codeword;
unsigned int restbits = 32-startbit-numbits;
/* 1. Prepare the memory location */
#ifndef MEMSET0 //Can remove this part if the contents of the memory are already set to all 0s
unsigned int mask = ((1<<numbits)-1); // -> 0000...001111
mask<<=restbits; //fill in zeros at the back positions -> 0000...001111000 -> 11111110000111111111111 (in order not to and other positions)
atomicAnd(&out[kc], ~mask); //set 0s in the destination from startbit in the len of numbits
#endif
/* 2. Write the codeword */
if (startbit == 0 && restbits == 0) {
out[kc] = cw32;
} else {
cw32 = cw32<<restbits;
atomicOr(&out[kc], cw32);
}
}
#endif //ifndef _PABIO_KERNEL_H_

View File

@ -1,43 +0,0 @@
#ifndef _PACK_KERNELS_H_
#define _PACK_KERNELS_H_
#include "parameters.h"
__global__ static void pack2(unsigned int *srcData, unsigned int *cindex,
unsigned int *cindex2, unsigned int *dstData,
unsigned int original_num_block_elements) {
unsigned int tid = blockDim.x * blockIdx.x + threadIdx.x;
// source index
unsigned int offset = tid * original_num_block_elements; // DPB,
unsigned int bitsize = cindex[tid];
// destination index
unsigned int pos = cindex2[tid], dword = pos / 32, bit = pos % 32;
unsigned int i, dw, tmp;
dw = srcData[offset]; // load the first dword from srcData[]
tmp = dw >> bit; // cut off those bits that do not fit into the initial
// location in destData[]
atomicOr(&dstData[dword], tmp); // fill up this initial location
tmp = (bit == 0) ? 0 : (dw << 32 - bit);
for (i = 1; i < bitsize / 32;
i++) { // from now on, we have exclusive access to destData[]
dw = srcData[offset + i]; // load next dword from srcData[]
tmp |= dw >> bit; // fill up tmp
dstData[dword + i] = tmp; // write complete dword to destData[]
tmp = (bit == 0) ? 0 : (dw << 32 - bit);
}
// exclusive access to dstData[] ends here
// the remaining block can, or rather should be further optimized
// write the remaining bits in tmp, UNLESS bit is 0 and bitsize is divisible
// by 32, in this case do nothing
if (bit != 0 || bitsize % 32 != 0)
atomicOr(&dstData[dword + i], tmp);
if (bitsize % 32 != 0) {
dw = srcData[offset + i];
atomicOr(&dstData[dword + i], dw >> bit);
atomicOr(&dstData[dword + i + 1], (bit == 0) ? 0 : (dw << 32 - bit));
}
}
#endif

View File

@ -1,27 +0,0 @@
#ifndef _PARAMS_H_
#define _PARAMS_H_
typedef unsigned int uint;
typedef unsigned char uint8;
#define BENCH 0
/* 0 - MEASURE TIME, NO TESTING
** 1 - TEST
** 2 - TEST & VERBOSE
*/
#define TESTING
#define DPT 4 // data (dwords) per thread
#define CACHECWLUT // MAX DPT = 8
//#define CACHESRCDATA // MAX DPT = 4
#define SMATOMICS
#define MEMSET0
#define MAX_SM_BLOCK_SIZE_GPU 16384 // B
#define NUM_SYMBOLS 256 // fixed to 256.
#endif

View File

@ -1,217 +0,0 @@
#ifndef _PRINT_HELPERS_H_
#define _PRINT_HELPERS_H_
#include "parameters.h"
#include <stdio.h>
__inline void printdbg_data_bin(const char *filename, unsigned int *data,
unsigned int num_ints) {
FILE *dump = fopen((const char *)filename, "wt");
for (unsigned int i = 0; i < num_ints; i++) {
unsigned int mask = 0x80000000;
for (unsigned int j = 0; j < 32; j++) {
if (data[i] & mask)
fprintf(dump, "1"); // printf("1");
else
fprintf(dump, "0"); // printf("0");
mask = mask >> 1;
}
fprintf(dump, "\n");
}
fclose(dump);
}
__inline void printdbg_data_int(const char *filename, unsigned int *data,
unsigned int num_ints) {
FILE *dump = fopen((const char *)filename, "wt");
for (unsigned int i = 0; i < num_ints; i++) {
fprintf(dump, "%d: %d\n", i, data[i]);
}
fclose(dump);
}
__inline void printdbg_gpu_data_detailed(FILE *gpudump, unsigned int *cw32,
unsigned int *cw32len,
unsigned int *cw32idx,
unsigned int num_elements) {
for (unsigned int i = 0; i < num_elements; i++) {
fprintf(gpudump, "bp: %d, kc: %d, startbit: %d, cwlen: %d, cw:\t\t",
cw32idx[i], cw32idx[i] / 32, cw32idx[i] % 32, cw32len[i]);
// print codeword:
unsigned int mask = 0x80000000;
mask = mask >> (32 - cw32len[i]);
for (unsigned int j = 0; j < cw32len[i]; j++) {
if (cw32[i] & mask)
fprintf(gpudump, "1"); // printf("1");
else
fprintf(gpudump, "0"); // printf("0");
mask = mask >> 1;
}
fprintf(gpudump, "\n");
}
}
__inline void printdbg_gpu_data_detailed2(const char *filename,
unsigned int *cw32,
unsigned int *cw32len,
unsigned int *cw32idx,
unsigned int num_elements) {
FILE *gpudump = fopen((const char *)filename, "wt");
for (unsigned int i = 0; i < num_elements; i++) {
fprintf(gpudump, "bp: %d, kc: %d, startbit: %d, cwlen: %d, cw:\t\t",
cw32idx[i], cw32idx[i] / 32, cw32idx[i] % 32, cw32len[i]);
// print codeword:
unsigned int mask = 0x80000000;
mask = mask >> (32 - cw32len[i]);
for (unsigned int j = 0; j < cw32len[i]; j++) {
if (cw32[i] & mask)
fprintf(gpudump, "1"); // printf("1");
else
fprintf(gpudump, "0"); // printf("0");
mask = mask >> 1;
}
fprintf(gpudump, "\n");
}
fclose(gpudump);
}
/************************************************************************/
/* BIT PRINTS */
/************************************************************************/
__inline void printBits(unsigned char number) {
unsigned char mask = 0x80;
for (unsigned int j = 0; j < 8; j++) {
if (number & mask)
printf("1");
else
printf("0");
mask = mask >> 1;
}
printf(" ");
}
__inline void print32Bits(unsigned int number) {
unsigned int mask = 0x80000000;
for (unsigned int j = 0; j < 32; j++) {
if (number & mask)
printf("1");
else
printf("0");
mask = mask >> 1;
}
printf("\n");
}
__inline void print32BitsM(unsigned int marker) {
for (unsigned int j = 0; j < 32; j++) {
if (marker == (j + 1))
printf("|");
else
printf(".");
}
printf("\n");
}
__inline void print_array_char_as_bits(unsigned char *a, unsigned int len) {
printf(
" ========================= Printing vector =======================\n");
printf("Total number of elements is %d\n", len);
for (unsigned int i = 0; i < len; i++) {
printf("a[%d]=%d \t", i, a[i]);
printBits(a[i]);
printf("\n");
}
printf("\n");
printf(
" ==================================================================\n");
}
__inline void print_array_ints_as_bits(unsigned int *a, unsigned int len) {
printf(
" ========================= Printing vector =======================\n");
for (unsigned int i = 0; i < len; i++) {
print32Bits(a[i]);
printf("\n");
}
printf("\n");
printf(
" ==================================================================\n");
}
__inline void print_compare_array_ints_as_bits(unsigned int *a, unsigned int *b,
unsigned int len) {
printf(
" ========================= Printing vector =======================\n");
for (unsigned int i = 0; i < len; i++) {
print32Bits(a[i]);
print32Bits(b[i]);
printf("\n");
}
printf("\n");
printf(
" ==================================================================\n");
}
__inline void print_array_in_hex(unsigned int *a, unsigned int len) {
printf(
" ========================= Printing vector =======================\n");
// printf("Total number of elements is %d\n", len);
for (unsigned int i = 0; i < len; i++) {
printf("%#X\t", a[i]);
}
printf("\n");
printf(
" ==================================================================\n");
}
/************************************************************************/
/* ARRAY PRINTS */
/***********************************************************************/
template <typename T> __inline void print_array(T *a, unsigned int len) {
printf(
" ========================= Printing vector =======================\n");
printf("Total number of elements is %d\n", len);
for (unsigned int i = 0; i < len; i++) {
printf("a[%d]=%d \t", i, a[i]);
}
printf("\n");
printf(
" ==================================================================\n");
}
template <typename ST, typename CT>
__inline void print_rled_arrays(ST *rle_symbols, CT *rle_counts,
unsigned int rle_len) {
ST current_symbol;
CT current_count;
printf(" ========================= Printing RLE vector "
"=======================\n");
printf(" Total number of RL Pairs is %d\n", rle_len);
for (unsigned int k = 0; k < rle_len; k++) {
current_symbol = rle_symbols[k];
current_count = rle_counts[k];
printf("(%d,%d) ,\t", current_symbol, current_count);
}
printf("\n");
}
__inline void print_packed_rle_array(unsigned int *rle, unsigned int rle_len) {
unsigned short current_symbol;
unsigned short current_count;
printf(" ========================= Printing RLE vector "
"=======================\n");
printf(" Total number of RL Pairs is %d\n", rle_len);
for (unsigned int k = 0; k < rle_len; k++) {
current_symbol = (unsigned short)(rle[k] >> 16); // get the higher half-word
current_count =
(unsigned short)rle[k] & 0x0000FFFFF; // get the shorter half-word
printf("(%d,%d) ,\t", current_symbol, current_count);
}
printf("\n");
}
#endif // _PRINT_HELPERS_H_

View File

@ -1,20 +0,0 @@
#!/bin/bash
set -e
# clang++ main_test_cu.cu --cuda-path=/usr/local/cuda-10.1 --cuda-gpu-arch=sm_61 -L/usr/local/cuda-10.1/lib64 -lcudart_static -ldl -lrt -pthread -save-temps -v
clang -c -emit-llvm cpuencode.cpp
llvm-as main_test_cu-cuda-nvptx64-nvidia-cuda-sm_61.ll
llvm-as main_test_cu-host-x86_64-unknown-linux-gnu.ll
../../build/compilation/kernelTranslator main_test_cu-cuda-nvptx64-nvidia-cuda-sm_61.bc kernel.bc
../../build/compilation/hostTranslator main_test_cu-host-x86_64-unknown-linux-gnu.bc host.bc
llc --relocation-model=pic --filetype=obj kernel.bc
llc --relocation-model=pic --filetype=obj host.bc
llc --relocation-model=pic --filetype=obj cpuencode.bc
g++ -Wall -L../../build/runtime \
-L../../build/runtime/threadPool -o pavle \
-fPIC -no-pie host.o kernel.o cpuencode.o -lc -lx86Runtime -lthreadPool -lpthread
export LD_LIBRARY_PATH=../../build/runtime:../../build/runtime/threadPool:$LD_LIBRARY_PATH
./pavle ../../rodinia-data/huffman/test1024_H2.206587175259.in

View File

@ -1,216 +0,0 @@
/*
* Copyright 1993-2006 NVIDIA Corporation. All rights reserved.
*
* NOTICE TO USER:
*
* This source code is subject to NVIDIA ownership rights under U.S. and
* international Copyright laws.
*
* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
* IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH
* REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
* MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
* OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
* OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
* OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
* OR PERFORMANCE OF THIS SOURCE CODE.
*
* U.S. Government End Users. This source code is a "commercial item" as
* that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of
* "commercial computer software" and "commercial computer software
* documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995)
* and is provided to the U.S. Government only as a commercial end item.
* Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
* 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
* source code with only those rights set forth herein.
*/
#ifndef _PRESCAN_CU_
#define _PRESCAN_CU_
// includes, kernels
#include "cutil.h"
#include "scanLargeArray_kernel.cu"
#include <assert.h>
#include <stdio.h>
#define max(a, b) (a > b ? a : b)
inline bool isPowerOfTwo(int n) { return ((n & (n - 1)) == 0); }
inline int floorPow2(int n) {
#ifdef WIN32
// method 2
return 1 << (int)logb((float)n);
#else
// method 1
// float nf = (float)n;
// return 1 << (((*(int*)&nf) >> 23) - 127);
int exp;
frexp((float)n, &exp);
return 1 << (exp - 1);
#endif
}
#define BLOCK_SIZE 256
static unsigned int **g_scanBlockSums;
static unsigned int g_numEltsAllocated = 0;
static unsigned int g_numLevelsAllocated = 0;
static void preallocBlockSums(unsigned int maxNumElements) {
assert(g_numEltsAllocated == 0); // shouldn't be called
g_numEltsAllocated = maxNumElements;
unsigned int blockSize = BLOCK_SIZE; // max size of the thread blocks
unsigned int numElts = maxNumElements;
int level = 0;
do {
unsigned int numBlocks =
max(1, (int)ceil((float)numElts / (2.f * blockSize)));
if (numBlocks > 1)
level++;
numElts = numBlocks;
} while (numElts > 1);
g_scanBlockSums = (unsigned int **)malloc(level * sizeof(unsigned int *));
g_numLevelsAllocated = level;
numElts = maxNumElements;
level = 0;
do {
unsigned int numBlocks =
max(1, (int)ceil((float)numElts / (2.f * blockSize)));
if (numBlocks > 1)
CUDA_SAFE_CALL(cudaMalloc((void **)&g_scanBlockSums[level++],
numBlocks * sizeof(unsigned int)));
numElts = numBlocks;
} while (numElts > 1);
CUT_CHECK_ERROR("preallocBlockSums");
}
static void deallocBlockSums() {
for (unsigned int i = 0; i < g_numLevelsAllocated; i++) {
cudaFree(g_scanBlockSums[i]);
}
CUT_CHECK_ERROR("deallocBlockSums");
free((void **)g_scanBlockSums);
g_scanBlockSums = 0;
g_numEltsAllocated = 0;
g_numLevelsAllocated = 0;
}
static void prescanArrayRecursive(unsigned int *outArray,
const unsigned int *inArray, int numElements,
int level) {
unsigned int blockSize = BLOCK_SIZE; // max size of the thread blocks
unsigned int numBlocks =
max(1, (int)ceil((float)numElements / (2.f * blockSize)));
unsigned int numThreads;
if (numBlocks > 1)
numThreads = blockSize;
else if (isPowerOfTwo(numElements))
numThreads = numElements / 2;
else
numThreads = floorPow2(numElements);
unsigned int numEltsPerBlock = numThreads * 2;
// if this is a non-power-of-2 array, the last block will be non-full
// compute the smallest power of 2 able to compute its scan.
unsigned int numEltsLastBlock =
numElements - (numBlocks - 1) * numEltsPerBlock;
unsigned int numThreadsLastBlock = max(1, numEltsLastBlock / 2);
unsigned int np2LastBlock = 0;
unsigned int sharedMemLastBlock = 0;
if (numEltsLastBlock != numEltsPerBlock) {
np2LastBlock = 1;
if (!isPowerOfTwo(numEltsLastBlock))
numThreadsLastBlock = floorPow2(numEltsLastBlock);
unsigned int extraSpace = (2 * numThreadsLastBlock) / NUM_BANKS;
sharedMemLastBlock =
sizeof(unsigned int) * (2 * numThreadsLastBlock + extraSpace);
}
// padding space is used to avoid shared memory bank conflicts
unsigned int extraSpace = numEltsPerBlock / NUM_BANKS;
unsigned int sharedMemSize =
sizeof(unsigned int) * (numEltsPerBlock + extraSpace);
#ifdef DEBUG
if (numBlocks > 1) {
assert(g_numEltsAllocated >= numElements);
}
#endif
// setup execution parameters
// if NP2, we process the last block separately
dim3 grid(max(1, numBlocks - np2LastBlock), 1, 1);
dim3 threads(numThreads, 1, 1);
// make sure there are no CUDA errors before we start
CUT_CHECK_ERROR("prescanArrayRecursive before kernels");
// execute the scan
if (numBlocks > 1) {
prescan<true, false><<<grid, threads>>>(
outArray, inArray, g_scanBlockSums[level], numThreads * 2, 0, 0);
cudaThreadSynchronize();
CUT_CHECK_ERROR("prescanWithBlockSums");
if (np2LastBlock) {
prescan<true, true><<<1, numThreadsLastBlock>>>(
outArray, inArray, g_scanBlockSums[level], numEltsLastBlock,
numBlocks - 1, numElements - numEltsLastBlock);
cudaThreadSynchronize();
CUT_CHECK_ERROR("prescanNP2WithBlockSums");
}
// After scanning all the sub-blocks, we are mostly done. But now we
// need to take all of the last values of the sub-blocks and scan those.
// This will give us a new value that must be sdded to each block to
// get the final results.
// recursive (CPU) call
prescanArrayRecursive(g_scanBlockSums[level], g_scanBlockSums[level],
numBlocks, level + 1);
uniformAdd<<<grid, threads>>>(outArray, g_scanBlockSums[level],
numElements - numEltsLastBlock, 0, 0);
cudaThreadSynchronize();
CUT_CHECK_ERROR("uniformAdd");
if (np2LastBlock) {
uniformAdd<<<1, numThreadsLastBlock>>>(outArray, g_scanBlockSums[level],
numEltsLastBlock, numBlocks - 1,
numElements - numEltsLastBlock);
cudaThreadSynchronize();
CUT_CHECK_ERROR("uniformAdd");
}
} else if (isPowerOfTwo(numElements)) {
prescan<false, false>
<<<grid, threads>>>(outArray, inArray, 0, numThreads * 2, 0, 0);
cudaThreadSynchronize();
CUT_CHECK_ERROR("prescan");
} else {
prescan<false, true>
<<<grid, threads>>>(outArray, inArray, 0, numElements, 0, 0);
cudaThreadSynchronize();
CUT_CHECK_ERROR("prescanNP2");
}
}
static void prescanArray(unsigned int *outArray, unsigned int *inArray,
int numElements) {
prescanArrayRecursive(outArray, inArray, numElements, 0);
}
#endif // _PRESCAN_CU_

View File

@ -1,237 +0,0 @@
/*
* Copyright 1993-2006 NVIDIA Corporation. All rights reserved.
*
* NOTICE TO USER:
*
* This source code is subject to NVIDIA ownership rights under U.S. and
* international Copyright laws.
*
* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
* IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH
* REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
* MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
* OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
* OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
* OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
* OR PERFORMANCE OF THIS SOURCE CODE.
*
* U.S. Government End Users. This source code is a "commercial item" as
* that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of
* "commercial computer software" and "commercial computer software
* documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995)
* and is provided to the U.S. Government only as a commercial end item.
* Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
* 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
* source code with only those rights set forth herein.
*/
#ifndef _SCAN_BEST_KERNEL_CU_
#define _SCAN_BEST_KERNEL_CU_
// Define this to more rigorously avoid bank conflicts,
// even at the lower (root) levels of the tree
// Note that due to the higher addressing overhead, performance
// is lower with ZERO_BANK_CONFLICTS enabled. It is provided
// as an example.
//#define ZERO_BANK_CONFLICTS
// 16 banks on G80
#define NUM_BANKS 16
#define LOG_NUM_BANKS 4
#ifdef ZERO_BANK_CONFLICTS
#define CONFLICT_FREE_OFFSET(index) \
((index) >> LOG_NUM_BANKS + (index) >> (2 * LOG_NUM_BANKS))
#else
#define CONFLICT_FREE_OFFSET(index) ((index) >> LOG_NUM_BANKS)
#endif
///////////////////////////////////////////////////////////////////////////////
// Work-efficient compute implementation of scan, one thread per 2 elements
// Work-efficient: O(log(n)) steps, and O(n) adds.
// Also shared storage efficient: Uses n + n/NUM_BANKS shared memory -- no
// ping-ponging Also avoids most bank conflicts using single-element offsets
// every NUM_BANKS elements.
//
// In addition, If ZERO_BANK_CONFLICTS is defined, uses
// n + n/NUM_BANKS + n/(NUM_BANKS*NUM_BANKS)
// shared memory. If ZERO_BANK_CONFLICTS is defined, avoids ALL bank conflicts
// using single-element offsets every NUM_BANKS elements, plus additional
// single-element offsets after every NUM_BANKS^2 elements.
//
// Uses a balanced tree type algorithm. See Blelloch, 1990 "Prefix Sums
// and Their Applications", or Prins and Chatterjee PRAM course notes:
// http://www.cs.unc.edu/~prins/Classes/203/Handouts/pram.pdf
//
// This work-efficient version is based on the algorithm presented in Guy
// Blelloch's excellent paper "Prefix sums and their applications".
// http://www-2.cs.cmu.edu/afs/cs.cmu.edu/project/scandal/public/papers/CMU-CS-90-190.html
//
// Pro: Work Efficient, very few bank conflicts (or zero if ZERO_BANK_CONFLICTS
// is defined) Con: More instructions to compute bank-conflict-free shared
// memory addressing, and slightly more shared memory storage used.
//
template <bool isNP2>
__device__ static void
loadSharedChunkFromMem(unsigned int *s_data, const unsigned int *g_idata, int n,
int baseIndex, int &ai, int &bi, int &mem_ai,
int &mem_bi, int &bankOffsetA, int &bankOffsetB) {
int thid = threadIdx.x;
mem_ai = baseIndex + threadIdx.x;
mem_bi = mem_ai + blockDim.x;
ai = thid;
bi = thid + blockDim.x;
// compute spacing to avoid bank conflicts
bankOffsetA = CONFLICT_FREE_OFFSET(ai);
bankOffsetB = CONFLICT_FREE_OFFSET(bi);
// Cache the computational window in shared memory
// pad values beyond n with zeros
s_data[ai + bankOffsetA] = g_idata[mem_ai];
if (isNP2) // compile-time decision
{
s_data[bi + bankOffsetB] = (bi < n) ? g_idata[mem_bi] : 0;
} else {
s_data[bi + bankOffsetB] = g_idata[mem_bi];
}
}
template <bool isNP2>
__device__ static void
storeSharedChunkToMem(unsigned int *g_odata, const unsigned int *s_data, int n,
int ai, int bi, int mem_ai, int mem_bi, int bankOffsetA,
int bankOffsetB) {
__syncthreads();
// write results to global memory
g_odata[mem_ai] = s_data[ai + bankOffsetA];
if (isNP2) // compile-time decision
{
if (bi < n)
g_odata[mem_bi] = s_data[bi + bankOffsetB];
} else {
g_odata[mem_bi] = s_data[bi + bankOffsetB];
}
}
template <bool storeSum>
__device__ static void clearLastElement(unsigned int *s_data,
unsigned int *g_blockSums,
int blockIndex) {
if (threadIdx.x == 0) {
int index = (blockDim.x << 1) - 1;
index += CONFLICT_FREE_OFFSET(index);
if (storeSum) // compile-time decision
{
// write this block's total sum to the corresponding index in the
// blockSums array
g_blockSums[blockIndex] = s_data[index];
}
// zero the last element in the scan so it will propagate back to the front
s_data[index] = 0;
}
}
__device__ static unsigned int buildSum(unsigned int *s_data) {
unsigned int thid = threadIdx.x;
unsigned int stride = 1;
// build the sum in place up the tree
for (int d = blockDim.x; d > 0; d >>= 1) {
__syncthreads();
if (thid < d) {
int i = __mul24(__mul24(2, stride), thid);
int ai = i + stride - 1;
int bi = ai + stride;
ai += CONFLICT_FREE_OFFSET(ai);
bi += CONFLICT_FREE_OFFSET(bi);
s_data[bi] += s_data[ai];
}
stride *= 2;
}
return stride;
}
__device__ static void scanRootToLeaves(unsigned int *s_data,
unsigned int stride) {
unsigned int thid = threadIdx.x;
// traverse down the tree building the scan in place
for (int d = 1; d <= blockDim.x; d *= 2) {
stride >>= 1;
__syncthreads();
if (thid < d) {
int i = __mul24(__mul24(2, stride), thid);
int ai = i + stride - 1;
int bi = ai + stride;
ai += CONFLICT_FREE_OFFSET(ai);
bi += CONFLICT_FREE_OFFSET(bi);
unsigned int t = s_data[ai];
s_data[ai] = s_data[bi];
s_data[bi] += t;
}
}
}
template <bool storeSum>
__device__ static void prescanBlock(unsigned int *data, int blockIndex,
unsigned int *blockSums) {
int stride = buildSum(data); // build the sum in place up the tree
clearLastElement<storeSum>(data, blockSums,
(blockIndex == 0) ? blockIdx.x : blockIndex);
scanRootToLeaves(data, stride); // traverse down tree to build the scan
}
template <bool storeSum, bool isNP2>
__global__ static void
prescan(unsigned int *g_odata, const unsigned int *g_idata,
unsigned int *g_blockSums, int n, int blockIndex, int baseIndex) {
int ai, bi, mem_ai, mem_bi, bankOffsetA, bankOffsetB;
__shared__ unsigned int s_data[3072];
// load data into shared memory
loadSharedChunkFromMem<isNP2>(
s_data, g_idata, n,
(baseIndex == 0) ? __mul24(blockIdx.x, (blockDim.x << 1)) : baseIndex, ai,
bi, mem_ai, mem_bi, bankOffsetA, bankOffsetB);
// scan the data in each block
prescanBlock<storeSum>(s_data, blockIndex, g_blockSums);
// write results to device memory
storeSharedChunkToMem<isNP2>(g_odata, s_data, n, ai, bi, mem_ai, mem_bi,
bankOffsetA, bankOffsetB);
}
__global__ static void uniformAdd(unsigned int *g_data, unsigned int *uniforms,
int n, int blockOffset, int baseIndex) {
__shared__ unsigned int uni;
if (threadIdx.x == 0)
uni = uniforms[blockIdx.x + blockOffset];
unsigned int address =
__mul24(blockIdx.x, (blockDim.x << 1)) + baseIndex + threadIdx.x;
__syncthreads();
// note two adds per thread
g_data[address] += uni;
g_data[address + blockDim.x] += (threadIdx.x + blockDim.x < n) * uni;
}
#endif // #ifndef _SCAN_BEST_KERNEL_CU_

View File

@ -1,43 +0,0 @@
/*
* Copyright 2009 Tjark Bringewat. All rights reserved.
*/
#include "stats_logger.h"
#include "stdafx.h"
#include <cstdio>
#include <map>
#include <sstream>
std::map<std::string, unsigned int> filenames;
void LogStats(const char *graphname, const char *seriesname, float xValue,
float yValue, const char *xAxisQuantity,
const char *yAxisQuantity, const char *xAxisUnit,
const char *yAxisUnit, const char *xAxisScaleType,
const char *yAxisScaleType, unsigned int seriesnumber,
const char *description) {
std::ostringstream temp, temp2;
temp << graphname << "__" << seriesname;
size_t exists = filenames.count(temp.str());
if (!exists)
filenames[temp.str()] = seriesnumber;
temp2 << graphname << "__" << filenames[temp.str()] << "_" << seriesname
<< ".txt";
FILE *f;
if (!exists) {
f = fopen(temp2.str().c_str(), "wt");
fprintf(f, "SERIES_NAME\n%s\n", seriesname);
fprintf(f, "X_AXIS_QUANTITY\n%s\n", xAxisQuantity);
fprintf(f, "Y_AXIS_QUANTITY\n%s\n", yAxisQuantity);
fprintf(f, "X_AXIS_UNIT\n%s\n", xAxisUnit);
fprintf(f, "Y_AXIS_UNIT\n%s\n", yAxisUnit);
fprintf(f, "X_AXIS_SCALE_TYPE\n%s\n", xAxisScaleType);
fprintf(f, "Y_AXIS_SCALE_TYPE\n%s\n", yAxisScaleType);
fprintf(f, "DESCRIPTION\n%s\n", description);
fprintf(f, "__DATA__\n");
} else {
f = fopen(temp2.str().c_str(), "at");
}
fprintf(f, "%f %f\n", xValue, yValue);
fclose(f);
}

View File

@ -1,45 +0,0 @@
/*
* Copyright Tjark Bringewat. All rights reserved.
*/
#ifndef _STATS_LOGGER_H_
#define _STATS_LOGGER_H_
#include <cstring>
#pragma warning(disable : 4996)
extern "C" void
LogStats(const char *graphname, const char *seriesname, float xValue,
float yValue, const char *xAxisQuantity, const char *yAxisQuantity,
const char *xAxisUnit = "", const char *yAxisUnit = "",
const char *xAxisScaleType = "lin", const char *yAxisScaleType = "lin",
unsigned int seriesnumber = 0, const char *description = "");
inline void LogStats2(
const char *graph, // Groups several functions into one graph. Only appears
// in the file name.
const char *function, // Name of the particular function. Appears in file
// name and legend.
float yValue, float xValue, const char *yAxisName = "Time",
const char *yAxisUnit = "ms", const char *xAxisName = "Data size",
const char *xAxisUnit = "MB",
const char *yAxisScaleType = "lin", // Can be lin or log for linear or
// logarithmic scale, respectively.
const char *xAxisScaleType = "log",
unsigned int fId =
0, // Determines the order in which different functions are plotted to a
// common graph. Only appears in the file name.
const char *description = "") {
LogStats(graph, function, xValue, yValue, xAxisName, yAxisName, xAxisUnit,
yAxisUnit, xAxisScaleType, yAxisScaleType, fId, description);
if (strcmp(xAxisUnit, "MB") == 0 && strcmp(yAxisUnit, "ms") == 0) {
char buffer[100];
strcpy(buffer, graph);
strcat(buffer, "_datarate");
LogStats(buffer, function, xValue, (xValue * 1000.0f) / (yValue * 1024.0f),
xAxisName, "Data rate", xAxisUnit, "GB/s", xAxisScaleType,
yAxisScaleType, fId, description);
}
}
#endif

View File

@ -1,11 +0,0 @@
#pragma once
#include "cutil.h"
#include <iostream>
#include <malloc.h>
#include <math.h>
#include <memory.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>

View File

@ -1,83 +0,0 @@
#ifndef _TESTDATA_GEN_H_
#define _TESTDATA_GEN_H_
#include "parameters.h"
template <typename T>
__inline__ void generateRLETestData(T *data, unsigned int num_blocks,
unsigned int num_block_threads) {
unsigned int i, j;
/* generate first block*/
for (i = 0; i < num_block_threads; i += 8) {
data[i] = 1;
data[i + 1] = 2;
data[i + 2] = 3;
data[i + 3] = 3;
data[i + 4] = 3;
data[i + 5] = 4;
data[i + 6] = 4;
data[i + 7] = 5;
}
/* copy contents of the first block to all other blocks (for testing only)*/
for (j = 1; j < num_blocks; j++)
for (i = 0; i < num_block_threads; i++)
*(data + j * num_block_threads + i) = data[i];
}
template <typename T>
__inline__ void generateRLETestDataLongRuns1(T *data, unsigned int num_blocks,
unsigned int num_block_threads,
unsigned int avg_run_len) {
unsigned int i, j;
/* generate first block*/
for (i = 0; i < num_block_threads / avg_run_len; i++)
for (j = 0; j < avg_run_len; j++)
data[i * avg_run_len + j] = i;
/* copy contents of the first block to all other blocks (for testing only)*/
for (j = 1; j < num_blocks; j++)
for (i = 0; i < num_block_threads; i++)
*(data + j * num_block_threads + i) = data[i];
}
// VLE TEST DATA VER2.0
// for testing only: generates codewords of the following lengths: 1, 2, 3, 4,
// 4, 5, 6, 7
// and dummy odewords: 1, 10, 100, 1000, 1000, 10000, 100000, 1000000
// equals 0x01, 0x02, 0x4, 0x8, 0x8, 0x10, 0x20, 0x40
// num_symbols =256. Must be multiple of 8.
inline void generateCodewords(unsigned int *codewords,
unsigned int *codewordlens,
unsigned int num_symbols) {
unsigned int idx, i, j, numbits, k; // val, k;
/* Generate codeword lengths*/
for (j = 0; j < num_symbols / 8; j++) {
for (i = 0; i < 4; i++) { // generate first half of length 1,2 3, 4
idx = j * 8 + i;
codewordlens[idx] = i % 4 + 1;
}
for (i = 0; i < 4; i++) { // generate first half of length 4, 5, 6, 7
idx = j * 8 + 4 + i;
codewordlens[idx] = i % 4 + 4;
}
}
/* Generate codewords*/
for (k = 0; k < num_symbols; k++) {
numbits = codewordlens[k];
codewords[k] = 0x01 << (numbits - 1);
}
}
inline void generateData(unsigned int *data, unsigned int num_elements,
unsigned int *codewords, unsigned int *codewordlens,
unsigned int num_symbols) {
unsigned int i;
for (i = 0; i < num_elements; i++) {
data[i] = (unsigned int)(((float)rand() / (RAND_MAX + 1)) * num_symbols);
}
}
#endif

View File

@ -1,160 +0,0 @@
#ifndef _VLC_SM64HUFF_KERNEL_H_
#define _VLC_SM64HUFF_KERNEL_H_
#include "pabio_kernels_v2.cu"
#include "parameters.h"
#include <cstdio>
#ifdef SMATOMICS
/* HUFFMAN-FRIENDLY PAVLE
CHARACTERISTICS:
1. CACHE CW_LUT INTO SM, LOAD AS 2 INT ARRAYS
2. PARALLEL PREFIX SUM
3. PARALLEL BIT I/O USING SHARED-MEMORY ATOMIC OPERATIONS (COMAPTIBLE WITH
CUDA1.3+)
NOTES & ASSUMPTIONS:
- HUFFMAN-CODING FRIENDLY, SUPPORTS CODEWORDS OF 2X SIZE OF ORIGINAL
SYMBOLS (BYTES). - NUMBER OF THREADS PER BLOCK IS 256; IF YOU WANT TO PLAY
WITH DIFFERENT NUMBERS, THE CW CACHING SHOULD BE MODIFIED (SEE DPT* KERNELS)
- SM usage: 1x size of the input data (REUSE) + size of CWLUT
TURN ON CACHING FOR HIGH ENTROPY DATA!
*/
__global__ static void
vlc_encode_kernel_sm64huff(unsigned int *data, const unsigned int *gm_codewords,
const unsigned int *gm_codewordlens,
#ifdef TESTING
unsigned int *cw32, unsigned int *cw32len,
unsigned int *cw32idx,
#endif
unsigned int *out, unsigned int *outidx) {
unsigned int kn = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int k = threadIdx.x;
unsigned int kc, startbit, wrbits;
unsigned long long cw64 = 0;
unsigned int val32, codewordlen = 0;
unsigned char tmpbyte, tmpcwlen;
unsigned int tmpcw32;
__shared__ unsigned int sm[3072];
__shared__ unsigned int kcmax;
#ifdef CACHECWLUT
unsigned int *codewords = (unsigned int *)sm;
unsigned int *codewordlens = (unsigned int *)(sm + NUM_SYMBOLS);
unsigned int *as = (unsigned int *)(sm + 2 * NUM_SYMBOLS);
/* Load the codewords and the original data*/
codewords[k] = gm_codewords[k];
codewordlens[k] = gm_codewordlens[k];
val32 = data[kn];
__syncthreads();
for (unsigned int i = 0; i < 4; i++) {
tmpbyte = (unsigned char)(val32 >> ((3 - i) * 8));
tmpcw32 = codewords[tmpbyte];
tmpcwlen = codewordlens[tmpbyte];
cw64 = (cw64 << tmpcwlen) | tmpcw32;
codewordlen += tmpcwlen;
}
#else
unsigned int *as = (unsigned int *)sm;
val32 = data[kn];
for (unsigned int i = 0; i < 4; i++) {
tmpbyte = (unsigned char)(val32 >> ((3 - i) * 8));
tmpcw32 = gm_codewords[tmpbyte];
tmpcwlen = gm_codewordlens[tmpbyte];
cw64 = (cw64 << tmpcwlen) | tmpcw32;
codewordlen += tmpcwlen;
}
#endif
as[k] = codewordlen;
__syncthreads();
/* Prefix sum of codeword lengths (denoted in bits) [inplace implementation]
*/
unsigned int offset = 1;
/* Build the sum in place up the tree */
for (unsigned int d = (blockDim.x) >> 1; d > 0; d >>= 1) {
__syncthreads();
if (k < d) {
unsigned char ai = offset * (2 * k + 1) - 1;
unsigned char bi = offset * (2 * k + 2) - 1;
as[bi] += as[ai];
}
offset *= 2;
}
/* scan back down the tree */
/* clear the last element */
if (k == 0)
as[blockDim.x - 1] = 0;
// traverse down the tree building the scan in place
for (unsigned int d = 1; d < blockDim.x; d *= 2) {
offset >>= 1;
__syncthreads();
if (k < d) {
unsigned char ai = offset * (2 * k + 1) - 1;
unsigned char bi = offset * (2 * k + 2) - 1;
unsigned int t = as[ai];
as[ai] = as[bi];
as[bi] += t;
}
}
__syncthreads();
if (k == blockDim.x - 1) {
outidx[blockIdx.x] = as[k] + codewordlen;
kcmax = (as[k] + codewordlen) / 32;
// printf("kcmax: %d\n", kcmax);
}
/* Write the codes */
kc = as[k] / 32;
startbit = as[k] % 32;
as[k] = 0U;
__syncthreads();
/* Part 1*/
wrbits = codewordlen > (32 - startbit) ? (32 - startbit) : codewordlen;
tmpcw32 = (unsigned int)(cw64 >> (codewordlen - wrbits));
// if (wrbits == 32) as[kc] = tmpcw32;
// //unnecessary overhead; increases number of branches else
atomicOr(&as[kc], tmpcw32 << (32 - startbit -
wrbits)); // shift left in case it's shorter
// then the available space
codewordlen -= wrbits;
/*Part 2*/
if (codewordlen) {
wrbits = codewordlen > 32 ? 32 : codewordlen;
tmpcw32 =
(unsigned int)(cw64 >> (codewordlen - wrbits)) & ((1 << wrbits) - 1);
// if (wrbits == 32) as[kc+1] = tmpcw32;
// else
atomicOr(&as[kc + 1], tmpcw32 << (32 - wrbits));
codewordlen -= wrbits;
}
/*Part 3*/
if (codewordlen) {
tmpcw32 = (unsigned int)(cw64 & ((1 << codewordlen) - 1));
// if (wrbits == 32) as[kc+2] = tmpcw32;
// else
atomicOr(&as[kc + 2], tmpcw32 << (32 - codewordlen));
}
__syncthreads();
if (k <= kcmax)
out[kn] = as[k];
}
//////////////////////////////////////////////////////////////////////////////
#endif
#endif

Some files were not shown because too many files have changed in this diff Show More