remove useless examples
This commit is contained in:
parent
49adfd026c
commit
9152feb24f
|
@ -1,454 +0,0 @@
|
|||
#include "backprop.h"
|
||||
#include <math.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
//#define OPEN
|
||||
|
||||
#define ABS(x) (((x) > 0.0) ? (x) : (-(x)))
|
||||
|
||||
#define fastcopy(to, from, len) \
|
||||
{ \
|
||||
register char *_to, *_from; \
|
||||
register int _i, _l; \
|
||||
_to = (char *)(to); \
|
||||
_from = (char *)(from); \
|
||||
_l = (len); \
|
||||
for (_i = 0; _i < _l; _i++) \
|
||||
*_to++ = *_from++; \
|
||||
}
|
||||
|
||||
/*** Return random number between 0.0 and 1.0 ***/
|
||||
float drnd() { return ((float)rand() / (float)BIGRND); }
|
||||
|
||||
/*** Return random number between -1.0 and 1.0 ***/
|
||||
float dpn1() { return ((drnd() * 2.0) - 1.0); }
|
||||
|
||||
/*** The squashing function. Currently, it's a sigmoid. ***/
|
||||
|
||||
float squash(x)
|
||||
float x;
|
||||
{
|
||||
float m;
|
||||
// x = -x;
|
||||
// m = 1 + x + x*x/2 + x*x*x/6 + x*x*x*x/24 + x*x*x*x*x/120;
|
||||
// return(1.0 / (1.0 + m));
|
||||
return (1.0 / (1.0 + exp(-x)));
|
||||
}
|
||||
|
||||
/*** Allocate 1d array of floats ***/
|
||||
|
||||
float *alloc_1d_dbl(n)
|
||||
int n;
|
||||
{
|
||||
float *new;
|
||||
|
||||
new = (float *)malloc((unsigned)(n * sizeof(float)));
|
||||
if (new == NULL) {
|
||||
printf("ALLOC_1D_DBL: Couldn't allocate array of floats\n");
|
||||
return (NULL);
|
||||
}
|
||||
return (new);
|
||||
}
|
||||
|
||||
/*** Allocate 2d array of floats ***/
|
||||
|
||||
float **alloc_2d_dbl(m, n)
|
||||
int m, n;
|
||||
{
|
||||
int i;
|
||||
float **new;
|
||||
|
||||
new = (float **)malloc((unsigned)(m * sizeof(float *)));
|
||||
if (new == NULL) {
|
||||
printf("ALLOC_2D_DBL: Couldn't allocate array of dbl ptrs\n");
|
||||
return (NULL);
|
||||
}
|
||||
|
||||
for (i = 0; i < m; i++) {
|
||||
new[i] = alloc_1d_dbl(n);
|
||||
}
|
||||
|
||||
return (new);
|
||||
}
|
||||
|
||||
bpnn_randomize_weights(w, m, n) float **w;
|
||||
int m, n;
|
||||
{
|
||||
int i, j;
|
||||
|
||||
for (i = 0; i <= m; i++) {
|
||||
for (j = 0; j <= n; j++) {
|
||||
w[i][j] = (float)rand() / RAND_MAX;
|
||||
// w[i][j] = dpn1();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
bpnn_randomize_row(w, m) float *w;
|
||||
int m;
|
||||
{
|
||||
int i;
|
||||
for (i = 0; i <= m; i++) {
|
||||
// w[i] = (float) rand()/RAND_MAX;
|
||||
w[i] = 0.1;
|
||||
}
|
||||
}
|
||||
|
||||
bpnn_zero_weights(w, m, n) float **w;
|
||||
int m, n;
|
||||
{
|
||||
int i, j;
|
||||
|
||||
for (i = 0; i <= m; i++) {
|
||||
for (j = 0; j <= n; j++) {
|
||||
w[i][j] = 0.0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void bpnn_initialize(seed) {
|
||||
printf("Random number generator seed: %d\n", seed);
|
||||
srand(seed);
|
||||
}
|
||||
|
||||
BPNN *bpnn_internal_create(n_in, n_hidden, n_out)
|
||||
int n_in, n_hidden, n_out;
|
||||
{
|
||||
BPNN *newnet;
|
||||
|
||||
newnet = (BPNN *)malloc(sizeof(BPNN));
|
||||
if (newnet == NULL) {
|
||||
printf("BPNN_CREATE: Couldn't allocate neural network\n");
|
||||
return (NULL);
|
||||
}
|
||||
|
||||
newnet->input_n = n_in;
|
||||
newnet->hidden_n = n_hidden;
|
||||
newnet->output_n = n_out;
|
||||
newnet->input_units = alloc_1d_dbl(n_in + 1);
|
||||
newnet->hidden_units = alloc_1d_dbl(n_hidden + 1);
|
||||
newnet->output_units = alloc_1d_dbl(n_out + 1);
|
||||
|
||||
newnet->hidden_delta = alloc_1d_dbl(n_hidden + 1);
|
||||
newnet->output_delta = alloc_1d_dbl(n_out + 1);
|
||||
newnet->target = alloc_1d_dbl(n_out + 1);
|
||||
|
||||
newnet->input_weights = alloc_2d_dbl(n_in + 1, n_hidden + 1);
|
||||
newnet->hidden_weights = alloc_2d_dbl(n_hidden + 1, n_out + 1);
|
||||
|
||||
newnet->input_prev_weights = alloc_2d_dbl(n_in + 1, n_hidden + 1);
|
||||
newnet->hidden_prev_weights = alloc_2d_dbl(n_hidden + 1, n_out + 1);
|
||||
|
||||
return (newnet);
|
||||
}
|
||||
|
||||
void bpnn_free(net) BPNN *net;
|
||||
{
|
||||
int n1, n2, i;
|
||||
|
||||
n1 = net->input_n;
|
||||
n2 = net->hidden_n;
|
||||
|
||||
free((char *)net->input_units);
|
||||
free((char *)net->hidden_units);
|
||||
free((char *)net->output_units);
|
||||
|
||||
free((char *)net->hidden_delta);
|
||||
free((char *)net->output_delta);
|
||||
free((char *)net->target);
|
||||
|
||||
for (i = 0; i <= n1; i++) {
|
||||
free((char *)net->input_weights[i]);
|
||||
free((char *)net->input_prev_weights[i]);
|
||||
}
|
||||
free((char *)net->input_weights);
|
||||
free((char *)net->input_prev_weights);
|
||||
|
||||
for (i = 0; i <= n2; i++) {
|
||||
free((char *)net->hidden_weights[i]);
|
||||
free((char *)net->hidden_prev_weights[i]);
|
||||
}
|
||||
free((char *)net->hidden_weights);
|
||||
free((char *)net->hidden_prev_weights);
|
||||
|
||||
free((char *)net);
|
||||
}
|
||||
|
||||
/*** Creates a new fully-connected network from scratch,
|
||||
with the given numbers of input, hidden, and output units.
|
||||
Threshold units are automatically included. All weights are
|
||||
randomly initialized.
|
||||
Space is also allocated for temporary storage (momentum weights,
|
||||
error computations, etc).
|
||||
***/
|
||||
|
||||
BPNN *bpnn_create(n_in, n_hidden, n_out)
|
||||
int n_in, n_hidden, n_out;
|
||||
{
|
||||
|
||||
BPNN *newnet;
|
||||
|
||||
newnet = bpnn_internal_create(n_in, n_hidden, n_out);
|
||||
|
||||
#ifdef INITZERO
|
||||
bpnn_zero_weights(newnet->input_weights, n_in, n_hidden);
|
||||
#else
|
||||
bpnn_randomize_weights(newnet->input_weights, n_in, n_hidden);
|
||||
#endif
|
||||
bpnn_randomize_weights(newnet->hidden_weights, n_hidden, n_out);
|
||||
bpnn_zero_weights(newnet->input_prev_weights, n_in, n_hidden);
|
||||
bpnn_zero_weights(newnet->hidden_prev_weights, n_hidden, n_out);
|
||||
bpnn_randomize_row(newnet->target, n_out);
|
||||
return (newnet);
|
||||
}
|
||||
|
||||
void bpnn_layerforward(l1, l2, conn, n1, n2) float *l1, *l2, **conn;
|
||||
int n1, n2;
|
||||
{
|
||||
float sum;
|
||||
int j, k;
|
||||
|
||||
/*** Set up thresholding unit ***/
|
||||
l1[0] = 1.0;
|
||||
#ifdef OPEN
|
||||
omp_set_num_threads(NUM_THREAD);
|
||||
#pragma omp parallel for shared(conn, n1, n2, l1) private(k, j) reduction(+: sum) schedule(static)
|
||||
#endif
|
||||
/*** For each unit in second layer ***/
|
||||
for (j = 1; j <= n2; j++) {
|
||||
|
||||
/*** Compute weighted sum of its inputs ***/
|
||||
sum = 0.0;
|
||||
for (k = 0; k <= n1; k++) {
|
||||
sum += conn[k][j] * l1[k];
|
||||
}
|
||||
l2[j] = squash(sum);
|
||||
}
|
||||
}
|
||||
|
||||
// extern "C"
|
||||
void bpnn_output_error(delta, target, output, nj, err) float *delta, *target,
|
||||
*output, *err;
|
||||
int nj;
|
||||
{
|
||||
int j;
|
||||
float o, t, errsum;
|
||||
errsum = 0.0;
|
||||
for (j = 1; j <= nj; j++) {
|
||||
o = output[j];
|
||||
t = target[j];
|
||||
delta[j] = o * (1.0 - o) * (t - o);
|
||||
errsum += ABS(delta[j]);
|
||||
}
|
||||
*err = errsum;
|
||||
}
|
||||
|
||||
void bpnn_hidden_error(delta_h, nh, delta_o, no, who, hidden,
|
||||
err) float *delta_h,
|
||||
*delta_o, *hidden, **who, *err;
|
||||
int nh, no;
|
||||
{
|
||||
int j, k;
|
||||
float h, sum, errsum;
|
||||
|
||||
errsum = 0.0;
|
||||
for (j = 1; j <= nh; j++) {
|
||||
h = hidden[j];
|
||||
sum = 0.0;
|
||||
for (k = 1; k <= no; k++) {
|
||||
sum += delta_o[k] * who[j][k];
|
||||
}
|
||||
delta_h[j] = h * (1.0 - h) * sum;
|
||||
errsum += ABS(delta_h[j]);
|
||||
}
|
||||
*err = errsum;
|
||||
}
|
||||
|
||||
void bpnn_adjust_weights(delta, ndelta, ly, nly, w, oldw) float *delta, *ly,
|
||||
**w, **oldw;
|
||||
{
|
||||
float new_dw;
|
||||
int k, j;
|
||||
ly[0] = 1.0;
|
||||
// eta = 0.3;
|
||||
// momentum = 0.3;
|
||||
|
||||
#ifdef OPEN
|
||||
omp_set_num_threads(NUM_THREAD);
|
||||
#pragma omp parallel for shared(oldw, w, delta) private(j, k, new_dw) \
|
||||
firstprivate(ndelta, nly, momentum)
|
||||
#endif
|
||||
for (j = 1; j <= ndelta; j++) {
|
||||
for (k = 0; k <= nly; k++) {
|
||||
new_dw = ((ETA * delta[j] * ly[k]) + (MOMENTUM * oldw[k][j]));
|
||||
w[k][j] += new_dw;
|
||||
oldw[k][j] = new_dw;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void bpnn_feedforward(net) BPNN *net;
|
||||
{
|
||||
int in, hid, out;
|
||||
|
||||
in = net->input_n;
|
||||
hid = net->hidden_n;
|
||||
out = net->output_n;
|
||||
|
||||
/*** Feed forward input activations. ***/
|
||||
bpnn_layerforward(net->input_units, net->hidden_units, net->input_weights, in,
|
||||
hid);
|
||||
bpnn_layerforward(net->hidden_units, net->output_units, net->hidden_weights,
|
||||
hid, out);
|
||||
}
|
||||
|
||||
void bpnn_train(net, eo, eh) BPNN *net;
|
||||
float *eo, *eh;
|
||||
{
|
||||
int in, hid, out;
|
||||
float out_err, hid_err;
|
||||
|
||||
in = net->input_n;
|
||||
hid = net->hidden_n;
|
||||
out = net->output_n;
|
||||
|
||||
/*** Feed forward input activations. ***/
|
||||
bpnn_layerforward(net->input_units, net->hidden_units, net->input_weights, in,
|
||||
hid);
|
||||
bpnn_layerforward(net->hidden_units, net->output_units, net->hidden_weights,
|
||||
hid, out);
|
||||
|
||||
/*** Compute error on output and hidden units. ***/
|
||||
bpnn_output_error(net->output_delta, net->target, net->output_units, out,
|
||||
&out_err);
|
||||
bpnn_hidden_error(net->hidden_delta, hid, net->output_delta, out,
|
||||
net->hidden_weights, net->hidden_units, &hid_err);
|
||||
*eo = out_err;
|
||||
*eh = hid_err;
|
||||
|
||||
/*** Adjust input and hidden weights. ***/
|
||||
bpnn_adjust_weights(net->output_delta, out, net->hidden_units, hid,
|
||||
net->hidden_weights, net->hidden_prev_weights);
|
||||
bpnn_adjust_weights(net->hidden_delta, hid, net->input_units, in,
|
||||
net->input_weights, net->input_prev_weights);
|
||||
}
|
||||
|
||||
void bpnn_save(net, filename) BPNN *net;
|
||||
char *filename;
|
||||
{
|
||||
int n1, n2, n3, i, j, memcnt;
|
||||
float dvalue, **w;
|
||||
char *mem;
|
||||
/// add//
|
||||
FILE *pFile;
|
||||
pFile = fopen(filename, "w+");
|
||||
///////
|
||||
/*
|
||||
if ((fd = creat(filename, 0644)) == -1) {
|
||||
printf("BPNN_SAVE: Cannot create '%s'\n", filename);
|
||||
return;
|
||||
}
|
||||
*/
|
||||
|
||||
n1 = net->input_n;
|
||||
n2 = net->hidden_n;
|
||||
n3 = net->output_n;
|
||||
printf("Saving %dx%dx%d network to '%s'\n", n1, n2, n3, filename);
|
||||
// fflush(stdout);
|
||||
|
||||
// write(fd, (char *) &n1, sizeof(int));
|
||||
// write(fd, (char *) &n2, sizeof(int));
|
||||
// write(fd, (char *) &n3, sizeof(int));
|
||||
|
||||
fwrite((char *)&n1, sizeof(char), sizeof(char), pFile);
|
||||
fwrite((char *)&n2, sizeof(char), sizeof(char), pFile);
|
||||
fwrite((char *)&n3, sizeof(char), sizeof(char), pFile);
|
||||
|
||||
memcnt = 0;
|
||||
w = net->input_weights;
|
||||
mem = (char *)malloc((unsigned)((n1 + 1) * (n2 + 1) * sizeof(float)));
|
||||
for (i = 0; i <= n1; i++) {
|
||||
for (j = 0; j <= n2; j++) {
|
||||
dvalue = w[i][j];
|
||||
fastcopy(&mem[memcnt], &dvalue, sizeof(float));
|
||||
memcnt += sizeof(float);
|
||||
}
|
||||
}
|
||||
// write(fd, mem, (n1+1) * (n2+1) * sizeof(float));
|
||||
fwrite(mem, (unsigned)(sizeof(float)),
|
||||
(unsigned)((n1 + 1) * (n2 + 1) * sizeof(float)), pFile);
|
||||
free(mem);
|
||||
|
||||
memcnt = 0;
|
||||
w = net->hidden_weights;
|
||||
mem = (char *)malloc((unsigned)((n2 + 1) * (n3 + 1) * sizeof(float)));
|
||||
for (i = 0; i <= n2; i++) {
|
||||
for (j = 0; j <= n3; j++) {
|
||||
dvalue = w[i][j];
|
||||
fastcopy(&mem[memcnt], &dvalue, sizeof(float));
|
||||
memcnt += sizeof(float);
|
||||
}
|
||||
}
|
||||
// write(fd, mem, (n2+1) * (n3+1) * sizeof(float));
|
||||
fwrite(mem, sizeof(float), (unsigned)((n2 + 1) * (n3 + 1) * sizeof(float)),
|
||||
pFile);
|
||||
free(mem);
|
||||
|
||||
fclose(pFile);
|
||||
return;
|
||||
}
|
||||
|
||||
BPNN *bpnn_read(filename)
|
||||
char *filename;
|
||||
{
|
||||
char *mem;
|
||||
BPNN *new;
|
||||
int fd, n1, n2, n3, i, j, memcnt;
|
||||
|
||||
if ((fd = open(filename, 0, 0644)) == -1) {
|
||||
return (NULL);
|
||||
}
|
||||
|
||||
printf("Reading '%s'\n", filename); // fflush(stdout);
|
||||
|
||||
read(fd, (char *)&n1, sizeof(int));
|
||||
read(fd, (char *)&n2, sizeof(int));
|
||||
read(fd, (char *)&n3, sizeof(int));
|
||||
new = bpnn_internal_create(n1, n2, n3);
|
||||
|
||||
printf("'%s' contains a %dx%dx%d network\n", filename, n1, n2, n3);
|
||||
printf("Reading input weights..."); // fflush(stdout);
|
||||
|
||||
memcnt = 0;
|
||||
mem = (char *)malloc((unsigned)((n1 + 1) * (n2 + 1) * sizeof(float)));
|
||||
read(fd, mem, (n1 + 1) * (n2 + 1) * sizeof(float));
|
||||
for (i = 0; i <= n1; i++) {
|
||||
for (j = 0; j <= n2; j++) {
|
||||
fastcopy(&(new->input_weights[i][j]), &mem[memcnt], sizeof(float));
|
||||
memcnt += sizeof(float);
|
||||
}
|
||||
}
|
||||
free(mem);
|
||||
|
||||
printf("Done\nReading hidden weights..."); // fflush(stdout);
|
||||
|
||||
memcnt = 0;
|
||||
mem = (char *)malloc((unsigned)((n2 + 1) * (n3 + 1) * sizeof(float)));
|
||||
read(fd, mem, (n2 + 1) * (n3 + 1) * sizeof(float));
|
||||
for (i = 0; i <= n2; i++) {
|
||||
for (j = 0; j <= n3; j++) {
|
||||
fastcopy(&(new->hidden_weights[i][j]), &mem[memcnt], sizeof(float));
|
||||
memcnt += sizeof(float);
|
||||
}
|
||||
}
|
||||
free(mem);
|
||||
close(fd);
|
||||
|
||||
printf("Done\n"); // fflush(stdout);
|
||||
|
||||
bpnn_zero_weights(new->input_prev_weights, n1, n2);
|
||||
bpnn_zero_weights(new->hidden_prev_weights, n2, n3);
|
||||
|
||||
return (new);
|
||||
}
|
|
@ -1,50 +0,0 @@
|
|||
#ifndef _BACKPROP_H_
|
||||
#define _BACKPROP_H_
|
||||
|
||||
#define BIGRND 0x7fffffff
|
||||
|
||||
#define GPU
|
||||
#define THREADS 256
|
||||
#define WIDTH 16 // shared memory width
|
||||
#define HEIGHT 16 // shared memory height
|
||||
|
||||
#define ETA 0.3 // eta value
|
||||
#define MOMENTUM 0.3 // momentum value
|
||||
#define NUM_THREAD 4 // OpenMP threads
|
||||
|
||||
typedef struct {
|
||||
int input_n; /* number of input units */
|
||||
int hidden_n; /* number of hidden units */
|
||||
int output_n; /* number of output units */
|
||||
|
||||
float *input_units; /* the input units */
|
||||
float *hidden_units; /* the hidden units */
|
||||
float *output_units; /* the output units */
|
||||
|
||||
float *hidden_delta; /* storage for hidden unit error */
|
||||
float *output_delta; /* storage for output unit error */
|
||||
|
||||
float *target; /* storage for target vector */
|
||||
|
||||
float **input_weights; /* weights from input to hidden layer */
|
||||
float **hidden_weights; /* weights from hidden to output layer */
|
||||
|
||||
/*** The next two are for momentum ***/
|
||||
float **input_prev_weights; /* previous change on input to hidden wgt */
|
||||
float **hidden_prev_weights; /* previous change on hidden to output wgt */
|
||||
} BPNN;
|
||||
|
||||
/*** User-level functions ***/
|
||||
|
||||
void bpnn_initialize();
|
||||
|
||||
BPNN *bpnn_create();
|
||||
void bpnn_free();
|
||||
|
||||
void bpnn_train();
|
||||
void bpnn_feedforward();
|
||||
|
||||
void bpnn_save();
|
||||
BPNN *bpnn_read();
|
||||
|
||||
#endif
|
|
@ -1,615 +0,0 @@
|
|||
; ModuleID = 'backprop_cuda-cuda-nvptx64-nvidia-cuda-sm_61.bc'
|
||||
source_filename = "backprop_cuda.cu"
|
||||
target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
|
||||
target triple = "nvptx64-nvidia-cuda"
|
||||
|
||||
%struct.__cuda_builtin_blockIdx_t = type { i8 }
|
||||
%struct.__cuda_builtin_threadIdx_t = type { i8 }
|
||||
%struct.cudaFuncAttributes = type { i64, i64, i64, i32, i32, i32, i32, i32, i32, i32 }
|
||||
|
||||
$_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_yEv = comdat any
|
||||
|
||||
$_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv = comdat any
|
||||
|
||||
$_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_yEv = comdat any
|
||||
|
||||
@blockIdx = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_blockIdx_t, align 1
|
||||
@threadIdx = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_threadIdx_t, align 1
|
||||
@_ZZ22bpnn_layerforward_CUDAPfS_S_S_iiE10input_node = internal addrspace(3) global [16 x float] undef, align 4
|
||||
@_ZZ22bpnn_layerforward_CUDAPfS_S_S_iiE13weight_matrix = internal addrspace(3) global [16 x [16 x float]] undef, align 4
|
||||
@"$str" = private addrspace(1) constant [11 x i8] c"__CUDA_FTZ\00"
|
||||
|
||||
; Function Attrs: convergent noinline nounwind optnone
|
||||
define weak dso_local i32 @cudaMalloc(i8** %p, i64 %s) #0 {
|
||||
entry:
|
||||
%p.addr = alloca i8**, align 8
|
||||
%s.addr = alloca i64, align 8
|
||||
store i8** %p, i8*** %p.addr, align 8
|
||||
store i64 %s, i64* %s.addr, align 8
|
||||
ret i32 999
|
||||
}
|
||||
|
||||
; Function Attrs: convergent noinline nounwind optnone
|
||||
define weak dso_local i32 @cudaFuncGetAttributes(%struct.cudaFuncAttributes* %p, i8* %c) #0 {
|
||||
entry:
|
||||
%p.addr = alloca %struct.cudaFuncAttributes*, align 8
|
||||
%c.addr = alloca i8*, align 8
|
||||
store %struct.cudaFuncAttributes* %p, %struct.cudaFuncAttributes** %p.addr, align 8
|
||||
store i8* %c, i8** %c.addr, align 8
|
||||
ret i32 999
|
||||
}
|
||||
|
||||
; Function Attrs: convergent noinline nounwind optnone
|
||||
define weak dso_local i32 @cudaDeviceGetAttribute(i32* %value, i32 %attr, i32 %device) #0 {
|
||||
entry:
|
||||
%value.addr = alloca i32*, align 8
|
||||
%attr.addr = alloca i32, align 4
|
||||
%device.addr = alloca i32, align 4
|
||||
store i32* %value, i32** %value.addr, align 8
|
||||
store i32 %attr, i32* %attr.addr, align 4
|
||||
store i32 %device, i32* %device.addr, align 4
|
||||
ret i32 999
|
||||
}
|
||||
|
||||
; Function Attrs: convergent noinline nounwind optnone
|
||||
define weak dso_local i32 @cudaGetDevice(i32* %device) #0 {
|
||||
entry:
|
||||
%device.addr = alloca i32*, align 8
|
||||
store i32* %device, i32** %device.addr, align 8
|
||||
ret i32 999
|
||||
}
|
||||
|
||||
; Function Attrs: convergent noinline nounwind optnone
|
||||
define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessor(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize) #0 {
|
||||
entry:
|
||||
%numBlocks.addr = alloca i32*, align 8
|
||||
%func.addr = alloca i8*, align 8
|
||||
%blockSize.addr = alloca i32, align 4
|
||||
%dynamicSmemSize.addr = alloca i64, align 8
|
||||
store i32* %numBlocks, i32** %numBlocks.addr, align 8
|
||||
store i8* %func, i8** %func.addr, align 8
|
||||
store i32 %blockSize, i32* %blockSize.addr, align 4
|
||||
store i64 %dynamicSmemSize, i64* %dynamicSmemSize.addr, align 8
|
||||
ret i32 999
|
||||
}
|
||||
|
||||
; Function Attrs: convergent noinline nounwind optnone
|
||||
define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize, i32 %flags) #0 {
|
||||
entry:
|
||||
%numBlocks.addr = alloca i32*, align 8
|
||||
%func.addr = alloca i8*, align 8
|
||||
%blockSize.addr = alloca i32, align 4
|
||||
%dynamicSmemSize.addr = alloca i64, align 8
|
||||
%flags.addr = alloca i32, align 4
|
||||
store i32* %numBlocks, i32** %numBlocks.addr, align 8
|
||||
store i8* %func, i8** %func.addr, align 8
|
||||
store i32 %blockSize, i32* %blockSize.addr, align 4
|
||||
store i64 %dynamicSmemSize, i64* %dynamicSmemSize.addr, align 8
|
||||
store i32 %flags, i32* %flags.addr, align 4
|
||||
ret i32 999
|
||||
}
|
||||
|
||||
; Function Attrs: convergent noinline nounwind optnone
|
||||
define dso_local void @_Z22bpnn_layerforward_CUDAPfS_S_S_ii(float* %input_cuda, float* %output_hidden_cuda, float* %input_hidden_cuda, float* %hidden_partial_sum, i32 %in, i32 %hid) #0 {
|
||||
entry:
|
||||
%input_cuda.addr = alloca float*, align 8
|
||||
%output_hidden_cuda.addr = alloca float*, align 8
|
||||
%input_hidden_cuda.addr = alloca float*, align 8
|
||||
%hidden_partial_sum.addr = alloca float*, align 8
|
||||
%in.addr = alloca i32, align 4
|
||||
%hid.addr = alloca i32, align 4
|
||||
%by = alloca i32, align 4
|
||||
%tx = alloca i32, align 4
|
||||
%ty = alloca i32, align 4
|
||||
%index = alloca i32, align 4
|
||||
%index_in = alloca i32, align 4
|
||||
%i = alloca i32, align 4
|
||||
%power_two = alloca i32, align 4
|
||||
store float* %input_cuda, float** %input_cuda.addr, align 8
|
||||
store float* %output_hidden_cuda, float** %output_hidden_cuda.addr, align 8
|
||||
store float* %input_hidden_cuda, float** %input_hidden_cuda.addr, align 8
|
||||
store float* %hidden_partial_sum, float** %hidden_partial_sum.addr, align 8
|
||||
store i32 %in, i32* %in.addr, align 4
|
||||
store i32 %hid, i32* %hid.addr, align 4
|
||||
%call = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_yEv() #2
|
||||
store i32 %call, i32* %by, align 4
|
||||
%call1 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #2
|
||||
store i32 %call1, i32* %tx, align 4
|
||||
%call2 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_yEv() #2
|
||||
store i32 %call2, i32* %ty, align 4
|
||||
%0 = load i32, i32* %hid.addr, align 4
|
||||
%add = add nsw i32 %0, 1
|
||||
%mul = mul nsw i32 %add, 16
|
||||
%1 = load i32, i32* %by, align 4
|
||||
%mul3 = mul nsw i32 %mul, %1
|
||||
%2 = load i32, i32* %hid.addr, align 4
|
||||
%add4 = add nsw i32 %2, 1
|
||||
%3 = load i32, i32* %ty, align 4
|
||||
%mul5 = mul nsw i32 %add4, %3
|
||||
%add6 = add nsw i32 %mul3, %mul5
|
||||
%4 = load i32, i32* %tx, align 4
|
||||
%add7 = add nsw i32 %add6, %4
|
||||
%add8 = add nsw i32 %add7, 1
|
||||
%5 = load i32, i32* %hid.addr, align 4
|
||||
%add9 = add nsw i32 %5, 1
|
||||
%add10 = add nsw i32 %add8, %add9
|
||||
store i32 %add10, i32* %index, align 4
|
||||
%6 = load i32, i32* %by, align 4
|
||||
%mul11 = mul nsw i32 16, %6
|
||||
%7 = load i32, i32* %ty, align 4
|
||||
%add12 = add nsw i32 %mul11, %7
|
||||
%add13 = add nsw i32 %add12, 1
|
||||
store i32 %add13, i32* %index_in, align 4
|
||||
%8 = load i32, i32* %tx, align 4
|
||||
%cmp = icmp eq i32 %8, 0
|
||||
br i1 %cmp, label %if.then, label %if.end
|
||||
|
||||
if.then: ; preds = %entry
|
||||
%9 = load float*, float** %input_cuda.addr, align 8
|
||||
%10 = load i32, i32* %index_in, align 4
|
||||
%idxprom = sext i32 %10 to i64
|
||||
%arrayidx = getelementptr inbounds float, float* %9, i64 %idxprom
|
||||
%11 = load float, float* %arrayidx, align 4
|
||||
%12 = load i32, i32* %ty, align 4
|
||||
%idxprom14 = sext i32 %12 to i64
|
||||
%arrayidx15 = getelementptr inbounds [16 x float], [16 x float]* addrspacecast ([16 x float] addrspace(3)* @_ZZ22bpnn_layerforward_CUDAPfS_S_S_iiE10input_node to [16 x float]*), i64 0, i64 %idxprom14
|
||||
store float %11, float* %arrayidx15, align 4
|
||||
br label %if.end
|
||||
|
||||
if.end: ; preds = %if.then, %entry
|
||||
call void @llvm.nvvm.barrier0()
|
||||
%13 = load float*, float** %input_hidden_cuda.addr, align 8
|
||||
%14 = load i32, i32* %index, align 4
|
||||
%idxprom16 = sext i32 %14 to i64
|
||||
%arrayidx17 = getelementptr inbounds float, float* %13, i64 %idxprom16
|
||||
%15 = load float, float* %arrayidx17, align 4
|
||||
%16 = load i32, i32* %ty, align 4
|
||||
%idxprom18 = sext i32 %16 to i64
|
||||
%arrayidx19 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ22bpnn_layerforward_CUDAPfS_S_S_iiE13weight_matrix to [16 x [16 x float]]*), i64 0, i64 %idxprom18
|
||||
%17 = load i32, i32* %tx, align 4
|
||||
%idxprom20 = sext i32 %17 to i64
|
||||
%arrayidx21 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx19, i64 0, i64 %idxprom20
|
||||
store float %15, float* %arrayidx21, align 4
|
||||
call void @llvm.nvvm.barrier0()
|
||||
%18 = load i32, i32* %ty, align 4
|
||||
%idxprom22 = sext i32 %18 to i64
|
||||
%arrayidx23 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ22bpnn_layerforward_CUDAPfS_S_S_iiE13weight_matrix to [16 x [16 x float]]*), i64 0, i64 %idxprom22
|
||||
%19 = load i32, i32* %tx, align 4
|
||||
%idxprom24 = sext i32 %19 to i64
|
||||
%arrayidx25 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx23, i64 0, i64 %idxprom24
|
||||
%20 = load float, float* %arrayidx25, align 4
|
||||
%21 = load i32, i32* %ty, align 4
|
||||
%idxprom26 = sext i32 %21 to i64
|
||||
%arrayidx27 = getelementptr inbounds [16 x float], [16 x float]* addrspacecast ([16 x float] addrspace(3)* @_ZZ22bpnn_layerforward_CUDAPfS_S_S_iiE10input_node to [16 x float]*), i64 0, i64 %idxprom26
|
||||
%22 = load float, float* %arrayidx27, align 4
|
||||
%mul28 = fmul contract float %20, %22
|
||||
%23 = load i32, i32* %ty, align 4
|
||||
%idxprom29 = sext i32 %23 to i64
|
||||
%arrayidx30 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ22bpnn_layerforward_CUDAPfS_S_S_iiE13weight_matrix to [16 x [16 x float]]*), i64 0, i64 %idxprom29
|
||||
%24 = load i32, i32* %tx, align 4
|
||||
%idxprom31 = sext i32 %24 to i64
|
||||
%arrayidx32 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx30, i64 0, i64 %idxprom31
|
||||
store float %mul28, float* %arrayidx32, align 4
|
||||
call void @llvm.nvvm.barrier0()
|
||||
store i32 1, i32* %i, align 4
|
||||
br label %for.cond
|
||||
|
||||
for.cond: ; preds = %for.inc, %if.end
|
||||
%25 = load i32, i32* %i, align 4
|
||||
%conv = sitofp i32 %25 to float
|
||||
%call33 = call float @_ZL7__log2ff(float 1.600000e+01) #2
|
||||
%cmp34 = fcmp ole float %conv, %call33
|
||||
br i1 %cmp34, label %for.body, label %for.end
|
||||
|
||||
for.body: ; preds = %for.cond
|
||||
%26 = load i32, i32* %i, align 4
|
||||
%conv35 = sitofp i32 %26 to float
|
||||
%call36 = call float @_ZL6__powfff(float 2.000000e+00, float %conv35) #2
|
||||
%conv37 = fptosi float %call36 to i32
|
||||
store i32 %conv37, i32* %power_two, align 4
|
||||
%27 = load i32, i32* %ty, align 4
|
||||
%28 = load i32, i32* %power_two, align 4
|
||||
%rem = srem i32 %27, %28
|
||||
%cmp38 = icmp eq i32 %rem, 0
|
||||
br i1 %cmp38, label %if.then39, label %if.end54
|
||||
|
||||
if.then39: ; preds = %for.body
|
||||
%29 = load i32, i32* %ty, align 4
|
||||
%idxprom40 = sext i32 %29 to i64
|
||||
%arrayidx41 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ22bpnn_layerforward_CUDAPfS_S_S_iiE13weight_matrix to [16 x [16 x float]]*), i64 0, i64 %idxprom40
|
||||
%30 = load i32, i32* %tx, align 4
|
||||
%idxprom42 = sext i32 %30 to i64
|
||||
%arrayidx43 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx41, i64 0, i64 %idxprom42
|
||||
%31 = load float, float* %arrayidx43, align 4
|
||||
%32 = load i32, i32* %ty, align 4
|
||||
%33 = load i32, i32* %power_two, align 4
|
||||
%div = sdiv i32 %33, 2
|
||||
%add44 = add nsw i32 %32, %div
|
||||
%idxprom45 = sext i32 %add44 to i64
|
||||
%arrayidx46 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ22bpnn_layerforward_CUDAPfS_S_S_iiE13weight_matrix to [16 x [16 x float]]*), i64 0, i64 %idxprom45
|
||||
%34 = load i32, i32* %tx, align 4
|
||||
%idxprom47 = sext i32 %34 to i64
|
||||
%arrayidx48 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx46, i64 0, i64 %idxprom47
|
||||
%35 = load float, float* %arrayidx48, align 4
|
||||
%add49 = fadd contract float %31, %35
|
||||
%36 = load i32, i32* %ty, align 4
|
||||
%idxprom50 = sext i32 %36 to i64
|
||||
%arrayidx51 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ22bpnn_layerforward_CUDAPfS_S_S_iiE13weight_matrix to [16 x [16 x float]]*), i64 0, i64 %idxprom50
|
||||
%37 = load i32, i32* %tx, align 4
|
||||
%idxprom52 = sext i32 %37 to i64
|
||||
%arrayidx53 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx51, i64 0, i64 %idxprom52
|
||||
store float %add49, float* %arrayidx53, align 4
|
||||
br label %if.end54
|
||||
|
||||
if.end54: ; preds = %if.then39, %for.body
|
||||
call void @llvm.nvvm.barrier0()
|
||||
br label %for.inc
|
||||
|
||||
for.inc: ; preds = %if.end54
|
||||
%38 = load i32, i32* %i, align 4
|
||||
%inc = add nsw i32 %38, 1
|
||||
store i32 %inc, i32* %i, align 4
|
||||
br label %for.cond
|
||||
|
||||
for.end: ; preds = %for.cond
|
||||
%39 = load i32, i32* %ty, align 4
|
||||
%idxprom55 = sext i32 %39 to i64
|
||||
%arrayidx56 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ22bpnn_layerforward_CUDAPfS_S_S_iiE13weight_matrix to [16 x [16 x float]]*), i64 0, i64 %idxprom55
|
||||
%40 = load i32, i32* %tx, align 4
|
||||
%idxprom57 = sext i32 %40 to i64
|
||||
%arrayidx58 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx56, i64 0, i64 %idxprom57
|
||||
%41 = load float, float* %arrayidx58, align 4
|
||||
%42 = load float*, float** %input_hidden_cuda.addr, align 8
|
||||
%43 = load i32, i32* %index, align 4
|
||||
%idxprom59 = sext i32 %43 to i64
|
||||
%arrayidx60 = getelementptr inbounds float, float* %42, i64 %idxprom59
|
||||
store float %41, float* %arrayidx60, align 4
|
||||
call void @llvm.nvvm.barrier0()
|
||||
%44 = load i32, i32* %tx, align 4
|
||||
%cmp61 = icmp eq i32 %44, 0
|
||||
br i1 %cmp61, label %if.then62, label %if.end71
|
||||
|
||||
if.then62: ; preds = %for.end
|
||||
%45 = load i32, i32* %tx, align 4
|
||||
%idxprom63 = sext i32 %45 to i64
|
||||
%arrayidx64 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ22bpnn_layerforward_CUDAPfS_S_S_iiE13weight_matrix to [16 x [16 x float]]*), i64 0, i64 %idxprom63
|
||||
%46 = load i32, i32* %ty, align 4
|
||||
%idxprom65 = sext i32 %46 to i64
|
||||
%arrayidx66 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx64, i64 0, i64 %idxprom65
|
||||
%47 = load float, float* %arrayidx66, align 4
|
||||
%48 = load float*, float** %hidden_partial_sum.addr, align 8
|
||||
%49 = load i32, i32* %by, align 4
|
||||
%50 = load i32, i32* %hid.addr, align 4
|
||||
%mul67 = mul nsw i32 %49, %50
|
||||
%51 = load i32, i32* %ty, align 4
|
||||
%add68 = add nsw i32 %mul67, %51
|
||||
%idxprom69 = sext i32 %add68 to i64
|
||||
%arrayidx70 = getelementptr inbounds float, float* %48, i64 %idxprom69
|
||||
store float %47, float* %arrayidx70, align 4
|
||||
br label %if.end71
|
||||
|
||||
if.end71: ; preds = %if.then62, %for.end
|
||||
ret void
|
||||
}
|
||||
|
||||
; Function Attrs: alwaysinline convergent nounwind
|
||||
define linkonce_odr dso_local i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_yEv() #1 comdat align 2 {
|
||||
entry:
|
||||
%0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y()
|
||||
ret i32 %0
|
||||
}
|
||||
|
||||
; Function Attrs: alwaysinline convergent nounwind
|
||||
define linkonce_odr dso_local i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #1 comdat align 2 {
|
||||
entry:
|
||||
%0 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
|
||||
ret i32 %0
|
||||
}
|
||||
|
||||
; Function Attrs: alwaysinline convergent nounwind
|
||||
define linkonce_odr dso_local i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_yEv() #1 comdat align 2 {
|
||||
entry:
|
||||
%0 = call i32 @llvm.nvvm.read.ptx.sreg.tid.y()
|
||||
ret i32 %0
|
||||
}
|
||||
|
||||
; Function Attrs: convergent nounwind
|
||||
declare void @llvm.nvvm.barrier0() #2
|
||||
|
||||
; Function Attrs: alwaysinline convergent nounwind
|
||||
define internal float @_ZL7__log2ff(float %__a) #1 {
|
||||
entry:
|
||||
%__a.addr = alloca float, align 4
|
||||
store float %__a, float* %__a.addr, align 4
|
||||
%0 = load float, float* %__a.addr, align 4
|
||||
%call = call float @__nv_fast_log2f(float %0) #2
|
||||
ret float %call
|
||||
}
|
||||
|
||||
; Function Attrs: alwaysinline convergent nounwind
|
||||
define internal float @_ZL6__powfff(float %__a, float %__b) #1 {
|
||||
entry:
|
||||
%__a.addr = alloca float, align 4
|
||||
%__b.addr = alloca float, align 4
|
||||
store float %__a, float* %__a.addr, align 4
|
||||
store float %__b, float* %__b.addr, align 4
|
||||
%0 = load float, float* %__a.addr, align 4
|
||||
%1 = load float, float* %__b.addr, align 4
|
||||
%call = call float @__nv_fast_powf(float %0, float %1) #2
|
||||
ret float %call
|
||||
}
|
||||
|
||||
; Function Attrs: convergent noinline nounwind optnone
|
||||
define dso_local void @_Z24bpnn_adjust_weights_cudaPfiS_iS_S_(float* %delta, i32 %hid, float* %ly, i32 %in, float* %w, float* %oldw) #0 {
|
||||
entry:
|
||||
%delta.addr = alloca float*, align 8
|
||||
%hid.addr = alloca i32, align 4
|
||||
%ly.addr = alloca float*, align 8
|
||||
%in.addr = alloca i32, align 4
|
||||
%w.addr = alloca float*, align 8
|
||||
%oldw.addr = alloca float*, align 8
|
||||
%by = alloca i32, align 4
|
||||
%tx = alloca i32, align 4
|
||||
%ty = alloca i32, align 4
|
||||
%index = alloca i32, align 4
|
||||
%index_y = alloca i32, align 4
|
||||
%index_x = alloca i32, align 4
|
||||
store float* %delta, float** %delta.addr, align 8
|
||||
store i32 %hid, i32* %hid.addr, align 4
|
||||
store float* %ly, float** %ly.addr, align 8
|
||||
store i32 %in, i32* %in.addr, align 4
|
||||
store float* %w, float** %w.addr, align 8
|
||||
store float* %oldw, float** %oldw.addr, align 8
|
||||
%call = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_yEv() #2
|
||||
store i32 %call, i32* %by, align 4
|
||||
%call1 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #2
|
||||
store i32 %call1, i32* %tx, align 4
|
||||
%call2 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_yEv() #2
|
||||
store i32 %call2, i32* %ty, align 4
|
||||
%0 = load i32, i32* %hid.addr, align 4
|
||||
%add = add nsw i32 %0, 1
|
||||
%mul = mul nsw i32 %add, 16
|
||||
%1 = load i32, i32* %by, align 4
|
||||
%mul3 = mul nsw i32 %mul, %1
|
||||
%2 = load i32, i32* %hid.addr, align 4
|
||||
%add4 = add nsw i32 %2, 1
|
||||
%3 = load i32, i32* %ty, align 4
|
||||
%mul5 = mul nsw i32 %add4, %3
|
||||
%add6 = add nsw i32 %mul3, %mul5
|
||||
%4 = load i32, i32* %tx, align 4
|
||||
%add7 = add nsw i32 %add6, %4
|
||||
%add8 = add nsw i32 %add7, 1
|
||||
%5 = load i32, i32* %hid.addr, align 4
|
||||
%add9 = add nsw i32 %5, 1
|
||||
%add10 = add nsw i32 %add8, %add9
|
||||
store i32 %add10, i32* %index, align 4
|
||||
%6 = load i32, i32* %by, align 4
|
||||
%mul11 = mul nsw i32 16, %6
|
||||
%7 = load i32, i32* %ty, align 4
|
||||
%add12 = add nsw i32 %mul11, %7
|
||||
%add13 = add nsw i32 %add12, 1
|
||||
store i32 %add13, i32* %index_y, align 4
|
||||
%8 = load i32, i32* %tx, align 4
|
||||
%add14 = add nsw i32 %8, 1
|
||||
store i32 %add14, i32* %index_x, align 4
|
||||
%9 = load float*, float** %delta.addr, align 8
|
||||
%10 = load i32, i32* %index_x, align 4
|
||||
%idxprom = sext i32 %10 to i64
|
||||
%arrayidx = getelementptr inbounds float, float* %9, i64 %idxprom
|
||||
%11 = load float, float* %arrayidx, align 4
|
||||
%conv = fpext float %11 to double
|
||||
%mul15 = fmul contract double 3.000000e-01, %conv
|
||||
%12 = load float*, float** %ly.addr, align 8
|
||||
%13 = load i32, i32* %index_y, align 4
|
||||
%idxprom16 = sext i32 %13 to i64
|
||||
%arrayidx17 = getelementptr inbounds float, float* %12, i64 %idxprom16
|
||||
%14 = load float, float* %arrayidx17, align 4
|
||||
%conv18 = fpext float %14 to double
|
||||
%mul19 = fmul contract double %mul15, %conv18
|
||||
%15 = load float*, float** %oldw.addr, align 8
|
||||
%16 = load i32, i32* %index, align 4
|
||||
%idxprom20 = sext i32 %16 to i64
|
||||
%arrayidx21 = getelementptr inbounds float, float* %15, i64 %idxprom20
|
||||
%17 = load float, float* %arrayidx21, align 4
|
||||
%conv22 = fpext float %17 to double
|
||||
%mul23 = fmul contract double 3.000000e-01, %conv22
|
||||
%add24 = fadd contract double %mul19, %mul23
|
||||
%18 = load float*, float** %w.addr, align 8
|
||||
%19 = load i32, i32* %index, align 4
|
||||
%idxprom25 = sext i32 %19 to i64
|
||||
%arrayidx26 = getelementptr inbounds float, float* %18, i64 %idxprom25
|
||||
%20 = load float, float* %arrayidx26, align 4
|
||||
%conv27 = fpext float %20 to double
|
||||
%add28 = fadd contract double %conv27, %add24
|
||||
%conv29 = fptrunc double %add28 to float
|
||||
store float %conv29, float* %arrayidx26, align 4
|
||||
%21 = load float*, float** %delta.addr, align 8
|
||||
%22 = load i32, i32* %index_x, align 4
|
||||
%idxprom30 = sext i32 %22 to i64
|
||||
%arrayidx31 = getelementptr inbounds float, float* %21, i64 %idxprom30
|
||||
%23 = load float, float* %arrayidx31, align 4
|
||||
%conv32 = fpext float %23 to double
|
||||
%mul33 = fmul contract double 3.000000e-01, %conv32
|
||||
%24 = load float*, float** %ly.addr, align 8
|
||||
%25 = load i32, i32* %index_y, align 4
|
||||
%idxprom34 = sext i32 %25 to i64
|
||||
%arrayidx35 = getelementptr inbounds float, float* %24, i64 %idxprom34
|
||||
%26 = load float, float* %arrayidx35, align 4
|
||||
%conv36 = fpext float %26 to double
|
||||
%mul37 = fmul contract double %mul33, %conv36
|
||||
%27 = load float*, float** %oldw.addr, align 8
|
||||
%28 = load i32, i32* %index, align 4
|
||||
%idxprom38 = sext i32 %28 to i64
|
||||
%arrayidx39 = getelementptr inbounds float, float* %27, i64 %idxprom38
|
||||
%29 = load float, float* %arrayidx39, align 4
|
||||
%conv40 = fpext float %29 to double
|
||||
%mul41 = fmul contract double 3.000000e-01, %conv40
|
||||
%add42 = fadd contract double %mul37, %mul41
|
||||
%conv43 = fptrunc double %add42 to float
|
||||
%30 = load float*, float** %oldw.addr, align 8
|
||||
%31 = load i32, i32* %index, align 4
|
||||
%idxprom44 = sext i32 %31 to i64
|
||||
%arrayidx45 = getelementptr inbounds float, float* %30, i64 %idxprom44
|
||||
store float %conv43, float* %arrayidx45, align 4
|
||||
call void @llvm.nvvm.barrier0()
|
||||
%32 = load i32, i32* %ty, align 4
|
||||
%cmp = icmp eq i32 %32, 0
|
||||
br i1 %cmp, label %land.lhs.true, label %if.end
|
||||
|
||||
land.lhs.true: ; preds = %entry
|
||||
%33 = load i32, i32* %by, align 4
|
||||
%cmp46 = icmp eq i32 %33, 0
|
||||
br i1 %cmp46, label %if.then, label %if.end
|
||||
|
||||
if.then: ; preds = %land.lhs.true
|
||||
%34 = load float*, float** %delta.addr, align 8
|
||||
%35 = load i32, i32* %index_x, align 4
|
||||
%idxprom47 = sext i32 %35 to i64
|
||||
%arrayidx48 = getelementptr inbounds float, float* %34, i64 %idxprom47
|
||||
%36 = load float, float* %arrayidx48, align 4
|
||||
%conv49 = fpext float %36 to double
|
||||
%mul50 = fmul contract double 3.000000e-01, %conv49
|
||||
%37 = load float*, float** %oldw.addr, align 8
|
||||
%38 = load i32, i32* %index_x, align 4
|
||||
%idxprom51 = sext i32 %38 to i64
|
||||
%arrayidx52 = getelementptr inbounds float, float* %37, i64 %idxprom51
|
||||
%39 = load float, float* %arrayidx52, align 4
|
||||
%conv53 = fpext float %39 to double
|
||||
%mul54 = fmul contract double 3.000000e-01, %conv53
|
||||
%add55 = fadd contract double %mul50, %mul54
|
||||
%40 = load float*, float** %w.addr, align 8
|
||||
%41 = load i32, i32* %index_x, align 4
|
||||
%idxprom56 = sext i32 %41 to i64
|
||||
%arrayidx57 = getelementptr inbounds float, float* %40, i64 %idxprom56
|
||||
%42 = load float, float* %arrayidx57, align 4
|
||||
%conv58 = fpext float %42 to double
|
||||
%add59 = fadd contract double %conv58, %add55
|
||||
%conv60 = fptrunc double %add59 to float
|
||||
store float %conv60, float* %arrayidx57, align 4
|
||||
%43 = load float*, float** %delta.addr, align 8
|
||||
%44 = load i32, i32* %index_x, align 4
|
||||
%idxprom61 = sext i32 %44 to i64
|
||||
%arrayidx62 = getelementptr inbounds float, float* %43, i64 %idxprom61
|
||||
%45 = load float, float* %arrayidx62, align 4
|
||||
%conv63 = fpext float %45 to double
|
||||
%mul64 = fmul contract double 3.000000e-01, %conv63
|
||||
%46 = load float*, float** %oldw.addr, align 8
|
||||
%47 = load i32, i32* %index_x, align 4
|
||||
%idxprom65 = sext i32 %47 to i64
|
||||
%arrayidx66 = getelementptr inbounds float, float* %46, i64 %idxprom65
|
||||
%48 = load float, float* %arrayidx66, align 4
|
||||
%conv67 = fpext float %48 to double
|
||||
%mul68 = fmul contract double 3.000000e-01, %conv67
|
||||
%add69 = fadd contract double %mul64, %mul68
|
||||
%conv70 = fptrunc double %add69 to float
|
||||
%49 = load float*, float** %oldw.addr, align 8
|
||||
%50 = load i32, i32* %index_x, align 4
|
||||
%idxprom71 = sext i32 %50 to i64
|
||||
%arrayidx72 = getelementptr inbounds float, float* %49, i64 %idxprom71
|
||||
store float %conv70, float* %arrayidx72, align 4
|
||||
br label %if.end
|
||||
|
||||
if.end: ; preds = %if.then, %land.lhs.true, %entry
|
||||
ret void
|
||||
}
|
||||
|
||||
; Function Attrs: nounwind readnone
|
||||
declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.y() #3
|
||||
|
||||
; Function Attrs: nounwind readnone
|
||||
declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() #3
|
||||
|
||||
; Function Attrs: nounwind readnone
|
||||
declare i32 @llvm.nvvm.read.ptx.sreg.tid.y() #3
|
||||
|
||||
; Function Attrs: alwaysinline convergent inlinehint nounwind
|
||||
define internal float @__nv_fast_log2f(float %a) #4 {
|
||||
%call.i = call i32 @__nvvm_reflect(i8* addrspacecast (i8 addrspace(1)* getelementptr inbounds ([11 x i8], [11 x i8] addrspace(1)* @"$str", i32 0, i32 0) to i8*))
|
||||
%1 = icmp ne i32 %call.i, 0
|
||||
br i1 %1, label %2, label %4
|
||||
|
||||
2: ; preds = %0
|
||||
%3 = call float @llvm.nvvm.lg2.approx.ftz.f(float %a)
|
||||
br label %__nvvm_builtin_log2f.exit
|
||||
|
||||
4: ; preds = %0
|
||||
%5 = call float @llvm.nvvm.lg2.approx.f(float %a)
|
||||
br label %__nvvm_builtin_log2f.exit
|
||||
|
||||
__nvvm_builtin_log2f.exit: ; preds = %4, %2
|
||||
%retval.0.i = phi float [ %3, %2 ], [ %5, %4 ]
|
||||
ret float %retval.0.i
|
||||
}
|
||||
|
||||
; Function Attrs: convergent nounwind
|
||||
declare i32 @__nvvm_reflect(i8*) #5
|
||||
|
||||
; Function Attrs: nounwind readnone
|
||||
declare float @llvm.nvvm.lg2.approx.ftz.f(float) #3
|
||||
|
||||
; Function Attrs: nounwind readnone
|
||||
declare float @llvm.nvvm.lg2.approx.f(float) #3
|
||||
|
||||
; Function Attrs: alwaysinline convergent inlinehint nounwind
|
||||
define internal float @__nv_fast_powf(float %a, float %b) #4 {
|
||||
%call.i.i = call i32 @__nvvm_reflect(i8* addrspacecast (i8 addrspace(1)* getelementptr inbounds ([11 x i8], [11 x i8] addrspace(1)* @"$str", i32 0, i32 0) to i8*))
|
||||
%1 = icmp ne i32 %call.i.i, 0
|
||||
br i1 %1, label %2, label %4
|
||||
|
||||
2: ; preds = %0
|
||||
%3 = call float @llvm.nvvm.lg2.approx.ftz.f(float %a)
|
||||
br label %__nv_fast_log2f.exit
|
||||
|
||||
4: ; preds = %0
|
||||
%5 = call float @llvm.nvvm.lg2.approx.f(float %a)
|
||||
br label %__nv_fast_log2f.exit
|
||||
|
||||
__nv_fast_log2f.exit: ; preds = %4, %2
|
||||
%retval.0.i.i = phi float [ %3, %2 ], [ %5, %4 ]
|
||||
%6 = fmul float %b, %retval.0.i.i
|
||||
%call.i.i1 = call i32 @__nvvm_reflect(i8* addrspacecast (i8 addrspace(1)* getelementptr inbounds ([11 x i8], [11 x i8] addrspace(1)* @"$str", i32 0, i32 0) to i8*))
|
||||
%7 = icmp ne i32 %call.i.i1, 0
|
||||
br i1 %7, label %8, label %10
|
||||
|
||||
8: ; preds = %__nv_fast_log2f.exit
|
||||
%9 = call float @llvm.nvvm.ex2.approx.ftz.f(float %6)
|
||||
br label %__nv_exp2f.exit
|
||||
|
||||
10: ; preds = %__nv_fast_log2f.exit
|
||||
%11 = call float @llvm.nvvm.ex2.approx.f(float %6)
|
||||
br label %__nv_exp2f.exit
|
||||
|
||||
__nv_exp2f.exit: ; preds = %10, %8
|
||||
%retval.0.i.i2 = phi float [ %9, %8 ], [ %11, %10 ]
|
||||
ret float %retval.0.i.i2
|
||||
}
|
||||
|
||||
; Function Attrs: nounwind readnone
|
||||
declare float @llvm.nvvm.ex2.approx.ftz.f(float) #3
|
||||
|
||||
; Function Attrs: nounwind readnone
|
||||
declare float @llvm.nvvm.ex2.approx.f(float) #3
|
||||
|
||||
attributes #0 = { convergent noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" }
|
||||
attributes #1 = { alwaysinline convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" }
|
||||
attributes #2 = { convergent nounwind }
|
||||
attributes #3 = { nounwind readnone }
|
||||
attributes #4 = { alwaysinline convergent inlinehint nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
|
||||
attributes #5 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
|
||||
|
||||
!llvm.module.flags = !{!0, !1, !2}
|
||||
!nvvm.annotations = !{!3, !4, !5, !6, !5, !7, !7, !7, !7, !8, !8, !7}
|
||||
!llvm.ident = !{!9}
|
||||
!nvvmir.version = !{!10}
|
||||
|
||||
!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 10, i32 1]}
|
||||
!1 = !{i32 1, !"wchar_size", i32 4}
|
||||
!2 = !{i32 4, !"nvvm-reflect-ftz", i32 0}
|
||||
!3 = !{void (float*, float*, float*, float*, i32, i32)* @_Z22bpnn_layerforward_CUDAPfS_S_S_ii, !"kernel", i32 1}
|
||||
!4 = !{void (float*, i32, float*, i32, float*, float*)* @_Z24bpnn_adjust_weights_cudaPfiS_iS_S_, !"kernel", i32 1}
|
||||
!5 = !{null, !"align", i32 8}
|
||||
!6 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080}
|
||||
!7 = !{null, !"align", i32 16}
|
||||
!8 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088}
|
||||
!9 = !{!"clang version 10.0.1 (https://github.com/llvm/llvm-project.git ef32c611aa214dea855364efd7ba451ec5ec3f74)"}
|
||||
!10 = !{i32 1, i32 4}
|
File diff suppressed because one or more lines are too long
|
@ -1,195 +0,0 @@
|
|||
#include <cuda.h>
|
||||
#include <math.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <sys/time.h>
|
||||
|
||||
// includes, kernels
|
||||
#include "backprop.h"
|
||||
#include "backprop_cuda_kernel.cu"
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
extern "C" void bpnn_layerforward(float *l1, float *l2, float **conn, int n1,
|
||||
int n2);
|
||||
|
||||
extern "C" void bpnn_output_error(float *delta, float *target, float *output,
|
||||
int nj, float *err);
|
||||
|
||||
extern "C" void bpnn_hidden_error(float *delta_h, int nh, float *delta_o,
|
||||
int no, float **who, float *hidden,
|
||||
float *err);
|
||||
|
||||
extern "C" void bpnn_adjust_weights(float *delta, int ndelta, float *ly,
|
||||
int nly, float **w, float **oldw);
|
||||
|
||||
extern "C" int setup(int argc, char **argv);
|
||||
|
||||
extern "C" float **alloc_2d_dbl(int m, int n);
|
||||
|
||||
extern "C" float squash(float x);
|
||||
|
||||
double gettime() {
|
||||
struct timeval t;
|
||||
gettimeofday(&t, NULL);
|
||||
return t.tv_sec + t.tv_usec * 1e-6;
|
||||
}
|
||||
|
||||
unsigned int num_threads = 0;
|
||||
unsigned int num_blocks = 0;
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Program main
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
int main(int argc, char **argv) {
|
||||
cudaSetDevice(0);
|
||||
setup(argc, argv);
|
||||
}
|
||||
|
||||
extern "C" void bpnn_train_cuda(BPNN *net, float *eo, float *eh) {
|
||||
int in, hid, out;
|
||||
float out_err, hid_err;
|
||||
|
||||
in = net->input_n;
|
||||
hid = net->hidden_n;
|
||||
out = net->output_n;
|
||||
|
||||
#ifdef GPU
|
||||
int m = 0;
|
||||
float *input_hidden_cuda;
|
||||
float *input_cuda;
|
||||
float *output_hidden_cuda;
|
||||
float *partial_sum;
|
||||
float *hidden_partial_sum;
|
||||
float *hidden_delta_cuda;
|
||||
float *input_prev_weights_cuda;
|
||||
float sum;
|
||||
float *input_weights_one_dim;
|
||||
float *input_weights_prev_one_dim;
|
||||
num_blocks = in / 16;
|
||||
dim3 grid(1, num_blocks);
|
||||
dim3 threads(16, 16);
|
||||
|
||||
input_weights_one_dim = (float *)malloc((in + 1) * (hid + 1) * sizeof(float));
|
||||
input_weights_prev_one_dim =
|
||||
(float *)malloc((in + 1) * (hid + 1) * sizeof(float));
|
||||
partial_sum = (float *)malloc(num_blocks * WIDTH * sizeof(float));
|
||||
|
||||
// this preprocessing stage is added to correct the bugs of wrong memcopy
|
||||
// using two-dimensional net->inputweights
|
||||
for (int k = 0; k <= in; k++) {
|
||||
for (int j = 0; j <= hid; j++) {
|
||||
input_weights_one_dim[m] = net->input_weights[k][j];
|
||||
input_weights_prev_one_dim[m] = net->input_prev_weights[k][j];
|
||||
m++;
|
||||
}
|
||||
}
|
||||
|
||||
cudaMalloc((void **)&input_cuda, (in + 1) * sizeof(float));
|
||||
cudaMalloc((void **)&output_hidden_cuda, (hid + 1) * sizeof(float));
|
||||
cudaMalloc((void **)&input_hidden_cuda, (in + 1) * (hid + 1) * sizeof(float));
|
||||
cudaMalloc((void **)&hidden_partial_sum, num_blocks * WIDTH * sizeof(float));
|
||||
|
||||
#endif
|
||||
|
||||
#ifdef CPU
|
||||
|
||||
printf("Performing CPU computation\n");
|
||||
bpnn_layerforward(net->input_units, net->hidden_units, net->input_weights, in,
|
||||
hid);
|
||||
|
||||
#endif
|
||||
|
||||
#ifdef GPU
|
||||
|
||||
printf("Performing GPU computation\n");
|
||||
|
||||
// printf("in= %d, hid = %d, numblocks = %d\n", in, hid, num_blocks);
|
||||
|
||||
cudaMemcpy(input_cuda, net->input_units, (in + 1) * sizeof(float),
|
||||
cudaMemcpyHostToDevice);
|
||||
cudaMemcpy(input_hidden_cuda, input_weights_one_dim,
|
||||
(in + 1) * (hid + 1) * sizeof(float), cudaMemcpyHostToDevice);
|
||||
|
||||
bpnn_layerforward_CUDA<<<grid, threads>>>(input_cuda, output_hidden_cuda,
|
||||
input_hidden_cuda,
|
||||
hidden_partial_sum, in, hid);
|
||||
|
||||
cudaThreadSynchronize();
|
||||
|
||||
cudaError_t error = cudaGetLastError();
|
||||
if (error != cudaSuccess) {
|
||||
printf("bpnn kernel error: %s\n", cudaGetErrorString(error));
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
cudaMemcpy(partial_sum, hidden_partial_sum,
|
||||
num_blocks * WIDTH * sizeof(float), cudaMemcpyDeviceToHost);
|
||||
|
||||
for (int j = 1; j <= hid; j++) {
|
||||
sum = 0.0;
|
||||
for (int k = 0; k < num_blocks; k++) {
|
||||
sum += partial_sum[k * hid + j - 1];
|
||||
}
|
||||
sum += net->input_weights[0][j];
|
||||
net->hidden_units[j] = float(1.0 / (1.0 + exp(-sum)));
|
||||
}
|
||||
#endif
|
||||
|
||||
bpnn_layerforward(net->hidden_units, net->output_units, net->hidden_weights,
|
||||
hid, out);
|
||||
bpnn_output_error(net->output_delta, net->target, net->output_units, out,
|
||||
&out_err);
|
||||
bpnn_hidden_error(net->hidden_delta, hid, net->output_delta, out,
|
||||
net->hidden_weights, net->hidden_units, &hid_err);
|
||||
bpnn_adjust_weights(net->output_delta, out, net->hidden_units, hid,
|
||||
net->hidden_weights, net->hidden_prev_weights);
|
||||
|
||||
#ifdef CPU
|
||||
|
||||
bpnn_adjust_weights(net->hidden_delta, hid, net->input_units, in,
|
||||
net->input_weights, net->input_prev_weights);
|
||||
|
||||
#endif
|
||||
|
||||
#ifdef GPU
|
||||
|
||||
cudaMalloc((void **)&hidden_delta_cuda, (hid + 1) * sizeof(float));
|
||||
cudaMalloc((void **)&input_prev_weights_cuda,
|
||||
(in + 1) * (hid + 1) * sizeof(float));
|
||||
|
||||
cudaMemcpy(hidden_delta_cuda, net->hidden_delta, (hid + 1) * sizeof(float),
|
||||
cudaMemcpyHostToDevice);
|
||||
cudaMemcpy(input_prev_weights_cuda, input_weights_prev_one_dim,
|
||||
(in + 1) * (hid + 1) * sizeof(float), cudaMemcpyHostToDevice);
|
||||
cudaMemcpy(input_hidden_cuda, input_weights_one_dim,
|
||||
(in + 1) * (hid + 1) * sizeof(float), cudaMemcpyHostToDevice);
|
||||
|
||||
bpnn_adjust_weights_cuda<<<grid, threads>>>(hidden_delta_cuda, hid,
|
||||
input_cuda, in, input_hidden_cuda,
|
||||
input_prev_weights_cuda);
|
||||
|
||||
cudaMemcpy(net->input_units, input_cuda, (in + 1) * sizeof(float),
|
||||
cudaMemcpyDeviceToHost);
|
||||
cudaMemcpy(input_weights_one_dim, input_hidden_cuda,
|
||||
(in + 1) * (hid + 1) * sizeof(float), cudaMemcpyDeviceToHost);
|
||||
|
||||
for (int i = 0; i < (in + 1) * (hid + 1); i++) {
|
||||
printf("%f ", input_weights_one_dim[i]);
|
||||
}
|
||||
printf("\n");
|
||||
|
||||
cudaFree(input_cuda);
|
||||
cudaFree(output_hidden_cuda);
|
||||
cudaFree(input_hidden_cuda);
|
||||
cudaFree(hidden_partial_sum);
|
||||
cudaFree(input_prev_weights_cuda);
|
||||
cudaFree(hidden_delta_cuda);
|
||||
|
||||
free(partial_sum);
|
||||
free(input_weights_one_dim);
|
||||
free(input_weights_prev_one_dim);
|
||||
|
||||
#endif
|
||||
}
|
|
@ -1,96 +0,0 @@
|
|||
#ifndef _BACKPROP_CUDA_KERNEL_H_
|
||||
#define _BACKPROP_CUDA_KERNEL_H_
|
||||
|
||||
#include "backprop.h"
|
||||
#include "cuda.h"
|
||||
#include "math.h"
|
||||
#include <stdio.h>
|
||||
|
||||
__global__ void bpnn_layerforward_CUDA(float *input_cuda,
|
||||
float *output_hidden_cuda,
|
||||
float *input_hidden_cuda,
|
||||
float *hidden_partial_sum, int in,
|
||||
int hid) {
|
||||
int by = blockIdx.y;
|
||||
int tx = threadIdx.x;
|
||||
int ty = threadIdx.y;
|
||||
|
||||
int index = (hid + 1) * HEIGHT * by + (hid + 1) * ty + tx + 1 + (hid + 1);
|
||||
|
||||
int index_in = HEIGHT * by + ty + 1;
|
||||
|
||||
__shared__ float input_node[HEIGHT];
|
||||
__shared__ float weight_matrix[HEIGHT][WIDTH];
|
||||
|
||||
if (tx == 0)
|
||||
input_node[ty] = input_cuda[index_in];
|
||||
|
||||
__syncthreads();
|
||||
|
||||
weight_matrix[ty][tx] = input_hidden_cuda[index];
|
||||
|
||||
__syncthreads();
|
||||
|
||||
weight_matrix[ty][tx] = weight_matrix[ty][tx] * input_node[ty];
|
||||
|
||||
__syncthreads();
|
||||
|
||||
for (int i = 1; i <= __log2f(HEIGHT); i++) {
|
||||
|
||||
int power_two = __powf(2, i);
|
||||
|
||||
if (ty % power_two == 0)
|
||||
weight_matrix[ty][tx] =
|
||||
weight_matrix[ty][tx] + weight_matrix[ty + power_two / 2][tx];
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
//__syncthreads();
|
||||
|
||||
input_hidden_cuda[index] = weight_matrix[ty][tx];
|
||||
|
||||
/*
|
||||
for ( unsigned int i = 2 ; i <= HEIGHT ; i *= 2){
|
||||
|
||||
unsigned int power_two = i - 1;
|
||||
if( (ty & power_two) == 0 ) {
|
||||
weight_matrix[ty][tx] = weight_matrix[ty][tx] +
|
||||
weight_matrix[ty + power_two/2][tx];
|
||||
}
|
||||
}
|
||||
*/
|
||||
|
||||
__syncthreads();
|
||||
|
||||
if (tx == 0) {
|
||||
hidden_partial_sum[by * hid + ty] = weight_matrix[tx][ty];
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void bpnn_adjust_weights_cuda(float *delta, int hid, float *ly,
|
||||
int in, float *w, float *oldw) {
|
||||
|
||||
int by = blockIdx.y;
|
||||
|
||||
int tx = threadIdx.x;
|
||||
int ty = threadIdx.y;
|
||||
|
||||
int index = (hid + 1) * HEIGHT * by + (hid + 1) * ty + tx + 1 + (hid + 1);
|
||||
int index_y = HEIGHT * by + ty + 1;
|
||||
int index_x = tx + 1;
|
||||
// eta = 0.3;
|
||||
// momentum = 0.3;
|
||||
|
||||
w[index] += ((ETA * delta[index_x] * ly[index_y]) + (MOMENTUM * oldw[index]));
|
||||
oldw[index] =
|
||||
((ETA * delta[index_x] * ly[index_y]) + (MOMENTUM * oldw[index]));
|
||||
|
||||
__syncthreads();
|
||||
|
||||
if (ty == 0 && by == 0) {
|
||||
w[index_x] += ((ETA * delta[index_x]) + (MOMENTUM * oldw[index_x]));
|
||||
oldw[index_x] = ((ETA * delta[index_x]) + (MOMENTUM * oldw[index_x]));
|
||||
}
|
||||
}
|
||||
#endif
|
|
@ -1,48 +0,0 @@
|
|||
#include "backprop.h"
|
||||
#include <math.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
extern char *strcpy();
|
||||
extern void exit();
|
||||
|
||||
int layer_size = 0;
|
||||
|
||||
backprop_face() {
|
||||
BPNN *net;
|
||||
int i;
|
||||
float out_err, hid_err;
|
||||
net = bpnn_create(layer_size, 16, 1); // (16, 1 can not be changed)
|
||||
|
||||
printf("Input layer size : %d\n", layer_size);
|
||||
load(net);
|
||||
// entering the training kernel, only one iteration
|
||||
printf("Starting training kernel\n");
|
||||
bpnn_train_cuda(net, &out_err, &hid_err);
|
||||
bpnn_free(net);
|
||||
printf("Training done\n");
|
||||
}
|
||||
|
||||
int setup(argc, argv)
|
||||
int argc;
|
||||
char *argv[];
|
||||
{
|
||||
|
||||
int seed;
|
||||
|
||||
if (argc != 2) {
|
||||
fprintf(stderr, "usage: backprop <num of input elements>\n");
|
||||
exit(0);
|
||||
}
|
||||
layer_size = atoi(argv[1]);
|
||||
if (layer_size % 16 != 0) {
|
||||
fprintf(stderr, "The number of input points must be divided by 16\n");
|
||||
exit(0);
|
||||
}
|
||||
|
||||
seed = 7;
|
||||
bpnn_initialize(seed);
|
||||
backprop_face();
|
||||
|
||||
exit(0);
|
||||
}
|
|
@ -1,22 +0,0 @@
|
|||
#include "backprop.h"
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
extern layer_size;
|
||||
|
||||
load(net) BPNN *net;
|
||||
{
|
||||
float *units;
|
||||
int nr, nc, imgsize, i, j, k;
|
||||
|
||||
nr = layer_size;
|
||||
|
||||
imgsize = nr * nc;
|
||||
units = net->input_units;
|
||||
|
||||
k = 1;
|
||||
for (i = 0; i < nr; i++) {
|
||||
units[k] = (float)rand() / RAND_MAX;
|
||||
k++;
|
||||
}
|
||||
}
|
|
@ -1,28 +0,0 @@
|
|||
#!/bin/bash
|
||||
set -e
|
||||
clang -c -emit-llvm backprop.c
|
||||
clang -c -emit-llvm facetrain.c
|
||||
clang -c -emit-llvm imagenet.c
|
||||
|
||||
llvm-as backprop_cuda-cuda-nvptx64-nvidia-cuda-sm_61.ll
|
||||
llvm-as backprop_cuda-host-x86_64-unknown-linux-gnu.ll
|
||||
../../build/compilation/kernelTranslator backprop_cuda-cuda-nvptx64-nvidia-cuda-sm_61.bc kernel.bc
|
||||
../../build/compilation/hostTranslator backprop_cuda-host-x86_64-unknown-linux-gnu.bc host.bc
|
||||
|
||||
llc --relocation-model=pic --filetype=obj kernel.bc
|
||||
llc --relocation-model=pic --filetype=obj host.bc
|
||||
llc --relocation-model=pic --filetype=obj backprop.bc
|
||||
llc --relocation-model=pic --filetype=obj facetrain.bc
|
||||
llc --relocation-model=pic --filetype=obj imagenet.bc
|
||||
export LD_LIBRARY_PATH=../../build/runtime:../../build/runtime/threadPool:$LD_LIBRARY_PATH
|
||||
g++ -Wall -L../../build/runtime -L../../build/runtime/threadPool -o demo \
|
||||
-fPIC -no-pie host.o kernel.o backprop.o facetrain.o imagenet.o \
|
||||
-lc -lx86Runtime -lthreadPool -lpthread
|
||||
|
||||
./demo 1024 > res.log
|
||||
if grep -q -e "0.173289 0.259645 0.350836" res.log; then
|
||||
echo "Pass"
|
||||
else
|
||||
echo "Error result"
|
||||
exit 1
|
||||
fi
|
|
@ -1,307 +0,0 @@
|
|||
; ModuleID = 'bfs-cuda-nvptx64-nvidia-cuda-sm_61.bc'
|
||||
source_filename = "bfs.cu"
|
||||
target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
|
||||
target triple = "nvptx64-nvidia-cuda"
|
||||
|
||||
%struct.__cuda_builtin_blockIdx_t = type { i8 }
|
||||
%struct.__cuda_builtin_threadIdx_t = type { i8 }
|
||||
%struct.cudaFuncAttributes = type { i64, i64, i64, i32, i32, i32, i32, i32, i32, i32 }
|
||||
%struct.Node = type { i32, i32 }
|
||||
|
||||
$_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv = comdat any
|
||||
|
||||
$_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv = comdat any
|
||||
|
||||
@blockIdx = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_blockIdx_t, align 1
|
||||
@threadIdx = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_threadIdx_t, align 1
|
||||
|
||||
; Function Attrs: convergent noinline nounwind optnone
|
||||
define weak dso_local i32 @cudaMalloc(i8** %p, i64 %s) #0 {
|
||||
entry:
|
||||
%p.addr = alloca i8**, align 8
|
||||
%s.addr = alloca i64, align 8
|
||||
store i8** %p, i8*** %p.addr, align 8
|
||||
store i64 %s, i64* %s.addr, align 8
|
||||
ret i32 999
|
||||
}
|
||||
|
||||
; Function Attrs: convergent noinline nounwind optnone
|
||||
define weak dso_local i32 @cudaFuncGetAttributes(%struct.cudaFuncAttributes* %p, i8* %c) #0 {
|
||||
entry:
|
||||
%p.addr = alloca %struct.cudaFuncAttributes*, align 8
|
||||
%c.addr = alloca i8*, align 8
|
||||
store %struct.cudaFuncAttributes* %p, %struct.cudaFuncAttributes** %p.addr, align 8
|
||||
store i8* %c, i8** %c.addr, align 8
|
||||
ret i32 999
|
||||
}
|
||||
|
||||
; Function Attrs: convergent noinline nounwind optnone
|
||||
define weak dso_local i32 @cudaDeviceGetAttribute(i32* %value, i32 %attr, i32 %device) #0 {
|
||||
entry:
|
||||
%value.addr = alloca i32*, align 8
|
||||
%attr.addr = alloca i32, align 4
|
||||
%device.addr = alloca i32, align 4
|
||||
store i32* %value, i32** %value.addr, align 8
|
||||
store i32 %attr, i32* %attr.addr, align 4
|
||||
store i32 %device, i32* %device.addr, align 4
|
||||
ret i32 999
|
||||
}
|
||||
|
||||
; Function Attrs: convergent noinline nounwind optnone
|
||||
define weak dso_local i32 @cudaGetDevice(i32* %device) #0 {
|
||||
entry:
|
||||
%device.addr = alloca i32*, align 8
|
||||
store i32* %device, i32** %device.addr, align 8
|
||||
ret i32 999
|
||||
}
|
||||
|
||||
; Function Attrs: convergent noinline nounwind optnone
|
||||
define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessor(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize) #0 {
|
||||
entry:
|
||||
%numBlocks.addr = alloca i32*, align 8
|
||||
%func.addr = alloca i8*, align 8
|
||||
%blockSize.addr = alloca i32, align 4
|
||||
%dynamicSmemSize.addr = alloca i64, align 8
|
||||
store i32* %numBlocks, i32** %numBlocks.addr, align 8
|
||||
store i8* %func, i8** %func.addr, align 8
|
||||
store i32 %blockSize, i32* %blockSize.addr, align 4
|
||||
store i64 %dynamicSmemSize, i64* %dynamicSmemSize.addr, align 8
|
||||
ret i32 999
|
||||
}
|
||||
|
||||
; Function Attrs: convergent noinline nounwind optnone
|
||||
define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize, i32 %flags) #0 {
|
||||
entry:
|
||||
%numBlocks.addr = alloca i32*, align 8
|
||||
%func.addr = alloca i8*, align 8
|
||||
%blockSize.addr = alloca i32, align 4
|
||||
%dynamicSmemSize.addr = alloca i64, align 8
|
||||
%flags.addr = alloca i32, align 4
|
||||
store i32* %numBlocks, i32** %numBlocks.addr, align 8
|
||||
store i8* %func, i8** %func.addr, align 8
|
||||
store i32 %blockSize, i32* %blockSize.addr, align 4
|
||||
store i64 %dynamicSmemSize, i64* %dynamicSmemSize.addr, align 8
|
||||
store i32 %flags, i32* %flags.addr, align 4
|
||||
ret i32 999
|
||||
}
|
||||
|
||||
; Function Attrs: convergent noinline nounwind optnone
|
||||
define dso_local void @_Z6KernelP4NodePiPbS2_S2_S1_i(%struct.Node* %g_graph_nodes, i32* %g_graph_edges, i8* %g_graph_mask, i8* %g_updating_graph_mask, i8* %g_graph_visited, i32* %g_cost, i32 %no_of_nodes) #0 {
|
||||
entry:
|
||||
%g_graph_nodes.addr = alloca %struct.Node*, align 8
|
||||
%g_graph_edges.addr = alloca i32*, align 8
|
||||
%g_graph_mask.addr = alloca i8*, align 8
|
||||
%g_updating_graph_mask.addr = alloca i8*, align 8
|
||||
%g_graph_visited.addr = alloca i8*, align 8
|
||||
%g_cost.addr = alloca i32*, align 8
|
||||
%no_of_nodes.addr = alloca i32, align 4
|
||||
%tid = alloca i32, align 4
|
||||
%i = alloca i32, align 4
|
||||
%id = alloca i32, align 4
|
||||
store %struct.Node* %g_graph_nodes, %struct.Node** %g_graph_nodes.addr, align 8
|
||||
store i32* %g_graph_edges, i32** %g_graph_edges.addr, align 8
|
||||
store i8* %g_graph_mask, i8** %g_graph_mask.addr, align 8
|
||||
store i8* %g_updating_graph_mask, i8** %g_updating_graph_mask.addr, align 8
|
||||
store i8* %g_graph_visited, i8** %g_graph_visited.addr, align 8
|
||||
store i32* %g_cost, i32** %g_cost.addr, align 8
|
||||
store i32 %no_of_nodes, i32* %no_of_nodes.addr, align 4
|
||||
%call = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #3
|
||||
%mul = mul i32 %call, 512
|
||||
%call1 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #3
|
||||
%add = add i32 %mul, %call1
|
||||
store i32 %add, i32* %tid, align 4
|
||||
%0 = load i32, i32* %tid, align 4
|
||||
%1 = load i32, i32* %no_of_nodes.addr, align 4
|
||||
%cmp = icmp slt i32 %0, %1
|
||||
br i1 %cmp, label %land.lhs.true, label %if.end26
|
||||
|
||||
land.lhs.true: ; preds = %entry
|
||||
%2 = load i8*, i8** %g_graph_mask.addr, align 8
|
||||
%3 = load i32, i32* %tid, align 4
|
||||
%idxprom = sext i32 %3 to i64
|
||||
%arrayidx = getelementptr inbounds i8, i8* %2, i64 %idxprom
|
||||
%4 = load i8, i8* %arrayidx, align 1
|
||||
%tobool = trunc i8 %4 to i1
|
||||
br i1 %tobool, label %if.then, label %if.end26
|
||||
|
||||
if.then: ; preds = %land.lhs.true
|
||||
%5 = load i8*, i8** %g_graph_mask.addr, align 8
|
||||
%6 = load i32, i32* %tid, align 4
|
||||
%idxprom2 = sext i32 %6 to i64
|
||||
%arrayidx3 = getelementptr inbounds i8, i8* %5, i64 %idxprom2
|
||||
store i8 0, i8* %arrayidx3, align 1
|
||||
%7 = load %struct.Node*, %struct.Node** %g_graph_nodes.addr, align 8
|
||||
%8 = load i32, i32* %tid, align 4
|
||||
%idxprom4 = sext i32 %8 to i64
|
||||
%arrayidx5 = getelementptr inbounds %struct.Node, %struct.Node* %7, i64 %idxprom4
|
||||
%starting = getelementptr inbounds %struct.Node, %struct.Node* %arrayidx5, i32 0, i32 0
|
||||
%9 = load i32, i32* %starting, align 4
|
||||
store i32 %9, i32* %i, align 4
|
||||
br label %for.cond
|
||||
|
||||
for.cond: ; preds = %for.inc, %if.then
|
||||
%10 = load i32, i32* %i, align 4
|
||||
%11 = load %struct.Node*, %struct.Node** %g_graph_nodes.addr, align 8
|
||||
%12 = load i32, i32* %tid, align 4
|
||||
%idxprom6 = sext i32 %12 to i64
|
||||
%arrayidx7 = getelementptr inbounds %struct.Node, %struct.Node* %11, i64 %idxprom6
|
||||
%no_of_edges = getelementptr inbounds %struct.Node, %struct.Node* %arrayidx7, i32 0, i32 1
|
||||
%13 = load i32, i32* %no_of_edges, align 4
|
||||
%14 = load %struct.Node*, %struct.Node** %g_graph_nodes.addr, align 8
|
||||
%15 = load i32, i32* %tid, align 4
|
||||
%idxprom8 = sext i32 %15 to i64
|
||||
%arrayidx9 = getelementptr inbounds %struct.Node, %struct.Node* %14, i64 %idxprom8
|
||||
%starting10 = getelementptr inbounds %struct.Node, %struct.Node* %arrayidx9, i32 0, i32 0
|
||||
%16 = load i32, i32* %starting10, align 4
|
||||
%add11 = add nsw i32 %13, %16
|
||||
%cmp12 = icmp slt i32 %10, %add11
|
||||
br i1 %cmp12, label %for.body, label %for.end
|
||||
|
||||
for.body: ; preds = %for.cond
|
||||
%17 = load i32*, i32** %g_graph_edges.addr, align 8
|
||||
%18 = load i32, i32* %i, align 4
|
||||
%idxprom13 = sext i32 %18 to i64
|
||||
%arrayidx14 = getelementptr inbounds i32, i32* %17, i64 %idxprom13
|
||||
%19 = load i32, i32* %arrayidx14, align 4
|
||||
store i32 %19, i32* %id, align 4
|
||||
%20 = load i8*, i8** %g_graph_visited.addr, align 8
|
||||
%21 = load i32, i32* %id, align 4
|
||||
%idxprom15 = sext i32 %21 to i64
|
||||
%arrayidx16 = getelementptr inbounds i8, i8* %20, i64 %idxprom15
|
||||
%22 = load i8, i8* %arrayidx16, align 1
|
||||
%tobool17 = trunc i8 %22 to i1
|
||||
br i1 %tobool17, label %if.end, label %if.then18
|
||||
|
||||
if.then18: ; preds = %for.body
|
||||
%23 = load i32*, i32** %g_cost.addr, align 8
|
||||
%24 = load i32, i32* %tid, align 4
|
||||
%idxprom19 = sext i32 %24 to i64
|
||||
%arrayidx20 = getelementptr inbounds i32, i32* %23, i64 %idxprom19
|
||||
%25 = load i32, i32* %arrayidx20, align 4
|
||||
%add21 = add nsw i32 %25, 1
|
||||
%26 = load i32*, i32** %g_cost.addr, align 8
|
||||
%27 = load i32, i32* %id, align 4
|
||||
%idxprom22 = sext i32 %27 to i64
|
||||
%arrayidx23 = getelementptr inbounds i32, i32* %26, i64 %idxprom22
|
||||
store i32 %add21, i32* %arrayidx23, align 4
|
||||
%28 = load i8*, i8** %g_updating_graph_mask.addr, align 8
|
||||
%29 = load i32, i32* %id, align 4
|
||||
%idxprom24 = sext i32 %29 to i64
|
||||
%arrayidx25 = getelementptr inbounds i8, i8* %28, i64 %idxprom24
|
||||
store i8 1, i8* %arrayidx25, align 1
|
||||
br label %if.end
|
||||
|
||||
if.end: ; preds = %if.then18, %for.body
|
||||
br label %for.inc
|
||||
|
||||
for.inc: ; preds = %if.end
|
||||
%30 = load i32, i32* %i, align 4
|
||||
%inc = add nsw i32 %30, 1
|
||||
store i32 %inc, i32* %i, align 4
|
||||
br label %for.cond
|
||||
|
||||
for.end: ; preds = %for.cond
|
||||
br label %if.end26
|
||||
|
||||
if.end26: ; preds = %for.end, %land.lhs.true, %entry
|
||||
ret void
|
||||
}
|
||||
|
||||
; Function Attrs: alwaysinline convergent nounwind
|
||||
define linkonce_odr dso_local i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #1 comdat align 2 {
|
||||
entry:
|
||||
%0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
|
||||
ret i32 %0
|
||||
}
|
||||
|
||||
; Function Attrs: alwaysinline convergent nounwind
|
||||
define linkonce_odr dso_local i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #1 comdat align 2 {
|
||||
entry:
|
||||
%0 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
|
||||
ret i32 %0
|
||||
}
|
||||
|
||||
; Function Attrs: convergent noinline nounwind optnone
|
||||
define dso_local void @_Z7Kernel2PbS_S_S_i(i8* %g_graph_mask, i8* %g_updating_graph_mask, i8* %g_graph_visited, i8* %g_over, i32 %no_of_nodes) #0 {
|
||||
entry:
|
||||
%g_graph_mask.addr = alloca i8*, align 8
|
||||
%g_updating_graph_mask.addr = alloca i8*, align 8
|
||||
%g_graph_visited.addr = alloca i8*, align 8
|
||||
%g_over.addr = alloca i8*, align 8
|
||||
%no_of_nodes.addr = alloca i32, align 4
|
||||
%tid = alloca i32, align 4
|
||||
store i8* %g_graph_mask, i8** %g_graph_mask.addr, align 8
|
||||
store i8* %g_updating_graph_mask, i8** %g_updating_graph_mask.addr, align 8
|
||||
store i8* %g_graph_visited, i8** %g_graph_visited.addr, align 8
|
||||
store i8* %g_over, i8** %g_over.addr, align 8
|
||||
store i32 %no_of_nodes, i32* %no_of_nodes.addr, align 4
|
||||
%call = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #3
|
||||
%mul = mul i32 %call, 512
|
||||
%call1 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #3
|
||||
%add = add i32 %mul, %call1
|
||||
store i32 %add, i32* %tid, align 4
|
||||
%0 = load i32, i32* %tid, align 4
|
||||
%1 = load i32, i32* %no_of_nodes.addr, align 4
|
||||
%cmp = icmp slt i32 %0, %1
|
||||
br i1 %cmp, label %land.lhs.true, label %if.end
|
||||
|
||||
land.lhs.true: ; preds = %entry
|
||||
%2 = load i8*, i8** %g_updating_graph_mask.addr, align 8
|
||||
%3 = load i32, i32* %tid, align 4
|
||||
%idxprom = sext i32 %3 to i64
|
||||
%arrayidx = getelementptr inbounds i8, i8* %2, i64 %idxprom
|
||||
%4 = load i8, i8* %arrayidx, align 1
|
||||
%tobool = trunc i8 %4 to i1
|
||||
br i1 %tobool, label %if.then, label %if.end
|
||||
|
||||
if.then: ; preds = %land.lhs.true
|
||||
%5 = load i8*, i8** %g_graph_mask.addr, align 8
|
||||
%6 = load i32, i32* %tid, align 4
|
||||
%idxprom2 = sext i32 %6 to i64
|
||||
%arrayidx3 = getelementptr inbounds i8, i8* %5, i64 %idxprom2
|
||||
store i8 1, i8* %arrayidx3, align 1
|
||||
%7 = load i8*, i8** %g_graph_visited.addr, align 8
|
||||
%8 = load i32, i32* %tid, align 4
|
||||
%idxprom4 = sext i32 %8 to i64
|
||||
%arrayidx5 = getelementptr inbounds i8, i8* %7, i64 %idxprom4
|
||||
store i8 1, i8* %arrayidx5, align 1
|
||||
%9 = load i8*, i8** %g_over.addr, align 8
|
||||
store i8 1, i8* %9, align 1
|
||||
%10 = load i8*, i8** %g_updating_graph_mask.addr, align 8
|
||||
%11 = load i32, i32* %tid, align 4
|
||||
%idxprom6 = sext i32 %11 to i64
|
||||
%arrayidx7 = getelementptr inbounds i8, i8* %10, i64 %idxprom6
|
||||
store i8 0, i8* %arrayidx7, align 1
|
||||
br label %if.end
|
||||
|
||||
if.end: ; preds = %if.then, %land.lhs.true, %entry
|
||||
ret void
|
||||
}
|
||||
|
||||
; Function Attrs: nounwind readnone
|
||||
declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #2
|
||||
|
||||
; Function Attrs: nounwind readnone
|
||||
declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() #2
|
||||
|
||||
attributes #0 = { convergent noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" }
|
||||
attributes #1 = { alwaysinline convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" }
|
||||
attributes #2 = { nounwind readnone }
|
||||
attributes #3 = { convergent nounwind }
|
||||
|
||||
!llvm.module.flags = !{!0, !1, !2}
|
||||
!nvvm.annotations = !{!3, !4, !5, !6, !5, !7, !7, !7, !7, !8, !8, !7}
|
||||
!llvm.ident = !{!9}
|
||||
!nvvmir.version = !{!10}
|
||||
|
||||
!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 10, i32 1]}
|
||||
!1 = !{i32 1, !"wchar_size", i32 4}
|
||||
!2 = !{i32 4, !"nvvm-reflect-ftz", i32 0}
|
||||
!3 = !{void (%struct.Node*, i32*, i8*, i8*, i8*, i32*, i32)* @_Z6KernelP4NodePiPbS2_S2_S1_i, !"kernel", i32 1}
|
||||
!4 = !{void (i8*, i8*, i8*, i8*, i32)* @_Z7Kernel2PbS_S_S_i, !"kernel", i32 1}
|
||||
!5 = !{null, !"align", i32 8}
|
||||
!6 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080}
|
||||
!7 = !{null, !"align", i32 16}
|
||||
!8 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088}
|
||||
!9 = !{!"clang version 10.0.1 (https://github.com/llvm/llvm-project.git ef32c611aa214dea855364efd7ba451ec5ec3f74)"}
|
||||
!10 = !{i32 1, i32 4}
|
File diff suppressed because one or more lines are too long
|
@ -1,213 +0,0 @@
|
|||
#include <cuda.h>
|
||||
#include <math.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
#define MAX_THREADS_PER_BLOCK 512
|
||||
|
||||
int no_of_nodes;
|
||||
int edge_list_size;
|
||||
FILE *fp;
|
||||
|
||||
// Structure to hold a node information
|
||||
struct Node {
|
||||
int starting;
|
||||
int no_of_edges;
|
||||
};
|
||||
|
||||
#include "kernel.cu"
|
||||
#include "kernel2.cu"
|
||||
|
||||
void BFSGraph(int argc, char **argv);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Main Program
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
int main(int argc, char **argv) {
|
||||
cudaSetDevice(0);
|
||||
no_of_nodes = 0;
|
||||
edge_list_size = 0;
|
||||
BFSGraph(argc, argv);
|
||||
}
|
||||
|
||||
void Usage(int argc, char **argv) {
|
||||
|
||||
fprintf(stderr, "Usage: %s <input_file>\n", argv[0]);
|
||||
}
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Apply BFS on a Graph using CUDA
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
void BFSGraph(int argc, char **argv) {
|
||||
|
||||
char *input_f;
|
||||
if (argc != 2) {
|
||||
Usage(argc, argv);
|
||||
exit(0);
|
||||
}
|
||||
|
||||
input_f = argv[1];
|
||||
printf("Reading File\n");
|
||||
// Read in Graph from a file
|
||||
fp = fopen(input_f, "r");
|
||||
if (!fp) {
|
||||
printf("Error Reading graph file\n");
|
||||
return;
|
||||
}
|
||||
|
||||
int source = 0;
|
||||
|
||||
fscanf(fp, "%d", &no_of_nodes);
|
||||
|
||||
int num_of_blocks = 1;
|
||||
int num_of_threads_per_block = no_of_nodes;
|
||||
|
||||
// Make execution Parameters according to the number of nodes
|
||||
// Distribute threads across multiple Blocks if necessary
|
||||
if (no_of_nodes > MAX_THREADS_PER_BLOCK) {
|
||||
num_of_blocks = (int)ceil(no_of_nodes / (double)MAX_THREADS_PER_BLOCK);
|
||||
num_of_threads_per_block = MAX_THREADS_PER_BLOCK;
|
||||
}
|
||||
|
||||
// allocate host memory
|
||||
Node *h_graph_nodes = (Node *)malloc(sizeof(Node) * no_of_nodes);
|
||||
bool *h_graph_mask = (bool *)malloc(sizeof(bool) * no_of_nodes);
|
||||
bool *h_updating_graph_mask = (bool *)malloc(sizeof(bool) * no_of_nodes);
|
||||
bool *h_graph_visited = (bool *)malloc(sizeof(bool) * no_of_nodes);
|
||||
|
||||
int start, edgeno;
|
||||
// initalize the memory
|
||||
for (unsigned int i = 0; i < no_of_nodes; i++) {
|
||||
fscanf(fp, "%d %d", &start, &edgeno);
|
||||
h_graph_nodes[i].starting = start;
|
||||
h_graph_nodes[i].no_of_edges = edgeno;
|
||||
h_graph_mask[i] = false;
|
||||
h_updating_graph_mask[i] = false;
|
||||
h_graph_visited[i] = false;
|
||||
}
|
||||
|
||||
// read the source node from the file
|
||||
fscanf(fp, "%d", &source);
|
||||
source = 0;
|
||||
|
||||
// set the source node as true in the mask
|
||||
h_graph_mask[source] = true;
|
||||
h_graph_visited[source] = true;
|
||||
|
||||
fscanf(fp, "%d", &edge_list_size);
|
||||
|
||||
int id, cost;
|
||||
int *h_graph_edges = (int *)malloc(sizeof(int) * edge_list_size);
|
||||
for (int i = 0; i < edge_list_size; i++) {
|
||||
fscanf(fp, "%d", &id);
|
||||
fscanf(fp, "%d", &cost);
|
||||
h_graph_edges[i] = id;
|
||||
}
|
||||
|
||||
if (fp)
|
||||
fclose(fp);
|
||||
|
||||
printf("Read File\n");
|
||||
|
||||
// Copy the Node list to device memory
|
||||
Node *d_graph_nodes;
|
||||
cudaMalloc((void **)&d_graph_nodes, sizeof(Node) * no_of_nodes);
|
||||
cudaMemcpy(d_graph_nodes, h_graph_nodes, sizeof(Node) * no_of_nodes,
|
||||
cudaMemcpyHostToDevice);
|
||||
|
||||
// Copy the Edge List to device Memory
|
||||
int *d_graph_edges;
|
||||
cudaMalloc((void **)&d_graph_edges, sizeof(int) * edge_list_size);
|
||||
cudaMemcpy(d_graph_edges, h_graph_edges, sizeof(int) * edge_list_size,
|
||||
cudaMemcpyHostToDevice);
|
||||
|
||||
// Copy the Mask to device memory
|
||||
bool *d_graph_mask;
|
||||
cudaMalloc((void **)&d_graph_mask, sizeof(bool) * no_of_nodes);
|
||||
cudaMemcpy(d_graph_mask, h_graph_mask, sizeof(bool) * no_of_nodes,
|
||||
cudaMemcpyHostToDevice);
|
||||
|
||||
bool *d_updating_graph_mask;
|
||||
cudaMalloc((void **)&d_updating_graph_mask, sizeof(bool) * no_of_nodes);
|
||||
cudaMemcpy(d_updating_graph_mask, h_updating_graph_mask,
|
||||
sizeof(bool) * no_of_nodes, cudaMemcpyHostToDevice);
|
||||
|
||||
// Copy the Visited nodes array to device memory
|
||||
bool *d_graph_visited;
|
||||
cudaMalloc((void **)&d_graph_visited, sizeof(bool) * no_of_nodes);
|
||||
cudaMemcpy(d_graph_visited, h_graph_visited, sizeof(bool) * no_of_nodes,
|
||||
cudaMemcpyHostToDevice);
|
||||
|
||||
// allocate mem for the result on host side
|
||||
int *h_cost = (int *)malloc(sizeof(int) * no_of_nodes);
|
||||
for (int i = 0; i < no_of_nodes; i++)
|
||||
h_cost[i] = -1;
|
||||
h_cost[source] = 0;
|
||||
|
||||
// allocate device memory for result
|
||||
int *d_cost;
|
||||
cudaMalloc((void **)&d_cost, sizeof(int) * no_of_nodes);
|
||||
cudaMemcpy(d_cost, h_cost, sizeof(int) * no_of_nodes, cudaMemcpyHostToDevice);
|
||||
|
||||
// make a bool to check if the execution is over
|
||||
bool *d_over;
|
||||
cudaMalloc((void **)&d_over, sizeof(bool));
|
||||
|
||||
printf("Copied Everything to GPU memory\n");
|
||||
|
||||
// setup execution parameters
|
||||
dim3 grid(num_of_blocks, 1, 1);
|
||||
dim3 threads(num_of_threads_per_block, 1, 1);
|
||||
|
||||
int k = 0;
|
||||
printf("Start traversing the tree\n");
|
||||
bool stop;
|
||||
// Call the Kernel untill all the elements of Frontier are not false
|
||||
do {
|
||||
// if no thread changes this value then the loop stops
|
||||
stop = false;
|
||||
cudaMemcpy(d_over, &stop, sizeof(bool), cudaMemcpyHostToDevice);
|
||||
|
||||
Kernel<<<grid, threads, 0>>>(d_graph_nodes, d_graph_edges, d_graph_mask,
|
||||
d_updating_graph_mask, d_graph_visited, d_cost,
|
||||
no_of_nodes);
|
||||
cudaDeviceSynchronize();
|
||||
// check if kernel execution generated and error
|
||||
|
||||
Kernel2<<<grid, threads, 0>>>(d_graph_mask, d_updating_graph_mask,
|
||||
d_graph_visited, d_over, no_of_nodes);
|
||||
cudaDeviceSynchronize();
|
||||
// check if kernel execution generated and error
|
||||
|
||||
cudaMemcpy(&stop, d_over, sizeof(bool), cudaMemcpyDeviceToHost);
|
||||
|
||||
k++;
|
||||
} while (stop);
|
||||
|
||||
printf("Kernel Executed %d times\n", k);
|
||||
|
||||
// copy result from device to host
|
||||
cudaMemcpy(h_cost, d_cost, sizeof(int) * no_of_nodes, cudaMemcpyDeviceToHost);
|
||||
|
||||
// Store the result into a file
|
||||
FILE *fpo = fopen("result.txt", "w");
|
||||
for (int i = 0; i < no_of_nodes; i++)
|
||||
fprintf(fpo, "%d) cost:%d\n", i, h_cost[i]);
|
||||
fclose(fpo);
|
||||
printf("Result stored in result.txt\n");
|
||||
|
||||
// cleanup memory
|
||||
free(h_graph_nodes);
|
||||
free(h_graph_edges);
|
||||
free(h_graph_mask);
|
||||
free(h_updating_graph_mask);
|
||||
free(h_graph_visited);
|
||||
free(h_cost);
|
||||
|
||||
cudaFree(d_graph_nodes);
|
||||
cudaFree(d_graph_edges);
|
||||
cudaFree(d_graph_mask);
|
||||
cudaFree(d_updating_graph_mask);
|
||||
cudaFree(d_graph_visited);
|
||||
cudaFree(d_cost);
|
||||
}
|
|
@ -1,23 +0,0 @@
|
|||
#ifndef _KERNEL_H_
|
||||
#define _KERNEL_H_
|
||||
|
||||
__global__ void
|
||||
Kernel( Node* g_graph_nodes, int* g_graph_edges, bool* g_graph_mask, bool* g_updating_graph_mask, bool *g_graph_visited, int* g_cost, int no_of_nodes)
|
||||
{
|
||||
int tid = blockIdx.x*MAX_THREADS_PER_BLOCK + threadIdx.x;
|
||||
if( tid<no_of_nodes && g_graph_mask[tid])
|
||||
{
|
||||
g_graph_mask[tid]=false;
|
||||
for(int i=g_graph_nodes[tid].starting; i<(g_graph_nodes[tid].no_of_edges + g_graph_nodes[tid].starting); i++)
|
||||
{
|
||||
int id = g_graph_edges[i];
|
||||
if(!g_graph_visited[id])
|
||||
{
|
||||
g_cost[id]=g_cost[tid]+1;
|
||||
g_updating_graph_mask[id]=true;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
|
@ -1,18 +0,0 @@
|
|||
#ifndef _KERNEL2_H_
|
||||
#define _KERNEL2_H_
|
||||
|
||||
__global__ void
|
||||
Kernel2( bool* g_graph_mask, bool *g_updating_graph_mask, bool* g_graph_visited, bool *g_over, int no_of_nodes)
|
||||
{
|
||||
int tid = blockIdx.x*MAX_THREADS_PER_BLOCK + threadIdx.x;
|
||||
if( tid<no_of_nodes && g_updating_graph_mask[tid])
|
||||
{
|
||||
|
||||
g_graph_mask[tid]=true;
|
||||
g_graph_visited[tid]=true;
|
||||
*g_over=true;
|
||||
g_updating_graph_mask[tid]=false;
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
|
@ -1,21 +0,0 @@
|
|||
#!/bin/bash
|
||||
set -e
|
||||
llvm-as bfs-cuda-nvptx64-nvidia-cuda-sm_61.ll
|
||||
llvm-as bfs-host-x86_64-unknown-linux-gnu.ll
|
||||
../../build/compilation/kernelTranslator bfs-cuda-nvptx64-nvidia-cuda-sm_61.bc kernel.bc
|
||||
../../build/compilation/hostTranslator bfs-host-x86_64-unknown-linux-gnu.bc host.bc
|
||||
|
||||
llc --relocation-model=pic --filetype=obj kernel.bc
|
||||
llc --relocation-model=pic --filetype=obj host.bc
|
||||
|
||||
g++ -Wall -L../../build/runtime -L../../build/runtime/threadPool \
|
||||
-o bfs.out -fPIC -no-pie host.o kernel.o -lc -lx86Runtime -lthreadPool -lpthread
|
||||
|
||||
export LD_LIBRARY_PATH=../../build/runtime:../../build/runtime/threadPool:$LD_LIBRARY_PATH
|
||||
./bfs.out ../../rodinia-data/bfs/graph65536.txt
|
||||
if grep -q "0) cost:0" result.txt; then
|
||||
echo "Pass"
|
||||
else
|
||||
echo "Error result"
|
||||
exit 1
|
||||
fi
|
|
@ -1,343 +0,0 @@
|
|||
// # ifdef __cplusplus
|
||||
// extern "C" {
|
||||
// # endif
|
||||
|
||||
// #ifndef LIST_H
|
||||
// # define LIST_H
|
||||
|
||||
//===============================================================================================================================================================================================================200
|
||||
// DEFINE/INCLUDE
|
||||
//===============================================================================================================================================================================================================200
|
||||
|
||||
//======================================================================================================================================================150
|
||||
// INCLUDE (for some reason these are not recognized when defined in main
|
||||
// file before this one is included)
|
||||
//======================================================================================================================================================150
|
||||
|
||||
#include <stdbool.h> // (in path known to compiler) needed by true/false, bool
|
||||
#include <stdint.h> // (in path known to compiler) needed by uint32_t
|
||||
#include <stdlib.h> // (in path known to compiler) needed by malloc
|
||||
|
||||
//======================================================================================================================================================150
|
||||
// DEFINE
|
||||
//======================================================================================================================================================150
|
||||
|
||||
#define fp float
|
||||
|
||||
#define Version "1.5"
|
||||
|
||||
#ifdef WINDOWS
|
||||
#define bool char
|
||||
#define false 0
|
||||
#define true 1
|
||||
#endif
|
||||
|
||||
/* #define DEFAULT_ORDER 256 */
|
||||
|
||||
#ifdef RD_WG_SIZE_0_0
|
||||
#define DEFAULT_ORDER RD_WG_SIZE_0_0
|
||||
#elif defined(RD_WG_SIZE_0)
|
||||
#define DEFAULT_ORDER RD_WG_SIZE_0
|
||||
#elif defined(RD_WG_SIZE)
|
||||
#define DEFAULT_ORDER RD_WG_SIZE
|
||||
#else
|
||||
#define DEFAULT_ORDER 256
|
||||
#endif
|
||||
|
||||
/* #ifdef RD_WG_SIZE_1_0 */
|
||||
/* #define DEFAULT_ORDER_2 RD_WG_SIZE_1_0 */
|
||||
/* #elif defined(RD_WG_SIZE_1) */
|
||||
/* #define DEFAULT_ORDER_2 RD_WG_SIZE_1 */
|
||||
/* #elif defined(RD_WG_SIZE) */
|
||||
/* #define DEFAULT_ORDER_2 RD_WG_SIZE */
|
||||
/* #else */
|
||||
/* #define DEFAULT_ORDER_2 256 */
|
||||
/* #endif */
|
||||
|
||||
/* #define DEFAULT_ORDER 508 */
|
||||
|
||||
#define malloc(size) \
|
||||
({ \
|
||||
void *_tmp; \
|
||||
\
|
||||
if (!(_tmp = malloc(size))) { \
|
||||
fprintf(stderr, "Allocation failed at %s:%d!\n", __FILE__, __LINE__); \
|
||||
exit(-1); \
|
||||
} \
|
||||
\
|
||||
_tmp; \
|
||||
})
|
||||
|
||||
//======================================================================================================================================================150
|
||||
// STRUCTURES
|
||||
//======================================================================================================================================================150
|
||||
|
||||
// struct list_item;
|
||||
typedef struct list_item list_item_t;
|
||||
|
||||
typedef struct list_t {
|
||||
list_item_t *head, *tail;
|
||||
uint32_t length;
|
||||
int32_t (*compare)(const void *key, const void *with);
|
||||
void (*datum_delete)(void *);
|
||||
} list_t;
|
||||
|
||||
typedef list_item_t *list_iterator_t;
|
||||
typedef list_item_t *list_reverse_iterator_t;
|
||||
|
||||
/* Type representing the record
|
||||
* to which a given key refers.
|
||||
* In a real B+ tree system, the
|
||||
* record would hold data (in a database)
|
||||
* or a file (in an operating system)
|
||||
* or some other information.
|
||||
* Users can rewrite this part of the code
|
||||
* to change the type and content
|
||||
* of the value field.
|
||||
*/
|
||||
typedef struct record {
|
||||
int value;
|
||||
} record;
|
||||
|
||||
/* Type representing a node in the B+ tree.
|
||||
* This type is general enough to serve for both
|
||||
* the leaf and the internal node.
|
||||
* The heart of the node is the array
|
||||
* of keys and the array of corresponding
|
||||
* pointers. The relation between keys
|
||||
* and pointers differs between leaves and
|
||||
* internal nodes. In a leaf, the index
|
||||
* of each key equals the index of its corresponding
|
||||
* pointer, with a maximum of order - 1 key-pointer
|
||||
* pairs. The last pointer points to the
|
||||
* leaf to the right (or NULL in the case
|
||||
* of the rightmost leaf).
|
||||
* In an internal node, the first pointer
|
||||
* refers to lower nodes with keys less than
|
||||
* the smallest key in the keys array. Then,
|
||||
* with indices i starting at 0, the pointer
|
||||
* at i + 1 points to the subtree with keys
|
||||
* greater than or equal to the key in this
|
||||
* node at index i.
|
||||
* The num_keys field is used to keep
|
||||
* track of the number of valid keys.
|
||||
* In an internal node, the number of valid
|
||||
* pointers is always num_keys + 1.
|
||||
* In a leaf, the number of valid pointers
|
||||
* to data is always num_keys. The
|
||||
* last leaf pointer points to the next leaf.
|
||||
*/
|
||||
typedef struct node {
|
||||
void **pointers;
|
||||
int *keys;
|
||||
struct node *parent;
|
||||
bool is_leaf;
|
||||
int num_keys;
|
||||
struct node *next; // Used for queue.
|
||||
} node;
|
||||
|
||||
//
|
||||
typedef struct knode {
|
||||
int location;
|
||||
int indices[DEFAULT_ORDER + 1];
|
||||
int keys[DEFAULT_ORDER + 1];
|
||||
bool is_leaf;
|
||||
int num_keys;
|
||||
} knode;
|
||||
|
||||
struct list_item {
|
||||
struct list_item *pred, *next;
|
||||
void *datum;
|
||||
};
|
||||
|
||||
//===============================================================================================================================================================================================================200
|
||||
// PROTOTYPES
|
||||
//===============================================================================================================================================================================================================200
|
||||
|
||||
//======================================================================================================================================================150
|
||||
// Other
|
||||
//======================================================================================================================================================150
|
||||
|
||||
void list_item_init(list_item_t *li, void *datum);
|
||||
|
||||
void list_item_delete(list_item_t *li, void (*datum_delete)(void *datum));
|
||||
|
||||
void list_insert_item_tail(list_t *l, list_item_t *i);
|
||||
|
||||
void list_insert_item_before(list_t *l, list_item_t *next, list_item_t *i);
|
||||
|
||||
void list_insert_item_after(list_t *l, list_item_t *pred, list_item_t *i);
|
||||
|
||||
void list_insert_item_sorted(list_t *l, list_item_t *i);
|
||||
|
||||
//======================================================================================================================================================150
|
||||
// ???
|
||||
//======================================================================================================================================================150
|
||||
|
||||
void list_init(list_t *l, int32_t (*compare)(const void *key, const void *with),
|
||||
void (*datum_delete)(void *datum));
|
||||
|
||||
void list_delete(list_t *l);
|
||||
|
||||
void list_reset(list_t *l);
|
||||
|
||||
void list_insert_head(list_t *l, void *v);
|
||||
|
||||
void list_insert_tail(list_t *l, void *v);
|
||||
|
||||
void list_insert_before(list_t *l, list_item_t *next, void *v);
|
||||
|
||||
void list_insert_after(list_t *l, list_item_t *pred, void *v);
|
||||
|
||||
void list_insert_sorted(list_t *l, void *v);
|
||||
|
||||
void list_insert_item_head(list_t *l, list_item_t *i);
|
||||
|
||||
void list_remove_item(list_t *l, list_item_t *i);
|
||||
|
||||
void list_remove_head(list_t *l);
|
||||
|
||||
void list_remove_tail(list_t *l);
|
||||
|
||||
list_item_t *list_find_item(list_t *l, void *datum);
|
||||
|
||||
list_item_t *list_get_head_item(list_t *l);
|
||||
|
||||
list_item_t *list_get_tail_item(list_t *l);
|
||||
|
||||
void *list_find(list_t *l, void *datum);
|
||||
|
||||
void *list_get_head(list_t *l);
|
||||
|
||||
void *list_get_tail(list_t *l);
|
||||
|
||||
uint32_t list_get_length(list_t *l);
|
||||
|
||||
bool list_is_empty(list_t *l);
|
||||
|
||||
bool list_not_empty(list_t *l);
|
||||
|
||||
void list_visit_items(list_t *l, void (*visitor)(void *v));
|
||||
|
||||
void *list_item_get_datum(list_item_t *li);
|
||||
|
||||
void list_iterator_init(list_t *l, list_iterator_t *li);
|
||||
|
||||
void list_iterator_delete(list_iterator_t *li);
|
||||
|
||||
void list_iterator_next(list_iterator_t *li);
|
||||
|
||||
void list_iterator_prev(list_iterator_t *li);
|
||||
|
||||
void *list_iterator_get_datum(list_iterator_t *li);
|
||||
|
||||
bool list_iterator_is_valid(list_iterator_t *li);
|
||||
|
||||
void list_reverse_iterator_init(list_t *l, list_iterator_t *li);
|
||||
|
||||
void list_reverse_iterator_delete(list_iterator_t *li);
|
||||
|
||||
void list_reverse_iterator_next(list_iterator_t *li);
|
||||
|
||||
void list_reverse_iterator_prev(list_iterator_t *li);
|
||||
|
||||
void *list_reverse_iterator_get_datum(list_iterator_t *li);
|
||||
|
||||
bool list_reverse_iterator_is_valid(list_reverse_iterator_t *li);
|
||||
|
||||
//======================================================================================================================================================150
|
||||
// Output and utility
|
||||
//======================================================================================================================================================150
|
||||
|
||||
void *kmalloc(int size);
|
||||
|
||||
long transform_to_cuda(node *n,
|
||||
bool verbose); // returns actual mem used in a long
|
||||
|
||||
void usage_1(void);
|
||||
|
||||
void usage_2(void);
|
||||
|
||||
void enqueue(node *new_node);
|
||||
|
||||
node *dequeue(void);
|
||||
|
||||
int height(node *root);
|
||||
|
||||
int path_to_root(node *root, node *child);
|
||||
|
||||
void print_leaves(node *root);
|
||||
|
||||
void print_tree(node *root);
|
||||
|
||||
node *find_leaf(node *root, int key, bool verbose);
|
||||
|
||||
record *find(node *root, int key, bool verbose);
|
||||
|
||||
int cut(int length);
|
||||
|
||||
//======================================================================================================================================================150
|
||||
// Insertion
|
||||
//======================================================================================================================================================150
|
||||
|
||||
record *make_record(int value);
|
||||
|
||||
node *make_node(void);
|
||||
|
||||
node *make_leaf(void);
|
||||
|
||||
int get_left_index(node *parent, node *left);
|
||||
|
||||
node *insert_into_leaf(node *leaf, int key, record *pointer);
|
||||
|
||||
node *insert_into_leaf_after_splitting(node *root, node *leaf, int key,
|
||||
record *pointer);
|
||||
|
||||
node *insert_into_node(node *root, node *parent, int left_index, int key,
|
||||
node *right);
|
||||
|
||||
node *insert_into_node_after_splitting(node *root, node *parent, int left_index,
|
||||
int key, node *right);
|
||||
|
||||
node *insert_into_parent(node *root, node *left, int key, node *right);
|
||||
|
||||
node *insert_into_new_root(node *left, int key, node *right);
|
||||
|
||||
node *start_new_tree(int key, record *pointer);
|
||||
|
||||
node *insert(node *root, int key, int value);
|
||||
|
||||
//======================================================================================================================================================150
|
||||
// Deletion
|
||||
//======================================================================================================================================================150
|
||||
|
||||
int get_neighbor_index(node *n);
|
||||
|
||||
node *adjust_root(node *root);
|
||||
|
||||
node *coalesce_nodes(node *root, node *n, node *neighbor, int neighbor_index,
|
||||
int k_prime);
|
||||
|
||||
node *redistribute_nodes(node *root, node *n, node *neighbor,
|
||||
int neighbor_index, int k_prime_index, int k_prime);
|
||||
|
||||
node *delete_entry(node *root, node *n, int key, void *pointer);
|
||||
|
||||
node *deleteVal(node *root, int key);
|
||||
|
||||
//===============================================================================================================================================================================================================200
|
||||
// HEADER
|
||||
//===============================================================================================================================================================================================================200
|
||||
|
||||
// int main( int argc,
|
||||
// char *argv []);
|
||||
|
||||
//===============================================================================================================================================================================================================200
|
||||
// END
|
||||
//===============================================================================================================================================================================================================200
|
||||
|
||||
// #endif
|
||||
|
||||
// # ifdef __cplusplus
|
||||
// }
|
||||
// # endif
|
|
@ -1,54 +0,0 @@
|
|||
//========================================================================================================================================================================================================200
|
||||
// findK function
|
||||
//========================================================================================================================================================================================================200
|
||||
|
||||
__global__ void
|
||||
findK( long height,
|
||||
knode *knodesD,
|
||||
long knodes_elem,
|
||||
record *recordsD,
|
||||
|
||||
long *currKnodeD,
|
||||
long *offsetD,
|
||||
int *keysD,
|
||||
record *ansD)
|
||||
{
|
||||
|
||||
// private thread IDs
|
||||
int thid = threadIdx.x;
|
||||
int bid = blockIdx.x;
|
||||
|
||||
// processtree levels
|
||||
int i;
|
||||
for(i = 0; i < height; i++){
|
||||
|
||||
// if value is between the two keys
|
||||
if((knodesD[currKnodeD[bid]].keys[thid]) <= keysD[bid] && (knodesD[currKnodeD[bid]].keys[thid+1] > keysD[bid])){
|
||||
// this conditional statement is inserted to avoid crush due to but in original code
|
||||
// "offset[bid]" calculated below that addresses knodes[] in the next iteration goes outside of its bounds cause segmentation fault
|
||||
// more specifically, values saved into knodes->indices in the main function are out of bounds of knodes that they address
|
||||
if(knodesD[offsetD[bid]].indices[thid] < knodes_elem){
|
||||
offsetD[bid] = knodesD[offsetD[bid]].indices[thid];
|
||||
}
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
// set for next tree level
|
||||
if(thid==0){
|
||||
currKnodeD[bid] = offsetD[bid];
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
}
|
||||
|
||||
//At this point, we have a candidate leaf node which may contain
|
||||
//the target record. Check each key to hopefully find the record
|
||||
if(knodesD[currKnodeD[bid]].keys[thid] == keysD[bid]){
|
||||
ansD[bid].value = recordsD[knodesD[currKnodeD[bid]].indices[thid]].value;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
//========================================================================================================================================================================================================200
|
||||
// End
|
||||
//========================================================================================================================================================================================================200
|
|
@ -1,70 +0,0 @@
|
|||
//========================================================================================================================================================================================================200
|
||||
// findRangeK function
|
||||
//========================================================================================================================================================================================================200
|
||||
|
||||
__global__ void
|
||||
findRangeK( long height,
|
||||
|
||||
knode *knodesD,
|
||||
long knodes_elem,
|
||||
|
||||
long *currKnodeD,
|
||||
long *offsetD,
|
||||
long *lastKnodeD,
|
||||
long *offset_2D,
|
||||
int *startD,
|
||||
int *endD,
|
||||
int *RecstartD,
|
||||
int *ReclenD)
|
||||
{
|
||||
|
||||
// private thread IDs
|
||||
int thid = threadIdx.x;
|
||||
int bid = blockIdx.x;
|
||||
|
||||
// ???
|
||||
int i;
|
||||
for(i = 0; i < height; i++){
|
||||
|
||||
if((knodesD[currKnodeD[bid]].keys[thid] <= startD[bid]) && (knodesD[currKnodeD[bid]].keys[thid+1] > startD[bid])){
|
||||
// this conditional statement is inserted to avoid crush due to but in original code
|
||||
// "offset[bid]" calculated below that later addresses part of knodes goes outside of its bounds cause segmentation fault
|
||||
// more specifically, values saved into knodes->indices in the main function are out of bounds of knodes that they address
|
||||
if(knodesD[currKnodeD[bid]].indices[thid] < knodes_elem){
|
||||
offsetD[bid] = knodesD[currKnodeD[bid]].indices[thid];
|
||||
}
|
||||
}
|
||||
if((knodesD[lastKnodeD[bid]].keys[thid] <= endD[bid]) && (knodesD[lastKnodeD[bid]].keys[thid+1] > endD[bid])){
|
||||
// this conditional statement is inserted to avoid crush due to but in original code
|
||||
// "offset_2[bid]" calculated below that later addresses part of knodes goes outside of its bounds cause segmentation fault
|
||||
// more specifically, values saved into knodes->indices in the main function are out of bounds of knodes that they address
|
||||
if(knodesD[lastKnodeD[bid]].indices[thid] < knodes_elem){
|
||||
offset_2D[bid] = knodesD[lastKnodeD[bid]].indices[thid];
|
||||
}
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
// set for next tree level
|
||||
if(thid==0){
|
||||
currKnodeD[bid] = offsetD[bid];
|
||||
lastKnodeD[bid] = offset_2D[bid];
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
// Find the index of the starting record
|
||||
if(knodesD[currKnodeD[bid]].keys[thid] == startD[bid]){
|
||||
RecstartD[bid] = knodesD[currKnodeD[bid]].indices[thid];
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
// Find the index of the ending record
|
||||
if(knodesD[lastKnodeD[bid]].keys[thid] == endD[bid]){
|
||||
ReclenD[bid] = knodesD[lastKnodeD[bid]].indices[thid] - RecstartD[bid]+1;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
//========================================================================================================================================================================================================200
|
||||
// End
|
||||
//========================================================================================================================================================================================================200
|
|
@ -1,292 +0,0 @@
|
|||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
//========================================================================================================================================================================================================200
|
||||
// DEFINE/INCLUDE
|
||||
//========================================================================================================================================================================================================200
|
||||
|
||||
//======================================================================================================================================================150
|
||||
// COMMON
|
||||
//======================================================================================================================================================150
|
||||
|
||||
#include "../common.h" // (in main program directory) needed to recognized input variables
|
||||
|
||||
//======================================================================================================================================================150
|
||||
// UTILITIES
|
||||
//======================================================================================================================================================150
|
||||
|
||||
#include "../util/cuda/cuda.h" // (in path specified to compiler) needed by for device functions
|
||||
#include "../util/timer/timer.h" // (in path specified to compiler) needed by timer
|
||||
|
||||
//======================================================================================================================================================150
|
||||
// KERNEL
|
||||
//======================================================================================================================================================150
|
||||
|
||||
#include "./kernel_gpu_cuda.cu" // (in current directory) GPU kernel, cannot include with header file because of complications with passing of constant memory variables
|
||||
|
||||
//======================================================================================================================================================150
|
||||
// HEADER
|
||||
//======================================================================================================================================================150
|
||||
|
||||
#include "./kernel_gpu_cuda_wrapper.h" // (in current directory)
|
||||
|
||||
//========================================================================================================================================================================================================200
|
||||
// KERNEL_GPU_CUDA_WRAPPER FUNCTION
|
||||
//========================================================================================================================================================================================================200
|
||||
|
||||
void
|
||||
kernel_gpu_cuda_wrapper(record *records,
|
||||
long records_mem,
|
||||
knode *knodes,
|
||||
long knodes_elem,
|
||||
long knodes_mem,
|
||||
|
||||
int order,
|
||||
long maxheight,
|
||||
int count,
|
||||
|
||||
long *currKnode,
|
||||
long *offset,
|
||||
int *keys,
|
||||
record *ans)
|
||||
{
|
||||
|
||||
//======================================================================================================================================================150
|
||||
// CPU VARIABLES
|
||||
//======================================================================================================================================================150
|
||||
|
||||
// timer
|
||||
long long time0;
|
||||
long long time1;
|
||||
long long time2;
|
||||
long long time3;
|
||||
long long time4;
|
||||
long long time5;
|
||||
long long time6;
|
||||
|
||||
time0 = get_time();
|
||||
|
||||
//======================================================================================================================================================150
|
||||
// GPU SETUP
|
||||
//======================================================================================================================================================150
|
||||
|
||||
//====================================================================================================100
|
||||
// INITIAL DRIVER OVERHEAD
|
||||
//====================================================================================================100
|
||||
|
||||
cudaThreadSynchronize();
|
||||
|
||||
//====================================================================================================100
|
||||
// EXECUTION PARAMETERS
|
||||
//====================================================================================================100
|
||||
|
||||
int numBlocks;
|
||||
numBlocks = count; // max # of blocks can be 65,535
|
||||
int threadsPerBlock;
|
||||
threadsPerBlock = order < 1024 ? order : 1024;
|
||||
|
||||
printf("# of blocks = %d, # of threads/block = %d (ensure that device can handle)\n", numBlocks, threadsPerBlock);
|
||||
|
||||
time1 = get_time();
|
||||
|
||||
//======================================================================================================================================================150
|
||||
// GPU MEMORY (MALLOC)
|
||||
//======================================================================================================================================================150
|
||||
|
||||
//====================================================================================================100
|
||||
// DEVICE IN
|
||||
//====================================================================================================100
|
||||
|
||||
//==================================================50
|
||||
// recordsD
|
||||
//==================================================50
|
||||
|
||||
record *recordsD;
|
||||
cudaMalloc((void**)&recordsD, records_mem);
|
||||
checkCUDAError("cudaMalloc recordsD");
|
||||
|
||||
//==================================================50
|
||||
// knodesD
|
||||
//==================================================50
|
||||
|
||||
knode *knodesD;
|
||||
cudaMalloc((void**)&knodesD, knodes_mem);
|
||||
checkCUDAError("cudaMalloc recordsD");
|
||||
|
||||
//==================================================50
|
||||
// currKnodeD
|
||||
//==================================================50
|
||||
|
||||
long *currKnodeD;
|
||||
cudaMalloc((void**)&currKnodeD, count*sizeof(long));
|
||||
checkCUDAError("cudaMalloc currKnodeD");
|
||||
|
||||
//==================================================50
|
||||
// offsetD
|
||||
//==================================================50
|
||||
|
||||
long *offsetD;
|
||||
cudaMalloc((void**)&offsetD, count*sizeof(long));
|
||||
checkCUDAError("cudaMalloc offsetD");
|
||||
|
||||
//==================================================50
|
||||
// keysD
|
||||
//==================================================50
|
||||
|
||||
int *keysD;
|
||||
cudaMalloc((void**)&keysD, count*sizeof(int));
|
||||
checkCUDAError("cudaMalloc keysD");
|
||||
|
||||
//====================================================================================================100
|
||||
// DEVICE IN/OUT
|
||||
//====================================================================================================100
|
||||
|
||||
//==================================================50
|
||||
// ansD
|
||||
//==================================================50
|
||||
|
||||
record *ansD;
|
||||
cudaMalloc((void**)&ansD, count*sizeof(record));
|
||||
checkCUDAError("cudaMalloc ansD");
|
||||
|
||||
time2 = get_time();
|
||||
|
||||
//======================================================================================================================================================150
|
||||
// GPU MEMORY COPY
|
||||
//======================================================================================================================================================150
|
||||
|
||||
//====================================================================================================100
|
||||
// GPU MEMORY (MALLOC) COPY IN
|
||||
//====================================================================================================100
|
||||
|
||||
//==================================================50
|
||||
// recordsD
|
||||
//==================================================50
|
||||
|
||||
cudaMemcpy(recordsD, records, records_mem, cudaMemcpyHostToDevice);
|
||||
checkCUDAError("cudaMalloc cudaMemcpy memD");
|
||||
|
||||
//==================================================50
|
||||
// knodesD
|
||||
//==================================================50
|
||||
|
||||
cudaMemcpy(knodesD, knodes, knodes_mem, cudaMemcpyHostToDevice);
|
||||
checkCUDAError("cudaMalloc cudaMemcpy memD");
|
||||
|
||||
//==================================================50
|
||||
// currKnodeD
|
||||
//==================================================50
|
||||
|
||||
cudaMemcpy(currKnodeD, currKnode, count*sizeof(long), cudaMemcpyHostToDevice);
|
||||
checkCUDAError("cudaMalloc cudaMemcpy currKnodeD");
|
||||
|
||||
//==================================================50
|
||||
// offsetD
|
||||
//==================================================50
|
||||
|
||||
cudaMemcpy(offsetD, offset, count*sizeof(long), cudaMemcpyHostToDevice);
|
||||
checkCUDAError("cudaMalloc cudaMemcpy offsetD");
|
||||
|
||||
//==================================================50
|
||||
// keysD
|
||||
//==================================================50
|
||||
|
||||
cudaMemcpy(keysD, keys, count*sizeof(int), cudaMemcpyHostToDevice);
|
||||
checkCUDAError("cudaMalloc cudaMemcpy keysD");
|
||||
|
||||
//====================================================================================================100
|
||||
// DEVICE IN/OUT
|
||||
//====================================================================================================100
|
||||
|
||||
//==================================================50
|
||||
// ansD
|
||||
//==================================================50
|
||||
|
||||
cudaMemcpy(ansD, ans, count*sizeof(record), cudaMemcpyHostToDevice);
|
||||
checkCUDAError("cudaMalloc cudaMemcpy ansD");
|
||||
|
||||
time3 = get_time();
|
||||
|
||||
//======================================================================================================================================================150
|
||||
// findK kernel
|
||||
//======================================================================================================================================================150
|
||||
|
||||
findK<<<numBlocks, threadsPerBlock>>>( maxheight,
|
||||
|
||||
knodesD,
|
||||
knodes_elem,
|
||||
|
||||
recordsD,
|
||||
|
||||
currKnodeD,
|
||||
offsetD,
|
||||
keysD,
|
||||
ansD);
|
||||
cudaThreadSynchronize();
|
||||
checkCUDAError("findK");
|
||||
|
||||
time4 = get_time();
|
||||
|
||||
//======================================================================================================================================================150
|
||||
// GPU MEMORY COPY (CONTD.)
|
||||
//======================================================================================================================================================150
|
||||
|
||||
//====================================================================================================100
|
||||
// DEVICE IN/OUT
|
||||
//====================================================================================================100
|
||||
|
||||
//==================================================50
|
||||
// ansD
|
||||
//==================================================50
|
||||
|
||||
cudaMemcpy(ans, ansD, count*sizeof(record), cudaMemcpyDeviceToHost);
|
||||
checkCUDAError("cudaMemcpy ansD");
|
||||
|
||||
time5 = get_time();
|
||||
|
||||
//======================================================================================================================================================150
|
||||
// GPU MEMORY DEALLOCATION
|
||||
//======================================================================================================================================================150
|
||||
|
||||
cudaFree(recordsD);
|
||||
cudaFree(knodesD);
|
||||
|
||||
cudaFree(currKnodeD);
|
||||
cudaFree(offsetD);
|
||||
cudaFree(keysD);
|
||||
cudaFree(ansD);
|
||||
|
||||
time6 = get_time();
|
||||
|
||||
//======================================================================================================================================================150
|
||||
// DISPLAY TIMING
|
||||
//======================================================================================================================================================150
|
||||
|
||||
printf("Time spent in different stages of GPU_CUDA KERNEL:\n");
|
||||
|
||||
printf("%15.12f s, %15.12f % : GPU: SET DEVICE / DRIVER INIT\n", (float) (time1-time0) / 1000000, (float) (time1-time0) / (float) (time6-time0) * 100);
|
||||
printf("%15.12f s, %15.12f % : GPU MEM: ALO\n", (float) (time2-time1) / 1000000, (float) (time2-time1) / (float) (time6-time0) * 100);
|
||||
printf("%15.12f s, %15.12f % : GPU MEM: COPY IN\n", (float) (time3-time2) / 1000000, (float) (time3-time2) / (float) (time6-time0) * 100);
|
||||
|
||||
printf("%15.12f s, %15.12f % : GPU: KERNEL\n", (float) (time4-time3) / 1000000, (float) (time4-time3) / (float) (time6-time0) * 100);
|
||||
|
||||
printf("%15.12f s, %15.12f % : GPU MEM: COPY OUT\n", (float) (time5-time4) / 1000000, (float) (time5-time4) / (float) (time6-time0) * 100);
|
||||
printf("%15.12f s, %15.12f % : GPU MEM: FRE\n", (float) (time6-time5) / 1000000, (float) (time6-time5) / (float) (time6-time0) * 100);
|
||||
|
||||
printf("Total time:\n");
|
||||
printf("%.12f s\n", (float) (time6-time0) / 1000000);
|
||||
|
||||
//========================================================================================================================================================================================================200
|
||||
// End
|
||||
//========================================================================================================================================================================================================200
|
||||
|
||||
}
|
||||
|
||||
//========================================================================================================================================================================================================200
|
||||
// END
|
||||
//========================================================================================================================================================================================================200
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
|
@ -1,23 +0,0 @@
|
|||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
//========================================================================================================================================================================================================200
|
||||
// KERNEL_GPU_CUDA_WRAPPER HEADER
|
||||
//========================================================================================================================================================================================================200
|
||||
|
||||
void kernel_gpu_cuda_wrapper(record *records, long records_mem, knode *knodes,
|
||||
long knodes_elem, long knodes_mem,
|
||||
|
||||
int order, long maxheight, int count,
|
||||
|
||||
long *currKnode, long *offset, int *keys,
|
||||
record *ans);
|
||||
|
||||
//========================================================================================================================================================================================================200
|
||||
// End
|
||||
//========================================================================================================================================================================================================200
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
|
@ -1,347 +0,0 @@
|
|||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
//========================================================================================================================================================================================================200
|
||||
// INCLUDE
|
||||
//========================================================================================================================================================================================================200
|
||||
|
||||
//======================================================================================================================================================150
|
||||
// COMMON
|
||||
//======================================================================================================================================================150
|
||||
|
||||
#include "../common.h" // (in the main program folder) needed to recognized input parameters
|
||||
|
||||
//======================================================================================================================================================150
|
||||
// UTILITIES
|
||||
//======================================================================================================================================================150
|
||||
|
||||
#include "../util/cuda/cuda.h" // (in library path specified to compiler) needed by for device functions
|
||||
#include "../util/timer/timer.h" // (in library path specified to compiler) needed by timer
|
||||
|
||||
//======================================================================================================================================================150
|
||||
// KERNEL
|
||||
//======================================================================================================================================================150
|
||||
|
||||
#include "./kernel_gpu_cuda_2.cu" // (in the current directory) GPU kernel, cannot include with header file because of complications with passing of constant memory variables
|
||||
|
||||
//======================================================================================================================================================150
|
||||
// HEADER
|
||||
//======================================================================================================================================================150
|
||||
|
||||
#include "./kernel_gpu_cuda_wrapper_2.h" // (in the current directory)
|
||||
|
||||
//========================================================================================================================================================================================================200
|
||||
// FUNCTION
|
||||
//========================================================================================================================================================================================================200
|
||||
|
||||
void
|
||||
kernel_gpu_cuda_wrapper_2( knode *knodes,
|
||||
long knodes_elem,
|
||||
long knodes_mem,
|
||||
|
||||
int order,
|
||||
long maxheight,
|
||||
int count,
|
||||
|
||||
long *currKnode,
|
||||
long *offset,
|
||||
long *lastKnode,
|
||||
long *offset_2,
|
||||
int *start,
|
||||
int *end,
|
||||
int *recstart,
|
||||
int *reclength)
|
||||
{
|
||||
|
||||
//======================================================================================================================================================150
|
||||
// CPU VARIABLES
|
||||
//======================================================================================================================================================150
|
||||
|
||||
// timer
|
||||
long long time0;
|
||||
long long time1;
|
||||
long long time2;
|
||||
long long time3;
|
||||
long long time4;
|
||||
long long time5;
|
||||
long long time6;
|
||||
|
||||
time0 = get_time();
|
||||
|
||||
//======================================================================================================================================================150
|
||||
// GPU SETUP
|
||||
//======================================================================================================================================================150
|
||||
|
||||
//====================================================================================================100
|
||||
// INITIAL DRIVER OVERHEAD
|
||||
//====================================================================================================100
|
||||
|
||||
cudaThreadSynchronize();
|
||||
|
||||
//====================================================================================================100
|
||||
// EXECUTION PARAMETERS
|
||||
//====================================================================================================100
|
||||
|
||||
int numBlocks;
|
||||
numBlocks = count;
|
||||
int threadsPerBlock;
|
||||
threadsPerBlock = order < 1024 ? order : 1024;
|
||||
|
||||
printf("# of blocks = %d, # of threads/block = %d (ensure that device can handle)\n", numBlocks, threadsPerBlock);
|
||||
|
||||
time1 = get_time();
|
||||
|
||||
//======================================================================================================================================================150
|
||||
// GPU MEMORY MALLOC
|
||||
//======================================================================================================================================================150
|
||||
|
||||
//====================================================================================================100
|
||||
// DEVICE IN
|
||||
//====================================================================================================100
|
||||
|
||||
//==================================================50
|
||||
// knodesD
|
||||
//==================================================50
|
||||
|
||||
knode *knodesD;
|
||||
cudaMalloc((void**)&knodesD, knodes_mem);
|
||||
checkCUDAError("cudaMalloc recordsD");
|
||||
|
||||
//==================================================50
|
||||
// currKnodeD
|
||||
//==================================================50
|
||||
|
||||
long *currKnodeD;
|
||||
cudaMalloc((void**)&currKnodeD, count*sizeof(long));
|
||||
checkCUDAError("cudaMalloc currKnodeD");
|
||||
|
||||
//==================================================50
|
||||
// offsetD
|
||||
//==================================================50
|
||||
|
||||
long *offsetD;
|
||||
cudaMalloc((void**)&offsetD, count*sizeof(long));
|
||||
checkCUDAError("cudaMalloc offsetD");
|
||||
|
||||
//==================================================50
|
||||
// lastKnodeD
|
||||
//==================================================50
|
||||
|
||||
long *lastKnodeD;
|
||||
cudaMalloc((void**)&lastKnodeD, count*sizeof(long));
|
||||
checkCUDAError("cudaMalloc lastKnodeD");
|
||||
|
||||
//==================================================50
|
||||
// offset_2D
|
||||
//==================================================50
|
||||
|
||||
long *offset_2D;
|
||||
cudaMalloc((void**)&offset_2D, count*sizeof(long));
|
||||
checkCUDAError("cudaMalloc offset_2D");
|
||||
|
||||
//==================================================50
|
||||
// startD
|
||||
//==================================================50
|
||||
|
||||
int *startD;
|
||||
cudaMalloc((void**)&startD, count*sizeof(int));
|
||||
checkCUDAError("cudaMalloc startD");
|
||||
|
||||
//==================================================50
|
||||
// endD
|
||||
//==================================================50
|
||||
|
||||
int *endD;
|
||||
cudaMalloc((void**)&endD, count*sizeof(int));
|
||||
checkCUDAError("cudaMalloc endD");
|
||||
|
||||
//====================================================================================================100
|
||||
// DEVICE IN/OUT
|
||||
//====================================================================================================100
|
||||
|
||||
//==================================================50
|
||||
// ansDStart
|
||||
//==================================================50
|
||||
|
||||
int *ansDStart;
|
||||
cudaMalloc((void**)&ansDStart, count*sizeof(int));
|
||||
checkCUDAError("cudaMalloc ansDStart");
|
||||
|
||||
//==================================================50
|
||||
// ansDLength
|
||||
//==================================================50
|
||||
|
||||
int *ansDLength;
|
||||
cudaMalloc((void**)&ansDLength, count*sizeof(int));
|
||||
checkCUDAError("cudaMalloc ansDLength");
|
||||
|
||||
time2 = get_time();
|
||||
|
||||
//======================================================================================================================================================150
|
||||
// GPU MEMORY COPY
|
||||
//======================================================================================================================================================150
|
||||
|
||||
//====================================================================================================100
|
||||
// DEVICE IN
|
||||
//====================================================================================================100
|
||||
|
||||
//==================================================50
|
||||
// knodesD
|
||||
//==================================================50
|
||||
|
||||
cudaMemcpy(knodesD, knodes, knodes_mem, cudaMemcpyHostToDevice);
|
||||
checkCUDAError("cudaMalloc cudaMemcpy memD");
|
||||
|
||||
//==================================================50
|
||||
// currKnodeD
|
||||
//==================================================50
|
||||
|
||||
cudaMemcpy(currKnodeD, currKnode, count*sizeof(long), cudaMemcpyHostToDevice);
|
||||
checkCUDAError("cudaMalloc cudaMemcpy currKnodeD");
|
||||
|
||||
//==================================================50
|
||||
// offsetD
|
||||
//==================================================50
|
||||
|
||||
cudaMemcpy(offsetD, offset, count*sizeof(long), cudaMemcpyHostToDevice);
|
||||
checkCUDAError("cudaMalloc cudaMemcpy offsetD");
|
||||
|
||||
//==================================================50
|
||||
// lastKnodeD
|
||||
//==================================================50
|
||||
|
||||
cudaMemcpy(lastKnodeD, lastKnode, count*sizeof(long), cudaMemcpyHostToDevice);
|
||||
checkCUDAError("cudaMalloc cudaMemcpy lastKnodeD");
|
||||
|
||||
//==================================================50
|
||||
// offset_2D
|
||||
//==================================================50
|
||||
|
||||
cudaMemcpy(offset_2D, offset_2, count*sizeof(long), cudaMemcpyHostToDevice);
|
||||
checkCUDAError("cudaMalloc cudaMemcpy offset_2D");
|
||||
|
||||
//==================================================50
|
||||
// startD
|
||||
//==================================================50
|
||||
|
||||
cudaMemcpy(startD, start, count*sizeof(int), cudaMemcpyHostToDevice);
|
||||
checkCUDAError("cudaMemcpy startD");
|
||||
|
||||
//==================================================50
|
||||
// endD
|
||||
//==================================================50
|
||||
|
||||
cudaMemcpy(endD, end, count*sizeof(int), cudaMemcpyHostToDevice);
|
||||
checkCUDAError("cudaMemcpy endD");
|
||||
|
||||
//====================================================================================================100
|
||||
// DEVICE IN/OUT
|
||||
//====================================================================================================100
|
||||
|
||||
//==================================================50
|
||||
// ansDStart
|
||||
//==================================================50
|
||||
|
||||
cudaMemcpy(ansDStart, recstart, count*sizeof(int), cudaMemcpyHostToDevice);
|
||||
checkCUDAError("cudaMemcpy ansDStart");
|
||||
|
||||
//==================================================50
|
||||
// ansDLength
|
||||
//==================================================50
|
||||
|
||||
cudaMemcpy(ansDLength, reclength, count*sizeof(int), cudaMemcpyHostToDevice);
|
||||
checkCUDAError("cudaMemcpy ansDLength");
|
||||
|
||||
time3 = get_time();
|
||||
|
||||
//======================================================================================================================================================150
|
||||
// KERNEL
|
||||
//======================================================================================================================================================150
|
||||
|
||||
// [GPU] findRangeK kernel
|
||||
findRangeK<<<numBlocks, threadsPerBlock>>>( maxheight,
|
||||
knodesD,
|
||||
knodes_elem,
|
||||
|
||||
currKnodeD,
|
||||
offsetD,
|
||||
lastKnodeD,
|
||||
offset_2D,
|
||||
startD,
|
||||
endD,
|
||||
ansDStart,
|
||||
ansDLength);
|
||||
cudaThreadSynchronize();
|
||||
checkCUDAError("findRangeK");
|
||||
|
||||
time4 = get_time();
|
||||
|
||||
//======================================================================================================================================================150
|
||||
// GPU MEMORY COPY (CONTD.)
|
||||
//======================================================================================================================================================150
|
||||
|
||||
//====================================================================================================100
|
||||
// DEVICE IN/OUT
|
||||
//====================================================================================================100
|
||||
|
||||
//==================================================50
|
||||
// ansDStart
|
||||
//==================================================50
|
||||
|
||||
cudaMemcpy(recstart, ansDStart, count*sizeof(int), cudaMemcpyDeviceToHost);
|
||||
checkCUDAError("cudaMemcpy ansDStart");
|
||||
|
||||
//==================================================50
|
||||
// ansDLength
|
||||
//==================================================50
|
||||
|
||||
cudaMemcpy(reclength, ansDLength, count*sizeof(int), cudaMemcpyDeviceToHost);
|
||||
checkCUDAError("cudaMemcpy ansDLength");
|
||||
|
||||
time5 = get_time();
|
||||
|
||||
//======================================================================================================================================================150
|
||||
// GPU MEMORY DEALLOCATION
|
||||
//======================================================================================================================================================150
|
||||
|
||||
cudaFree(knodesD);
|
||||
|
||||
cudaFree(currKnodeD);
|
||||
cudaFree(offsetD);
|
||||
cudaFree(lastKnodeD);
|
||||
cudaFree(offset_2D);
|
||||
cudaFree(startD);
|
||||
cudaFree(endD);
|
||||
cudaFree(ansDStart);
|
||||
cudaFree(ansDLength);
|
||||
|
||||
time6 = get_time();
|
||||
|
||||
//======================================================================================================================================================150
|
||||
// DISPLAY TIMING
|
||||
//======================================================================================================================================================150
|
||||
|
||||
printf("Time spent in different stages of GPU_CUDA KERNEL:\n");
|
||||
|
||||
printf("%15.12f s, %15.12f % : GPU: SET DEVICE / DRIVER INIT\n", (float) (time1-time0) / 1000000, (float) (time1-time0) / (float) (time6-time0) * 100);
|
||||
printf("%15.12f s, %15.12f % : GPU MEM: ALO\n", (float) (time2-time1) / 1000000, (float) (time2-time1) / (float) (time6-time0) * 100);
|
||||
printf("%15.12f s, %15.12f % : GPU MEM: COPY IN\n", (float) (time3-time2) / 1000000, (float) (time3-time2) / (float) (time6-time0) * 100);
|
||||
|
||||
printf("%15.12f s, %15.12f % : GPU: KERNEL\n", (float) (time4-time3) / 1000000, (float) (time4-time3) / (float) (time6-time0) * 100);
|
||||
|
||||
printf("%15.12f s, %15.12f % : GPU MEM: COPY OUT\n", (float) (time5-time4) / 1000000, (float) (time5-time4) / (float) (time6-time0) * 100);
|
||||
printf("%15.12f s, %15.12f % : GPU MEM: FRE\n", (float) (time6-time5) / 1000000, (float) (time6-time5) / (float) (time6-time0) * 100);
|
||||
|
||||
printf("Total time:\n");
|
||||
printf("%.12f s\n", (float) (time6-time0) / 1000000);
|
||||
|
||||
}
|
||||
|
||||
//========================================================================================================================================================================================================200
|
||||
// END
|
||||
//========================================================================================================================================================================================================200
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
|
@ -1,23 +0,0 @@
|
|||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
//========================================================================================================================================================================================================200
|
||||
// KERNEL_GPU_CUDA_WRAPPER HEADER
|
||||
//========================================================================================================================================================================================================200
|
||||
|
||||
void kernel_gpu_cuda_wrapper_2(knode *knodes, long knodes_elem, long knodes_mem,
|
||||
|
||||
int order, long maxheight, int count,
|
||||
|
||||
long *currKnode, long *offset, long *lastKnode,
|
||||
long *offset_2, int *start, int *end,
|
||||
int *recstart, int *reclength);
|
||||
|
||||
//========================================================================================================================================================================================================200
|
||||
// End
|
||||
//========================================================================================================================================================================================================200
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
|
@ -1,332 +0,0 @@
|
|||
; ModuleID = 'kernel_gpu_cuda_wrapper-cuda-nvptx64-nvidia-cuda-sm_61.bc'
|
||||
source_filename = "kernel/kernel_gpu_cuda_wrapper.cu"
|
||||
target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
|
||||
target triple = "nvptx64-nvidia-cuda"
|
||||
|
||||
%struct.__cuda_builtin_threadIdx_t = type { i8 }
|
||||
%struct.__cuda_builtin_blockIdx_t = type { i8 }
|
||||
%struct.cudaFuncAttributes = type { i64, i64, i64, i32, i32, i32, i32, i32, i32, i32 }
|
||||
%struct.knode = type { i32, [257 x i32], [257 x i32], i8, i32 }
|
||||
%struct.record = type { i32 }
|
||||
|
||||
$_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv = comdat any
|
||||
|
||||
$_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv = comdat any
|
||||
|
||||
@threadIdx = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_threadIdx_t, align 1
|
||||
@blockIdx = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_blockIdx_t, align 1
|
||||
|
||||
; Function Attrs: convergent noinline nounwind optnone
|
||||
define weak dso_local i32 @cudaMalloc(i8** %p, i64 %s) #0 {
|
||||
entry:
|
||||
%p.addr = alloca i8**, align 8
|
||||
%s.addr = alloca i64, align 8
|
||||
store i8** %p, i8*** %p.addr, align 8
|
||||
store i64 %s, i64* %s.addr, align 8
|
||||
ret i32 999
|
||||
}
|
||||
|
||||
; Function Attrs: convergent noinline nounwind optnone
|
||||
define weak dso_local i32 @cudaFuncGetAttributes(%struct.cudaFuncAttributes* %p, i8* %c) #0 {
|
||||
entry:
|
||||
%p.addr = alloca %struct.cudaFuncAttributes*, align 8
|
||||
%c.addr = alloca i8*, align 8
|
||||
store %struct.cudaFuncAttributes* %p, %struct.cudaFuncAttributes** %p.addr, align 8
|
||||
store i8* %c, i8** %c.addr, align 8
|
||||
ret i32 999
|
||||
}
|
||||
|
||||
; Function Attrs: convergent noinline nounwind optnone
|
||||
define weak dso_local i32 @cudaDeviceGetAttribute(i32* %value, i32 %attr, i32 %device) #0 {
|
||||
entry:
|
||||
%value.addr = alloca i32*, align 8
|
||||
%attr.addr = alloca i32, align 4
|
||||
%device.addr = alloca i32, align 4
|
||||
store i32* %value, i32** %value.addr, align 8
|
||||
store i32 %attr, i32* %attr.addr, align 4
|
||||
store i32 %device, i32* %device.addr, align 4
|
||||
ret i32 999
|
||||
}
|
||||
|
||||
; Function Attrs: convergent noinline nounwind optnone
|
||||
define weak dso_local i32 @cudaGetDevice(i32* %device) #0 {
|
||||
entry:
|
||||
%device.addr = alloca i32*, align 8
|
||||
store i32* %device, i32** %device.addr, align 8
|
||||
ret i32 999
|
||||
}
|
||||
|
||||
; Function Attrs: convergent noinline nounwind optnone
|
||||
define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessor(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize) #0 {
|
||||
entry:
|
||||
%numBlocks.addr = alloca i32*, align 8
|
||||
%func.addr = alloca i8*, align 8
|
||||
%blockSize.addr = alloca i32, align 4
|
||||
%dynamicSmemSize.addr = alloca i64, align 8
|
||||
store i32* %numBlocks, i32** %numBlocks.addr, align 8
|
||||
store i8* %func, i8** %func.addr, align 8
|
||||
store i32 %blockSize, i32* %blockSize.addr, align 4
|
||||
store i64 %dynamicSmemSize, i64* %dynamicSmemSize.addr, align 8
|
||||
ret i32 999
|
||||
}
|
||||
|
||||
; Function Attrs: convergent noinline nounwind optnone
|
||||
define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize, i32 %flags) #0 {
|
||||
entry:
|
||||
%numBlocks.addr = alloca i32*, align 8
|
||||
%func.addr = alloca i8*, align 8
|
||||
%blockSize.addr = alloca i32, align 4
|
||||
%dynamicSmemSize.addr = alloca i64, align 8
|
||||
%flags.addr = alloca i32, align 4
|
||||
store i32* %numBlocks, i32** %numBlocks.addr, align 8
|
||||
store i8* %func, i8** %func.addr, align 8
|
||||
store i32 %blockSize, i32* %blockSize.addr, align 4
|
||||
store i64 %dynamicSmemSize, i64* %dynamicSmemSize.addr, align 8
|
||||
store i32 %flags, i32* %flags.addr, align 4
|
||||
ret i32 999
|
||||
}
|
||||
|
||||
; Function Attrs: convergent noinline nounwind optnone
|
||||
define dso_local void @findK(i64 %height, %struct.knode* %knodesD, i64 %knodes_elem, %struct.record* %recordsD, i64* %currKnodeD, i64* %offsetD, i32* %keysD, %struct.record* %ansD) #0 {
|
||||
entry:
|
||||
%height.addr = alloca i64, align 8
|
||||
%knodesD.addr = alloca %struct.knode*, align 8
|
||||
%knodes_elem.addr = alloca i64, align 8
|
||||
%recordsD.addr = alloca %struct.record*, align 8
|
||||
%currKnodeD.addr = alloca i64*, align 8
|
||||
%offsetD.addr = alloca i64*, align 8
|
||||
%keysD.addr = alloca i32*, align 8
|
||||
%ansD.addr = alloca %struct.record*, align 8
|
||||
%thid = alloca i32, align 4
|
||||
%bid = alloca i32, align 4
|
||||
%i = alloca i32, align 4
|
||||
store i64 %height, i64* %height.addr, align 8
|
||||
store %struct.knode* %knodesD, %struct.knode** %knodesD.addr, align 8
|
||||
store i64 %knodes_elem, i64* %knodes_elem.addr, align 8
|
||||
store %struct.record* %recordsD, %struct.record** %recordsD.addr, align 8
|
||||
store i64* %currKnodeD, i64** %currKnodeD.addr, align 8
|
||||
store i64* %offsetD, i64** %offsetD.addr, align 8
|
||||
store i32* %keysD, i32** %keysD.addr, align 8
|
||||
store %struct.record* %ansD, %struct.record** %ansD.addr, align 8
|
||||
%call = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #2
|
||||
store i32 %call, i32* %thid, align 4
|
||||
%call1 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #2
|
||||
store i32 %call1, i32* %bid, align 4
|
||||
store i32 0, i32* %i, align 4
|
||||
br label %for.cond
|
||||
|
||||
for.cond: ; preds = %for.inc, %entry
|
||||
%0 = load i32, i32* %i, align 4
|
||||
%conv = sext i32 %0 to i64
|
||||
%1 = load i64, i64* %height.addr, align 8
|
||||
%cmp = icmp slt i64 %conv, %1
|
||||
br i1 %cmp, label %for.body, label %for.end
|
||||
|
||||
for.body: ; preds = %for.cond
|
||||
%2 = load %struct.knode*, %struct.knode** %knodesD.addr, align 8
|
||||
%3 = load i64*, i64** %currKnodeD.addr, align 8
|
||||
%4 = load i32, i32* %bid, align 4
|
||||
%idxprom = sext i32 %4 to i64
|
||||
%arrayidx = getelementptr inbounds i64, i64* %3, i64 %idxprom
|
||||
%5 = load i64, i64* %arrayidx, align 8
|
||||
%arrayidx2 = getelementptr inbounds %struct.knode, %struct.knode* %2, i64 %5
|
||||
%keys = getelementptr inbounds %struct.knode, %struct.knode* %arrayidx2, i32 0, i32 2
|
||||
%6 = load i32, i32* %thid, align 4
|
||||
%idxprom3 = sext i32 %6 to i64
|
||||
%arrayidx4 = getelementptr inbounds [257 x i32], [257 x i32]* %keys, i64 0, i64 %idxprom3
|
||||
%7 = load i32, i32* %arrayidx4, align 4
|
||||
%8 = load i32*, i32** %keysD.addr, align 8
|
||||
%9 = load i32, i32* %bid, align 4
|
||||
%idxprom5 = sext i32 %9 to i64
|
||||
%arrayidx6 = getelementptr inbounds i32, i32* %8, i64 %idxprom5
|
||||
%10 = load i32, i32* %arrayidx6, align 4
|
||||
%cmp7 = icmp sle i32 %7, %10
|
||||
br i1 %cmp7, label %land.lhs.true, label %if.end34
|
||||
|
||||
land.lhs.true: ; preds = %for.body
|
||||
%11 = load %struct.knode*, %struct.knode** %knodesD.addr, align 8
|
||||
%12 = load i64*, i64** %currKnodeD.addr, align 8
|
||||
%13 = load i32, i32* %bid, align 4
|
||||
%idxprom8 = sext i32 %13 to i64
|
||||
%arrayidx9 = getelementptr inbounds i64, i64* %12, i64 %idxprom8
|
||||
%14 = load i64, i64* %arrayidx9, align 8
|
||||
%arrayidx10 = getelementptr inbounds %struct.knode, %struct.knode* %11, i64 %14
|
||||
%keys11 = getelementptr inbounds %struct.knode, %struct.knode* %arrayidx10, i32 0, i32 2
|
||||
%15 = load i32, i32* %thid, align 4
|
||||
%add = add nsw i32 %15, 1
|
||||
%idxprom12 = sext i32 %add to i64
|
||||
%arrayidx13 = getelementptr inbounds [257 x i32], [257 x i32]* %keys11, i64 0, i64 %idxprom12
|
||||
%16 = load i32, i32* %arrayidx13, align 4
|
||||
%17 = load i32*, i32** %keysD.addr, align 8
|
||||
%18 = load i32, i32* %bid, align 4
|
||||
%idxprom14 = sext i32 %18 to i64
|
||||
%arrayidx15 = getelementptr inbounds i32, i32* %17, i64 %idxprom14
|
||||
%19 = load i32, i32* %arrayidx15, align 4
|
||||
%cmp16 = icmp sgt i32 %16, %19
|
||||
br i1 %cmp16, label %if.then, label %if.end34
|
||||
|
||||
if.then: ; preds = %land.lhs.true
|
||||
%20 = load %struct.knode*, %struct.knode** %knodesD.addr, align 8
|
||||
%21 = load i64*, i64** %offsetD.addr, align 8
|
||||
%22 = load i32, i32* %bid, align 4
|
||||
%idxprom17 = sext i32 %22 to i64
|
||||
%arrayidx18 = getelementptr inbounds i64, i64* %21, i64 %idxprom17
|
||||
%23 = load i64, i64* %arrayidx18, align 8
|
||||
%arrayidx19 = getelementptr inbounds %struct.knode, %struct.knode* %20, i64 %23
|
||||
%indices = getelementptr inbounds %struct.knode, %struct.knode* %arrayidx19, i32 0, i32 1
|
||||
%24 = load i32, i32* %thid, align 4
|
||||
%idxprom20 = sext i32 %24 to i64
|
||||
%arrayidx21 = getelementptr inbounds [257 x i32], [257 x i32]* %indices, i64 0, i64 %idxprom20
|
||||
%25 = load i32, i32* %arrayidx21, align 4
|
||||
%conv22 = sext i32 %25 to i64
|
||||
%26 = load i64, i64* %knodes_elem.addr, align 8
|
||||
%cmp23 = icmp slt i64 %conv22, %26
|
||||
br i1 %cmp23, label %if.then24, label %if.end
|
||||
|
||||
if.then24: ; preds = %if.then
|
||||
%27 = load %struct.knode*, %struct.knode** %knodesD.addr, align 8
|
||||
%28 = load i64*, i64** %offsetD.addr, align 8
|
||||
%29 = load i32, i32* %bid, align 4
|
||||
%idxprom25 = sext i32 %29 to i64
|
||||
%arrayidx26 = getelementptr inbounds i64, i64* %28, i64 %idxprom25
|
||||
%30 = load i64, i64* %arrayidx26, align 8
|
||||
%arrayidx27 = getelementptr inbounds %struct.knode, %struct.knode* %27, i64 %30
|
||||
%indices28 = getelementptr inbounds %struct.knode, %struct.knode* %arrayidx27, i32 0, i32 1
|
||||
%31 = load i32, i32* %thid, align 4
|
||||
%idxprom29 = sext i32 %31 to i64
|
||||
%arrayidx30 = getelementptr inbounds [257 x i32], [257 x i32]* %indices28, i64 0, i64 %idxprom29
|
||||
%32 = load i32, i32* %arrayidx30, align 4
|
||||
%conv31 = sext i32 %32 to i64
|
||||
%33 = load i64*, i64** %offsetD.addr, align 8
|
||||
%34 = load i32, i32* %bid, align 4
|
||||
%idxprom32 = sext i32 %34 to i64
|
||||
%arrayidx33 = getelementptr inbounds i64, i64* %33, i64 %idxprom32
|
||||
store i64 %conv31, i64* %arrayidx33, align 8
|
||||
br label %if.end
|
||||
|
||||
if.end: ; preds = %if.then24, %if.then
|
||||
br label %if.end34
|
||||
|
||||
if.end34: ; preds = %if.end, %land.lhs.true, %for.body
|
||||
call void @llvm.nvvm.barrier0()
|
||||
%35 = load i32, i32* %thid, align 4
|
||||
%cmp35 = icmp eq i32 %35, 0
|
||||
br i1 %cmp35, label %if.then36, label %if.end41
|
||||
|
||||
if.then36: ; preds = %if.end34
|
||||
%36 = load i64*, i64** %offsetD.addr, align 8
|
||||
%37 = load i32, i32* %bid, align 4
|
||||
%idxprom37 = sext i32 %37 to i64
|
||||
%arrayidx38 = getelementptr inbounds i64, i64* %36, i64 %idxprom37
|
||||
%38 = load i64, i64* %arrayidx38, align 8
|
||||
%39 = load i64*, i64** %currKnodeD.addr, align 8
|
||||
%40 = load i32, i32* %bid, align 4
|
||||
%idxprom39 = sext i32 %40 to i64
|
||||
%arrayidx40 = getelementptr inbounds i64, i64* %39, i64 %idxprom39
|
||||
store i64 %38, i64* %arrayidx40, align 8
|
||||
br label %if.end41
|
||||
|
||||
if.end41: ; preds = %if.then36, %if.end34
|
||||
call void @llvm.nvvm.barrier0()
|
||||
br label %for.inc
|
||||
|
||||
for.inc: ; preds = %if.end41
|
||||
%41 = load i32, i32* %i, align 4
|
||||
%inc = add nsw i32 %41, 1
|
||||
store i32 %inc, i32* %i, align 4
|
||||
br label %for.cond
|
||||
|
||||
for.end: ; preds = %for.cond
|
||||
%42 = load %struct.knode*, %struct.knode** %knodesD.addr, align 8
|
||||
%43 = load i64*, i64** %currKnodeD.addr, align 8
|
||||
%44 = load i32, i32* %bid, align 4
|
||||
%idxprom42 = sext i32 %44 to i64
|
||||
%arrayidx43 = getelementptr inbounds i64, i64* %43, i64 %idxprom42
|
||||
%45 = load i64, i64* %arrayidx43, align 8
|
||||
%arrayidx44 = getelementptr inbounds %struct.knode, %struct.knode* %42, i64 %45
|
||||
%keys45 = getelementptr inbounds %struct.knode, %struct.knode* %arrayidx44, i32 0, i32 2
|
||||
%46 = load i32, i32* %thid, align 4
|
||||
%idxprom46 = sext i32 %46 to i64
|
||||
%arrayidx47 = getelementptr inbounds [257 x i32], [257 x i32]* %keys45, i64 0, i64 %idxprom46
|
||||
%47 = load i32, i32* %arrayidx47, align 4
|
||||
%48 = load i32*, i32** %keysD.addr, align 8
|
||||
%49 = load i32, i32* %bid, align 4
|
||||
%idxprom48 = sext i32 %49 to i64
|
||||
%arrayidx49 = getelementptr inbounds i32, i32* %48, i64 %idxprom48
|
||||
%50 = load i32, i32* %arrayidx49, align 4
|
||||
%cmp50 = icmp eq i32 %47, %50
|
||||
br i1 %cmp50, label %if.then51, label %if.end63
|
||||
|
||||
if.then51: ; preds = %for.end
|
||||
%51 = load %struct.record*, %struct.record** %recordsD.addr, align 8
|
||||
%52 = load %struct.knode*, %struct.knode** %knodesD.addr, align 8
|
||||
%53 = load i64*, i64** %currKnodeD.addr, align 8
|
||||
%54 = load i32, i32* %bid, align 4
|
||||
%idxprom52 = sext i32 %54 to i64
|
||||
%arrayidx53 = getelementptr inbounds i64, i64* %53, i64 %idxprom52
|
||||
%55 = load i64, i64* %arrayidx53, align 8
|
||||
%arrayidx54 = getelementptr inbounds %struct.knode, %struct.knode* %52, i64 %55
|
||||
%indices55 = getelementptr inbounds %struct.knode, %struct.knode* %arrayidx54, i32 0, i32 1
|
||||
%56 = load i32, i32* %thid, align 4
|
||||
%idxprom56 = sext i32 %56 to i64
|
||||
%arrayidx57 = getelementptr inbounds [257 x i32], [257 x i32]* %indices55, i64 0, i64 %idxprom56
|
||||
%57 = load i32, i32* %arrayidx57, align 4
|
||||
%idxprom58 = sext i32 %57 to i64
|
||||
%arrayidx59 = getelementptr inbounds %struct.record, %struct.record* %51, i64 %idxprom58
|
||||
%value = getelementptr inbounds %struct.record, %struct.record* %arrayidx59, i32 0, i32 0
|
||||
%58 = load i32, i32* %value, align 4
|
||||
%59 = load %struct.record*, %struct.record** %ansD.addr, align 8
|
||||
%60 = load i32, i32* %bid, align 4
|
||||
%idxprom60 = sext i32 %60 to i64
|
||||
%arrayidx61 = getelementptr inbounds %struct.record, %struct.record* %59, i64 %idxprom60
|
||||
%value62 = getelementptr inbounds %struct.record, %struct.record* %arrayidx61, i32 0, i32 0
|
||||
store i32 %58, i32* %value62, align 4
|
||||
br label %if.end63
|
||||
|
||||
if.end63: ; preds = %if.then51, %for.end
|
||||
ret void
|
||||
}
|
||||
|
||||
; Function Attrs: alwaysinline convergent nounwind
|
||||
define linkonce_odr dso_local i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #1 comdat align 2 {
|
||||
entry:
|
||||
%0 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
|
||||
ret i32 %0
|
||||
}
|
||||
|
||||
; Function Attrs: alwaysinline convergent nounwind
|
||||
define linkonce_odr dso_local i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #1 comdat align 2 {
|
||||
entry:
|
||||
%0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
|
||||
ret i32 %0
|
||||
}
|
||||
|
||||
; Function Attrs: convergent nounwind
|
||||
declare void @llvm.nvvm.barrier0() #2
|
||||
|
||||
; Function Attrs: nounwind readnone
|
||||
declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() #3
|
||||
|
||||
; Function Attrs: nounwind readnone
|
||||
declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #3
|
||||
|
||||
attributes #0 = { convergent noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" }
|
||||
attributes #1 = { alwaysinline convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" }
|
||||
attributes #2 = { convergent nounwind }
|
||||
attributes #3 = { nounwind readnone }
|
||||
|
||||
!llvm.module.flags = !{!0, !1, !2}
|
||||
!nvvm.annotations = !{!3, !4, !5, !4, !6, !6, !6, !6, !7, !7, !6}
|
||||
!llvm.ident = !{!8}
|
||||
!nvvmir.version = !{!9}
|
||||
|
||||
!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 10, i32 1]}
|
||||
!1 = !{i32 1, !"wchar_size", i32 4}
|
||||
!2 = !{i32 4, !"nvvm-reflect-ftz", i32 0}
|
||||
!3 = !{void (i64, %struct.knode*, i64, %struct.record*, i64*, i64*, i32*, %struct.record*)* @findK, !"kernel", i32 1}
|
||||
!4 = !{null, !"align", i32 8}
|
||||
!5 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080}
|
||||
!6 = !{null, !"align", i32 16}
|
||||
!7 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088}
|
||||
!8 = !{!"clang version 10.0.1 (https://github.com/llvm/llvm-project.git ef32c611aa214dea855364efd7ba451ec5ec3f74)"}
|
||||
!9 = !{i32 1, i32 4}
|
File diff suppressed because one or more lines are too long
|
@ -1,475 +0,0 @@
|
|||
; ModuleID = 'kernel_gpu_cuda_wrapper_2-cuda-nvptx64-nvidia-cuda-sm_61.bc'
|
||||
source_filename = "kernel/kernel_gpu_cuda_wrapper_2.cu"
|
||||
target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
|
||||
target triple = "nvptx64-nvidia-cuda"
|
||||
|
||||
%struct.__cuda_builtin_threadIdx_t = type { i8 }
|
||||
%struct.__cuda_builtin_blockIdx_t = type { i8 }
|
||||
%struct.cudaFuncAttributes = type { i64, i64, i64, i32, i32, i32, i32, i32, i32, i32 }
|
||||
%struct.knode = type { i32, [257 x i32], [257 x i32], i8, i32 }
|
||||
|
||||
$_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv = comdat any
|
||||
|
||||
$_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv = comdat any
|
||||
|
||||
@threadIdx = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_threadIdx_t, align 1
|
||||
@blockIdx = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_blockIdx_t, align 1
|
||||
|
||||
; Function Attrs: convergent noinline nounwind optnone
|
||||
define weak dso_local i32 @cudaMalloc(i8** %p, i64 %s) #0 {
|
||||
entry:
|
||||
%p.addr = alloca i8**, align 8
|
||||
%s.addr = alloca i64, align 8
|
||||
store i8** %p, i8*** %p.addr, align 8
|
||||
store i64 %s, i64* %s.addr, align 8
|
||||
ret i32 999
|
||||
}
|
||||
|
||||
; Function Attrs: convergent noinline nounwind optnone
|
||||
define weak dso_local i32 @cudaFuncGetAttributes(%struct.cudaFuncAttributes* %p, i8* %c) #0 {
|
||||
entry:
|
||||
%p.addr = alloca %struct.cudaFuncAttributes*, align 8
|
||||
%c.addr = alloca i8*, align 8
|
||||
store %struct.cudaFuncAttributes* %p, %struct.cudaFuncAttributes** %p.addr, align 8
|
||||
store i8* %c, i8** %c.addr, align 8
|
||||
ret i32 999
|
||||
}
|
||||
|
||||
; Function Attrs: convergent noinline nounwind optnone
|
||||
define weak dso_local i32 @cudaDeviceGetAttribute(i32* %value, i32 %attr, i32 %device) #0 {
|
||||
entry:
|
||||
%value.addr = alloca i32*, align 8
|
||||
%attr.addr = alloca i32, align 4
|
||||
%device.addr = alloca i32, align 4
|
||||
store i32* %value, i32** %value.addr, align 8
|
||||
store i32 %attr, i32* %attr.addr, align 4
|
||||
store i32 %device, i32* %device.addr, align 4
|
||||
ret i32 999
|
||||
}
|
||||
|
||||
; Function Attrs: convergent noinline nounwind optnone
|
||||
define weak dso_local i32 @cudaGetDevice(i32* %device) #0 {
|
||||
entry:
|
||||
%device.addr = alloca i32*, align 8
|
||||
store i32* %device, i32** %device.addr, align 8
|
||||
ret i32 999
|
||||
}
|
||||
|
||||
; Function Attrs: convergent noinline nounwind optnone
|
||||
define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessor(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize) #0 {
|
||||
entry:
|
||||
%numBlocks.addr = alloca i32*, align 8
|
||||
%func.addr = alloca i8*, align 8
|
||||
%blockSize.addr = alloca i32, align 4
|
||||
%dynamicSmemSize.addr = alloca i64, align 8
|
||||
store i32* %numBlocks, i32** %numBlocks.addr, align 8
|
||||
store i8* %func, i8** %func.addr, align 8
|
||||
store i32 %blockSize, i32* %blockSize.addr, align 4
|
||||
store i64 %dynamicSmemSize, i64* %dynamicSmemSize.addr, align 8
|
||||
ret i32 999
|
||||
}
|
||||
|
||||
; Function Attrs: convergent noinline nounwind optnone
|
||||
define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize, i32 %flags) #0 {
|
||||
entry:
|
||||
%numBlocks.addr = alloca i32*, align 8
|
||||
%func.addr = alloca i8*, align 8
|
||||
%blockSize.addr = alloca i32, align 4
|
||||
%dynamicSmemSize.addr = alloca i64, align 8
|
||||
%flags.addr = alloca i32, align 4
|
||||
store i32* %numBlocks, i32** %numBlocks.addr, align 8
|
||||
store i8* %func, i8** %func.addr, align 8
|
||||
store i32 %blockSize, i32* %blockSize.addr, align 4
|
||||
store i64 %dynamicSmemSize, i64* %dynamicSmemSize.addr, align 8
|
||||
store i32 %flags, i32* %flags.addr, align 4
|
||||
ret i32 999
|
||||
}
|
||||
|
||||
; Function Attrs: convergent noinline nounwind optnone
|
||||
define dso_local void @findRangeK(i64 %height, %struct.knode* %knodesD, i64 %knodes_elem, i64* %currKnodeD, i64* %offsetD, i64* %lastKnodeD, i64* %offset_2D, i32* %startD, i32* %endD, i32* %RecstartD, i32* %ReclenD) #0 {
|
||||
entry:
|
||||
%height.addr = alloca i64, align 8
|
||||
%knodesD.addr = alloca %struct.knode*, align 8
|
||||
%knodes_elem.addr = alloca i64, align 8
|
||||
%currKnodeD.addr = alloca i64*, align 8
|
||||
%offsetD.addr = alloca i64*, align 8
|
||||
%lastKnodeD.addr = alloca i64*, align 8
|
||||
%offset_2D.addr = alloca i64*, align 8
|
||||
%startD.addr = alloca i32*, align 8
|
||||
%endD.addr = alloca i32*, align 8
|
||||
%RecstartD.addr = alloca i32*, align 8
|
||||
%ReclenD.addr = alloca i32*, align 8
|
||||
%thid = alloca i32, align 4
|
||||
%bid = alloca i32, align 4
|
||||
%i = alloca i32, align 4
|
||||
store i64 %height, i64* %height.addr, align 8
|
||||
store %struct.knode* %knodesD, %struct.knode** %knodesD.addr, align 8
|
||||
store i64 %knodes_elem, i64* %knodes_elem.addr, align 8
|
||||
store i64* %currKnodeD, i64** %currKnodeD.addr, align 8
|
||||
store i64* %offsetD, i64** %offsetD.addr, align 8
|
||||
store i64* %lastKnodeD, i64** %lastKnodeD.addr, align 8
|
||||
store i64* %offset_2D, i64** %offset_2D.addr, align 8
|
||||
store i32* %startD, i32** %startD.addr, align 8
|
||||
store i32* %endD, i32** %endD.addr, align 8
|
||||
store i32* %RecstartD, i32** %RecstartD.addr, align 8
|
||||
store i32* %ReclenD, i32** %ReclenD.addr, align 8
|
||||
%call = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #2
|
||||
store i32 %call, i32* %thid, align 4
|
||||
%call1 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #2
|
||||
store i32 %call1, i32* %bid, align 4
|
||||
store i32 0, i32* %i, align 4
|
||||
br label %for.cond
|
||||
|
||||
for.cond: ; preds = %for.inc, %entry
|
||||
%0 = load i32, i32* %i, align 4
|
||||
%conv = sext i32 %0 to i64
|
||||
%1 = load i64, i64* %height.addr, align 8
|
||||
%cmp = icmp slt i64 %conv, %1
|
||||
br i1 %cmp, label %for.body, label %for.end
|
||||
|
||||
for.body: ; preds = %for.cond
|
||||
%2 = load %struct.knode*, %struct.knode** %knodesD.addr, align 8
|
||||
%3 = load i64*, i64** %currKnodeD.addr, align 8
|
||||
%4 = load i32, i32* %bid, align 4
|
||||
%idxprom = sext i32 %4 to i64
|
||||
%arrayidx = getelementptr inbounds i64, i64* %3, i64 %idxprom
|
||||
%5 = load i64, i64* %arrayidx, align 8
|
||||
%arrayidx2 = getelementptr inbounds %struct.knode, %struct.knode* %2, i64 %5
|
||||
%keys = getelementptr inbounds %struct.knode, %struct.knode* %arrayidx2, i32 0, i32 2
|
||||
%6 = load i32, i32* %thid, align 4
|
||||
%idxprom3 = sext i32 %6 to i64
|
||||
%arrayidx4 = getelementptr inbounds [257 x i32], [257 x i32]* %keys, i64 0, i64 %idxprom3
|
||||
%7 = load i32, i32* %arrayidx4, align 4
|
||||
%8 = load i32*, i32** %startD.addr, align 8
|
||||
%9 = load i32, i32* %bid, align 4
|
||||
%idxprom5 = sext i32 %9 to i64
|
||||
%arrayidx6 = getelementptr inbounds i32, i32* %8, i64 %idxprom5
|
||||
%10 = load i32, i32* %arrayidx6, align 4
|
||||
%cmp7 = icmp sle i32 %7, %10
|
||||
br i1 %cmp7, label %land.lhs.true, label %if.end34
|
||||
|
||||
land.lhs.true: ; preds = %for.body
|
||||
%11 = load %struct.knode*, %struct.knode** %knodesD.addr, align 8
|
||||
%12 = load i64*, i64** %currKnodeD.addr, align 8
|
||||
%13 = load i32, i32* %bid, align 4
|
||||
%idxprom8 = sext i32 %13 to i64
|
||||
%arrayidx9 = getelementptr inbounds i64, i64* %12, i64 %idxprom8
|
||||
%14 = load i64, i64* %arrayidx9, align 8
|
||||
%arrayidx10 = getelementptr inbounds %struct.knode, %struct.knode* %11, i64 %14
|
||||
%keys11 = getelementptr inbounds %struct.knode, %struct.knode* %arrayidx10, i32 0, i32 2
|
||||
%15 = load i32, i32* %thid, align 4
|
||||
%add = add nsw i32 %15, 1
|
||||
%idxprom12 = sext i32 %add to i64
|
||||
%arrayidx13 = getelementptr inbounds [257 x i32], [257 x i32]* %keys11, i64 0, i64 %idxprom12
|
||||
%16 = load i32, i32* %arrayidx13, align 4
|
||||
%17 = load i32*, i32** %startD.addr, align 8
|
||||
%18 = load i32, i32* %bid, align 4
|
||||
%idxprom14 = sext i32 %18 to i64
|
||||
%arrayidx15 = getelementptr inbounds i32, i32* %17, i64 %idxprom14
|
||||
%19 = load i32, i32* %arrayidx15, align 4
|
||||
%cmp16 = icmp sgt i32 %16, %19
|
||||
br i1 %cmp16, label %if.then, label %if.end34
|
||||
|
||||
if.then: ; preds = %land.lhs.true
|
||||
%20 = load %struct.knode*, %struct.knode** %knodesD.addr, align 8
|
||||
%21 = load i64*, i64** %currKnodeD.addr, align 8
|
||||
%22 = load i32, i32* %bid, align 4
|
||||
%idxprom17 = sext i32 %22 to i64
|
||||
%arrayidx18 = getelementptr inbounds i64, i64* %21, i64 %idxprom17
|
||||
%23 = load i64, i64* %arrayidx18, align 8
|
||||
%arrayidx19 = getelementptr inbounds %struct.knode, %struct.knode* %20, i64 %23
|
||||
%indices = getelementptr inbounds %struct.knode, %struct.knode* %arrayidx19, i32 0, i32 1
|
||||
%24 = load i32, i32* %thid, align 4
|
||||
%idxprom20 = sext i32 %24 to i64
|
||||
%arrayidx21 = getelementptr inbounds [257 x i32], [257 x i32]* %indices, i64 0, i64 %idxprom20
|
||||
%25 = load i32, i32* %arrayidx21, align 4
|
||||
%conv22 = sext i32 %25 to i64
|
||||
%26 = load i64, i64* %knodes_elem.addr, align 8
|
||||
%cmp23 = icmp slt i64 %conv22, %26
|
||||
br i1 %cmp23, label %if.then24, label %if.end
|
||||
|
||||
if.then24: ; preds = %if.then
|
||||
%27 = load %struct.knode*, %struct.knode** %knodesD.addr, align 8
|
||||
%28 = load i64*, i64** %currKnodeD.addr, align 8
|
||||
%29 = load i32, i32* %bid, align 4
|
||||
%idxprom25 = sext i32 %29 to i64
|
||||
%arrayidx26 = getelementptr inbounds i64, i64* %28, i64 %idxprom25
|
||||
%30 = load i64, i64* %arrayidx26, align 8
|
||||
%arrayidx27 = getelementptr inbounds %struct.knode, %struct.knode* %27, i64 %30
|
||||
%indices28 = getelementptr inbounds %struct.knode, %struct.knode* %arrayidx27, i32 0, i32 1
|
||||
%31 = load i32, i32* %thid, align 4
|
||||
%idxprom29 = sext i32 %31 to i64
|
||||
%arrayidx30 = getelementptr inbounds [257 x i32], [257 x i32]* %indices28, i64 0, i64 %idxprom29
|
||||
%32 = load i32, i32* %arrayidx30, align 4
|
||||
%conv31 = sext i32 %32 to i64
|
||||
%33 = load i64*, i64** %offsetD.addr, align 8
|
||||
%34 = load i32, i32* %bid, align 4
|
||||
%idxprom32 = sext i32 %34 to i64
|
||||
%arrayidx33 = getelementptr inbounds i64, i64* %33, i64 %idxprom32
|
||||
store i64 %conv31, i64* %arrayidx33, align 8
|
||||
br label %if.end
|
||||
|
||||
if.end: ; preds = %if.then24, %if.then
|
||||
br label %if.end34
|
||||
|
||||
if.end34: ; preds = %if.end, %land.lhs.true, %for.body
|
||||
%35 = load %struct.knode*, %struct.knode** %knodesD.addr, align 8
|
||||
%36 = load i64*, i64** %lastKnodeD.addr, align 8
|
||||
%37 = load i32, i32* %bid, align 4
|
||||
%idxprom35 = sext i32 %37 to i64
|
||||
%arrayidx36 = getelementptr inbounds i64, i64* %36, i64 %idxprom35
|
||||
%38 = load i64, i64* %arrayidx36, align 8
|
||||
%arrayidx37 = getelementptr inbounds %struct.knode, %struct.knode* %35, i64 %38
|
||||
%keys38 = getelementptr inbounds %struct.knode, %struct.knode* %arrayidx37, i32 0, i32 2
|
||||
%39 = load i32, i32* %thid, align 4
|
||||
%idxprom39 = sext i32 %39 to i64
|
||||
%arrayidx40 = getelementptr inbounds [257 x i32], [257 x i32]* %keys38, i64 0, i64 %idxprom39
|
||||
%40 = load i32, i32* %arrayidx40, align 4
|
||||
%41 = load i32*, i32** %endD.addr, align 8
|
||||
%42 = load i32, i32* %bid, align 4
|
||||
%idxprom41 = sext i32 %42 to i64
|
||||
%arrayidx42 = getelementptr inbounds i32, i32* %41, i64 %idxprom41
|
||||
%43 = load i32, i32* %arrayidx42, align 4
|
||||
%cmp43 = icmp sle i32 %40, %43
|
||||
br i1 %cmp43, label %land.lhs.true44, label %if.end75
|
||||
|
||||
land.lhs.true44: ; preds = %if.end34
|
||||
%44 = load %struct.knode*, %struct.knode** %knodesD.addr, align 8
|
||||
%45 = load i64*, i64** %lastKnodeD.addr, align 8
|
||||
%46 = load i32, i32* %bid, align 4
|
||||
%idxprom45 = sext i32 %46 to i64
|
||||
%arrayidx46 = getelementptr inbounds i64, i64* %45, i64 %idxprom45
|
||||
%47 = load i64, i64* %arrayidx46, align 8
|
||||
%arrayidx47 = getelementptr inbounds %struct.knode, %struct.knode* %44, i64 %47
|
||||
%keys48 = getelementptr inbounds %struct.knode, %struct.knode* %arrayidx47, i32 0, i32 2
|
||||
%48 = load i32, i32* %thid, align 4
|
||||
%add49 = add nsw i32 %48, 1
|
||||
%idxprom50 = sext i32 %add49 to i64
|
||||
%arrayidx51 = getelementptr inbounds [257 x i32], [257 x i32]* %keys48, i64 0, i64 %idxprom50
|
||||
%49 = load i32, i32* %arrayidx51, align 4
|
||||
%50 = load i32*, i32** %endD.addr, align 8
|
||||
%51 = load i32, i32* %bid, align 4
|
||||
%idxprom52 = sext i32 %51 to i64
|
||||
%arrayidx53 = getelementptr inbounds i32, i32* %50, i64 %idxprom52
|
||||
%52 = load i32, i32* %arrayidx53, align 4
|
||||
%cmp54 = icmp sgt i32 %49, %52
|
||||
br i1 %cmp54, label %if.then55, label %if.end75
|
||||
|
||||
if.then55: ; preds = %land.lhs.true44
|
||||
%53 = load %struct.knode*, %struct.knode** %knodesD.addr, align 8
|
||||
%54 = load i64*, i64** %lastKnodeD.addr, align 8
|
||||
%55 = load i32, i32* %bid, align 4
|
||||
%idxprom56 = sext i32 %55 to i64
|
||||
%arrayidx57 = getelementptr inbounds i64, i64* %54, i64 %idxprom56
|
||||
%56 = load i64, i64* %arrayidx57, align 8
|
||||
%arrayidx58 = getelementptr inbounds %struct.knode, %struct.knode* %53, i64 %56
|
||||
%indices59 = getelementptr inbounds %struct.knode, %struct.knode* %arrayidx58, i32 0, i32 1
|
||||
%57 = load i32, i32* %thid, align 4
|
||||
%idxprom60 = sext i32 %57 to i64
|
||||
%arrayidx61 = getelementptr inbounds [257 x i32], [257 x i32]* %indices59, i64 0, i64 %idxprom60
|
||||
%58 = load i32, i32* %arrayidx61, align 4
|
||||
%conv62 = sext i32 %58 to i64
|
||||
%59 = load i64, i64* %knodes_elem.addr, align 8
|
||||
%cmp63 = icmp slt i64 %conv62, %59
|
||||
br i1 %cmp63, label %if.then64, label %if.end74
|
||||
|
||||
if.then64: ; preds = %if.then55
|
||||
%60 = load %struct.knode*, %struct.knode** %knodesD.addr, align 8
|
||||
%61 = load i64*, i64** %lastKnodeD.addr, align 8
|
||||
%62 = load i32, i32* %bid, align 4
|
||||
%idxprom65 = sext i32 %62 to i64
|
||||
%arrayidx66 = getelementptr inbounds i64, i64* %61, i64 %idxprom65
|
||||
%63 = load i64, i64* %arrayidx66, align 8
|
||||
%arrayidx67 = getelementptr inbounds %struct.knode, %struct.knode* %60, i64 %63
|
||||
%indices68 = getelementptr inbounds %struct.knode, %struct.knode* %arrayidx67, i32 0, i32 1
|
||||
%64 = load i32, i32* %thid, align 4
|
||||
%idxprom69 = sext i32 %64 to i64
|
||||
%arrayidx70 = getelementptr inbounds [257 x i32], [257 x i32]* %indices68, i64 0, i64 %idxprom69
|
||||
%65 = load i32, i32* %arrayidx70, align 4
|
||||
%conv71 = sext i32 %65 to i64
|
||||
%66 = load i64*, i64** %offset_2D.addr, align 8
|
||||
%67 = load i32, i32* %bid, align 4
|
||||
%idxprom72 = sext i32 %67 to i64
|
||||
%arrayidx73 = getelementptr inbounds i64, i64* %66, i64 %idxprom72
|
||||
store i64 %conv71, i64* %arrayidx73, align 8
|
||||
br label %if.end74
|
||||
|
||||
if.end74: ; preds = %if.then64, %if.then55
|
||||
br label %if.end75
|
||||
|
||||
if.end75: ; preds = %if.end74, %land.lhs.true44, %if.end34
|
||||
call void @llvm.nvvm.barrier0()
|
||||
%68 = load i32, i32* %thid, align 4
|
||||
%cmp76 = icmp eq i32 %68, 0
|
||||
br i1 %cmp76, label %if.then77, label %if.end86
|
||||
|
||||
if.then77: ; preds = %if.end75
|
||||
%69 = load i64*, i64** %offsetD.addr, align 8
|
||||
%70 = load i32, i32* %bid, align 4
|
||||
%idxprom78 = sext i32 %70 to i64
|
||||
%arrayidx79 = getelementptr inbounds i64, i64* %69, i64 %idxprom78
|
||||
%71 = load i64, i64* %arrayidx79, align 8
|
||||
%72 = load i64*, i64** %currKnodeD.addr, align 8
|
||||
%73 = load i32, i32* %bid, align 4
|
||||
%idxprom80 = sext i32 %73 to i64
|
||||
%arrayidx81 = getelementptr inbounds i64, i64* %72, i64 %idxprom80
|
||||
store i64 %71, i64* %arrayidx81, align 8
|
||||
%74 = load i64*, i64** %offset_2D.addr, align 8
|
||||
%75 = load i32, i32* %bid, align 4
|
||||
%idxprom82 = sext i32 %75 to i64
|
||||
%arrayidx83 = getelementptr inbounds i64, i64* %74, i64 %idxprom82
|
||||
%76 = load i64, i64* %arrayidx83, align 8
|
||||
%77 = load i64*, i64** %lastKnodeD.addr, align 8
|
||||
%78 = load i32, i32* %bid, align 4
|
||||
%idxprom84 = sext i32 %78 to i64
|
||||
%arrayidx85 = getelementptr inbounds i64, i64* %77, i64 %idxprom84
|
||||
store i64 %76, i64* %arrayidx85, align 8
|
||||
br label %if.end86
|
||||
|
||||
if.end86: ; preds = %if.then77, %if.end75
|
||||
call void @llvm.nvvm.barrier0()
|
||||
br label %for.inc
|
||||
|
||||
for.inc: ; preds = %if.end86
|
||||
%79 = load i32, i32* %i, align 4
|
||||
%inc = add nsw i32 %79, 1
|
||||
store i32 %inc, i32* %i, align 4
|
||||
br label %for.cond
|
||||
|
||||
for.end: ; preds = %for.cond
|
||||
%80 = load %struct.knode*, %struct.knode** %knodesD.addr, align 8
|
||||
%81 = load i64*, i64** %currKnodeD.addr, align 8
|
||||
%82 = load i32, i32* %bid, align 4
|
||||
%idxprom87 = sext i32 %82 to i64
|
||||
%arrayidx88 = getelementptr inbounds i64, i64* %81, i64 %idxprom87
|
||||
%83 = load i64, i64* %arrayidx88, align 8
|
||||
%arrayidx89 = getelementptr inbounds %struct.knode, %struct.knode* %80, i64 %83
|
||||
%keys90 = getelementptr inbounds %struct.knode, %struct.knode* %arrayidx89, i32 0, i32 2
|
||||
%84 = load i32, i32* %thid, align 4
|
||||
%idxprom91 = sext i32 %84 to i64
|
||||
%arrayidx92 = getelementptr inbounds [257 x i32], [257 x i32]* %keys90, i64 0, i64 %idxprom91
|
||||
%85 = load i32, i32* %arrayidx92, align 4
|
||||
%86 = load i32*, i32** %startD.addr, align 8
|
||||
%87 = load i32, i32* %bid, align 4
|
||||
%idxprom93 = sext i32 %87 to i64
|
||||
%arrayidx94 = getelementptr inbounds i32, i32* %86, i64 %idxprom93
|
||||
%88 = load i32, i32* %arrayidx94, align 4
|
||||
%cmp95 = icmp eq i32 %85, %88
|
||||
br i1 %cmp95, label %if.then96, label %if.end105
|
||||
|
||||
if.then96: ; preds = %for.end
|
||||
%89 = load %struct.knode*, %struct.knode** %knodesD.addr, align 8
|
||||
%90 = load i64*, i64** %currKnodeD.addr, align 8
|
||||
%91 = load i32, i32* %bid, align 4
|
||||
%idxprom97 = sext i32 %91 to i64
|
||||
%arrayidx98 = getelementptr inbounds i64, i64* %90, i64 %idxprom97
|
||||
%92 = load i64, i64* %arrayidx98, align 8
|
||||
%arrayidx99 = getelementptr inbounds %struct.knode, %struct.knode* %89, i64 %92
|
||||
%indices100 = getelementptr inbounds %struct.knode, %struct.knode* %arrayidx99, i32 0, i32 1
|
||||
%93 = load i32, i32* %thid, align 4
|
||||
%idxprom101 = sext i32 %93 to i64
|
||||
%arrayidx102 = getelementptr inbounds [257 x i32], [257 x i32]* %indices100, i64 0, i64 %idxprom101
|
||||
%94 = load i32, i32* %arrayidx102, align 4
|
||||
%95 = load i32*, i32** %RecstartD.addr, align 8
|
||||
%96 = load i32, i32* %bid, align 4
|
||||
%idxprom103 = sext i32 %96 to i64
|
||||
%arrayidx104 = getelementptr inbounds i32, i32* %95, i64 %idxprom103
|
||||
store i32 %94, i32* %arrayidx104, align 4
|
||||
br label %if.end105
|
||||
|
||||
if.end105: ; preds = %if.then96, %for.end
|
||||
call void @llvm.nvvm.barrier0()
|
||||
%97 = load %struct.knode*, %struct.knode** %knodesD.addr, align 8
|
||||
%98 = load i64*, i64** %lastKnodeD.addr, align 8
|
||||
%99 = load i32, i32* %bid, align 4
|
||||
%idxprom106 = sext i32 %99 to i64
|
||||
%arrayidx107 = getelementptr inbounds i64, i64* %98, i64 %idxprom106
|
||||
%100 = load i64, i64* %arrayidx107, align 8
|
||||
%arrayidx108 = getelementptr inbounds %struct.knode, %struct.knode* %97, i64 %100
|
||||
%keys109 = getelementptr inbounds %struct.knode, %struct.knode* %arrayidx108, i32 0, i32 2
|
||||
%101 = load i32, i32* %thid, align 4
|
||||
%idxprom110 = sext i32 %101 to i64
|
||||
%arrayidx111 = getelementptr inbounds [257 x i32], [257 x i32]* %keys109, i64 0, i64 %idxprom110
|
||||
%102 = load i32, i32* %arrayidx111, align 4
|
||||
%103 = load i32*, i32** %endD.addr, align 8
|
||||
%104 = load i32, i32* %bid, align 4
|
||||
%idxprom112 = sext i32 %104 to i64
|
||||
%arrayidx113 = getelementptr inbounds i32, i32* %103, i64 %idxprom112
|
||||
%105 = load i32, i32* %arrayidx113, align 4
|
||||
%cmp114 = icmp eq i32 %102, %105
|
||||
br i1 %cmp114, label %if.then115, label %if.end127
|
||||
|
||||
if.then115: ; preds = %if.end105
|
||||
%106 = load %struct.knode*, %struct.knode** %knodesD.addr, align 8
|
||||
%107 = load i64*, i64** %lastKnodeD.addr, align 8
|
||||
%108 = load i32, i32* %bid, align 4
|
||||
%idxprom116 = sext i32 %108 to i64
|
||||
%arrayidx117 = getelementptr inbounds i64, i64* %107, i64 %idxprom116
|
||||
%109 = load i64, i64* %arrayidx117, align 8
|
||||
%arrayidx118 = getelementptr inbounds %struct.knode, %struct.knode* %106, i64 %109
|
||||
%indices119 = getelementptr inbounds %struct.knode, %struct.knode* %arrayidx118, i32 0, i32 1
|
||||
%110 = load i32, i32* %thid, align 4
|
||||
%idxprom120 = sext i32 %110 to i64
|
||||
%arrayidx121 = getelementptr inbounds [257 x i32], [257 x i32]* %indices119, i64 0, i64 %idxprom120
|
||||
%111 = load i32, i32* %arrayidx121, align 4
|
||||
%112 = load i32*, i32** %RecstartD.addr, align 8
|
||||
%113 = load i32, i32* %bid, align 4
|
||||
%idxprom122 = sext i32 %113 to i64
|
||||
%arrayidx123 = getelementptr inbounds i32, i32* %112, i64 %idxprom122
|
||||
%114 = load i32, i32* %arrayidx123, align 4
|
||||
%sub = sub nsw i32 %111, %114
|
||||
%add124 = add nsw i32 %sub, 1
|
||||
%115 = load i32*, i32** %ReclenD.addr, align 8
|
||||
%116 = load i32, i32* %bid, align 4
|
||||
%idxprom125 = sext i32 %116 to i64
|
||||
%arrayidx126 = getelementptr inbounds i32, i32* %115, i64 %idxprom125
|
||||
store i32 %add124, i32* %arrayidx126, align 4
|
||||
br label %if.end127
|
||||
|
||||
if.end127: ; preds = %if.then115, %if.end105
|
||||
ret void
|
||||
}
|
||||
|
||||
; Function Attrs: alwaysinline convergent nounwind
|
||||
define linkonce_odr dso_local i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #1 comdat align 2 {
|
||||
entry:
|
||||
%0 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
|
||||
ret i32 %0
|
||||
}
|
||||
|
||||
; Function Attrs: alwaysinline convergent nounwind
|
||||
define linkonce_odr dso_local i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #1 comdat align 2 {
|
||||
entry:
|
||||
%0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
|
||||
ret i32 %0
|
||||
}
|
||||
|
||||
; Function Attrs: convergent nounwind
|
||||
declare void @llvm.nvvm.barrier0() #2
|
||||
|
||||
; Function Attrs: nounwind readnone
|
||||
declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() #3
|
||||
|
||||
; Function Attrs: nounwind readnone
|
||||
declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #3
|
||||
|
||||
attributes #0 = { convergent noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" }
|
||||
attributes #1 = { alwaysinline convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" }
|
||||
attributes #2 = { convergent nounwind }
|
||||
attributes #3 = { nounwind readnone }
|
||||
|
||||
!llvm.module.flags = !{!0, !1, !2}
|
||||
!nvvm.annotations = !{!3, !4, !5, !4, !6, !6, !6, !6, !7, !7, !6}
|
||||
!llvm.ident = !{!8}
|
||||
!nvvmir.version = !{!9}
|
||||
|
||||
!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 10, i32 1]}
|
||||
!1 = !{i32 1, !"wchar_size", i32 4}
|
||||
!2 = !{i32 4, !"nvvm-reflect-ftz", i32 0}
|
||||
!3 = !{void (i64, %struct.knode*, i64, i64*, i64*, i64*, i64*, i32*, i32*, i32*, i32*)* @findRangeK, !"kernel", i32 1}
|
||||
!4 = !{null, !"align", i32 8}
|
||||
!5 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080}
|
||||
!6 = !{null, !"align", i32 16}
|
||||
!7 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088}
|
||||
!8 = !{!"clang version 10.0.1 (https://github.com/llvm/llvm-project.git ef32c611aa214dea855364efd7ba451ec5ec3f74)"}
|
||||
!9 = !{i32 1, i32 4}
|
File diff suppressed because one or more lines are too long
File diff suppressed because it is too large
Load Diff
|
@ -1,40 +0,0 @@
|
|||
#!/bin/bash
|
||||
set -e
|
||||
clang -c -emit-llvm util/timer/timer.c
|
||||
clang -c -emit-llvm util/num/num.c
|
||||
#clang -c -emit-llvm util/cuda/cuda.cu --cuda-gpu-arch=sm_61
|
||||
#clang -c -emit-llvm kernel/kernel_gpu_cuda_wrapper.cu --cuda-gpu-arch=sm_61
|
||||
#clang++ kernel/kernel_gpu_cuda_wrapper.cu --cuda-path=/usr/local/cuda-10.1 --cuda-gpu-arch=sm_61 -L/usr/local/cuda-10.1/lib64 -lcudart_static -ldl -lrt -pthread -save-temps -v
|
||||
#clang++ kernel/kernel_gpu_cuda_wrapper_2.cu --cuda-path=/usr/local/cuda-10.1 --cuda-gpu-arch=sm_61 -L/usr/local/cuda-10.1/lib64 -lcudart_static -ldl -lrt -pthread -save-temps -v
|
||||
clang -c -emit-llvm main.c
|
||||
|
||||
llvm-as kernel_gpu_cuda_wrapper-cuda-nvptx64-nvidia-cuda-sm_61.ll
|
||||
llvm-as kernel_gpu_cuda_wrapper_2-cuda-nvptx64-nvidia-cuda-sm_61.ll
|
||||
llvm-as kernel_gpu_cuda_wrapper-host-x86_64-unknown-linux-gnu.ll
|
||||
llvm-as kernel_gpu_cuda_wrapper_2-host-x86_64-unknown-linux-gnu.ll
|
||||
../../build/compilation/kernelTranslator kernel_gpu_cuda_wrapper-cuda-nvptx64-nvidia-cuda-sm_61.bc kernel1.bc
|
||||
../../build/compilation/kernelTranslator kernel_gpu_cuda_wrapper_2-cuda-nvptx64-nvidia-cuda-sm_61.bc kernel2.bc
|
||||
../../build/compilation/hostTranslator kernel_gpu_cuda_wrapper-host-x86_64-unknown-linux-gnu.bc host1.bc
|
||||
../../build/compilation/hostTranslator kernel_gpu_cuda_wrapper_2-host-x86_64-unknown-linux-gnu.bc host2.bc
|
||||
|
||||
llc --relocation-model=pic --filetype=obj main.bc
|
||||
llc --relocation-model=pic --filetype=obj cuda.bc
|
||||
llc --relocation-model=pic --filetype=obj num.bc
|
||||
llc --relocation-model=pic --filetype=obj timer.bc
|
||||
llc --relocation-model=pic --filetype=obj kernel1.bc
|
||||
llc --relocation-model=pic --filetype=obj kernel2.bc
|
||||
llc --relocation-model=pic --filetype=obj host1.bc
|
||||
llc --relocation-model=pic --filetype=obj host2.bc
|
||||
export LD_LIBRARY_PATH=../../build/runtime:../../build/runtime/threadPool:$LD_LIBRARY_PATH
|
||||
g++ -Wall -L../../build/runtime -L../../build/runtime/threadPool -o b+tree.out \
|
||||
-fPIC -no-pie main.o host1.o host2.o kernel1.o kernel2.o cuda.o num.o timer.o \
|
||||
-lc -lx86Runtime -lthreadPool -lpthread
|
||||
|
||||
./b+tree.out file ../../rodinia-data/b+tree/mil.txt \
|
||||
command ../../rodinia-data/b+tree/command.txt
|
||||
if grep -q "0 840187 6001" output.txt; then
|
||||
echo "Pass"
|
||||
else
|
||||
echo "Error result"
|
||||
exit 1
|
||||
fi
|
|
@ -1,75 +0,0 @@
|
|||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
//===============================================================================================================================================================================================================200
|
||||
// SET_DEVICE CODE
|
||||
//===============================================================================================================================================================================================================200
|
||||
|
||||
//======================================================================================================================================================150
|
||||
// INCLUDE/DEFINE
|
||||
//======================================================================================================================================================150
|
||||
|
||||
#include "cuda.h" // (in library path specified to compiler)
|
||||
|
||||
//======================================================================================================================================================150
|
||||
// FUNCTIONS
|
||||
//======================================================================================================================================================150
|
||||
|
||||
//====================================================================================================100
|
||||
// SET DEVICE
|
||||
//====================================================================================================100
|
||||
|
||||
void setdevice(void){
|
||||
|
||||
// variables
|
||||
int num_devices;
|
||||
int device;
|
||||
|
||||
// work
|
||||
cudaGetDeviceCount(&num_devices);
|
||||
if (num_devices > 1) {
|
||||
|
||||
// variables
|
||||
int max_multiprocessors;
|
||||
int max_device;
|
||||
cudaDeviceProp properties;
|
||||
|
||||
// initialize variables
|
||||
max_multiprocessors = 0;
|
||||
max_device = 0;
|
||||
|
||||
for (device = 0; device < num_devices; device++) {
|
||||
cudaGetDeviceProperties(&properties, device);
|
||||
if (max_multiprocessors < properties.multiProcessorCount) {
|
||||
max_multiprocessors = properties.multiProcessorCount;
|
||||
max_device = device;
|
||||
}
|
||||
}
|
||||
cudaSetDevice(max_device);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
//====================================================================================================100
|
||||
// GET LAST ERROR
|
||||
//====================================================================================================100
|
||||
|
||||
void checkCUDAError(const char *msg)
|
||||
{
|
||||
cudaError_t err = cudaGetLastError();
|
||||
if( cudaSuccess != err) {
|
||||
// fprintf(stderr, "Cuda error: %s: %s.\n", msg, cudaGetErrorString( err) );
|
||||
printf("Cuda error: %s: %s.\n", msg, cudaGetErrorString( err) );
|
||||
fflush(NULL);
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
}
|
||||
|
||||
//===============================================================================================================================================================================================================200
|
||||
// END
|
||||
//===============================================================================================================================================================================================================200
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
|
@ -1,37 +0,0 @@
|
|||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
//===============================================================================================================================================================================================================200
|
||||
// SET_DEVICE HEADER
|
||||
//===============================================================================================================================================================================================================200
|
||||
|
||||
//======================================================================================================================================================150
|
||||
// INCLUDE/DEFINE
|
||||
//======================================================================================================================================================150
|
||||
|
||||
#include <stdio.h> // (in library path known to compiler) needed by printf
|
||||
|
||||
//======================================================================================================================================================150
|
||||
// FUNCTION PROTOTYPES
|
||||
//======================================================================================================================================================150
|
||||
|
||||
//====================================================================================================100
|
||||
// SET DEVICE
|
||||
//====================================================================================================100
|
||||
|
||||
void setdevice(void);
|
||||
|
||||
//====================================================================================================100
|
||||
// GET LAST ERROR
|
||||
//====================================================================================================100
|
||||
|
||||
void checkCUDAError(const char *msg);
|
||||
|
||||
//===============================================================================================================================================================================================================200
|
||||
// END SET_DEVICE HEADER
|
||||
//===============================================================================================================================================================================================================200
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
|
@ -1,55 +0,0 @@
|
|||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
//===============================================================================================================================================================================================================200
|
||||
// DESCRIPTION
|
||||
//===============================================================================================================================================================================================================200
|
||||
|
||||
// Returns: 0 if string does not represent integer
|
||||
// 1 if string represents integer
|
||||
|
||||
//===============================================================================================================================================================================================================200
|
||||
// NUM CODE
|
||||
//===============================================================================================================================================================================================================200
|
||||
|
||||
//======================================================================================================================================================150
|
||||
// ISINTEGER FUNCTION
|
||||
//======================================================================================================================================================150
|
||||
|
||||
int isInteger(char *str) {
|
||||
|
||||
//====================================================================================================100
|
||||
// make sure it's not empty
|
||||
//====================================================================================================100
|
||||
|
||||
if (*str == '\0') {
|
||||
return 0;
|
||||
}
|
||||
|
||||
//====================================================================================================100
|
||||
// if any digit is not a number, return false
|
||||
//====================================================================================================100
|
||||
|
||||
for (; *str != '\0'; str++) {
|
||||
if (*str < 48 ||
|
||||
*str >
|
||||
57) { // digit characters (need to include . if checking for float)
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
//====================================================================================================100
|
||||
// it got past all my checks so I think it's a number
|
||||
//====================================================================================================100
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
//===============================================================================================================================================================================================================200
|
||||
// END NUM CODE
|
||||
//===============================================================================================================================================================================================================200
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
|
@ -1,21 +0,0 @@
|
|||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
//===============================================================================================================================================================================================================200
|
||||
// FILE HEADER
|
||||
//===============================================================================================================================================================================================================200
|
||||
|
||||
//======================================================================================================================================================150
|
||||
// ISINTEGER FUNCTION PROTOTYPE
|
||||
//======================================================================================================================================================150
|
||||
|
||||
int isInteger(char *str);
|
||||
|
||||
//===============================================================================================================================================================================================================200
|
||||
// END FILE HEADER
|
||||
//===============================================================================================================================================================================================================200
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
|
@ -1,36 +0,0 @@
|
|||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
//===============================================================================================================================================================================================================200
|
||||
// TIMER CODE
|
||||
//===============================================================================================================================================================================================================200
|
||||
|
||||
//======================================================================================================================================================150
|
||||
// INCLUDE/DEFINE
|
||||
//======================================================================================================================================================150
|
||||
|
||||
#include <stdlib.h>
|
||||
|
||||
//======================================================================================================================================================150
|
||||
// FUNCTIONS
|
||||
//======================================================================================================================================================150
|
||||
|
||||
//====================================================================================================100
|
||||
// DISPLAY TIME
|
||||
//====================================================================================================100
|
||||
|
||||
// Returns the current system time in microseconds
|
||||
long long get_time() {
|
||||
struct timeval tv;
|
||||
gettimeofday(&tv, NULL);
|
||||
return (tv.tv_sec * 1000000) + tv.tv_usec;
|
||||
}
|
||||
|
||||
//===============================================================================================================================================================================================================200
|
||||
// END TIMER CODE
|
||||
//===============================================================================================================================================================================================================200
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
|
@ -1,21 +0,0 @@
|
|||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
//===============================================================================================================================================================================================================200
|
||||
// TIMER HEADER
|
||||
//===============================================================================================================================================================================================================200
|
||||
|
||||
//======================================================================================================================================================150
|
||||
// FUNCTION PROTOTYPES
|
||||
//======================================================================================================================================================150
|
||||
|
||||
long long get_time();
|
||||
|
||||
//===============================================================================================================================================================================================================200
|
||||
// END TIMER HEADER
|
||||
//===============================================================================================================================================================================================================200
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
|
@ -1,662 +0,0 @@
|
|||
#include <fstream>
|
||||
#include <helper_cuda.h>
|
||||
#include <helper_timer.h>
|
||||
#include <iostream>
|
||||
|
||||
/*
|
||||
* Options
|
||||
*
|
||||
*/
|
||||
#define GAMMA 1.4f
|
||||
#define iterations 2
|
||||
// #ifndef block_length
|
||||
// #define block_length 192
|
||||
// #endif
|
||||
|
||||
#define NDIM 3
|
||||
#define NNB 4
|
||||
|
||||
#define RK 3 // 3rd order RK
|
||||
#define ff_mach 1.2f
|
||||
#define deg_angle_of_attack 0.0f
|
||||
|
||||
/*
|
||||
* not options
|
||||
*/
|
||||
|
||||
#ifdef RD_WG_SIZE_0_0
|
||||
#define BLOCK_SIZE_0 RD_WG_SIZE_0_0
|
||||
#elif defined(RD_WG_SIZE_0)
|
||||
#define BLOCK_SIZE_0 RD_WG_SIZE_0
|
||||
#elif defined(RD_WG_SIZE)
|
||||
#define BLOCK_SIZE_0 RD_WG_SIZE
|
||||
#else
|
||||
#define BLOCK_SIZE_0 192
|
||||
#endif
|
||||
|
||||
#ifdef RD_WG_SIZE_1_0
|
||||
#define BLOCK_SIZE_1 RD_WG_SIZE_1_0
|
||||
#elif defined(RD_WG_SIZE_1)
|
||||
#define BLOCK_SIZE_1 RD_WG_SIZE_1
|
||||
#elif defined(RD_WG_SIZE)
|
||||
#define BLOCK_SIZE_1 RD_WG_SIZE
|
||||
#else
|
||||
#define BLOCK_SIZE_1 192
|
||||
#endif
|
||||
|
||||
#ifdef RD_WG_SIZE_2_0
|
||||
#define BLOCK_SIZE_2 RD_WG_SIZE_2_0
|
||||
#elif defined(RD_WG_SIZE_1)
|
||||
#define BLOCK_SIZE_2 RD_WG_SIZE_2
|
||||
#elif defined(RD_WG_SIZE)
|
||||
#define BLOCK_SIZE_2 RD_WG_SIZE
|
||||
#else
|
||||
#define BLOCK_SIZE_2 192
|
||||
#endif
|
||||
|
||||
#ifdef RD_WG_SIZE_3_0
|
||||
#define BLOCK_SIZE_3 RD_WG_SIZE_3_0
|
||||
#elif defined(RD_WG_SIZE_3)
|
||||
#define BLOCK_SIZE_3 RD_WG_SIZE_3
|
||||
#elif defined(RD_WG_SIZE)
|
||||
#define BLOCK_SIZE_3 RD_WG_SIZE
|
||||
#else
|
||||
#define BLOCK_SIZE_3 192
|
||||
#endif
|
||||
|
||||
#ifdef RD_WG_SIZE_4_0
|
||||
#define BLOCK_SIZE_4 RD_WG_SIZE_4_0
|
||||
#elif defined(RD_WG_SIZE_4)
|
||||
#define BLOCK_SIZE_4 RD_WG_SIZE_4
|
||||
#elif defined(RD_WG_SIZE)
|
||||
#define BLOCK_SIZE_4 RD_WG_SIZE
|
||||
#else
|
||||
#define BLOCK_SIZE_4 192
|
||||
#endif
|
||||
|
||||
// #if block_length > 128
|
||||
// #warning "the kernels may fail too launch on some systems if the block length
|
||||
// is too large" #endif
|
||||
|
||||
#define VAR_DENSITY 0
|
||||
#define VAR_MOMENTUM 1
|
||||
#define VAR_DENSITY_ENERGY (VAR_MOMENTUM + NDIM)
|
||||
#define NVAR (VAR_DENSITY_ENERGY + 1)
|
||||
|
||||
/*
|
||||
* Generic functions
|
||||
*/
|
||||
template <typename T> T *alloc(int N) {
|
||||
T *t;
|
||||
checkCudaErrors(cudaMalloc((void **)&t, sizeof(T) * N));
|
||||
return t;
|
||||
}
|
||||
|
||||
template <typename T> void dealloc(T *array) {
|
||||
checkCudaErrors(cudaFree((void *)array));
|
||||
}
|
||||
|
||||
template <typename T> void copy(T *dst, T *src, int N) {
|
||||
checkCudaErrors(cudaMemcpy((void *)dst, (void *)src, N * sizeof(T),
|
||||
cudaMemcpyDeviceToDevice));
|
||||
}
|
||||
|
||||
template <typename T> void upload(T *dst, T *src, int N) {
|
||||
checkCudaErrors(cudaMemcpy((void *)dst, (void *)src, N * sizeof(T),
|
||||
cudaMemcpyHostToDevice));
|
||||
}
|
||||
|
||||
template <typename T> void download(T *dst, T *src, int N) {
|
||||
checkCudaErrors(cudaMemcpy((void *)dst, (void *)src, N * sizeof(T),
|
||||
cudaMemcpyDeviceToHost));
|
||||
}
|
||||
|
||||
void dump(float *variables, int nel, int nelr) {
|
||||
float *h_variables = new float[nelr * NVAR];
|
||||
download(h_variables, variables, nelr * NVAR);
|
||||
|
||||
{
|
||||
std::ofstream file("density");
|
||||
file << nel << " " << nelr << std::endl;
|
||||
for (int i = 0; i < nel; i++)
|
||||
file << h_variables[i + VAR_DENSITY * nelr] << std::endl;
|
||||
}
|
||||
|
||||
{
|
||||
std::ofstream file("momentum");
|
||||
file << nel << " " << nelr << std::endl;
|
||||
for (int i = 0; i < nel; i++) {
|
||||
for (int j = 0; j != NDIM; j++)
|
||||
file << h_variables[i + (VAR_MOMENTUM + j) * nelr] << " ";
|
||||
file << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
std::ofstream file("density_energy");
|
||||
file << nel << " " << nelr << std::endl;
|
||||
for (int i = 0; i < nel; i++)
|
||||
file << h_variables[i + VAR_DENSITY_ENERGY * nelr] << std::endl;
|
||||
}
|
||||
delete[] h_variables;
|
||||
}
|
||||
|
||||
/*
|
||||
* Element-based Cell-centered FVM solver functions
|
||||
*/
|
||||
__constant__ float ff_variable[NVAR];
|
||||
__constant__ float3 ff_flux_contribution_momentum_x[1];
|
||||
__constant__ float3 ff_flux_contribution_momentum_y[1];
|
||||
__constant__ float3 ff_flux_contribution_momentum_z[1];
|
||||
__constant__ float3 ff_flux_contribution_density_energy[1];
|
||||
|
||||
__global__ void cuda_initialize_variables(int nelr, float *variables) {
|
||||
const int i = (blockDim.x * blockIdx.x + threadIdx.x);
|
||||
for (int j = 0; j < NVAR; j++)
|
||||
variables[i + j * nelr] = ff_variable[j];
|
||||
}
|
||||
void initialize_variables(int nelr, float *variables) {
|
||||
dim3 Dg(nelr / BLOCK_SIZE_1), Db(BLOCK_SIZE_1);
|
||||
cuda_initialize_variables<<<Dg, Db>>>(nelr, variables);
|
||||
getLastCudaError("initialize_variables failed");
|
||||
}
|
||||
|
||||
__device__ __host__ inline void compute_flux_contribution(
|
||||
float &density, float3 &momentum, float &density_energy, float &pressure,
|
||||
float3 &velocity, float3 &fc_momentum_x, float3 &fc_momentum_y,
|
||||
float3 &fc_momentum_z, float3 &fc_density_energy) {
|
||||
fc_momentum_x.x = velocity.x * momentum.x + pressure;
|
||||
fc_momentum_x.y = velocity.x * momentum.y;
|
||||
fc_momentum_x.z = velocity.x * momentum.z;
|
||||
|
||||
fc_momentum_y.x = fc_momentum_x.y;
|
||||
fc_momentum_y.y = velocity.y * momentum.y + pressure;
|
||||
fc_momentum_y.z = velocity.y * momentum.z;
|
||||
|
||||
fc_momentum_z.x = fc_momentum_x.z;
|
||||
fc_momentum_z.y = fc_momentum_y.z;
|
||||
fc_momentum_z.z = velocity.z * momentum.z + pressure;
|
||||
|
||||
float de_p = density_energy + pressure;
|
||||
fc_density_energy.x = velocity.x * de_p;
|
||||
fc_density_energy.y = velocity.y * de_p;
|
||||
fc_density_energy.z = velocity.z * de_p;
|
||||
}
|
||||
|
||||
__device__ inline void compute_velocity(float &density, float3 &momentum,
|
||||
float3 &velocity) {
|
||||
velocity.x = momentum.x / density;
|
||||
velocity.y = momentum.y / density;
|
||||
velocity.z = momentum.z / density;
|
||||
}
|
||||
|
||||
__device__ inline float compute_speed_sqd(float3 &velocity) {
|
||||
return velocity.x * velocity.x + velocity.y * velocity.y +
|
||||
velocity.z * velocity.z;
|
||||
}
|
||||
|
||||
__device__ inline float compute_pressure(float &density, float &density_energy,
|
||||
float &speed_sqd) {
|
||||
return (float(GAMMA) - float(1.0f)) *
|
||||
(density_energy - float(0.5f) * density * speed_sqd);
|
||||
}
|
||||
|
||||
__device__ inline float compute_speed_of_sound(float &density,
|
||||
float &pressure) {
|
||||
return sqrtf(float(GAMMA) * pressure / density);
|
||||
}
|
||||
|
||||
__global__ void cuda_compute_step_factor(int nelr, float *variables,
|
||||
float *areas, float *step_factors) {
|
||||
const int i = (blockDim.x * blockIdx.x + threadIdx.x);
|
||||
|
||||
float density = variables[i + VAR_DENSITY * nelr];
|
||||
float3 momentum;
|
||||
momentum.x = variables[i + (VAR_MOMENTUM + 0) * nelr];
|
||||
momentum.y = variables[i + (VAR_MOMENTUM + 1) * nelr];
|
||||
momentum.z = variables[i + (VAR_MOMENTUM + 2) * nelr];
|
||||
|
||||
float density_energy = variables[i + VAR_DENSITY_ENERGY * nelr];
|
||||
|
||||
float3 velocity;
|
||||
compute_velocity(density, momentum, velocity);
|
||||
float speed_sqd = compute_speed_sqd(velocity);
|
||||
float pressure = compute_pressure(density, density_energy, speed_sqd);
|
||||
float speed_of_sound = compute_speed_of_sound(density, pressure);
|
||||
|
||||
// dt = float(0.5f) * sqrtf(areas[i]) / (||v|| + c).... but when we do time
|
||||
// stepping, this later would need to be divided by the area, so we just do it
|
||||
// all at once
|
||||
step_factors[i] =
|
||||
float(0.5f) / (sqrtf(areas[i]) * (sqrtf(speed_sqd) + speed_of_sound));
|
||||
}
|
||||
void compute_step_factor(int nelr, float *variables, float *areas,
|
||||
float *step_factors) {
|
||||
dim3 Dg(nelr / BLOCK_SIZE_2), Db(BLOCK_SIZE_2);
|
||||
cuda_compute_step_factor<<<Dg, Db>>>(nelr, variables, areas, step_factors);
|
||||
getLastCudaError("compute_step_factor failed");
|
||||
}
|
||||
|
||||
/*
|
||||
*
|
||||
*
|
||||
*/
|
||||
__global__ void cuda_compute_flux(int nelr, int *elements_surrounding_elements,
|
||||
float *normals, float *variables,
|
||||
float *fluxes) {
|
||||
const float smoothing_coefficient = float(0.2f);
|
||||
const int i = (blockDim.x * blockIdx.x + threadIdx.x);
|
||||
|
||||
int j, nb;
|
||||
float3 normal;
|
||||
float normal_len;
|
||||
float factor;
|
||||
|
||||
float density_i = variables[i + VAR_DENSITY * nelr];
|
||||
float3 momentum_i;
|
||||
momentum_i.x = variables[i + (VAR_MOMENTUM + 0) * nelr];
|
||||
momentum_i.y = variables[i + (VAR_MOMENTUM + 1) * nelr];
|
||||
momentum_i.z = variables[i + (VAR_MOMENTUM + 2) * nelr];
|
||||
|
||||
float density_energy_i = variables[i + VAR_DENSITY_ENERGY * nelr];
|
||||
|
||||
float3 velocity_i;
|
||||
compute_velocity(density_i, momentum_i, velocity_i);
|
||||
float speed_sqd_i = compute_speed_sqd(velocity_i);
|
||||
float speed_i = sqrtf(speed_sqd_i);
|
||||
float pressure_i = compute_pressure(density_i, density_energy_i, speed_sqd_i);
|
||||
float speed_of_sound_i = compute_speed_of_sound(density_i, pressure_i);
|
||||
float3 flux_contribution_i_momentum_x, flux_contribution_i_momentum_y,
|
||||
flux_contribution_i_momentum_z;
|
||||
float3 flux_contribution_i_density_energy;
|
||||
compute_flux_contribution(
|
||||
density_i, momentum_i, density_energy_i, pressure_i, velocity_i,
|
||||
flux_contribution_i_momentum_x, flux_contribution_i_momentum_y,
|
||||
flux_contribution_i_momentum_z, flux_contribution_i_density_energy);
|
||||
|
||||
float flux_i_density = float(0.0f);
|
||||
float3 flux_i_momentum;
|
||||
flux_i_momentum.x = float(0.0f);
|
||||
flux_i_momentum.y = float(0.0f);
|
||||
flux_i_momentum.z = float(0.0f);
|
||||
float flux_i_density_energy = float(0.0f);
|
||||
|
||||
float3 velocity_nb;
|
||||
float density_nb, density_energy_nb;
|
||||
float3 momentum_nb;
|
||||
float3 flux_contribution_nb_momentum_x, flux_contribution_nb_momentum_y,
|
||||
flux_contribution_nb_momentum_z;
|
||||
float3 flux_contribution_nb_density_energy;
|
||||
float speed_sqd_nb, speed_of_sound_nb, pressure_nb;
|
||||
|
||||
#pragma unroll
|
||||
for (j = 0; j < NNB; j++) {
|
||||
nb = elements_surrounding_elements[i + j * nelr];
|
||||
normal.x = normals[i + (j + 0 * NNB) * nelr];
|
||||
normal.y = normals[i + (j + 1 * NNB) * nelr];
|
||||
normal.z = normals[i + (j + 2 * NNB) * nelr];
|
||||
normal_len =
|
||||
sqrtf(normal.x * normal.x + normal.y * normal.y + normal.z * normal.z);
|
||||
|
||||
if (nb >= 0) // a legitimate neighbor
|
||||
{
|
||||
density_nb = variables[nb + VAR_DENSITY * nelr];
|
||||
momentum_nb.x = variables[nb + (VAR_MOMENTUM + 0) * nelr];
|
||||
momentum_nb.y = variables[nb + (VAR_MOMENTUM + 1) * nelr];
|
||||
momentum_nb.z = variables[nb + (VAR_MOMENTUM + 2) * nelr];
|
||||
density_energy_nb = variables[nb + VAR_DENSITY_ENERGY * nelr];
|
||||
compute_velocity(density_nb, momentum_nb, velocity_nb);
|
||||
speed_sqd_nb = compute_speed_sqd(velocity_nb);
|
||||
pressure_nb =
|
||||
compute_pressure(density_nb, density_energy_nb, speed_sqd_nb);
|
||||
speed_of_sound_nb = compute_speed_of_sound(density_nb, pressure_nb);
|
||||
compute_flux_contribution(
|
||||
density_nb, momentum_nb, density_energy_nb, pressure_nb, velocity_nb,
|
||||
flux_contribution_nb_momentum_x, flux_contribution_nb_momentum_y,
|
||||
flux_contribution_nb_momentum_z, flux_contribution_nb_density_energy);
|
||||
|
||||
// artificial viscosity
|
||||
factor = -normal_len * smoothing_coefficient * float(0.5f) *
|
||||
(speed_i + sqrtf(speed_sqd_nb) + speed_of_sound_i +
|
||||
speed_of_sound_nb);
|
||||
flux_i_density += factor * (density_i - density_nb);
|
||||
flux_i_density_energy += factor * (density_energy_i - density_energy_nb);
|
||||
flux_i_momentum.x += factor * (momentum_i.x - momentum_nb.x);
|
||||
flux_i_momentum.y += factor * (momentum_i.y - momentum_nb.y);
|
||||
flux_i_momentum.z += factor * (momentum_i.z - momentum_nb.z);
|
||||
|
||||
// accumulate cell-centered fluxes
|
||||
factor = float(0.5f) * normal.x;
|
||||
flux_i_density += factor * (momentum_nb.x + momentum_i.x);
|
||||
flux_i_density_energy += factor * (flux_contribution_nb_density_energy.x +
|
||||
flux_contribution_i_density_energy.x);
|
||||
flux_i_momentum.x += factor * (flux_contribution_nb_momentum_x.x +
|
||||
flux_contribution_i_momentum_x.x);
|
||||
flux_i_momentum.y += factor * (flux_contribution_nb_momentum_y.x +
|
||||
flux_contribution_i_momentum_y.x);
|
||||
flux_i_momentum.z += factor * (flux_contribution_nb_momentum_z.x +
|
||||
flux_contribution_i_momentum_z.x);
|
||||
|
||||
factor = float(0.5f) * normal.y;
|
||||
flux_i_density += factor * (momentum_nb.y + momentum_i.y);
|
||||
flux_i_density_energy += factor * (flux_contribution_nb_density_energy.y +
|
||||
flux_contribution_i_density_energy.y);
|
||||
flux_i_momentum.x += factor * (flux_contribution_nb_momentum_x.y +
|
||||
flux_contribution_i_momentum_x.y);
|
||||
flux_i_momentum.y += factor * (flux_contribution_nb_momentum_y.y +
|
||||
flux_contribution_i_momentum_y.y);
|
||||
flux_i_momentum.z += factor * (flux_contribution_nb_momentum_z.y +
|
||||
flux_contribution_i_momentum_z.y);
|
||||
|
||||
factor = float(0.5f) * normal.z;
|
||||
flux_i_density += factor * (momentum_nb.z + momentum_i.z);
|
||||
flux_i_density_energy += factor * (flux_contribution_nb_density_energy.z +
|
||||
flux_contribution_i_density_energy.z);
|
||||
flux_i_momentum.x += factor * (flux_contribution_nb_momentum_x.z +
|
||||
flux_contribution_i_momentum_x.z);
|
||||
flux_i_momentum.y += factor * (flux_contribution_nb_momentum_y.z +
|
||||
flux_contribution_i_momentum_y.z);
|
||||
flux_i_momentum.z += factor * (flux_contribution_nb_momentum_z.z +
|
||||
flux_contribution_i_momentum_z.z);
|
||||
} else if (nb == -1) // a wing boundary
|
||||
{
|
||||
flux_i_momentum.x += normal.x * pressure_i;
|
||||
flux_i_momentum.y += normal.y * pressure_i;
|
||||
flux_i_momentum.z += normal.z * pressure_i;
|
||||
} else if (nb == -2) // a far field boundary
|
||||
{
|
||||
factor = float(0.5f) * normal.x;
|
||||
flux_i_density += factor * (ff_variable[VAR_MOMENTUM + 0] + momentum_i.x);
|
||||
flux_i_density_energy +=
|
||||
factor * (ff_flux_contribution_density_energy[0].x +
|
||||
flux_contribution_i_density_energy.x);
|
||||
flux_i_momentum.x += factor * (ff_flux_contribution_momentum_x[0].x +
|
||||
flux_contribution_i_momentum_x.x);
|
||||
flux_i_momentum.y += factor * (ff_flux_contribution_momentum_y[0].x +
|
||||
flux_contribution_i_momentum_y.x);
|
||||
flux_i_momentum.z += factor * (ff_flux_contribution_momentum_z[0].x +
|
||||
flux_contribution_i_momentum_z.x);
|
||||
|
||||
factor = float(0.5f) * normal.y;
|
||||
flux_i_density += factor * (ff_variable[VAR_MOMENTUM + 1] + momentum_i.y);
|
||||
flux_i_density_energy +=
|
||||
factor * (ff_flux_contribution_density_energy[0].y +
|
||||
flux_contribution_i_density_energy.y);
|
||||
flux_i_momentum.x += factor * (ff_flux_contribution_momentum_x[0].y +
|
||||
flux_contribution_i_momentum_x.y);
|
||||
flux_i_momentum.y += factor * (ff_flux_contribution_momentum_y[0].y +
|
||||
flux_contribution_i_momentum_y.y);
|
||||
flux_i_momentum.z += factor * (ff_flux_contribution_momentum_z[0].y +
|
||||
flux_contribution_i_momentum_z.y);
|
||||
|
||||
factor = float(0.5f) * normal.z;
|
||||
flux_i_density += factor * (ff_variable[VAR_MOMENTUM + 2] + momentum_i.z);
|
||||
flux_i_density_energy +=
|
||||
factor * (ff_flux_contribution_density_energy[0].z +
|
||||
flux_contribution_i_density_energy.z);
|
||||
flux_i_momentum.x += factor * (ff_flux_contribution_momentum_x[0].z +
|
||||
flux_contribution_i_momentum_x.z);
|
||||
flux_i_momentum.y += factor * (ff_flux_contribution_momentum_y[0].z +
|
||||
flux_contribution_i_momentum_y.z);
|
||||
flux_i_momentum.z += factor * (ff_flux_contribution_momentum_z[0].z +
|
||||
flux_contribution_i_momentum_z.z);
|
||||
}
|
||||
}
|
||||
|
||||
fluxes[i + VAR_DENSITY * nelr] = flux_i_density;
|
||||
fluxes[i + (VAR_MOMENTUM + 0) * nelr] = flux_i_momentum.x;
|
||||
fluxes[i + (VAR_MOMENTUM + 1) * nelr] = flux_i_momentum.y;
|
||||
fluxes[i + (VAR_MOMENTUM + 2) * nelr] = flux_i_momentum.z;
|
||||
fluxes[i + VAR_DENSITY_ENERGY * nelr] = flux_i_density_energy;
|
||||
}
|
||||
void compute_flux(int nelr, int *elements_surrounding_elements, float *normals,
|
||||
float *variables, float *fluxes) {
|
||||
dim3 Dg(nelr / BLOCK_SIZE_3), Db(BLOCK_SIZE_3);
|
||||
cuda_compute_flux<<<Dg, Db>>>(nelr, elements_surrounding_elements, normals,
|
||||
variables, fluxes);
|
||||
getLastCudaError("compute_flux failed");
|
||||
}
|
||||
|
||||
__global__ void cuda_time_step(int j, int nelr, float *old_variables,
|
||||
float *variables, float *step_factors,
|
||||
float *fluxes) {
|
||||
const int i = (blockDim.x * blockIdx.x + threadIdx.x);
|
||||
|
||||
float factor = step_factors[i] / float(RK + 1 - j);
|
||||
|
||||
variables[i + VAR_DENSITY * nelr] = old_variables[i + VAR_DENSITY * nelr] +
|
||||
factor * fluxes[i + VAR_DENSITY * nelr];
|
||||
variables[i + VAR_DENSITY_ENERGY * nelr] =
|
||||
old_variables[i + VAR_DENSITY_ENERGY * nelr] +
|
||||
factor * fluxes[i + VAR_DENSITY_ENERGY * nelr];
|
||||
variables[i + (VAR_MOMENTUM + 0) * nelr] =
|
||||
old_variables[i + (VAR_MOMENTUM + 0) * nelr] +
|
||||
factor * fluxes[i + (VAR_MOMENTUM + 0) * nelr];
|
||||
variables[i + (VAR_MOMENTUM + 1) * nelr] =
|
||||
old_variables[i + (VAR_MOMENTUM + 1) * nelr] +
|
||||
factor * fluxes[i + (VAR_MOMENTUM + 1) * nelr];
|
||||
variables[i + (VAR_MOMENTUM + 2) * nelr] =
|
||||
old_variables[i + (VAR_MOMENTUM + 2) * nelr] +
|
||||
factor * fluxes[i + (VAR_MOMENTUM + 2) * nelr];
|
||||
}
|
||||
void time_step(int j, int nelr, float *old_variables, float *variables,
|
||||
float *step_factors, float *fluxes) {
|
||||
dim3 Dg(nelr / BLOCK_SIZE_4), Db(BLOCK_SIZE_4);
|
||||
cuda_time_step<<<Dg, Db>>>(j, nelr, old_variables, variables, step_factors,
|
||||
fluxes);
|
||||
getLastCudaError("update failed");
|
||||
}
|
||||
|
||||
/*
|
||||
* Main function
|
||||
*/
|
||||
int main(int argc, char **argv) {
|
||||
printf("WG size of kernel:initialize = %d, WG size of "
|
||||
"kernel:compute_step_factor = %d, WG size of kernel:compute_flux = "
|
||||
"%d, WG size of kernel:time_step = %d\n",
|
||||
BLOCK_SIZE_1, BLOCK_SIZE_2, BLOCK_SIZE_3, BLOCK_SIZE_4);
|
||||
|
||||
if (argc < 2) {
|
||||
std::cout << "specify data file name" << std::endl;
|
||||
return 0;
|
||||
}
|
||||
const char *data_file_name = argv[1];
|
||||
|
||||
cudaDeviceProp prop;
|
||||
int dev;
|
||||
|
||||
checkCudaErrors(cudaSetDevice(0));
|
||||
|
||||
// set far field conditions and load them into constant memory on the gpu
|
||||
{
|
||||
float h_ff_variable[NVAR];
|
||||
const float angle_of_attack =
|
||||
float(3.1415926535897931 / 180.0f) * float(deg_angle_of_attack);
|
||||
|
||||
h_ff_variable[VAR_DENSITY] = float(1.4);
|
||||
|
||||
float ff_pressure = float(1.0f);
|
||||
float ff_speed_of_sound =
|
||||
sqrt(GAMMA * ff_pressure / h_ff_variable[VAR_DENSITY]);
|
||||
float ff_speed = float(ff_mach) * ff_speed_of_sound;
|
||||
|
||||
float3 ff_velocity;
|
||||
ff_velocity.x = ff_speed * float(cos((float)angle_of_attack));
|
||||
ff_velocity.y = ff_speed * float(sin((float)angle_of_attack));
|
||||
ff_velocity.z = 0.0f;
|
||||
|
||||
h_ff_variable[VAR_MOMENTUM + 0] =
|
||||
h_ff_variable[VAR_DENSITY] * ff_velocity.x;
|
||||
h_ff_variable[VAR_MOMENTUM + 1] =
|
||||
h_ff_variable[VAR_DENSITY] * ff_velocity.y;
|
||||
h_ff_variable[VAR_MOMENTUM + 2] =
|
||||
h_ff_variable[VAR_DENSITY] * ff_velocity.z;
|
||||
|
||||
h_ff_variable[VAR_DENSITY_ENERGY] =
|
||||
h_ff_variable[VAR_DENSITY] * (float(0.5f) * (ff_speed * ff_speed)) +
|
||||
(ff_pressure / float(GAMMA - 1.0f));
|
||||
|
||||
float3 h_ff_momentum;
|
||||
h_ff_momentum.x = *(h_ff_variable + VAR_MOMENTUM + 0);
|
||||
h_ff_momentum.y = *(h_ff_variable + VAR_MOMENTUM + 1);
|
||||
h_ff_momentum.z = *(h_ff_variable + VAR_MOMENTUM + 2);
|
||||
float3 h_ff_flux_contribution_momentum_x;
|
||||
float3 h_ff_flux_contribution_momentum_y;
|
||||
float3 h_ff_flux_contribution_momentum_z;
|
||||
float3 h_ff_flux_contribution_density_energy;
|
||||
compute_flux_contribution(h_ff_variable[VAR_DENSITY], h_ff_momentum,
|
||||
h_ff_variable[VAR_DENSITY_ENERGY], ff_pressure,
|
||||
ff_velocity, h_ff_flux_contribution_momentum_x,
|
||||
h_ff_flux_contribution_momentum_y,
|
||||
h_ff_flux_contribution_momentum_z,
|
||||
h_ff_flux_contribution_density_energy);
|
||||
|
||||
// copy far field conditions to the gpu
|
||||
checkCudaErrors(
|
||||
cudaMemcpyToSymbol(ff_variable, h_ff_variable, NVAR * sizeof(float)));
|
||||
checkCudaErrors(cudaMemcpyToSymbol(ff_flux_contribution_momentum_x,
|
||||
&h_ff_flux_contribution_momentum_x,
|
||||
sizeof(float3)));
|
||||
checkCudaErrors(cudaMemcpyToSymbol(ff_flux_contribution_momentum_y,
|
||||
&h_ff_flux_contribution_momentum_y,
|
||||
sizeof(float3)));
|
||||
checkCudaErrors(cudaMemcpyToSymbol(ff_flux_contribution_momentum_z,
|
||||
&h_ff_flux_contribution_momentum_z,
|
||||
sizeof(float3)));
|
||||
|
||||
checkCudaErrors(cudaMemcpyToSymbol(ff_flux_contribution_density_energy,
|
||||
&h_ff_flux_contribution_density_energy,
|
||||
sizeof(float3)));
|
||||
}
|
||||
int nel;
|
||||
int nelr;
|
||||
|
||||
// read in domain geometry
|
||||
float *areas;
|
||||
int *elements_surrounding_elements;
|
||||
float *normals;
|
||||
{
|
||||
std::ifstream file(data_file_name);
|
||||
|
||||
file >> nel;
|
||||
nelr =
|
||||
BLOCK_SIZE_0 * ((nel / BLOCK_SIZE_0) + std::min(1, nel % BLOCK_SIZE_0));
|
||||
|
||||
float *h_areas = new float[nelr];
|
||||
int *h_elements_surrounding_elements = new int[nelr * NNB];
|
||||
float *h_normals = new float[nelr * NDIM * NNB];
|
||||
|
||||
// read in data
|
||||
for (int i = 0; i < nel; i++) {
|
||||
file >> h_areas[i];
|
||||
for (int j = 0; j < NNB; j++) {
|
||||
file >> h_elements_surrounding_elements[i + j * nelr];
|
||||
if (h_elements_surrounding_elements[i + j * nelr] < 0)
|
||||
h_elements_surrounding_elements[i + j * nelr] = -1;
|
||||
h_elements_surrounding_elements[i + j * nelr]--; // it's coming in with
|
||||
// Fortran numbering
|
||||
|
||||
for (int k = 0; k < NDIM; k++) {
|
||||
file >> h_normals[i + (j + k * NNB) * nelr];
|
||||
h_normals[i + (j + k * NNB) * nelr] =
|
||||
-h_normals[i + (j + k * NNB) * nelr];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// fill in remaining data
|
||||
int last = nel - 1;
|
||||
for (int i = nel; i < nelr; i++) {
|
||||
h_areas[i] = h_areas[last];
|
||||
for (int j = 0; j < NNB; j++) {
|
||||
// duplicate the last element
|
||||
h_elements_surrounding_elements[i + j * nelr] =
|
||||
h_elements_surrounding_elements[last + j * nelr];
|
||||
for (int k = 0; k < NDIM; k++)
|
||||
h_normals[last + (j + k * NNB) * nelr] =
|
||||
h_normals[last + (j + k * NNB) * nelr];
|
||||
}
|
||||
}
|
||||
|
||||
areas = alloc<float>(nelr);
|
||||
upload<float>(areas, h_areas, nelr);
|
||||
|
||||
elements_surrounding_elements = alloc<int>(nelr * NNB);
|
||||
upload<int>(elements_surrounding_elements, h_elements_surrounding_elements,
|
||||
nelr * NNB);
|
||||
|
||||
normals = alloc<float>(nelr * NDIM * NNB);
|
||||
upload<float>(normals, h_normals, nelr * NDIM * NNB);
|
||||
|
||||
delete[] h_areas;
|
||||
delete[] h_elements_surrounding_elements;
|
||||
delete[] h_normals;
|
||||
}
|
||||
|
||||
// Create arrays and set initial conditions
|
||||
float *variables = alloc<float>(nelr * NVAR);
|
||||
initialize_variables(nelr, variables);
|
||||
|
||||
float *old_variables = alloc<float>(nelr * NVAR);
|
||||
float *fluxes = alloc<float>(nelr * NVAR);
|
||||
float *step_factors = alloc<float>(nelr);
|
||||
|
||||
// make sure all memory is floatly allocated before we start timing
|
||||
initialize_variables(nelr, old_variables);
|
||||
initialize_variables(nelr, fluxes);
|
||||
cudaMemset((void *)step_factors, 0, sizeof(float) * nelr);
|
||||
// make sure CUDA isn't still doing something before we start timing
|
||||
cudaThreadSynchronize();
|
||||
|
||||
// these need to be computed the first time in order to compute time step
|
||||
std::cout << "Starting..." << std::endl;
|
||||
|
||||
StopWatchInterface *timer = 0;
|
||||
// unsigned int timer = 0;
|
||||
|
||||
// CUT_SAFE_CALL( cutCreateTimer( &timer));
|
||||
// CUT_SAFE_CALL( cutStartTimer( timer));
|
||||
sdkCreateTimer(&timer);
|
||||
sdkStartTimer(&timer);
|
||||
// Begin iterations
|
||||
for (int i = 0; i < iterations; i++) {
|
||||
copy<float>(old_variables, variables, nelr * NVAR);
|
||||
|
||||
// for the first iteration we compute the time step
|
||||
compute_step_factor(nelr, variables, areas, step_factors);
|
||||
getLastCudaError("compute_step_factor failed");
|
||||
|
||||
for (int j = 0; j < RK; j++) {
|
||||
compute_flux(nelr, elements_surrounding_elements, normals, variables,
|
||||
fluxes);
|
||||
getLastCudaError("compute_flux failed");
|
||||
time_step(j, nelr, old_variables, variables, step_factors, fluxes);
|
||||
getLastCudaError("time_step failed");
|
||||
}
|
||||
}
|
||||
|
||||
cudaThreadSynchronize();
|
||||
// CUT_SAFE_CALL( cutStopTimer(timer) );
|
||||
sdkStopTimer(&timer);
|
||||
|
||||
std::cout << (sdkGetAverageTimerValue(&timer) / 1000.0) / iterations
|
||||
<< " seconds per iteration" << std::endl;
|
||||
|
||||
std::cout << "Saving solution..." << std::endl;
|
||||
dump(variables, nel, nelr);
|
||||
std::cout << "Saved solution..." << std::endl;
|
||||
|
||||
std::cout << "Cleaning up..." << std::endl;
|
||||
dealloc<float>(areas);
|
||||
dealloc<int>(elements_surrounding_elements);
|
||||
dealloc<float>(normals);
|
||||
|
||||
dealloc<float>(variables);
|
||||
dealloc<float>(old_variables);
|
||||
dealloc<float>(fluxes);
|
||||
dealloc<float>(step_factors);
|
||||
|
||||
std::cout << "Done..." << std::endl;
|
||||
|
||||
return 0;
|
||||
}
|
|
@ -1,15 +0,0 @@
|
|||
# # #!/bin/bash
|
||||
clang++ euler3d.cu -I/usr/local/cuda-10.1/samples/common/inc --cuda-path=/usr/local/cuda-10.1 --cuda-gpu-arch=sm_50 -L/usr/local/cuda-10.1/lib64 -lcudart_static -ldl -lrt -pthread -save-temps -v
|
||||
|
||||
/home/robinhan/repo/open_source_template/build/compilation/kernelTranslator euler3d-cuda-nvptx64-nvidia-cuda-sm_50.bc kernel.bc
|
||||
/home/robinhan/repo/open_source_template/build/compilation/hostTranslator euler3d-host-x86_64-unknown-linux-gnu.bc host.bc
|
||||
|
||||
llc --relocation-model=pic --filetype=obj kernel.bc
|
||||
llc --relocation-model=pic --filetype=obj host.bc
|
||||
|
||||
g++ -Wall -L/home/robinhan/repo/open_source_template/build/runtime -L/home/robinhan/repo/open_source_template/build/runtime/threadPool -o a.out -fPIC -no-pie host.o kernel.o -lc -lx86Runtime -lthreadPool -lpthread
|
||||
|
||||
./a.out ../rodinia-data/cfd/fvcorr.domn.097K
|
||||
# ./demo 1024
|
||||
# # # ./demo -f ../../data/matrix3.txt
|
||||
# # # run -f ../../data/gaussian/matrix3.txt
|
|
@ -1,64 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2009, Jiri Matela
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#ifndef _COMMON_H
|
||||
#define _COMMON_H
|
||||
|
||||
// 24-bit multiplication is faster on G80,
|
||||
// but we must be sure to multiply integers
|
||||
// only within [-8M, 8M - 1] range
|
||||
#define IMUL(a, b) __mul24(a, b)
|
||||
|
||||
////cuda timing macros
|
||||
//#define CTIMERINIT cudaEvent_t cstart, cstop; \
|
||||
// cudaEventCreate(&cstart); \
|
||||
// cudaEventCreate(&cstop); \
|
||||
// float elapsedTime
|
||||
//#define CTIMERSTART(cstart) cudaEventRecord(cstart,0)
|
||||
//#define CTIMERSTOP(cstop) cudaEventRecord(cstop,0); \
|
||||
// cudaEventSynchronize(cstop); \
|
||||
// cudaEventElapsedTime(&elapsedTime, cstart, cstop)
|
||||
|
||||
// divide and round up macro
|
||||
#define DIVANDRND(a, b) ((((a) % (b)) != 0) ? ((a) / (b) + 1) : ((a) / (b)))
|
||||
|
||||
#define cudaCheckError(msg) \
|
||||
{ \
|
||||
cudaError_t err = cudaGetLastError(); \
|
||||
if (cudaSuccess != err) { \
|
||||
fprintf(stderr, "%s: %i: %s: %s.\n", __FILE__, __LINE__, msg, \
|
||||
cudaGetErrorString(err)); \
|
||||
exit(-1); \
|
||||
} \
|
||||
}
|
||||
|
||||
#define cudaCheckAsyncError(msg) \
|
||||
{ \
|
||||
cudaThreadSynchronize(); \
|
||||
cudaCheckError(msg); \
|
||||
}
|
||||
|
||||
#endif
|
|
@ -1,193 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2009, Jiri Matela
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include <unistd.h>
|
||||
#include <error.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <errno.h>
|
||||
#include <assert.h>
|
||||
|
||||
#include "components.h"
|
||||
#include "common.h"
|
||||
|
||||
#define THREADS 256
|
||||
|
||||
/* Store 3 RGB float components */
|
||||
__device__ void storeComponents(float *d_r, float *d_g, float *d_b, float r, float g, float b, int pos)
|
||||
{
|
||||
d_r[pos] = (r/255.0f) - 0.5f;
|
||||
d_g[pos] = (g/255.0f) - 0.5f;
|
||||
d_b[pos] = (b/255.0f) - 0.5f;
|
||||
}
|
||||
|
||||
/* Store 3 RGB intege components */
|
||||
__device__ void storeComponents(int *d_r, int *d_g, int *d_b, int r, int g, int b, int pos)
|
||||
{
|
||||
d_r[pos] = r - 128;
|
||||
d_g[pos] = g - 128;
|
||||
d_b[pos] = b - 128;
|
||||
}
|
||||
|
||||
/* Store float component */
|
||||
__device__ void storeComponent(float *d_c, float c, int pos)
|
||||
{
|
||||
d_c[pos] = (c/255.0f) - 0.5f;
|
||||
}
|
||||
|
||||
/* Store integer component */
|
||||
__device__ void storeComponent(int *d_c, int c, int pos)
|
||||
{
|
||||
d_c[pos] = c - 128;
|
||||
}
|
||||
|
||||
/* Copy img src data into three separated component buffers */
|
||||
template<typename T>
|
||||
__global__ void c_CopySrcToComponents(T *d_r, T *d_g, T *d_b,
|
||||
unsigned char * d_src,
|
||||
int pixels)
|
||||
{
|
||||
int x = threadIdx.x;
|
||||
int gX = blockDim.x*blockIdx.x;
|
||||
|
||||
__shared__ unsigned char sData[THREADS*3];
|
||||
|
||||
/* Copy data to shared mem by 4bytes
|
||||
other checks are not necessary, since
|
||||
d_src buffer is aligned to sharedDataSize */
|
||||
if ( (x*4) < THREADS*3 ) {
|
||||
float *s = (float *)d_src;
|
||||
float *d = (float *)sData;
|
||||
d[x] = s[((gX*3)>>2) + x];
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
T r, g, b;
|
||||
|
||||
int offset = x*3;
|
||||
r = (T)(sData[offset]);
|
||||
g = (T)(sData[offset+1]);
|
||||
b = (T)(sData[offset+2]);
|
||||
|
||||
int globalOutputPosition = gX + x;
|
||||
if (globalOutputPosition < pixels) {
|
||||
storeComponents(d_r, d_g, d_b, r, g, b, globalOutputPosition);
|
||||
}
|
||||
}
|
||||
|
||||
/* Copy img src data into three separated component buffers */
|
||||
template<typename T>
|
||||
__global__ void c_CopySrcToComponent(T *d_c, unsigned char * d_src, int pixels)
|
||||
{
|
||||
int x = threadIdx.x;
|
||||
int gX = blockDim.x*blockIdx.x;
|
||||
|
||||
__shared__ unsigned char sData[THREADS];
|
||||
|
||||
/* Copy data to shared mem by 4bytes
|
||||
other checks are not necessary, since
|
||||
d_src buffer is aligned to sharedDataSize */
|
||||
if ( (x*4) < THREADS) {
|
||||
float *s = (float *)d_src;
|
||||
float *d = (float *)sData;
|
||||
d[x] = s[(gX>>2) + x];
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
T c;
|
||||
|
||||
c = (T)(sData[x]);
|
||||
|
||||
int globalOutputPosition = gX + x;
|
||||
if (globalOutputPosition < pixels) {
|
||||
storeComponent(d_c, c, globalOutputPosition);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/* Separate compoents of 8bit RGB source image */
|
||||
template<typename T>
|
||||
void rgbToComponents(T *d_r, T *d_g, T *d_b, unsigned char * src, int width, int height)
|
||||
{
|
||||
unsigned char * d_src;
|
||||
int pixels = width*height;
|
||||
int alignedSize = DIVANDRND(width*height, THREADS) * THREADS * 3; //aligned to thread block size -- THREADS
|
||||
|
||||
/* Alloc d_src buffer */
|
||||
cudaMalloc((void **)&d_src, alignedSize);
|
||||
cudaCheckAsyncError("Cuda malloc")
|
||||
cudaMemset(d_src, 0, alignedSize);
|
||||
|
||||
/* Copy data to device */
|
||||
cudaMemcpy(d_src, src, pixels*3, cudaMemcpyHostToDevice);
|
||||
cudaCheckError("Copy data to device")
|
||||
|
||||
/* Kernel */
|
||||
dim3 threads(THREADS);
|
||||
dim3 grid(alignedSize/(THREADS*3));
|
||||
assert(alignedSize%(THREADS*3) == 0);
|
||||
c_CopySrcToComponents<<<grid, threads>>>(d_r, d_g, d_b, d_src, pixels);
|
||||
cudaCheckAsyncError("CopySrcToComponents kernel")
|
||||
|
||||
/* Free Memory */
|
||||
cudaFree(d_src);
|
||||
cudaCheckAsyncError("Free memory")
|
||||
}
|
||||
template void rgbToComponents<float>(float *d_r, float *d_g, float *d_b, unsigned char * src, int width, int height);
|
||||
template void rgbToComponents<int>(int *d_r, int *d_g, int *d_b, unsigned char * src, int width, int height);
|
||||
|
||||
|
||||
/* Copy a 8bit source image data into a color compoment of type T */
|
||||
template<typename T>
|
||||
void bwToComponent(T *d_c, unsigned char * src, int width, int height)
|
||||
{
|
||||
unsigned char * d_src;
|
||||
int pixels = width*height;
|
||||
int alignedSize = DIVANDRND(pixels, THREADS) * THREADS; //aligned to thread block size -- THREADS
|
||||
|
||||
/* Alloc d_src buffer */
|
||||
cudaMalloc((void **)&d_src, alignedSize);
|
||||
cudaCheckAsyncError("Cuda malloc")
|
||||
cudaMemset(d_src, 0, alignedSize);
|
||||
|
||||
/* Copy data to device */
|
||||
cudaMemcpy(d_src, src, pixels, cudaMemcpyHostToDevice);
|
||||
cudaCheckError("Copy data to device")
|
||||
|
||||
/* Kernel */
|
||||
dim3 threads(THREADS);
|
||||
dim3 grid(alignedSize/(THREADS));
|
||||
assert(alignedSize%(THREADS) == 0);
|
||||
c_CopySrcToComponent<<<grid, threads>>>(d_c, d_src, pixels);
|
||||
cudaCheckAsyncError("CopySrcToComponent kernel")
|
||||
|
||||
/* Free Memory */
|
||||
cudaFree(d_src);
|
||||
cudaCheckAsyncError("Free memory")
|
||||
}
|
||||
|
||||
template void bwToComponent<float>(float *d_c, unsigned char *src, int width, int height);
|
||||
template void bwToComponent<int>(int *d_c, unsigned char *src, int width, int height);
|
|
@ -1,39 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2009, Jiri Matela
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#ifndef _COMPONENTS_H
|
||||
#define _COMPONENTS_H
|
||||
|
||||
/* Separate compoents of source 8bit RGB image */
|
||||
template <typename T>
|
||||
void rgbToComponents(T *d_r, T *d_g, T *d_b, unsigned char *src, int width,
|
||||
int height);
|
||||
|
||||
/* Copy a 8bit source image data into a color compoment of type T */
|
||||
template <typename T>
|
||||
void bwToComponent(T *d_c, unsigned char *src, int width, int height);
|
||||
|
||||
#endif
|
|
@ -1,385 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2009, Jiri Matela
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <fcntl.h>
|
||||
#include <assert.h>
|
||||
#include <errno.h>
|
||||
#include <sys/time.h>
|
||||
#include <unistd.h>
|
||||
#include <error.h>
|
||||
#include "dwt_cuda/dwt.h"
|
||||
#include "dwt_cuda/common.h"
|
||||
#include "dwt.h"
|
||||
#include "common.h"
|
||||
#include <iostream>
|
||||
#include <fstream>
|
||||
|
||||
inline void fdwt(float *in, float *out, int width, int height, int levels)
|
||||
{
|
||||
printf(" Running fdwt97 Float \n");
|
||||
dwt_cuda::fdwt97(in, out, width, height, levels);
|
||||
}
|
||||
/*
|
||||
inline void fdwt(float *in, float *out, int width, int height, int levels, float *diffOut)
|
||||
{
|
||||
dwt_cuda::fdwt97(in, out, width, height, levels, diffOut);
|
||||
}
|
||||
*/
|
||||
|
||||
|
||||
|
||||
inline void fdwt(int *in, int *out, int width, int height, int levels)
|
||||
{
|
||||
printf(" Running fdwt53 Int \n");
|
||||
|
||||
dwt_cuda::fdwt53(in, out, width, height, levels);
|
||||
}
|
||||
/*
|
||||
inline void fdwt(int *in, int *out, int width, int height, int levels, int *diffOut)
|
||||
{
|
||||
dwt_cuda::fdwt53(in, out, width, height, levels, diffOut);
|
||||
}
|
||||
*/
|
||||
|
||||
|
||||
|
||||
inline void rdwt(float *in, float *out, int width, int height, int levels)
|
||||
{
|
||||
printf(" Running rdwt97 Float \n");
|
||||
|
||||
dwt_cuda::rdwt97(in, out, width, height, levels);
|
||||
}
|
||||
|
||||
inline void rdwt(int *in, int *out, int width, int height, int levels)
|
||||
{
|
||||
printf(" Running rdwt53 Int \n");
|
||||
|
||||
dwt_cuda::rdwt53(in, out, width, height, levels);
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
int nStage2dDWT(T * in, T * out, T * backup, int pixWidth, int pixHeight, int stages, bool forward)
|
||||
{
|
||||
printf("\n*** %d stages of 2D forward DWT:\n", stages);
|
||||
|
||||
/* create backup of input, because each test iteration overwrites it */
|
||||
const int size = pixHeight * pixWidth * sizeof(T);
|
||||
cudaMemcpy(backup, in, size, cudaMemcpyDeviceToDevice);
|
||||
cudaCheckError("Memcopy device to device");
|
||||
|
||||
/* Measure time of individual levels. */
|
||||
if(forward)
|
||||
fdwt(in, out, pixWidth, pixHeight, stages);
|
||||
else
|
||||
rdwt(in, out, pixWidth, pixHeight, stages);
|
||||
|
||||
// Measure overall time of DWT.
|
||||
/* #ifdef GPU_DWT_TESTING_1
|
||||
|
||||
dwt_cuda::CudaDWTTester tester;
|
||||
for(int i = tester.getNumIterations(); i--; ) {
|
||||
// Recover input and measure one overall DWT run.
|
||||
cudaMemcpy(in, backup, size, cudaMemcpyDeviceToDevice);
|
||||
cudaCheckError("Memcopy device to device");
|
||||
tester.beginTestIteration();
|
||||
if(forward)
|
||||
fdwt(in, out, pixWidth, pixHeight, stages);
|
||||
else
|
||||
rdwt(in, out, pixWidth, pixHeight, stages);
|
||||
tester.endTestIteration();
|
||||
}
|
||||
tester.showPerformance(" Overall DWT", pixWidth, pixHeight);
|
||||
#endif // GPU_DWT_TESTING
|
||||
|
||||
cudaCheckAsyncError("DWT Kernel calls");
|
||||
*/ return 0;
|
||||
}
|
||||
template int nStage2dDWT<float>(float*, float*, float*, int, int, int, bool);
|
||||
template int nStage2dDWT<int>(int*, int*, int*, int, int, int, bool);
|
||||
|
||||
|
||||
|
||||
/*
|
||||
template<typename T>
|
||||
int nStage2dDWT(T * in, T * out, T * backup, int pixWidth, int pixHeight, int stages, bool forward, T * diffOut)
|
||||
{
|
||||
printf("*** %d stages of 2D forward DWT:\n", stages);
|
||||
|
||||
// create backup of input, because each test iteration overwrites it
|
||||
const int size = pixHeight * pixWidth * sizeof(T);
|
||||
cudaMemcpy(backup, in, size, cudaMemcpyDeviceToDevice);
|
||||
cudaCheckError("Memcopy device to device");
|
||||
|
||||
// Measure time of individual levels.
|
||||
if(forward)
|
||||
fdwt(in, out, pixWidth, pixHeight, stages, diffOut);
|
||||
else
|
||||
rdwt(in, out, pixWidth, pixHeight, stages);
|
||||
|
||||
// Measure overall time of DWT.
|
||||
#ifdef GPU_DWT_TESTING_1
|
||||
|
||||
dwt_cuda::CudaDWTTester tester;
|
||||
for(int i = tester.getNumIterations(); i--; ) {
|
||||
// Recover input and measure one overall DWT run.
|
||||
cudaMemcpy(in, backup, size, cudaMemcpyDeviceToDevice);
|
||||
cudaCheckError("Memcopy device to device");
|
||||
tester.beginTestIteration();
|
||||
if(forward)
|
||||
fdwt(in, out, pixWidth, pixHeight, stages, diffOut);
|
||||
else
|
||||
rdwt(in, out, pixWidth, pixHeight, stages);
|
||||
tester.endTestIteration();
|
||||
}
|
||||
tester.showPerformance(" Overall DWT", pixWidth, pixHeight);
|
||||
#endif // GPU_DWT_TESTING
|
||||
|
||||
cudaCheckAsyncError("DWT Kernel calls");
|
||||
return 0;
|
||||
}
|
||||
template int nStage2dDWT<float>(float*, float*, float*, int, int, int, bool, float*);
|
||||
template int nStage2dDWT<int>(int*, int*, int*, int, int, int, bool, int*);
|
||||
|
||||
*/
|
||||
|
||||
void samplesToChar(unsigned char * dst, float * src, int samplesNum, const char * filename)
|
||||
{
|
||||
int i;
|
||||
std::ofstream outputFile;
|
||||
char outfile[strlen(filename)+strlen(".txt")];
|
||||
strcpy(outfile, filename);
|
||||
strcpy(outfile+strlen(filename), ".txt");
|
||||
outputFile.open(outfile);
|
||||
|
||||
|
||||
for(i = 0; i < samplesNum; i++) {
|
||||
float r = (src[i]+0.5f) * 255;
|
||||
if (r > 255) r = 255;
|
||||
if (r < 0) r = 0;
|
||||
dst[i] = (unsigned char)r;
|
||||
outputFile << "index: " << i << " val: "<< r <<" \n";
|
||||
|
||||
|
||||
}
|
||||
outputFile.close();
|
||||
}
|
||||
|
||||
void samplesToChar(unsigned char * dst, int * src, int samplesNum, const char * filename)
|
||||
{
|
||||
int i;
|
||||
std::ofstream outputFile;
|
||||
char outfile[strlen(filename)+strlen(".txt")];
|
||||
strcpy(outfile, filename);
|
||||
strcpy(outfile+strlen(filename), ".txt");
|
||||
outputFile.open(outfile);
|
||||
for(i = 0; i < samplesNum; i++) {
|
||||
int r = src[i]+128;
|
||||
if (r > 255) r = 255;
|
||||
if (r < 0) r = 0;
|
||||
dst[i] = (unsigned char)r;
|
||||
// added this line to output check
|
||||
outputFile << "index: " << i << " val: "<< r <<" \n";
|
||||
}
|
||||
outputFile.close();
|
||||
}
|
||||
|
||||
///* Write output linear orderd*/
|
||||
template<typename T>
|
||||
int writeLinear(T *component_cuda, int pixWidth, int pixHeight,
|
||||
const char * filename, const char * suffix)
|
||||
{
|
||||
unsigned char * result;
|
||||
T *gpu_output;
|
||||
int i;
|
||||
int size;
|
||||
int samplesNum = pixWidth*pixHeight;
|
||||
|
||||
size = samplesNum*sizeof(T);
|
||||
cudaMallocHost((void **)&gpu_output, size);
|
||||
cudaCheckError("Malloc host");
|
||||
memset(gpu_output, 0, size);
|
||||
result = (unsigned char *)malloc(samplesNum);
|
||||
cudaMemcpy(gpu_output, component_cuda, size, cudaMemcpyDeviceToHost);
|
||||
cudaCheckError("Memcopy device to host");
|
||||
|
||||
/* T to char */
|
||||
samplesToChar(result, gpu_output, samplesNum, filename);
|
||||
|
||||
/* Write component */
|
||||
char outfile[strlen(filename)+strlen(suffix)];
|
||||
strcpy(outfile, filename);
|
||||
strcpy(outfile+strlen(filename), suffix);
|
||||
i = open(outfile, O_CREAT|O_WRONLY, 0644);
|
||||
if (i == -1) {
|
||||
error(0,errno,"cannot access %s", outfile);
|
||||
return -1;
|
||||
}
|
||||
printf("\nWriting to %s (%d x %d)\n", outfile, pixWidth, pixHeight);
|
||||
ssize_t x ;
|
||||
x = write(i, result, samplesNum);
|
||||
close(i);
|
||||
|
||||
/* Clean up */
|
||||
cudaFreeHost(gpu_output);
|
||||
cudaCheckError("Cuda free host memory");
|
||||
free(result);
|
||||
if(x == 0) return 1;
|
||||
return 0;
|
||||
}
|
||||
template int writeLinear<float>(float *component_cuda, int pixWidth, int pixHeight, const char * filename, const char * suffix);
|
||||
template int writeLinear<int>(int *component_cuda, int pixWidth, int pixHeight, const char * filename, const char * suffix);
|
||||
|
||||
/* Write output visual ordered */
|
||||
template<typename T>
|
||||
int writeNStage2DDWT(T *component_cuda, int pixWidth, int pixHeight,
|
||||
int stages, const char * filename, const char * suffix)
|
||||
{
|
||||
struct band {
|
||||
int dimX;
|
||||
int dimY;
|
||||
};
|
||||
struct dimensions {
|
||||
struct band LL;
|
||||
struct band HL;
|
||||
struct band LH;
|
||||
struct band HH;
|
||||
};
|
||||
|
||||
unsigned char * result;
|
||||
T *src, *dst;
|
||||
int i,s;
|
||||
int size;
|
||||
int offset;
|
||||
int yOffset;
|
||||
int samplesNum = pixWidth*pixHeight;
|
||||
struct dimensions * bandDims;
|
||||
|
||||
bandDims = (struct dimensions *)malloc(stages * sizeof(struct dimensions));
|
||||
|
||||
bandDims[0].LL.dimX = DIVANDRND(pixWidth,2);
|
||||
bandDims[0].LL.dimY = DIVANDRND(pixHeight,2);
|
||||
bandDims[0].HL.dimX = pixWidth - bandDims[0].LL.dimX;
|
||||
bandDims[0].HL.dimY = bandDims[0].LL.dimY;
|
||||
bandDims[0].LH.dimX = bandDims[0].LL.dimX;
|
||||
bandDims[0].LH.dimY = pixHeight - bandDims[0].LL.dimY;
|
||||
bandDims[0].HH.dimX = bandDims[0].HL.dimX;
|
||||
bandDims[0].HH.dimY = bandDims[0].LH.dimY;
|
||||
|
||||
for (i = 1; i < stages; i++) {
|
||||
bandDims[i].LL.dimX = DIVANDRND(bandDims[i-1].LL.dimX,2);
|
||||
bandDims[i].LL.dimY = DIVANDRND(bandDims[i-1].LL.dimY,2);
|
||||
bandDims[i].HL.dimX = bandDims[i-1].LL.dimX - bandDims[i].LL.dimX;
|
||||
bandDims[i].HL.dimY = bandDims[i].LL.dimY;
|
||||
bandDims[i].LH.dimX = bandDims[i].LL.dimX;
|
||||
bandDims[i].LH.dimY = bandDims[i-1].LL.dimY - bandDims[i].LL.dimY;
|
||||
bandDims[i].HH.dimX = bandDims[i].HL.dimX;
|
||||
bandDims[i].HH.dimY = bandDims[i].LH.dimY;
|
||||
}
|
||||
|
||||
#if 0
|
||||
printf("Original image pixWidth x pixHeight: %d x %d\n", pixWidth, pixHeight);
|
||||
for (i = 0; i < stages; i++) {
|
||||
printf("Stage %d: LL: pixWidth x pixHeight: %d x %d\n", i, bandDims[i].LL.dimX, bandDims[i].LL.dimY);
|
||||
printf("Stage %d: HL: pixWidth x pixHeight: %d x %d\n", i, bandDims[i].HL.dimX, bandDims[i].HL.dimY);
|
||||
printf("Stage %d: LH: pixWidth x pixHeight: %d x %d\n", i, bandDims[i].LH.dimX, bandDims[i].LH.dimY);
|
||||
printf("Stage %d: HH: pixWidth x pixHeight: %d x %d\n", i, bandDims[i].HH.dimX, bandDims[i].HH.dimY);
|
||||
}
|
||||
#endif
|
||||
|
||||
size = samplesNum*sizeof(T);
|
||||
cudaMallocHost((void **)&src, size);
|
||||
cudaCheckError("Malloc host");
|
||||
dst = (T*)malloc(size);
|
||||
memset(src, 0, size);
|
||||
memset(dst, 0, size);
|
||||
result = (unsigned char *)malloc(samplesNum);
|
||||
cudaMemcpy(src, component_cuda, size, cudaMemcpyDeviceToHost);
|
||||
cudaCheckError("Memcopy device to host");
|
||||
|
||||
// LL Band
|
||||
size = bandDims[stages-1].LL.dimX * sizeof(T);
|
||||
for (i = 0; i < bandDims[stages-1].LL.dimY; i++) {
|
||||
memcpy(dst+i*pixWidth, src+i*bandDims[stages-1].LL.dimX, size);
|
||||
}
|
||||
|
||||
for (s = stages - 1; s >= 0; s--) {
|
||||
// HL Band
|
||||
size = bandDims[s].HL.dimX * sizeof(T);
|
||||
offset = bandDims[s].LL.dimX * bandDims[s].LL.dimY;
|
||||
for (i = 0; i < bandDims[s].HL.dimY; i++) {
|
||||
memcpy(dst+i*pixWidth+bandDims[s].LL.dimX,
|
||||
src+offset+i*bandDims[s].HL.dimX,
|
||||
size);
|
||||
}
|
||||
|
||||
// LH band
|
||||
size = bandDims[s].LH.dimX * sizeof(T);
|
||||
offset += bandDims[s].HL.dimX * bandDims[s].HL.dimY;
|
||||
yOffset = bandDims[s].LL.dimY;
|
||||
for (i = 0; i < bandDims[s].HL.dimY; i++) {
|
||||
memcpy(dst+(yOffset+i)*pixWidth,
|
||||
src+offset+i*bandDims[s].LH.dimX,
|
||||
size);
|
||||
}
|
||||
|
||||
//HH band
|
||||
size = bandDims[s].HH.dimX * sizeof(T);
|
||||
offset += bandDims[s].LH.dimX * bandDims[s].LH.dimY;
|
||||
yOffset = bandDims[s].HL.dimY;
|
||||
for (i = 0; i < bandDims[s].HH.dimY; i++) {
|
||||
memcpy(dst+(yOffset+i)*pixWidth+bandDims[s].LH.dimX,
|
||||
src+offset+i*bandDims[s].HH.dimX,
|
||||
size);
|
||||
}
|
||||
}
|
||||
|
||||
/* Write component */
|
||||
samplesToChar(result, dst, samplesNum, filename);
|
||||
|
||||
char outfile[strlen(filename)+strlen(suffix)];
|
||||
strcpy(outfile, filename);
|
||||
strcpy(outfile+strlen(filename), suffix);
|
||||
i = open(outfile, O_CREAT|O_WRONLY, 0644);
|
||||
if (i == -1) {
|
||||
error(0,errno,"cannot access %s", outfile);
|
||||
return -1;
|
||||
}
|
||||
printf("\nWriting to %s (%d x %d)\n", outfile, pixWidth, pixHeight);
|
||||
ssize_t x;
|
||||
x = write(i, result, samplesNum);
|
||||
close(i);
|
||||
|
||||
cudaFreeHost(src);
|
||||
cudaCheckError("Cuda free host memory");
|
||||
free(dst);
|
||||
free(result);
|
||||
free(bandDims);
|
||||
if (x == 0) return 1;
|
||||
return 0;
|
||||
}
|
||||
template int writeNStage2DDWT<float>(float *component_cuda, int pixWidth, int pixHeight, int stages, const char * filename, const char * suffix);
|
||||
template int writeNStage2DDWT<int>(int *component_cuda, int pixWidth, int pixHeight, int stages, const char * filename, const char * suffix);
|
|
@ -1,41 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2009, Jiri Matela
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#ifndef _DWT_H
|
||||
#define _DWT_H
|
||||
|
||||
template <typename T>
|
||||
int nStage2dDWT(T *in, T *out, T *backup, int pixWidth, int pixHeight,
|
||||
int stages, bool forward);
|
||||
|
||||
template <typename T>
|
||||
int writeNStage2DDWT(T *component_cuda, int width, int height, int stages,
|
||||
const char *filename, const char *suffix);
|
||||
template <typename T>
|
||||
int writeLinear(T *component_cuda, int width, int height, const char *filename,
|
||||
const char *suffix);
|
||||
|
||||
#endif
|
|
@ -1,35 +0,0 @@
|
|||
///
|
||||
/// @file common.cu
|
||||
/// @author Martin Jirman (207962@mail.muni.cz)
|
||||
/// @date 2011-01-20 14:37
|
||||
///
|
||||
/// Copyright (c) 2011 Martin Jirman
|
||||
/// All rights reserved.
|
||||
///
|
||||
/// Redistribution and use in source and binary forms, with or without
|
||||
/// modification, are permitted provided that the following conditions are met:
|
||||
///
|
||||
/// * Redistributions of source code must retain the above copyright
|
||||
/// notice, this list of conditions and the following disclaimer.
|
||||
/// * Redistributions in binary form must reproduce the above copyright
|
||||
/// notice, this list of conditions and the following disclaimer in the
|
||||
/// documentation and/or other materials provided with the distribution.
|
||||
///
|
||||
/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
/// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
/// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
/// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
/// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
/// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
/// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
/// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
/// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
/// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
/// POSSIBILITY OF SUCH DAMAGE.
|
||||
///
|
||||
|
||||
#include "common.h"
|
||||
|
||||
namespace dwt_cuda {
|
||||
bool CudaDWTTester::testRunning = false;
|
||||
}
|
|
@ -1,232 +0,0 @@
|
|||
///
|
||||
/// @file common.h
|
||||
/// @author Martin Jirman (207962@mail.muni.cz)
|
||||
/// @brief Common stuff for all CUDA dwt functions.
|
||||
/// @date 2011-01-20 14:19
|
||||
///
|
||||
/// Copyright (c) 2011 Martin Jirman
|
||||
/// All rights reserved.
|
||||
///
|
||||
/// Redistribution and use in source and binary forms, with or without
|
||||
/// modification, are permitted provided that the following conditions are met:
|
||||
///
|
||||
/// * Redistributions of source code must retain the above copyright
|
||||
/// notice, this list of conditions and the following disclaimer.
|
||||
/// * Redistributions in binary form must reproduce the above copyright
|
||||
/// notice, this list of conditions and the following disclaimer in the
|
||||
/// documentation and/or other materials provided with the distribution.
|
||||
///
|
||||
/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
/// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
/// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
/// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
/// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
/// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
/// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
/// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
/// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
/// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
/// POSSIBILITY OF SUCH DAMAGE.
|
||||
///
|
||||
|
||||
#ifndef DWT_COMMON_H
|
||||
#define DWT_COMMON_H
|
||||
|
||||
#include <algorithm>
|
||||
#include <cstdio>
|
||||
#include <vector>
|
||||
|
||||
// compile time minimum macro
|
||||
#define CTMIN(a, b) (((a) < (b)) ? (a) : (b))
|
||||
|
||||
// performance testing macros
|
||||
#if defined(GPU_DWT_TESTING)
|
||||
#define PERF_BEGIN \
|
||||
{ \
|
||||
dwt_cuda::CudaDWTTester PERF_TESTER; \
|
||||
for (int PERF_N = PERF_TESTER.getNumIterations(); PERF_N--;) { \
|
||||
PERF_TESTER.beginTestIteration();
|
||||
|
||||
#define PERF_END(PERF_NAME, PERF_W, PERF_H) \
|
||||
PERF_TESTER.endTestIteration(); \
|
||||
} \
|
||||
PERF_TESTER.showPerformance(PERF_NAME, PERF_W, PERF_H); \
|
||||
}
|
||||
#else // GPU_DWT_TESTING
|
||||
#define PERF_BEGIN
|
||||
#define PERF_END(PERF_NAME, PERF_W, PERF_H)
|
||||
#endif // GPU_DWT_TESTING
|
||||
|
||||
namespace dwt_cuda {
|
||||
|
||||
/// Divide and round up.
|
||||
template <typename T>
|
||||
__device__ __host__ inline T divRndUp(const T &n, const T &d) {
|
||||
return (n / d) + ((n % d) ? 1 : 0);
|
||||
}
|
||||
|
||||
// 9/7 forward DWT lifting schema coefficients
|
||||
const float f97Predict1 = -1.586134342; ///< forward 9/7 predict 1
|
||||
const float f97Update1 = -0.05298011854; ///< forward 9/7 update 1
|
||||
const float f97Predict2 = 0.8829110762; ///< forward 9/7 predict 2
|
||||
const float f97Update2 = 0.4435068522; ///< forward 9/7 update 2
|
||||
|
||||
// 9/7 reverse DWT lifting schema coefficients
|
||||
const float r97update2 = -f97Update2; ///< undo 9/7 update 2
|
||||
const float r97predict2 = -f97Predict2; ///< undo 9/7 predict 2
|
||||
const float r97update1 = -f97Update1; ///< undo 9/7 update 1
|
||||
const float r97Predict1 = -f97Predict1; ///< undo 9/7 predict 1
|
||||
|
||||
// FDWT 9/7 scaling coefficients
|
||||
const float scale97Mul = 1.23017410491400f;
|
||||
const float scale97Div = 1.0 / scale97Mul;
|
||||
|
||||
// 5/3 forward DWT lifting schema coefficients
|
||||
const float forward53Predict = -0.5f; /// forward 5/3 predict
|
||||
const float forward53Update = 0.25f; /// forward 5/3 update
|
||||
|
||||
// 5/3 forward DWT lifting schema coefficients
|
||||
const float reverse53Update = -forward53Update; /// undo 5/3 update
|
||||
const float reverse53Predict = -forward53Predict; /// undo 5/3 predict
|
||||
|
||||
/// Functor which adds scaled sum of neighbors to given central pixel.
|
||||
struct AddScaledSum {
|
||||
const float scale; // scale of neighbors
|
||||
__device__ AddScaledSum(const float scale) : scale(scale) {}
|
||||
__device__ void operator()(const float p, float &c, const float n) const {
|
||||
|
||||
// if(threadIdx.x == 0) {
|
||||
|
||||
// printf("scale %f, p %f c %f n %f , result: %f\n", scale, p, c, n,
|
||||
// scale * (p + n) );
|
||||
|
||||
// }
|
||||
|
||||
c += scale * (p + n);
|
||||
}
|
||||
};
|
||||
|
||||
/// Returns index ranging from 0 to num threads, such that first half
|
||||
/// of threads get even indices and others get odd indices. Each thread
|
||||
/// gets different index.
|
||||
/// Example: (for 8 threads) threadIdx.x: 0 1 2 3 4 5 6 7
|
||||
/// parityIdx: 0 2 4 6 1 3 5 7
|
||||
/// @tparam THREADS total count of participating threads
|
||||
/// @return parity-separated index of thread
|
||||
template <int THREADS> __device__ inline int parityIdx() {
|
||||
return (threadIdx.x * 2) - (THREADS - 1) * (threadIdx.x / (THREADS / 2));
|
||||
}
|
||||
|
||||
/// size of shared memory
|
||||
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 200)
|
||||
const int SHM_SIZE = 48 * 1024;
|
||||
#else
|
||||
const int SHM_SIZE = 16 * 1024;
|
||||
#endif
|
||||
|
||||
/// Perrformance and return code tester.
|
||||
class CudaDWTTester {
|
||||
private:
|
||||
static bool testRunning; ///< true if any test is currently running
|
||||
cudaEvent_t beginEvent; ///< begin CUDA event
|
||||
cudaEvent_t endEvent; ///< end CUDA event
|
||||
std::vector<float> times; ///< collected times
|
||||
const bool disabled; ///< true if this object is disabled
|
||||
public:
|
||||
/// Checks CUDA related error.
|
||||
/// @param status return code to be checked
|
||||
/// @param message message to be shown if there was an error
|
||||
/// @return true if there was no error, false otherwise
|
||||
static bool check(const cudaError_t &status, const char *message) {
|
||||
#if defined(GPU_DWT_TESTING)
|
||||
if ((!testRunning) && status != cudaSuccess) {
|
||||
const char *errorString = cudaGetErrorString(status);
|
||||
fprintf(stderr, "CUDA ERROR: '%s': %s\n", message, errorString);
|
||||
fflush(stderr);
|
||||
return false;
|
||||
}
|
||||
#endif // GPU_DWT_TESTING
|
||||
return true;
|
||||
}
|
||||
|
||||
/// Checks last kernel call for errors.
|
||||
/// @param message description of the kernel call
|
||||
/// @return true if there was no error, false otherwise
|
||||
static bool checkLastKernelCall(const char *message) {
|
||||
#if defined(GPU_DWT_TESTING)
|
||||
return testRunning ? true : check(cudaThreadSynchronize(), message);
|
||||
#else // GPU_DWT_TESTING
|
||||
return true;
|
||||
#endif // GPU_DWT_TESTING
|
||||
}
|
||||
|
||||
/// Initializes DWT tester for time measurement
|
||||
CudaDWTTester() : disabled(testRunning) {}
|
||||
|
||||
/// Gets rpefered number of iterations
|
||||
int getNumIterations() { return disabled ? 1 : 31; }
|
||||
|
||||
/// Starts one test iteration.
|
||||
void beginTestIteration() {
|
||||
if (!disabled) {
|
||||
cudaEventCreate(&beginEvent);
|
||||
cudaEventCreate(&endEvent);
|
||||
cudaEventRecord(beginEvent, 0);
|
||||
testRunning = true;
|
||||
}
|
||||
}
|
||||
|
||||
/// Ends on etest iteration.
|
||||
void endTestIteration() {
|
||||
if (!disabled) {
|
||||
float time;
|
||||
testRunning = false;
|
||||
cudaEventRecord(endEvent, 0);
|
||||
cudaEventSynchronize(endEvent);
|
||||
cudaEventElapsedTime(&time, beginEvent, endEvent);
|
||||
cudaEventDestroy(beginEvent);
|
||||
cudaEventDestroy(endEvent);
|
||||
times.push_back(time);
|
||||
}
|
||||
}
|
||||
|
||||
/// Shows brief info about all iterations.
|
||||
/// @param name name of processing method
|
||||
/// @param sizeX width of processed image
|
||||
/// @param sizeY height of processed image
|
||||
void showPerformance(const char *name, const int sizeX, const int sizeY) {
|
||||
if (!disabled) {
|
||||
// compute mean and median
|
||||
std::sort(times.begin(), times.end());
|
||||
double sum = 0;
|
||||
for (int i = times.size(); i--;) {
|
||||
sum += times[i];
|
||||
}
|
||||
const double median =
|
||||
(times[times.size() / 2] + times[(times.size() - 1) / 2]) * 0.5f;
|
||||
printf(" %s: %7.3f ms (mean) %7.3f ms (median) %7.3f ms (max) "
|
||||
"(%d x %d)\n",
|
||||
name, (sum / times.size()), median, times[times.size() - 1], sizeX,
|
||||
sizeY);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
/// Simple cudaMemcpy wrapped in performance tester.
|
||||
/// @param dest destination bufer
|
||||
/// @param src source buffer
|
||||
/// @param sx width of copied image
|
||||
/// @param sy height of copied image
|
||||
template <typename T>
|
||||
inline void memCopy(T *const dest, const T *const src, const size_t sx,
|
||||
const size_t sy) {
|
||||
cudaError_t status;
|
||||
PERF_BEGIN
|
||||
status = cudaMemcpy(dest, src, sx * sy * sizeof(T), cudaMemcpyDeviceToDevice);
|
||||
PERF_END(" memcpy", sx, sy)
|
||||
CudaDWTTester::check(status, "memcpy device > device");
|
||||
}
|
||||
|
||||
} // end of namespace dwt_cuda
|
||||
|
||||
#endif // DWT_COMMON_CUDA_H
|
|
@ -1,103 +0,0 @@
|
|||
///
|
||||
/// @file dwt.h
|
||||
/// @author Martin Jirman (207962@mail.muni.cz)
|
||||
/// @brief Entry points for CUDA implementaion of 9/7 and 5/3 DWT.
|
||||
/// @date 2011-01-20 11:41
|
||||
///
|
||||
///
|
||||
///
|
||||
/// Copyright (c) 2011 Martin Jirman
|
||||
/// All rights reserved.
|
||||
///
|
||||
/// Redistribution and use in source and binary forms, with or without
|
||||
/// modification, are permitted provided that the following conditions are met:
|
||||
///
|
||||
/// * Redistributions of source code must retain the above copyright
|
||||
/// notice, this list of conditions and the following disclaimer.
|
||||
/// * Redistributions in binary form must reproduce the above copyright
|
||||
/// notice, this list of conditions and the following disclaimer in the
|
||||
/// documentation and/or other materials provided with the distribution.
|
||||
///
|
||||
/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
/// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
/// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
/// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
/// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
/// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
/// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
/// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
/// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
/// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
/// POSSIBILITY OF SUCH DAMAGE.
|
||||
///
|
||||
///
|
||||
///
|
||||
/// Following conditions are common for all four DWT functions:
|
||||
/// - Both input and output images are stored in GPU memory with no padding
|
||||
/// of lines or interleaving of pixels.
|
||||
/// - DWT coefficients are stored as follows: Each band is saved as one
|
||||
/// consecutive chunk (no padding/stride/interleaving). Deepest level bands
|
||||
/// (smallest ones) are stored first (at the beginning of the input/output
|
||||
/// buffers), less deep bands follow. There is no padding between stored
|
||||
/// bands in the buffer. Order of bands of the same level in the buffer is
|
||||
/// following: Low-low band (or deeper level subbands) is stored first.
|
||||
/// Vertical-low/horizontal-high band follows. Vertical-high/horizonal-low
|
||||
/// band is saved next and finally, the high-high band is saved. Out of all
|
||||
/// low-low bands, only th edeepest one is saved (right at the beginning of
|
||||
/// the buffer), others are replaced with deeper level subbands.
|
||||
/// - Input images of all functions won't be preserved (will be overwritten).
|
||||
/// - Input and output buffers can't overlap.
|
||||
/// - Size of output buffer must be greater or equal to size of input buffer.
|
||||
///
|
||||
/// There are no common compile time settings (buffer size, etc...) for
|
||||
/// all DWTs, because each DTW type needs different amount of GPU resources.
|
||||
/// Instead, each DWT type has its own compile time settings, which can be
|
||||
/// found in *.cu file, where it is implemented.
|
||||
///
|
||||
|
||||
#ifndef DWT_CUDA_H
|
||||
#define DWT_CUDA_H
|
||||
|
||||
namespace dwt_cuda {
|
||||
|
||||
/// Forward 5/3 2D DWT. See common rules (above) for more details.
|
||||
/// @param in Expected to be normalized into range [-128, 127].
|
||||
/// Will not be preserved (will be overwritten).
|
||||
/// @param out output buffer on GPU
|
||||
/// @param sizeX width of input image (in pixels)
|
||||
/// @param sizeY height of input image (in pixels)
|
||||
/// @param levels number of recursive DWT levels
|
||||
void fdwt53(int *in, int *out, int sizeX, int sizeY, int levels);
|
||||
|
||||
/// Reverse 5/3 2D DWT. See common rules (above) for more details.
|
||||
/// @param in Input DWT coefficients. Format described in common rules.
|
||||
/// Will not be preserved (will be overwritten).
|
||||
/// @param out output buffer on GPU - will contain original image
|
||||
/// in normalized range [-128, 127].
|
||||
/// @param sizeX width of input image (in pixels)
|
||||
/// @param sizeY height of input image (in pixels)
|
||||
/// @param levels number of recursive DWT levels
|
||||
void rdwt53(int *in, int *out, int sizeX, int sizeY, int levels);
|
||||
|
||||
/// Forward 9/7 2D DWT. See common rules (above) for more details.
|
||||
/// @param in Input DWT coefficients. Should be normalized (in range
|
||||
/// [-0.5, 0.5]). Will not be preserved (will be overwritten).
|
||||
/// @param out output buffer on GPU - format specified in common rules
|
||||
/// @param sizeX width of input image (in pixels)
|
||||
/// @param sizeY height of input image (in pixels)
|
||||
/// @param levels number of recursive DWT levels
|
||||
void fdwt97(float *in, float *out, int sizeX, int sizeY, int levels);
|
||||
|
||||
/// Reverse 9/7 2D DWT. See common rules (above) for more details.
|
||||
/// @param in Input DWT coefficients. Format described in common rules.
|
||||
/// Will not be preserved (will be overwritten).
|
||||
/// @param out output buffer on GPU - will contain original image
|
||||
/// in normalized range [-0.5, 0.5].
|
||||
/// @param sizeX width of input image (in pixels)
|
||||
/// @param sizeY height of input image (in pixels)
|
||||
/// @param levels number of recursive DWT levels
|
||||
void rdwt97(float *in, float *out, int sizeX, int sizeY, int levels);
|
||||
|
||||
} // namespace dwt_cuda
|
||||
|
||||
#endif // DWT_CUDA_H
|
|
@ -1,400 +0,0 @@
|
|||
/// @file fdwt53.cu
|
||||
/// @brief CUDA implementation of forward 5/3 2D DWT.
|
||||
/// @author Martin Jirman (207962@mail.muni.cz)
|
||||
/// @date 2011-02-04 13:23
|
||||
///
|
||||
///
|
||||
/// Copyright (c) 2011 Martin Jirman
|
||||
/// All rights reserved.
|
||||
///
|
||||
/// Redistribution and use in source and binary forms, with or without
|
||||
/// modification, are permitted provided that the following conditions are met:
|
||||
///
|
||||
/// * Redistributions of source code must retain the above copyright
|
||||
/// notice, this list of conditions and the following disclaimer.
|
||||
/// * Redistributions in binary form must reproduce the above copyright
|
||||
/// notice, this list of conditions and the following disclaimer in the
|
||||
/// documentation and/or other materials provided with the distribution.
|
||||
///
|
||||
/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
/// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
/// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
/// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
/// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
/// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
/// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
/// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
/// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
/// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
/// POSSIBILITY OF SUCH DAMAGE.
|
||||
///
|
||||
|
||||
|
||||
#include "common.h"
|
||||
#include "transform_buffer.h"
|
||||
#include "io.h"
|
||||
|
||||
namespace dwt_cuda {
|
||||
|
||||
|
||||
/// Wraps buffer and methods needed for computing one level of 5/3 FDWT
|
||||
/// using sliding window approach.
|
||||
/// @tparam WIN_SIZE_X width of sliding window
|
||||
/// @tparam WIN_SIZE_Y height of sliding window
|
||||
template <int WIN_SIZE_X, int WIN_SIZE_Y>
|
||||
class FDWT53 {
|
||||
private:
|
||||
|
||||
/// Info needed for processing of one input column.
|
||||
/// @tparam CHECKED_LOADER true if column's loader should check boundaries
|
||||
/// false if there are no near boudnaries to check
|
||||
template <bool CHECKED_LOADER>
|
||||
struct FDWT53Column {
|
||||
/// loader for the column
|
||||
VerticalDWTPixelLoader<int, CHECKED_LOADER> loader;
|
||||
|
||||
/// offset of the column in shared buffer
|
||||
int offset;
|
||||
|
||||
// backup of first 3 loaded pixels (not transformed)
|
||||
int pixel0, pixel1, pixel2;
|
||||
|
||||
/// Sets all fields to anything to prevent 'uninitialized' warnings.
|
||||
__device__ void clear() {
|
||||
offset = pixel0 = pixel1 = pixel2 = 0;
|
||||
loader.clear();
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
/// Type of shared memory buffer for 5/3 FDWT transforms.
|
||||
typedef TransformBuffer<int, WIN_SIZE_X, WIN_SIZE_Y + 3, 2> FDWT53Buffer;
|
||||
|
||||
/// Actual shared buffer used for forward 5/3 DWT.
|
||||
FDWT53Buffer buffer;
|
||||
|
||||
/// Difference between indices of two vertical neighbors in buffer.
|
||||
enum { STRIDE = FDWT53Buffer::VERTICAL_STRIDE };
|
||||
|
||||
|
||||
/// Forward 5/3 DWT predict operation.
|
||||
struct Forward53Predict {
|
||||
__device__ void operator() (const int p, int & c, const int n) const {
|
||||
// c = n;
|
||||
c -= (p + n) / 2; // F.8, page 126, ITU-T Rec. T.800 final draft the real one
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
/// Forward 5/3 DWT update operation.
|
||||
struct Forward53Update {
|
||||
__device__ void operator() (const int p, int & c, const int n) const {
|
||||
c += (p + n + 2) / 4; // F.9, page 126, ITU-T Rec. T.800 final draft
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
/// Initializes one column: computes offset of the column in shared memory
|
||||
/// buffer, initializes loader and finally uses it to load first 3 pixels.
|
||||
/// @tparam CHECKED true if loader of the column checks boundaries
|
||||
/// @param column (uninitialized) column info to be initialized
|
||||
/// @param input input image
|
||||
/// @param sizeX width of the input image
|
||||
/// @param sizeY height of the input image
|
||||
/// @param colIndex x-axis coordinate of the column (relative to the left
|
||||
/// side of this threadblock's block of input pixels)
|
||||
/// @param firstY y-axis coordinate of first image row to be transformed
|
||||
|
||||
template <bool CHECKED>
|
||||
__device__ void initColumn(FDWT53Column<CHECKED> & column,
|
||||
const int * const input,
|
||||
const int sizeX, const int sizeY,
|
||||
const int colIndex, const int firstY) {
|
||||
// get offset of the column with index 'cId'
|
||||
column.offset = buffer.getColumnOffset(colIndex);
|
||||
|
||||
// coordinates of the first pixel to be loaded
|
||||
const int firstX = blockIdx.x * WIN_SIZE_X + colIndex;
|
||||
|
||||
if(blockIdx.y == 0) {
|
||||
// topmost block - apply mirroring rules when loading first 3 rows
|
||||
column.loader.init(sizeX, sizeY, firstX, firstY);
|
||||
|
||||
// load pixels in mirrored way
|
||||
column.pixel2 = column.loader.loadFrom(input); // loaded pixel #0
|
||||
column.pixel1 = column.loader.loadFrom(input); // loaded pixel #1
|
||||
column.pixel0 = column.loader.loadFrom(input); // loaded pixel #2
|
||||
|
||||
// reinitialize loader to start with pixel #1 again
|
||||
column.loader.init(sizeX, sizeY, firstX, firstY + 1);
|
||||
} else {
|
||||
// non-topmost row - regular loading:
|
||||
column.loader.init(sizeX, sizeY, firstX, firstY - 2);
|
||||
|
||||
// load 3 rows into the column
|
||||
column.pixel0 = column.loader.loadFrom(input);
|
||||
column.pixel1 = column.loader.loadFrom(input);
|
||||
column.pixel2 = column.loader.loadFrom(input);
|
||||
// Now, the next pixel, which will be loaded by loader, is pixel #1.
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
/// Loads and vertically transforms given column. Assumes that first 3
|
||||
/// pixels are already loaded in column fields pixel0 ... pixel2.
|
||||
/// @tparam CHECKED true if loader of the column checks boundaries
|
||||
/// @param column column to be loaded and vertically transformed
|
||||
/// @param input pointer to input image data
|
||||
template <bool CHECKED>
|
||||
__device__ void loadAndVerticallyTransform(FDWT53Column<CHECKED> & column,
|
||||
const int * const input) {
|
||||
// take 3 loaded pixels and put them into shared memory transform buffer
|
||||
buffer[column.offset + 0 * STRIDE] = column.pixel0;
|
||||
buffer[column.offset + 1 * STRIDE] = column.pixel1;
|
||||
buffer[column.offset + 2 * STRIDE] = column.pixel2;
|
||||
|
||||
// load remaining pixels to be able to vertically transform the window
|
||||
|
||||
for(int i = 3; i < (3 + WIN_SIZE_Y); i++)
|
||||
{
|
||||
buffer[column.offset + i * STRIDE] = column.loader.loadFrom(input);
|
||||
}
|
||||
|
||||
// remember last 3 pixels for use in next iteration
|
||||
column.pixel0 = buffer[column.offset + (WIN_SIZE_Y + 0) * STRIDE];
|
||||
column.pixel1 = buffer[column.offset + (WIN_SIZE_Y + 1) * STRIDE];
|
||||
column.pixel2 = buffer[column.offset + (WIN_SIZE_Y + 2) * STRIDE];
|
||||
|
||||
// vertically transform the column in transform buffer
|
||||
buffer.forEachVerticalOdd(column.offset, Forward53Predict());
|
||||
buffer.forEachVerticalEven(column.offset, Forward53Update());
|
||||
|
||||
}
|
||||
|
||||
|
||||
/// Actual implementation of 5/3 FDWT.
|
||||
/// @tparam CHECK_LOADS true if input loader must check boundaries
|
||||
/// @tparam CHECK_WRITES true if output writer must check boundaries
|
||||
/// @param in input image
|
||||
/// @param out output buffer
|
||||
/// @param sizeX width of the input image
|
||||
/// @param sizeY height of the input image
|
||||
/// @param winSteps number of sliding window steps
|
||||
template <bool CHECK_LOADS, bool CHECK_WRITES>
|
||||
__device__ void transform(const int * const in, int * const out,
|
||||
const int sizeX, const int sizeY,
|
||||
const int winSteps) {
|
||||
// info about one main and one boundary columns processed by this thread
|
||||
FDWT53Column<CHECK_LOADS> column;
|
||||
FDWT53Column<CHECK_LOADS> boundaryColumn; // only few threads use this
|
||||
|
||||
// Initialize all column info: initialize loaders, compute offset of
|
||||
// column in shared buffer and initialize loader of column.
|
||||
const int firstY = blockIdx.y * WIN_SIZE_Y * winSteps;
|
||||
initColumn(column, in, sizeX, sizeY, threadIdx.x, firstY); //has been checked Mar 9th
|
||||
|
||||
|
||||
// first 3 threads initialize boundary columns, others do not use them
|
||||
boundaryColumn.clear();
|
||||
if(threadIdx.x < 3) {
|
||||
// index of boundary column (relative x-axis coordinate of the column)
|
||||
const int colId = threadIdx.x + ((threadIdx.x == 0) ? WIN_SIZE_X : -3);
|
||||
|
||||
// initialize the column
|
||||
initColumn(boundaryColumn, in, sizeX, sizeY, colId, firstY);
|
||||
|
||||
}
|
||||
|
||||
|
||||
// index of column which will be written into output by this thread
|
||||
const int outColumnIndex = parityIdx<WIN_SIZE_X>();
|
||||
|
||||
// offset of column which will be written by this thread into output
|
||||
const int outColumnOffset = buffer.getColumnOffset(outColumnIndex);
|
||||
|
||||
// initialize output writer for this thread
|
||||
const int outputFirstX = blockIdx.x * WIN_SIZE_X + outColumnIndex;
|
||||
VerticalDWTBandWriter<int, CHECK_WRITES> writer;
|
||||
writer.init(sizeX, sizeY, outputFirstX, firstY);
|
||||
__syncthreads();
|
||||
|
||||
|
||||
// Sliding window iterations:
|
||||
// Each iteration assumes that first 3 pixels of each column are loaded.
|
||||
for(int w = 0; w < winSteps; w++) {
|
||||
|
||||
// For each column (including boundary columns): load and vertically
|
||||
// transform another WIN_SIZE_Y lines.
|
||||
loadAndVerticallyTransform(column, in);
|
||||
if(threadIdx.x < 3) {
|
||||
loadAndVerticallyTransform(boundaryColumn, in);
|
||||
}
|
||||
|
||||
// wait for all columns to be vertically transformed and transform all
|
||||
// output rows horizontally
|
||||
__syncthreads();
|
||||
|
||||
|
||||
buffer.forEachHorizontalOdd(2, WIN_SIZE_Y, Forward53Predict());
|
||||
__syncthreads();
|
||||
|
||||
buffer.forEachHorizontalEven(2, WIN_SIZE_Y, Forward53Update());
|
||||
|
||||
// wait for all output rows to be transformed horizontally and write
|
||||
// them into output buffer
|
||||
__syncthreads();
|
||||
|
||||
|
||||
for(int r = 2; r < (2 + WIN_SIZE_Y); r += 2) {
|
||||
// Write low coefficients from output column into low band ...
|
||||
writer.writeLowInto(out, buffer[outColumnOffset + r * STRIDE]);
|
||||
// ... and high coeficients into the high band.
|
||||
writer.writeHighInto(out, buffer[outColumnOffset + (r+1) * STRIDE]);
|
||||
}
|
||||
|
||||
// before proceeding to next iteration, wait for all output columns
|
||||
// to be written into the output
|
||||
__syncthreads();
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
public:
|
||||
/// Determines, whether this block's pixels touch boundary and selects
|
||||
/// right version of algorithm according to it - for many threadblocks, it
|
||||
/// selects version which does not deal with boundary mirroring and thus is
|
||||
/// slightly faster.
|
||||
/// @param in input image
|
||||
/// @param out output buffer
|
||||
/// @param sx width of the input image
|
||||
/// @param sy height of the input image
|
||||
/// @param steps number of sliding window steps
|
||||
__device__ static void run(const int * const in, int * const out,
|
||||
const int sx, const int sy, const int steps) {
|
||||
// if(blockIdx.x==0 && blockIdx.y ==11 && threadIdx.x >=0&&threadIdx.x <64){
|
||||
// object with transform buffer in shared memory
|
||||
__shared__ FDWT53<WIN_SIZE_X, WIN_SIZE_Y> fdwt53;
|
||||
|
||||
// Compute limits of this threadblock's block of pixels and use them to
|
||||
// determine, whether this threadblock will have to deal with boundary.
|
||||
// (1 in next expressions is for radius of impulse response of 9/7 FDWT.)
|
||||
const int maxX = (blockIdx.x + 1) * WIN_SIZE_X + 1;
|
||||
const int maxY = (blockIdx.y + 1) * WIN_SIZE_Y * steps + 1;
|
||||
const bool atRightBoudary = maxX >= sx;
|
||||
const bool atBottomBoudary = maxY >= sy;
|
||||
|
||||
// Select specialized version of code according to distance of this
|
||||
// threadblock's pixels from image boundary.
|
||||
|
||||
// if(threadIdx.x == 0) {
|
||||
// printf("fdwt53 run");
|
||||
// }
|
||||
if(atBottomBoudary)
|
||||
{
|
||||
// near bottom boundary => check both writing and reading
|
||||
fdwt53.transform<true, true>(in, out, sx, sy, steps);
|
||||
} else if(atRightBoudary)
|
||||
{
|
||||
// near right boundary only => check writing only
|
||||
fdwt53.transform<false, true>(in, out, sx, sy, steps);
|
||||
} else
|
||||
{
|
||||
// no nearby boundary => check nothing
|
||||
fdwt53.transform<false, false>(in, out, sx, sy, steps);
|
||||
}
|
||||
}
|
||||
// }
|
||||
|
||||
}; // end of class FDWT53
|
||||
|
||||
|
||||
|
||||
/// Main GPU 5/3 FDWT entry point.
|
||||
/// @tparam WIN_SX width of sliding window to be used
|
||||
/// @tparam WIN_SY height of sliding window to be used
|
||||
/// @param input input image
|
||||
/// @param output output buffer
|
||||
/// @param sizeX width of the input image
|
||||
/// @param sizeY height of the input image
|
||||
/// @param winSteps number of sliding window steps
|
||||
template <int WIN_SX, int WIN_SY>
|
||||
__launch_bounds__(WIN_SX, CTMIN(SHM_SIZE/sizeof(FDWT53<WIN_SX, WIN_SY>), 8))
|
||||
__global__ void fdwt53Kernel(const int * const input, int * const output,
|
||||
const int sizeX, const int sizeY,
|
||||
const int winSteps) {
|
||||
FDWT53<WIN_SX, WIN_SY>::run(input, output, sizeX, sizeY, winSteps);
|
||||
}
|
||||
|
||||
|
||||
|
||||
/// Only computes optimal number of sliding window steps,
|
||||
/// number of threadblocks and then lanches the 5/3 FDWT kernel.
|
||||
/// @tparam WIN_SX width of sliding window
|
||||
/// @tparam WIN_SY height of sliding window
|
||||
/// @param in input image
|
||||
/// @param out output buffer
|
||||
/// @param sx width of the input image
|
||||
/// @param sy height of the input image
|
||||
template <int WIN_SX, int WIN_SY>
|
||||
void launchFDWT53Kernel (int * in, int * out, int sx, int sy) {
|
||||
// compute optimal number of steps of each sliding window
|
||||
|
||||
const int steps = divRndUp(sy, 15 * WIN_SY);
|
||||
|
||||
int gx = divRndUp(sx, WIN_SX);
|
||||
int gy = divRndUp(sy, WIN_SY * steps);
|
||||
|
||||
printf("\n sliding steps = %d , gx = %d , gy = %d \n", steps, gx, gy);
|
||||
|
||||
// prepare grid size
|
||||
dim3 gSize(divRndUp(sx, WIN_SX), divRndUp(sy, WIN_SY * steps));
|
||||
// printf("\n globalx=%d, globaly=%d, blocksize=%d\n", gSize.x, gSize.y, WIN_SX);
|
||||
|
||||
// run kernel, possibly measure time and finally check the call
|
||||
// PERF_BEGIN
|
||||
fdwt53Kernel<WIN_SX, WIN_SY><<<gSize, WIN_SX>>>(in, out, sx, sy, steps);
|
||||
// PERF_END(" FDWT53", sx, sy)
|
||||
// CudaDWTTester::checkLastKernelCall("FDWT 5/3 kernel");
|
||||
printf("fdwt53Kernel in launchFDWT53Kernel has finished");
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
/// Forward 5/3 2D DWT. See common rules (above) for more details.
|
||||
/// @param in Expected to be normalized into range [-128, 127].
|
||||
/// Will not be preserved (will be overwritten).
|
||||
/// @param out output buffer on GPU
|
||||
/// @param sizeX width of input image (in pixels)
|
||||
/// @param sizeY height of input image (in pixels)
|
||||
/// @param levels number of recursive DWT levels
|
||||
void fdwt53(int * in, int * out, int sizeX, int sizeY, int levels) {
|
||||
// select right width of kernel for the size of the image
|
||||
|
||||
if(sizeX >= 960) {
|
||||
launchFDWT53Kernel<192, 8>(in, out, sizeX, sizeY);
|
||||
} else if (sizeX >= 480) {
|
||||
launchFDWT53Kernel<128, 8>(in, out, sizeX, sizeY);
|
||||
} else {
|
||||
launchFDWT53Kernel<64, 8>(in, out, sizeX, sizeY);
|
||||
}
|
||||
|
||||
// if this was not the last level, continue recursively with other levels
|
||||
if(levels > 1) {
|
||||
// copy output's LL band back into input buffer
|
||||
const int llSizeX = divRndUp(sizeX, 2);
|
||||
const int llSizeY = divRndUp(sizeY, 2);
|
||||
// printf("\n llSizeX = %d , llSizeY = %d \n", llSizeX, llSizeY);
|
||||
memCopy(in, out, llSizeX, llSizeY); //the function memCopy in cuda_dwt/common.h line 238
|
||||
|
||||
// run remaining levels of FDWT
|
||||
fdwt53(in, out, llSizeX, llSizeY, levels - 1);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
} // end of namespace dwt_cuda
|
|
@ -1,383 +0,0 @@
|
|||
///
|
||||
/// @file fdwt97.cu
|
||||
/// @brief CUDA implementation of forward 9/7 2D DWT.
|
||||
/// @author Martin Jirman (207962@mail.muni.cz)
|
||||
/// @date 2011-01-20 13:18
|
||||
///
|
||||
///
|
||||
/// Copyright (c) 2011 Martin Jirman
|
||||
/// All rights reserved.
|
||||
///
|
||||
/// Redistribution and use in source and binary forms, with or without
|
||||
/// modification, are permitted provided that the following conditions are met:
|
||||
///
|
||||
/// * Redistributions of source code must retain the above copyright
|
||||
/// notice, this list of conditions and the following disclaimer.
|
||||
/// * Redistributions in binary form must reproduce the above copyright
|
||||
/// notice, this list of conditions and the following disclaimer in the
|
||||
/// documentation and/or other materials provided with the distribution.
|
||||
///
|
||||
/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
/// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
/// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
/// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
/// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
/// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
/// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
/// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
/// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
/// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
/// POSSIBILITY OF SUCH DAMAGE.
|
||||
///
|
||||
|
||||
|
||||
#include "common.h"
|
||||
#include "transform_buffer.h"
|
||||
#include "io.h"
|
||||
|
||||
|
||||
namespace dwt_cuda {
|
||||
|
||||
|
||||
|
||||
/// Wraps a buffer and methods for computing 9/7 FDWT with sliding window
|
||||
/// of specified size. Template arguments specify this size.
|
||||
/// @tparam WIN_SIZE_X width of sliding window
|
||||
/// @tparam WIN_SIZE_Y height of sliding window
|
||||
template <int WIN_SIZE_X, int WIN_SIZE_Y>
|
||||
class FDWT97 {
|
||||
private:
|
||||
/// Type of shared memory buffer used for 9/7 DWT.
|
||||
typedef TransformBuffer<float, WIN_SIZE_X, WIN_SIZE_Y + 7, 4> FDWT97Buffer;
|
||||
|
||||
/// Actual shared buffer used for forward 9/7 DWT.
|
||||
FDWT97Buffer buffer;
|
||||
|
||||
/// Difference of indices of two vertically neighboring items in buffer.
|
||||
enum { STRIDE = FDWT97Buffer::VERTICAL_STRIDE };
|
||||
|
||||
|
||||
/// One thread's info about loading input image
|
||||
/// @tparam CHECKED true if loader should check for image boundaries
|
||||
template <bool CHECKED>
|
||||
struct FDWT97ColumnLoadingInfo {
|
||||
/// Loader of pixels from some input image.
|
||||
VerticalDWTPixelLoader<float, CHECKED> loader;
|
||||
|
||||
/// Offset of column loaded by loader. (Offset in shared buffer.)
|
||||
int offset;
|
||||
};
|
||||
|
||||
|
||||
/// Horizontal 9/7 FDWT on specified lines of transform buffer.
|
||||
/// @param lines number of lines to be transformed
|
||||
/// @param firstLine index of the first line to be transformed
|
||||
__device__ void horizontalFDWT97(const int lines, const int firstLine) {
|
||||
__syncthreads();
|
||||
|
||||
buffer.forEachHorizontalOdd(firstLine, lines, AddScaledSum(f97Predict1));
|
||||
__syncthreads();
|
||||
buffer.forEachHorizontalEven(firstLine, lines, AddScaledSum(f97Update1));
|
||||
__syncthreads();
|
||||
buffer.forEachHorizontalOdd(firstLine, lines, AddScaledSum(f97Predict2));
|
||||
__syncthreads();
|
||||
buffer.forEachHorizontalEven(firstLine, lines, AddScaledSum(f97Update2));
|
||||
__syncthreads();
|
||||
|
||||
buffer.scaleHorizontal(scale97Div, scale97Mul, firstLine, lines);
|
||||
|
||||
__syncthreads();
|
||||
|
||||
}
|
||||
|
||||
|
||||
/// Initializes one column of shared transform buffer with 7 input pixels.
|
||||
/// Those 7 pixels will not be transformed. Also initializes given loader.
|
||||
/// @tparam CHECKED true if loader should check for image boundaries
|
||||
/// @param column (uninitialized) object for loading input pixels
|
||||
/// @param columnIndex index (not offset!) of the column to be loaded
|
||||
/// (relative to threadblock's first column)
|
||||
/// @param input pointer to input image in GPU memory
|
||||
/// @param sizeX width of the input image
|
||||
/// @param sizeY height of the input image
|
||||
/// @param firstY index of first row to be loaded from image
|
||||
template <bool CHECKED>
|
||||
__device__ void initColumn(FDWT97ColumnLoadingInfo<CHECKED> & column,
|
||||
const int columnIndex, const float * const input,
|
||||
const int sizeX, const int sizeY,
|
||||
const int firstY) {
|
||||
// get offset of the column with index 'columnIndex'
|
||||
column.offset = buffer.getColumnOffset(columnIndex);
|
||||
|
||||
// printf(" offset: %d , threadIdx: %d, blockIdx.y %d\n ", column.offset, threadIdx.x, blockIdx.y);
|
||||
|
||||
// x-coordinate of the first pixel to be loaded by given loader
|
||||
const int firstX = blockIdx.x * WIN_SIZE_X + columnIndex;
|
||||
|
||||
if(blockIdx.y == 0) {
|
||||
// topmost block - apply mirroring rules when loading first 7 rows
|
||||
column.loader.init(sizeX, sizeY, firstX, firstY);
|
||||
|
||||
// load pixels in mirrored way
|
||||
buffer[column.offset + 4 * STRIDE] = column.loader.loadFrom(input);
|
||||
buffer[column.offset + 3 * STRIDE] =
|
||||
buffer[column.offset + 5 * STRIDE] = column.loader.loadFrom(input);
|
||||
buffer[column.offset + 2 * STRIDE] =
|
||||
buffer[column.offset + 6 * STRIDE] = column.loader.loadFrom(input);
|
||||
buffer[column.offset + 1 * STRIDE] = column.loader.loadFrom(input);
|
||||
buffer[column.offset + 0 * STRIDE] = column.loader.loadFrom(input);
|
||||
|
||||
// reinitialize loader to start with pixel #3 again
|
||||
column.loader.init(sizeX, sizeY, firstX, firstY + 3);
|
||||
|
||||
} else {
|
||||
// non-topmost row - regular loading:
|
||||
column.loader.init(sizeX, sizeY, firstX, firstY - 4);
|
||||
|
||||
// load 7 rows into the transform buffer
|
||||
for(int i = 0; i < 7; i++) {
|
||||
buffer[column.offset + i * STRIDE] = column.loader.loadFrom(input);
|
||||
|
||||
}
|
||||
}
|
||||
// Now, the next pixel, which will be loaded by loader, is pixel #3.
|
||||
}
|
||||
|
||||
|
||||
/// Loads another WIN_SIZE_Y pixels into given column using given loader.
|
||||
/// @tparam CHECKED true if loader should check for image boundaries
|
||||
/// @param input input image to load from
|
||||
/// @param column loader and offset of loaded column in shared buffer
|
||||
template <bool CHECKED>
|
||||
inline __device__ void loadWindowIntoColumn(const float * const input,
|
||||
FDWT97ColumnLoadingInfo<CHECKED> & column) {
|
||||
for(int i = 7; i < (7 + WIN_SIZE_Y); i++) {
|
||||
buffer[column.offset + i * STRIDE] = column.loader.loadFrom(input);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/// Main GPU 9/7 FDWT entry point.
|
||||
/// @tparam CHECK_LOADS true if boundaries should be checked when loading
|
||||
/// @tparam CHECK_WRITES true if boundaries should be checked when writing
|
||||
/// @param in input image
|
||||
/// @param out output buffer
|
||||
/// @param sizeX width of the input image
|
||||
/// @param sizeY height of the input image
|
||||
/// @param winSteps number of steps of sliding window
|
||||
template <bool CHECK_LOADS, bool CHECK_WRITES>
|
||||
__device__ void transform(const float * const in, float * const out,
|
||||
const int sizeX, const int sizeY,
|
||||
const int winSteps) {
|
||||
// info about columns loaded by this thread: one main column and possibly
|
||||
// one boundary column. (Only some threads load some boundary column.)
|
||||
FDWT97ColumnLoadingInfo<CHECK_LOADS> loadedColumn;
|
||||
FDWT97ColumnLoadingInfo<CHECK_LOADS> boundaryColumn;
|
||||
|
||||
// Initialize first 7 lines of transform buffer.
|
||||
const int firstY = blockIdx.y * WIN_SIZE_Y * winSteps;
|
||||
initColumn(loadedColumn, threadIdx.x, in, sizeX, sizeY, firstY);
|
||||
|
||||
// Some threads initialize boundary columns.
|
||||
boundaryColumn.offset = 0;
|
||||
boundaryColumn.loader.clear();
|
||||
if(threadIdx.x < 7) {
|
||||
// each thread among first 7 ones gets index of one of boundary columns
|
||||
const int colId = threadIdx.x + ((threadIdx.x < 3) ? WIN_SIZE_X : -7);
|
||||
|
||||
// Thread initializes offset of the boundary column (in shared buffer),
|
||||
// first 7 pixels of the column and a loader for this column.
|
||||
initColumn(boundaryColumn, colId, in, sizeX, sizeY, firstY);
|
||||
}
|
||||
|
||||
// horizontally transform first 7 rows in all columns
|
||||
horizontalFDWT97(7, 0);
|
||||
|
||||
// Index of column handled by this thread. (First half of threads handle
|
||||
// even columns and others handle odd columns.)
|
||||
const int outColumnIndex = parityIdx<WIN_SIZE_X>();
|
||||
|
||||
// writer of output linear bands - initialize it
|
||||
const int firstX = blockIdx.x * WIN_SIZE_X + outColumnIndex;
|
||||
VerticalDWTBandWriter<float, CHECK_WRITES> writer;
|
||||
writer.init(sizeX, sizeY, firstX, firstY);
|
||||
|
||||
// transform buffer offset of column transformed and saved by this thread
|
||||
const int outColumnOffset = buffer.getColumnOffset(outColumnIndex);
|
||||
|
||||
// (Each iteration of this loop assumes that first 7 rows of transform
|
||||
// buffer are already loaded with horizontally transformed coefficients.)
|
||||
for(int w = 0; w < winSteps; w++) {
|
||||
// Load another WIN_SIZE_Y lines of thread's column into the buffer.
|
||||
loadWindowIntoColumn(in, loadedColumn);
|
||||
|
||||
// some threads also load boundary columns
|
||||
if(threadIdx.x < 7) {
|
||||
loadWindowIntoColumn(in, boundaryColumn);
|
||||
}
|
||||
|
||||
// horizontally transform all newly loaded lines
|
||||
horizontalFDWT97(WIN_SIZE_Y, 7);
|
||||
|
||||
// Using 7 registers, remember current values of last 7 rows of
|
||||
// transform buffer. These rows are transformed horizontally only
|
||||
// and will be used in next iteration.
|
||||
float last7Lines[7];
|
||||
for(int i = 0; i < 7; i++) {
|
||||
last7Lines[i] = buffer[outColumnOffset + (WIN_SIZE_Y + i) * STRIDE];
|
||||
}
|
||||
|
||||
// vertically transform all central columns (do not scale yet)
|
||||
buffer.forEachVerticalOdd(outColumnOffset, AddScaledSum(f97Predict1));
|
||||
buffer.forEachVerticalEven(outColumnOffset, AddScaledSum(f97Update1));
|
||||
buffer.forEachVerticalOdd(outColumnOffset, AddScaledSum(f97Predict2));
|
||||
buffer.forEachVerticalEven(outColumnOffset, AddScaledSum(f97Update2));
|
||||
|
||||
// Save all results of current window. Results are in transform buffer
|
||||
// at rows from #4 to #(4 + WIN_SIZE_Y). Other rows are invalid now.
|
||||
// (They only served as a boundary for vertical FDWT.)
|
||||
|
||||
for(int i = 4; i < (4 + WIN_SIZE_Y); i += 2) {
|
||||
const int index = outColumnOffset + i * STRIDE;
|
||||
// Write low coefficients from column into low band ...
|
||||
writer.writeLowInto(out, buffer[index] * scale97Div);
|
||||
// ... and high coeficients into the high band.
|
||||
writer.writeHighInto(out, buffer[index + STRIDE] * scale97Mul);
|
||||
}
|
||||
|
||||
// Use last 7 remembered lines as first 7 lines for next iteration.
|
||||
// As expected, these lines are already horizontally transformed.
|
||||
for(int i = 0; i < 7; i++) {
|
||||
buffer[outColumnOffset + i * STRIDE] = last7Lines[i];
|
||||
|
||||
}
|
||||
|
||||
// Wait for all writing threads before proceeding to loading new
|
||||
// pixels in next iteration. (Not to overwrite those which
|
||||
// are not written yet.)
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
public:
|
||||
/// Runs one of specialized variants of 9/7 FDWT according to distance of
|
||||
/// processed pixels to image boudnary. Some variants do not check for
|
||||
/// boudnary and thus are slightly faster.
|
||||
/// @param in input image
|
||||
/// @param out output buffer
|
||||
/// @param sx width of the input image
|
||||
/// @param sy height of the input image
|
||||
/// @param steps number of steps of sliding window
|
||||
__device__ static void run(const float * const input, float * const output,
|
||||
const int sx, const int sy, const int steps) {
|
||||
// object with transform buffer in shared memory
|
||||
__shared__ FDWT97<WIN_SIZE_X, WIN_SIZE_Y> fdwt97;
|
||||
|
||||
// Compute limits of this threadblock's block of pixels and use them to
|
||||
// determine, whether this threadblock will have to deal with boundary.
|
||||
// (3 in next expressions is for radius of impulse response of 9/7 FDWT.)
|
||||
const int maxX = (blockIdx.x + 1) * WIN_SIZE_X + 3;
|
||||
const int maxY = (blockIdx.y + 1) * WIN_SIZE_Y * steps + 3;
|
||||
const bool atRightBoudary = maxX >= sx;
|
||||
const bool atBottomBoudary = maxY >= sy;
|
||||
|
||||
// Select specialized version of code according to distance of this
|
||||
// threadblock's pixels from image boundary.
|
||||
if(atBottomBoudary) {
|
||||
// near bottom boundary => check both writing and reading
|
||||
// printf("\n atBottomBoudary \n ");
|
||||
fdwt97.transform<true, true>(input, output, sx, sy, steps);
|
||||
} else if(atRightBoudary) {
|
||||
|
||||
// near right boundary only => check writing only
|
||||
fdwt97.transform<false, true>(input, output, sx, sy, steps);
|
||||
} else {
|
||||
|
||||
// no nearby boundary => check nothing
|
||||
fdwt97.transform<false, false>(input, output, sx, sy, steps);
|
||||
}
|
||||
}
|
||||
|
||||
}; // end of class FDWT97
|
||||
|
||||
|
||||
|
||||
/// Main GPU 9/7 FDWT entry point.
|
||||
/// @param input input image
|
||||
/// @parma output output buffer
|
||||
/// @param sx width of the input image
|
||||
/// @param sy height of the input image
|
||||
/// @param steps number of steps of sliding window
|
||||
template <int WIN_SX, int WIN_SY>
|
||||
__launch_bounds__(WIN_SX, CTMIN(SHM_SIZE/sizeof(FDWT97<WIN_SX, WIN_SY>), 8))
|
||||
__global__ void fdwt97Kernel(const float * const input, float * const output,
|
||||
const int sx, const int sy, const int steps) {
|
||||
// Excuse me, dear reader of this code - this call have to be here. If you
|
||||
// try to simply put contents of following method right here, CUDA compiler
|
||||
// (version 3.2) will spit tons of nonsense messy errors ...
|
||||
// Hope they will not break it even more in future releases.
|
||||
FDWT97<WIN_SX, WIN_SY>::run(input, output, sx, sy, steps);
|
||||
}
|
||||
|
||||
|
||||
|
||||
/// Only computes optimal number of sliding window steps,
|
||||
/// number of threadblocks and then lanches the 9/7 FDWT kernel.
|
||||
/// @tparam WIN_SX width of sliding window
|
||||
/// @tparam WIN_SY height of sliding window
|
||||
/// @param in input image
|
||||
/// @param out output buffer
|
||||
/// @param sx width of the input image
|
||||
/// @param sy height of the input image
|
||||
template <int WIN_SX, int WIN_SY>
|
||||
void launchFDWT97Kernel (float * in, float * out, int sx, int sy) {
|
||||
// compute optimal number of steps of each sliding window
|
||||
const int steps = divRndUp(sy, 15 * WIN_SY);
|
||||
|
||||
// prepare grid size
|
||||
dim3 gSize(divRndUp(sx, WIN_SX), divRndUp(sy, WIN_SY * steps));
|
||||
printf("\n globalx=%d, globaly=%d, blocksize=%d\n", gSize.x, gSize.y, WIN_SX);
|
||||
|
||||
// run kernel, possibly measure time and finally check the call
|
||||
PERF_BEGIN
|
||||
fdwt97Kernel<WIN_SX, WIN_SY><<<gSize, WIN_SX>>>(in, out, sx, sy, steps);
|
||||
PERF_END(" FDWT97", sx, sy)
|
||||
CudaDWTTester::checkLastKernelCall("FDWT 9/7 kernel");
|
||||
}
|
||||
|
||||
|
||||
|
||||
/// Forward 9/7 2D DWT. See common rules (dwt.h) for more details.
|
||||
/// @param in Input DWT coefficients. Should be normalized (in range
|
||||
/// [-0.5, 0.5]). Will not be preserved (will be overwritten).
|
||||
/// @param out output buffer on GPU - format specified in common rules
|
||||
/// @param sizeX width of input image (in pixels)
|
||||
/// @param sizeY height of input image (in pixels)
|
||||
/// @param levels number of recursive DWT levels
|
||||
void fdwt97(float * in, float * out, int sizeX, int sizeY, int levels) {
|
||||
// select right width of kernel for the size of the image
|
||||
if(sizeX >= 960) {
|
||||
launchFDWT97Kernel<192, 8>(in, out, sizeX, sizeY);
|
||||
} else if (sizeX >= 480) {
|
||||
launchFDWT97Kernel<128, 6>(in, out, sizeX, sizeY);
|
||||
} else {
|
||||
launchFDWT97Kernel<64, 6>(in, out, sizeX, sizeY);
|
||||
}
|
||||
|
||||
// if this was not the last level, continue recursively with other levels
|
||||
if(levels > 1) {
|
||||
// copy output's LL band back into input buffer
|
||||
const int llSizeX = divRndUp(sizeX, 2);
|
||||
const int llSizeY = divRndUp(sizeY, 2);
|
||||
memCopy(in, out, llSizeX, llSizeY);
|
||||
|
||||
// run remaining levels of FDWT
|
||||
fdwt97(in, out, llSizeX, llSizeY, levels - 1);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
} // end of namespace dwt_cuda
|
|
@ -1,440 +0,0 @@
|
|||
///
|
||||
/// @file: io.h
|
||||
/// @brief Manages loading and saving lineary stored bands and input images.
|
||||
/// @author Martin Jirman (207962@mail.muni.cz)
|
||||
/// @date 2011-01-20 22:38
|
||||
///
|
||||
///
|
||||
/// Copyright (c) 2011 Martin Jirman
|
||||
/// All rights reserved.
|
||||
///
|
||||
/// Redistribution and use in source and binary forms, with or without
|
||||
/// modification, are permitted provided that the following conditions are met:
|
||||
///
|
||||
/// * Redistributions of source code must retain the above copyright
|
||||
/// notice, this list of conditions and the following disclaimer.
|
||||
/// * Redistributions in binary form must reproduce the above copyright
|
||||
/// notice, this list of conditions and the following disclaimer in the
|
||||
/// documentation and/or other materials provided with the distribution.
|
||||
///
|
||||
/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
/// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
/// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
/// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
/// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
/// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
/// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
/// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
/// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
/// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
/// POSSIBILITY OF SUCH DAMAGE.
|
||||
///
|
||||
|
||||
#ifndef IO_H
|
||||
#define IO_H
|
||||
|
||||
#include "common.h"
|
||||
|
||||
namespace dwt_cuda {
|
||||
|
||||
/// Base for all IO classes - manages mirroring.
|
||||
class DWTIO {
|
||||
protected:
|
||||
/// Handles mirroring of image at edges in a DWT correct way.
|
||||
/// @param d a position in the image (will be replaced by mirrored d)
|
||||
/// @param sizeD size of the image along the dimension of 'd'
|
||||
__device__ static void mirror(int &d, const int &sizeD) {
|
||||
// TODO: enable multiple mirroring:
|
||||
// if(sizeD > 1) {
|
||||
// if(d < 0) {
|
||||
// const int underflow = -1 - d;
|
||||
// const int phase = (underflow / (sizeD - 1)) & 1;
|
||||
// const int remainder = underflow % (sizeD - 1);
|
||||
// if(phase == 0) {
|
||||
// d = remainder + 1;
|
||||
// } else {
|
||||
// d = sizeD - 2 - remainder;
|
||||
// }
|
||||
// } else if(d >= sizeD) {
|
||||
// const int overflow = d - sizeD;
|
||||
// const int phase = (overflow / (sizeD - 1)) & 1;
|
||||
// const int remainder = overflow % (sizeD - 1);
|
||||
// if(phase == 0) {
|
||||
// d = sizeD - 2 - remainder;
|
||||
// } else {
|
||||
// d = remainder + 1;
|
||||
// }
|
||||
// }
|
||||
// } else {
|
||||
// d = 0;
|
||||
// }
|
||||
// for test the mirror's use Feb 17
|
||||
if (d >= sizeD) {
|
||||
d = 2 * sizeD - 2 - d;
|
||||
} else if (d < 0) {
|
||||
d = -d;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
/// Base class for pixel loader and writer - manages computing start index,
|
||||
/// stride and end of image for loading column of pixels.
|
||||
/// @tparam T type of image pixels
|
||||
/// @tparam CHECKED true = be prepared to image boundary, false = don't care
|
||||
template <typename T, bool CHECKED> class VerticalDWTPixelIO : protected DWTIO {
|
||||
protected:
|
||||
int end; ///< index of bottom neightbor of last pixel of column
|
||||
int stride; ///< increment of pointer to get to next pixel
|
||||
|
||||
/// Initializes pixel IO - sets end index and a position of first pixel.
|
||||
/// @param sizeX width of the image
|
||||
/// @param sizeY height of the image
|
||||
/// @param firstX x-coordinate of first pixel to use
|
||||
/// @param firstY y-coordinate of first pixel to use
|
||||
/// @return index of pixel at position [x, y] in the image
|
||||
__device__ int initialize(const int sizeX, const int sizeY, int firstX,
|
||||
int firstY) {
|
||||
// initialize all pointers and stride
|
||||
end = CHECKED ? (sizeY * sizeX + firstX) : 0;
|
||||
stride = sizeX;
|
||||
return firstX + sizeX * firstY;
|
||||
}
|
||||
};
|
||||
|
||||
/// Writes reverse transformed pixels directly into output image.
|
||||
/// @tparam T type of output pixels
|
||||
/// @tparam CHECKED true = be prepared to image boundary, false = don't care
|
||||
template <typename T, bool CHECKED>
|
||||
class VerticalDWTPixelWriter : VerticalDWTPixelIO<T, CHECKED> {
|
||||
private:
|
||||
int next; // index of the next pixel to be loaded
|
||||
|
||||
public:
|
||||
/// Initializes writer - sets output buffer and a position of first pixel.
|
||||
/// @param sizeX width of the image
|
||||
/// @param sizeY height of the image
|
||||
/// @param firstX x-coordinate of first pixel to write into
|
||||
/// @param firstY y-coordinate of first pixel to write into
|
||||
__device__ void init(const int sizeX, const int sizeY, int firstX,
|
||||
int firstY) {
|
||||
if (firstX < sizeX) {
|
||||
next = this->initialize(sizeX, sizeY, firstX, firstY);
|
||||
} else {
|
||||
this->end = 0;
|
||||
this->stride = 0;
|
||||
next = 0;
|
||||
}
|
||||
}
|
||||
|
||||
/// Writes given value at next position and advances internal pointer while
|
||||
/// correctly handling mirroring.
|
||||
/// @param output output image to write pixel into
|
||||
/// @param value value of the pixel to be written
|
||||
__device__ void writeInto(T *const output, const T &value) {
|
||||
if ((!CHECKED) || (next != this->end)) {
|
||||
output[next] = value;
|
||||
next += this->stride;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
/// Loads pixels from input image.
|
||||
/// @tparam T type of image input pixels
|
||||
/// @tparam CHECKED true = be prepared to image boundary, false = don't care
|
||||
template <typename T, bool CHECKED>
|
||||
class VerticalDWTPixelLoader : protected VerticalDWTPixelIO<const T, CHECKED> {
|
||||
private:
|
||||
int last; ///< index of last loaded pixel
|
||||
public:
|
||||
//******************* FOR TEST **********************
|
||||
__device__ int getlast() { return last; }
|
||||
__device__ int getend() { return this->end; }
|
||||
__device__ int getstride() { return this->stride; }
|
||||
__device__ void setend(int a) { this->end = a; }
|
||||
//******************* FOR TEST **********************
|
||||
|
||||
/// Initializes loader - sets input size and a position of first pixel.
|
||||
/// @param sizeX width of the image
|
||||
/// @param sizeY height of the image
|
||||
/// @param firstX x-coordinate of first pixel to load
|
||||
/// @param firstY y-coordinate of first pixel to load
|
||||
__device__ void init(const int sizeX, const int sizeY, int firstX,
|
||||
int firstY) {
|
||||
// correctly mirror x coordinate
|
||||
this->mirror(firstX, sizeX);
|
||||
|
||||
// 'last' always points to already loaded pixel (subtract sizeX = stride)
|
||||
last = this->initialize(sizeX, sizeY, firstX, firstY) - sizeX;
|
||||
// last = (FirstX + sizeX * FirstY) - sizeX
|
||||
}
|
||||
|
||||
/// Sets all fields to zeros, for compiler not to complain about
|
||||
/// uninitialized stuff.
|
||||
__device__ void clear() {
|
||||
this->end = 0;
|
||||
this->stride = 0;
|
||||
this->last = 0;
|
||||
}
|
||||
|
||||
/// Gets another pixel and advancees internal pointer to following one.
|
||||
/// @param input input image to load next pixel from
|
||||
/// @return next pixel from given image
|
||||
__device__ T loadFrom(const T *const input) {
|
||||
last += this->stride;
|
||||
if (CHECKED && (last == this->end)) {
|
||||
last -= 2 * this->stride;
|
||||
this->stride = -this->stride; // reverse loader's direction
|
||||
}
|
||||
// avoid reading from negative indices if loader is checked
|
||||
// return (CHECKED && (last < 0)) ? 0 : input[last]; // TODO: use this
|
||||
// checked variant later
|
||||
if (last < 0) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
return input[last];
|
||||
// return this->end;
|
||||
// return last;
|
||||
// return this->stride;
|
||||
}
|
||||
};
|
||||
|
||||
/// Base for band write and loader. Manages computing strides and pointers
|
||||
/// to first and last pixels in a linearly-stored-bands correct way.
|
||||
/// @tparam T type of band coefficients
|
||||
/// @tparam CHECKED true = be prepared to image boundary, false = don't care
|
||||
template <typename T, bool CHECKED> class VerticalDWTBandIO : protected DWTIO {
|
||||
protected:
|
||||
/// index of bottom neighbor of last pixel of loaded column
|
||||
int end;
|
||||
|
||||
/// increment of index to get from highpass band to the lowpass one
|
||||
int strideHighToLow;
|
||||
|
||||
/// increment of index to get from the lowpass band to the highpass one
|
||||
int strideLowToHigh;
|
||||
|
||||
/// Initializes IO - sets size of image and a position of first pixel.
|
||||
/// @param imageSizeX width of the image
|
||||
/// @param imageSizeY height of the image
|
||||
/// @param firstX x-coordinate of first pixel to use
|
||||
/// (Parity determines vertically low or high band.)
|
||||
/// @param firstY y-coordinate of first pixel to use
|
||||
/// (Parity determines horizontally low or high band.)
|
||||
/// @return index of first item specified by firstX and firstY
|
||||
__device__ int initialize(const int imageSizeX, const int imageSizeY,
|
||||
int firstX, int firstY) {
|
||||
// index of first pixel (topmost one) of the column with index firstX
|
||||
int columnOffset = firstX / 2;
|
||||
|
||||
// difference between indices of two vertically neighboring pixels
|
||||
// in the same band
|
||||
int verticalStride;
|
||||
|
||||
// resolve index of first pixel according to horizontal parity
|
||||
if (firstX & 1) {
|
||||
// first pixel in one of right bands
|
||||
verticalStride = imageSizeX / 2;
|
||||
columnOffset += divRndUp(imageSizeX, 2) * divRndUp(imageSizeY, 2);
|
||||
strideLowToHigh = (imageSizeX * imageSizeY) / 2;
|
||||
} else {
|
||||
// first pixel in one of left bands
|
||||
verticalStride = imageSizeX / 2 + (imageSizeX & 1);
|
||||
strideLowToHigh = divRndUp(imageSizeY, 2) * imageSizeX;
|
||||
}
|
||||
|
||||
// set the other stride
|
||||
strideHighToLow = verticalStride - strideLowToHigh;
|
||||
|
||||
// compute index of coefficient which indicates end of image
|
||||
if (CHECKED) {
|
||||
end = columnOffset // right column
|
||||
+ (imageSizeY / 2) * verticalStride // right row
|
||||
+ (imageSizeY & 1) * strideLowToHigh; // possibly in high band
|
||||
} else {
|
||||
end = 0;
|
||||
}
|
||||
|
||||
//***********for test**************
|
||||
// end = CHECKED;
|
||||
//***********for test**************
|
||||
|
||||
// finally, return index of the first item
|
||||
return columnOffset // right column
|
||||
+ (firstY / 2) * verticalStride // right row
|
||||
+ (firstY & 1) * strideLowToHigh; // possibly in high band
|
||||
}
|
||||
};
|
||||
|
||||
/// Directly loads coefficients from four consecutively stored transformed
|
||||
/// bands.
|
||||
/// @tparam T type of input band coefficients
|
||||
/// @tparam CHECKED true = be prepared to image boundary, false = don't care
|
||||
template <typename T, bool CHECKED>
|
||||
class VerticalDWTBandLoader : public VerticalDWTBandIO<const T, CHECKED> {
|
||||
private:
|
||||
int last; ///< index of last loaded pixel
|
||||
|
||||
/// Checks internal index and possibly reverses direction of loader.
|
||||
/// (Handles mirroring at the bottom of the image.)
|
||||
/// @param input input image to load next coefficient from
|
||||
/// @param stride stride to use now (one of two loader's strides)
|
||||
/// @return loaded coefficient
|
||||
__device__ T updateAndLoad(const T *const input, const int &stride) {
|
||||
last += stride;
|
||||
if (CHECKED && (last == this->end)) {
|
||||
// undo last two updates of index (to get to previous mirrored item)
|
||||
last -= (this->strideLowToHigh + this->strideHighToLow);
|
||||
|
||||
// swap and reverse strides (to move up in the loaded column now)
|
||||
const int temp = this->strideLowToHigh;
|
||||
this->strideLowToHigh = -this->strideHighToLow;
|
||||
this->strideHighToLow = -temp;
|
||||
}
|
||||
if (last < 0) {
|
||||
return 0;
|
||||
}
|
||||
// avoid reading from negative indices if loader is checked
|
||||
// return (CHECKED && (last < 0)) ? 0 : input[last]; // TODO: use this
|
||||
// checked variant later
|
||||
return input[last];
|
||||
}
|
||||
|
||||
public:
|
||||
/// Initializes loader - sets input size and a position of first pixel.
|
||||
/// @param imageSizeX width of the image
|
||||
/// @param imageSizeY height of the image
|
||||
/// @param firstX x-coordinate of first pixel to load
|
||||
/// (Parity determines vertically low or high band.)
|
||||
/// @param firstY y-coordinate of first pixel to load
|
||||
/// (Parity determines horizontally low or high band.)
|
||||
__device__ void init(const int imageSizeX, const int imageSizeY, int firstX,
|
||||
const int firstY) {
|
||||
this->mirror(firstX, imageSizeX);
|
||||
last = this->initialize(imageSizeX, imageSizeY, firstX, firstY);
|
||||
|
||||
// adjust to point to previous item
|
||||
last -= (firstY & 1) ? this->strideLowToHigh : this->strideHighToLow;
|
||||
}
|
||||
|
||||
/// Sets all fields to zeros, for compiler not to complain about
|
||||
/// uninitialized stuff.
|
||||
__device__ void clear() {
|
||||
this->end = 0;
|
||||
this->strideHighToLow = 0;
|
||||
this->strideLowToHigh = 0;
|
||||
this->last = 0;
|
||||
}
|
||||
|
||||
/// Gets another coefficient from lowpass band and advances internal index.
|
||||
/// Call this method first if position of first pixel passed to init
|
||||
/// was in high band.
|
||||
/// @param input input image to load next coefficient from
|
||||
/// @return next coefficient from the lowpass band of the given image
|
||||
__device__ T loadLowFrom(const T *const input) {
|
||||
return updateAndLoad(input, this->strideHighToLow);
|
||||
}
|
||||
|
||||
/// Gets another coefficient from the highpass band and advances index.
|
||||
/// Call this method first if position of first pixel passed to init
|
||||
/// was in high band.
|
||||
/// @param input input image to load next coefficient from
|
||||
/// @return next coefficient from the highbass band of the given image
|
||||
__device__ T loadHighFrom(const T *const input) {
|
||||
return updateAndLoad(input, this->strideLowToHigh);
|
||||
}
|
||||
};
|
||||
|
||||
/// Directly saves coefficients into four transformed bands.
|
||||
/// @tparam T type of output band coefficients
|
||||
/// @tparam CHECKED true = be prepared to image boundary, false = don't care
|
||||
template <typename T, bool CHECKED>
|
||||
class VerticalDWTBandWriter : public VerticalDWTBandIO<T, CHECKED> {
|
||||
private:
|
||||
int next; ///< index of last loaded pixel
|
||||
|
||||
/// Checks internal index and possibly stops the writer.
|
||||
/// (Handles mirroring at edges of the image.)
|
||||
/// @param output output buffer
|
||||
/// @param item item to put into the output
|
||||
/// @param stride increment of the pointer to get to next output index
|
||||
__device__ int saveAndUpdate(T *const output, const T &item,
|
||||
const int &stride) {
|
||||
// if(blockIdx.x == 0 && blockIdx.y == 11 && threadIdx.x == 0){
|
||||
////test, Mar 20
|
||||
if ((!CHECKED) || (next != this->end)) {
|
||||
// if(next == 4) {
|
||||
// printf(" next: %d stride: %d val: %f \n", next, stride, item );
|
||||
// }
|
||||
output[next] = item;
|
||||
next += stride;
|
||||
}
|
||||
// }
|
||||
// if((!CHECKED) || (next != this->end)) { //the real one
|
||||
// output[next] = item;
|
||||
// next += stride; //stride has been test
|
||||
// }
|
||||
return next;
|
||||
}
|
||||
|
||||
public:
|
||||
/// Initializes writer - sets output size and a position of first pixel.
|
||||
/// @param output output image
|
||||
/// @param imageSizeX width of the image
|
||||
/// @param imageSizeY height of the image
|
||||
/// @param firstX x-coordinate of first pixel to write
|
||||
/// (Parity determines vertically low or high band.)
|
||||
/// @param firstY y-coordinate of first pixel to write
|
||||
/// (Parity determines horizontally low or high band.)
|
||||
__device__ void init(const int imageSizeX, const int imageSizeY,
|
||||
const int firstX, const int firstY) {
|
||||
if (firstX < imageSizeX) {
|
||||
next = this->initialize(imageSizeX, imageSizeY, firstX, firstY);
|
||||
} else {
|
||||
clear();
|
||||
}
|
||||
}
|
||||
|
||||
/// Sets all fields to zeros, for compiler not to complain about
|
||||
/// uninitialized stuff.
|
||||
__device__ void clear() {
|
||||
this->end = 0;
|
||||
this->strideHighToLow = 0;
|
||||
this->strideLowToHigh = 0;
|
||||
this->next = 0;
|
||||
}
|
||||
|
||||
/// Writes another coefficient into the band which was specified using
|
||||
/// init's firstX and firstY parameters and advances internal pointer.
|
||||
/// Call this method first if position of first pixel passed to init
|
||||
/// was in lowpass band.
|
||||
/// @param output output image
|
||||
/// @param low lowpass coefficient to save into the lowpass band
|
||||
__device__ int writeLowInto(T *const output, const T &primary) {
|
||||
return saveAndUpdate(output, primary, this->strideLowToHigh);
|
||||
}
|
||||
|
||||
/// Writes another coefficient from the other band and advances pointer.
|
||||
/// Call this method first if position of first pixel passed to init
|
||||
/// was in highpass band.
|
||||
/// @param output output image
|
||||
/// @param high highpass coefficient to save into the highpass band
|
||||
__device__ int writeHighInto(T *const output, const T &other) {
|
||||
return saveAndUpdate(output, other, this->strideHighToLow);
|
||||
}
|
||||
|
||||
//*******Add three functions to get private values*******
|
||||
__device__ int getnext() { return next; }
|
||||
|
||||
__device__ int getend() { return this->end; }
|
||||
|
||||
__device__ int getstrideHighToLow() { return this->strideHighToLow; }
|
||||
|
||||
__device__ int getstrideLowToHigh() { return this->strideLowToHigh; }
|
||||
|
||||
//*******Add three functions to get private values*******
|
||||
};
|
||||
|
||||
} // namespace dwt_cuda
|
||||
|
||||
#endif // IO_H
|
|
@ -1,360 +0,0 @@
|
|||
///
|
||||
/// @file rdwt53.cu
|
||||
/// @brief CUDA implementation of reverse 5/3 2D DWT.
|
||||
/// @author Martin Jirman (207962@mail.muni.cz)
|
||||
/// @date 2011-02-04 14:19
|
||||
///
|
||||
///
|
||||
/// Copyright (c) 2011 Martin Jirman
|
||||
/// All rights reserved.
|
||||
///
|
||||
/// Redistribution and use in source and binary forms, with or without
|
||||
/// modification, are permitted provided that the following conditions are met:
|
||||
///
|
||||
/// * Redistributions of source code must retain the above copyright
|
||||
/// notice, this list of conditions and the following disclaimer.
|
||||
/// * Redistributions in binary form must reproduce the above copyright
|
||||
/// notice, this list of conditions and the following disclaimer in the
|
||||
/// documentation and/or other materials provided with the distribution.
|
||||
///
|
||||
/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
/// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
/// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
/// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
/// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
/// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
/// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
/// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
/// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
/// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
/// POSSIBILITY OF SUCH DAMAGE.
|
||||
///
|
||||
|
||||
|
||||
#include "common.h"
|
||||
#include "transform_buffer.h"
|
||||
#include "io.h"
|
||||
|
||||
|
||||
namespace dwt_cuda {
|
||||
|
||||
|
||||
|
||||
/// Wraps shared momory buffer and algorithms needed for computing 5/3 RDWT
|
||||
/// using sliding window and lifting schema.
|
||||
/// @tparam WIN_SIZE_X width of sliding window
|
||||
/// @tparam WIN_SIZE_Y height of sliding window
|
||||
template <int WIN_SIZE_X, int WIN_SIZE_Y>
|
||||
class RDWT53 {
|
||||
private:
|
||||
|
||||
/// Shared memory buffer used for 5/3 DWT transforms.
|
||||
typedef TransformBuffer<int, WIN_SIZE_X, WIN_SIZE_Y + 3, 2> RDWT53Buffer;
|
||||
|
||||
/// Shared buffer used for reverse 5/3 DWT.
|
||||
RDWT53Buffer buffer;
|
||||
|
||||
/// Difference between indices of two vertically neighboring items in buffer.
|
||||
enum { STRIDE = RDWT53Buffer::VERTICAL_STRIDE };
|
||||
|
||||
|
||||
/// Info needed for loading of one input column from input image.
|
||||
/// @tparam CHECKED true if loader should check boundaries
|
||||
template <bool CHECKED>
|
||||
struct RDWT53Column {
|
||||
/// loader of pixels from column in input image
|
||||
VerticalDWTBandLoader<int, CHECKED> loader;
|
||||
|
||||
/// Offset of corresponding column in shared buffer.
|
||||
int offset;
|
||||
|
||||
/// Sets all fields to some values to avoid 'uninitialized' warnings.
|
||||
__device__ void clear() {
|
||||
offset = 0;
|
||||
loader.clear();
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
/// 5/3 DWT reverse update operation.
|
||||
struct Reverse53Update {
|
||||
__device__ void operator() (const int p, int & c, const int n) const {
|
||||
c -= (p + n + 2) / 4; // F.3, page 118, ITU-T Rec. T.800 final draft
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
/// 5/3 DWT reverse predict operation.
|
||||
struct Reverse53Predict {
|
||||
__device__ void operator() (const int p, int & c, const int n) const {
|
||||
c += (p + n) / 2; // F.4, page 118, ITU-T Rec. T.800 final draft
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
/// Horizontal 5/3 RDWT on specified lines of transform buffer.
|
||||
/// @param lines number of lines to be transformed
|
||||
/// @param firstLine index of the first line to be transformed
|
||||
__device__ void horizontalTransform(const int lines, const int firstLine) {
|
||||
__syncthreads();
|
||||
buffer.forEachHorizontalEven(firstLine, lines, Reverse53Update());
|
||||
__syncthreads();
|
||||
buffer.forEachHorizontalOdd(firstLine, lines, Reverse53Predict());
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
|
||||
/// Using given loader, it loads another WIN_SIZE_Y coefficients
|
||||
/// into specified column.
|
||||
/// @tparam CHECKED true if loader should check image boundaries
|
||||
/// @param input input coefficients to load from
|
||||
/// @param col info about loaded column
|
||||
template <bool CHECKED>
|
||||
inline __device__ void loadWindowIntoColumn(const int * const input,
|
||||
RDWT53Column<CHECKED> & col) {
|
||||
for(int i = 3; i < (3 + WIN_SIZE_Y); i += 2) {
|
||||
buffer[col.offset + i * STRIDE] = col.loader.loadLowFrom(input);
|
||||
buffer[col.offset + (i + 1) * STRIDE] = col.loader.loadHighFrom(input);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/// Initializes one column of shared transform buffer with 7 input pixels.
|
||||
/// Those 7 pixels will not be transformed. Also initializes given loader.
|
||||
/// @tparam CHECKED true if loader should check image boundaries
|
||||
/// @param columnX x coordinate of column in shared transform buffer
|
||||
/// @param input input image
|
||||
/// @param sizeX width of the input image
|
||||
/// @param sizeY height of the input image
|
||||
/// @param loader (uninitialized) info about loaded column
|
||||
template <bool CHECKED>
|
||||
__device__ void initColumn(const int columnX, const int * const input,
|
||||
const int sizeX, const int sizeY,
|
||||
RDWT53Column<CHECKED> & column,
|
||||
const int firstY) {
|
||||
// coordinates of the first coefficient to be loaded
|
||||
const int firstX = blockIdx.x * WIN_SIZE_X + columnX;
|
||||
|
||||
// offset of the column with index 'colIndex' in the transform buffer
|
||||
column.offset = buffer.getColumnOffset(columnX);
|
||||
|
||||
if(blockIdx.y == 0) {
|
||||
// topmost block - apply mirroring rules when loading first 3 rows
|
||||
column.loader.init(sizeX, sizeY, firstX, firstY);
|
||||
|
||||
// load pixels in mirrored way
|
||||
buffer[column.offset + 1 * STRIDE] = column.loader.loadLowFrom(input);
|
||||
buffer[column.offset + 0 * STRIDE] =
|
||||
buffer[column.offset + 2 * STRIDE] = column.loader.loadHighFrom(input);
|
||||
} else {
|
||||
// non-topmost row - regular loading:
|
||||
column.loader.init(sizeX, sizeY, firstX, firstY - 1);
|
||||
buffer[column.offset + 0 * STRIDE] = column.loader.loadHighFrom(input);
|
||||
buffer[column.offset + 1 * STRIDE] = column.loader.loadLowFrom(input);
|
||||
buffer[column.offset + 2 * STRIDE] = column.loader.loadHighFrom(input);
|
||||
}
|
||||
// Now, the next coefficient, which will be loaded by loader, is #2.
|
||||
}
|
||||
|
||||
|
||||
/// Actual GPU 5/3 RDWT implementation.
|
||||
/// @tparam CHECKED_LOADS true if boundaries must be checked when reading
|
||||
/// @tparam CHECKED_WRITES true if boundaries must be checked when writing
|
||||
/// @param in input image (5/3 transformed coefficients)
|
||||
/// @param out output buffer (for reverse transformed image)
|
||||
/// @param sizeX width of the output image
|
||||
/// @param sizeY height of the output image
|
||||
/// @param winSteps number of sliding window steps
|
||||
template<bool CHECKED_LOADS, bool CHECKED_WRITES>
|
||||
__device__ void transform(const int * const in, int * const out,
|
||||
const int sizeX, const int sizeY,
|
||||
const int winSteps) {
|
||||
// info about one main and one boundary column
|
||||
RDWT53Column<CHECKED_LOADS> column, boundaryColumn;
|
||||
|
||||
// index of first row to be transformed
|
||||
const int firstY = blockIdx.y * WIN_SIZE_Y * winSteps;
|
||||
|
||||
// some threads initialize boundary columns
|
||||
boundaryColumn.clear();
|
||||
if(threadIdx.x < 3) {
|
||||
// First 3 threads also handle boundary columns. Thread #0 gets right
|
||||
// column #0, thread #1 get right column #1 and thread #2 left column.
|
||||
const int colId = threadIdx.x + ((threadIdx.x != 2) ? WIN_SIZE_X : -3);
|
||||
|
||||
// Thread initializes offset of the boundary column (in shared
|
||||
// buffer), first 3 pixels of the column and a loader for this column.
|
||||
initColumn(colId, in, sizeX, sizeY, boundaryColumn, firstY);
|
||||
}
|
||||
|
||||
// All threads initialize central columns.
|
||||
initColumn(parityIdx<WIN_SIZE_X>(), in, sizeX, sizeY, column, firstY);
|
||||
|
||||
// horizontally transform first 3 rows
|
||||
horizontalTransform(3, 0);
|
||||
|
||||
// writer of output pixels - initialize it
|
||||
const int outX = blockIdx.x * WIN_SIZE_X + threadIdx.x;
|
||||
VerticalDWTPixelWriter<int, CHECKED_WRITES> writer;
|
||||
writer.init(sizeX, sizeY, outX, firstY);
|
||||
|
||||
// offset of column (in transform buffer) saved by this thread
|
||||
const int outputColumnOffset = buffer.getColumnOffset(threadIdx.x);
|
||||
|
||||
// (Each iteration assumes that first 3 rows of transform buffer are
|
||||
// already loaded with horizontally transformed pixels.)
|
||||
for(int w = 0; w < winSteps; w++) {
|
||||
// Load another WIN_SIZE_Y lines of this thread's column
|
||||
// into the transform buffer.
|
||||
loadWindowIntoColumn(in, column);
|
||||
|
||||
// possibly load boundary columns
|
||||
if(threadIdx.x < 3) {
|
||||
loadWindowIntoColumn(in, boundaryColumn);
|
||||
}
|
||||
|
||||
// horizontally transform all newly loaded lines
|
||||
horizontalTransform(WIN_SIZE_Y, 3);
|
||||
|
||||
// Using 3 registers, remember current values of last 3 rows
|
||||
// of transform buffer. These rows are transformed horizontally
|
||||
// only and will be used in next iteration.
|
||||
int last3Lines[3];
|
||||
last3Lines[0] = buffer[outputColumnOffset + (WIN_SIZE_Y + 0) * STRIDE];
|
||||
last3Lines[1] = buffer[outputColumnOffset + (WIN_SIZE_Y + 1) * STRIDE];
|
||||
last3Lines[2] = buffer[outputColumnOffset + (WIN_SIZE_Y + 2) * STRIDE];
|
||||
|
||||
// vertically transform all central columns
|
||||
buffer.forEachVerticalOdd(outputColumnOffset, Reverse53Update());
|
||||
buffer.forEachVerticalEven(outputColumnOffset, Reverse53Predict());
|
||||
|
||||
// Save all results of current window. Results are in transform buffer
|
||||
// at rows from #1 to #(1 + WIN_SIZE_Y). Other rows are invalid now.
|
||||
// (They only served as a boundary for vertical RDWT.)
|
||||
for(int i = 1; i < (1 + WIN_SIZE_Y); i++) {
|
||||
writer.writeInto(out, buffer[outputColumnOffset + i * STRIDE]);
|
||||
}
|
||||
|
||||
// Use last 3 remembered lines as first 3 lines for next iteration.
|
||||
// As expected, these lines are already horizontally transformed.
|
||||
buffer[outputColumnOffset + 0 * STRIDE] = last3Lines[0];
|
||||
buffer[outputColumnOffset + 1 * STRIDE] = last3Lines[1];
|
||||
buffer[outputColumnOffset + 2 * STRIDE] = last3Lines[2];
|
||||
|
||||
// Wait for all writing threads before proceeding to loading new
|
||||
// coeficients in next iteration. (Not to overwrite those which
|
||||
// are not written yet.)
|
||||
__syncthreads();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public:
|
||||
/// Main GPU 5/3 RDWT entry point.
|
||||
/// @param in input image (5/3 transformed coefficients)
|
||||
/// @param out output buffer (for reverse transformed image)
|
||||
/// @param sizeX width of the output image
|
||||
/// @param sizeY height of the output image
|
||||
/// @param winSteps number of sliding window steps
|
||||
__device__ static void run(const int * const input, int * const output,
|
||||
const int sx, const int sy, const int steps) {
|
||||
// prepare instance with buffer in shared memory
|
||||
__shared__ RDWT53<WIN_SIZE_X, WIN_SIZE_Y> rdwt53;
|
||||
|
||||
// Compute limits of this threadblock's block of pixels and use them to
|
||||
// determine, whether this threadblock will have to deal with boundary.
|
||||
// (1 in next expressions is for radius of impulse response of 5/3 RDWT.)
|
||||
const int maxX = (blockIdx.x + 1) * WIN_SIZE_X + 1;
|
||||
const int maxY = (blockIdx.y + 1) * WIN_SIZE_Y * steps + 1;
|
||||
const bool atRightBoudary = maxX >= sx;
|
||||
const bool atBottomBoudary = maxY >= sy;
|
||||
|
||||
// Select specialized version of code according to distance of this
|
||||
// threadblock's pixels from image boundary.
|
||||
if(atBottomBoudary) {
|
||||
// near bottom boundary => check both writing and reading
|
||||
rdwt53.transform<true, true>(input, output, sx, sy, steps);
|
||||
} else if(atRightBoudary) {
|
||||
// near right boundary only => check writing only
|
||||
rdwt53.transform<false, true>(input, output, sx, sy, steps);
|
||||
} else {
|
||||
// no nearby boundary => check nothing
|
||||
rdwt53.transform<false, false>(input, output, sx, sy, steps);
|
||||
}
|
||||
}
|
||||
|
||||
}; // end of class RDWT53
|
||||
|
||||
|
||||
|
||||
/// Main GPU 5/3 RDWT entry point.
|
||||
/// @param in input image (5/3 transformed coefficients)
|
||||
/// @param out output buffer (for reverse transformed image)
|
||||
/// @param sizeX width of the output image
|
||||
/// @param sizeY height of the output image
|
||||
/// @param winSteps number of sliding window steps
|
||||
template <int WIN_SX, int WIN_SY>
|
||||
__launch_bounds__(WIN_SX, CTMIN(SHM_SIZE/sizeof(RDWT53<WIN_SX, WIN_SY>), 8))
|
||||
__global__ void rdwt53Kernel(const int * const in, int * const out,
|
||||
const int sx, const int sy, const int steps) {
|
||||
RDWT53<WIN_SX, WIN_SY>::run(in, out, sx, sy, steps);
|
||||
}
|
||||
|
||||
|
||||
|
||||
/// Only computes optimal number of sliding window steps,
|
||||
/// number of threadblocks and then lanches the 5/3 RDWT kernel.
|
||||
/// @tparam WIN_SX width of sliding window
|
||||
/// @tparam WIN_SY height of sliding window
|
||||
/// @param in input image
|
||||
/// @param out output buffer
|
||||
/// @param sx width of the input image
|
||||
/// @param sy height of the input image
|
||||
template <int WIN_SX, int WIN_SY>
|
||||
void launchRDWT53Kernel (int * in, int * out, const int sx, const int sy) {
|
||||
// compute optimal number of steps of each sliding window
|
||||
const int steps = divRndUp(sy, 15 * WIN_SY);
|
||||
|
||||
// prepare grid size
|
||||
dim3 gSize(divRndUp(sx, WIN_SX), divRndUp(sy, WIN_SY * steps));
|
||||
|
||||
// finally transform this level
|
||||
PERF_BEGIN
|
||||
rdwt53Kernel<WIN_SX, WIN_SY><<<gSize, WIN_SX>>>(in, out, sx, sy, steps);
|
||||
PERF_END(" RDWT53", sx, sy)
|
||||
CudaDWTTester::checkLastKernelCall("RDWT 5/3 kernel");
|
||||
}
|
||||
|
||||
|
||||
|
||||
/// Reverse 5/3 2D DWT. See common rules (above) for more details.
|
||||
/// @param in Input DWT coefficients. Format described in common rules.
|
||||
/// Will not be preserved (will be overwritten).
|
||||
/// @param out output buffer on GPU - will contain original image
|
||||
/// in normalized range [-128, 127].
|
||||
/// @param sizeX width of input image (in pixels)
|
||||
/// @param sizeY height of input image (in pixels)
|
||||
/// @param levels number of recursive DWT levels
|
||||
void rdwt53(int * in, int * out, int sizeX, int sizeY, int levels) {
|
||||
if(levels > 1) {
|
||||
// let this function recursively reverse transform deeper levels first
|
||||
const int llSizeX = divRndUp(sizeX, 2);
|
||||
const int llSizeY = divRndUp(sizeY, 2);
|
||||
rdwt53(in, out, llSizeX, llSizeY, levels - 1);
|
||||
|
||||
// copy reverse transformed LL band from output back into the input
|
||||
memCopy(in, out, llSizeX, llSizeY);
|
||||
}
|
||||
|
||||
// select right width of kernel for the size of the image
|
||||
if(sizeX >= 960) {
|
||||
launchRDWT53Kernel<192, 8>(in, out, sizeX, sizeY);
|
||||
} else if (sizeX >= 480) {
|
||||
launchRDWT53Kernel<128, 8>(in, out, sizeX, sizeY);
|
||||
} else {
|
||||
launchRDWT53Kernel<64, 8>(in, out, sizeX, sizeY);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
} // end of namespace dwt_cuda
|
|
@ -1,363 +0,0 @@
|
|||
///
|
||||
/// @file rdwt97.cu
|
||||
/// @brief CUDA implementation of reverse 9/7 2D DWT.
|
||||
/// @author Martin Jirman (207962@mail.muni.cz)
|
||||
/// @date 2011-02-03 21:59
|
||||
///
|
||||
///
|
||||
/// Copyright (c) 2011 Martin Jirman
|
||||
/// All rights reserved.
|
||||
///
|
||||
/// Redistribution and use in source and binary forms, with or without
|
||||
/// modification, are permitted provided that the following conditions are met:
|
||||
///
|
||||
/// * Redistributions of source code must retain the above copyright
|
||||
/// notice, this list of conditions and the following disclaimer.
|
||||
/// * Redistributions in binary form must reproduce the above copyright
|
||||
/// notice, this list of conditions and the following disclaimer in the
|
||||
/// documentation and/or other materials provided with the distribution.
|
||||
///
|
||||
/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
/// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
/// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
/// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
/// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
/// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
/// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
/// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
/// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
/// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
/// POSSIBILITY OF SUCH DAMAGE.
|
||||
///
|
||||
|
||||
|
||||
#include "common.h"
|
||||
#include "transform_buffer.h"
|
||||
#include "io.h"
|
||||
|
||||
|
||||
namespace dwt_cuda {
|
||||
|
||||
|
||||
/// Wraps shared memory buffer and methods for computing 9/7 RDWT using
|
||||
/// lifting schema and sliding window.
|
||||
/// @tparam WIN_SIZE_X width of the sliding window
|
||||
/// @tparam WIN_SIZE_Y height of the sliding window
|
||||
template <int WIN_SIZE_X, int WIN_SIZE_Y>
|
||||
class RDWT97 {
|
||||
private:
|
||||
|
||||
/// Info related to loading of one input column.
|
||||
/// @tparam CHECKED true if boundary chould be checked,
|
||||
/// false if there is no near boudnary
|
||||
template <bool CHECKED>
|
||||
struct RDWT97Column {
|
||||
/// laoder of input pxels for given column.
|
||||
VerticalDWTBandLoader<float, CHECKED> loader;
|
||||
|
||||
/// Offset of loaded column in shared memory buffer.
|
||||
int offset;
|
||||
|
||||
/// Sets all fields to some values to avoid 'uninitialized' warnings.
|
||||
__device__ void clear() {
|
||||
loader.clear();
|
||||
offset = 0;
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
/// Shared memory buffer used for 9/7 DWT transforms.
|
||||
typedef TransformBuffer<float, WIN_SIZE_X, WIN_SIZE_Y + 7, 4> RDWT97Buffer;
|
||||
|
||||
/// Shared buffer used for reverse 9/7 DWT.
|
||||
RDWT97Buffer buffer;
|
||||
|
||||
/// Difference between indices of two vertical neighbors in buffer.
|
||||
enum { STRIDE = RDWT97Buffer::VERTICAL_STRIDE };
|
||||
|
||||
|
||||
/// Horizontal 9/7 RDWT on specified lines of transform buffer.
|
||||
/// @param lines number of lines to be transformed
|
||||
/// @param firstLine index of the first line to be transformed
|
||||
__device__ void horizontalRDWT97(int lines, int firstLine) {
|
||||
__syncthreads();
|
||||
buffer.scaleHorizontal(scale97Mul, scale97Div, firstLine, lines);
|
||||
__syncthreads();
|
||||
buffer.forEachHorizontalEven(firstLine, lines, AddScaledSum(r97update2));
|
||||
__syncthreads();
|
||||
buffer.forEachHorizontalOdd(firstLine, lines, AddScaledSum(r97predict2));
|
||||
__syncthreads();
|
||||
buffer.forEachHorizontalEven(firstLine, lines, AddScaledSum(r97update1));
|
||||
__syncthreads();
|
||||
buffer.forEachHorizontalOdd(firstLine, lines, AddScaledSum(r97Predict1));
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
|
||||
/// Initializes one column of shared transform buffer with 7 input pixels.
|
||||
/// Those 7 pixels will not be transformed. Also initializes given loader.
|
||||
/// @tparam CHECKED true if there are near image boundaries
|
||||
/// @param colIndex index of column in shared transform buffer
|
||||
/// @param input input image
|
||||
/// @param sizeX width of the input image
|
||||
/// @param sizeY height of the input image
|
||||
/// @param column (uninitialized) info about loading one column
|
||||
/// @param firstY index of first image row to be transformed
|
||||
template <bool CHECKED>
|
||||
__device__ void initColumn(const int colIndex, const float * const input,
|
||||
const int sizeX, const int sizeY,
|
||||
RDWT97Column<CHECKED> & column,
|
||||
const int firstY) {
|
||||
// coordinates of the first coefficient to be loaded
|
||||
const int firstX = blockIdx.x * WIN_SIZE_X + colIndex;
|
||||
|
||||
// offset of the column with index 'colIndex' in the transform buffer
|
||||
column.offset = buffer.getColumnOffset(colIndex);
|
||||
|
||||
if(blockIdx.y == 0) {
|
||||
// topmost block - apply mirroring rules when loading first 7 rows
|
||||
column.loader.init(sizeX, sizeY, firstX, firstY);
|
||||
|
||||
// load pixels in mirrored way
|
||||
buffer[column.offset + 3 * STRIDE] = column.loader.loadLowFrom(input);
|
||||
buffer[column.offset + 4 * STRIDE] =
|
||||
buffer[column.offset + 2 * STRIDE] = column.loader.loadHighFrom(input);
|
||||
buffer[column.offset + 5 * STRIDE] =
|
||||
buffer[column.offset + 1 * STRIDE] = column.loader.loadLowFrom(input);
|
||||
buffer[column.offset + 6 * STRIDE] =
|
||||
buffer[column.offset + 0 * STRIDE] = column.loader.loadHighFrom(input);
|
||||
} else {
|
||||
// non-topmost row - regular loading:
|
||||
column.loader.init(sizeX, sizeY, firstX, firstY - 3);
|
||||
buffer[column.offset + 0 * STRIDE] = column.loader.loadHighFrom(input);
|
||||
buffer[column.offset + 1 * STRIDE] = column.loader.loadLowFrom(input);
|
||||
buffer[column.offset + 2 * STRIDE] = column.loader.loadHighFrom(input);
|
||||
buffer[column.offset + 3 * STRIDE] = column.loader.loadLowFrom(input);
|
||||
buffer[column.offset + 4 * STRIDE] = column.loader.loadHighFrom(input);
|
||||
buffer[column.offset + 5 * STRIDE] = column.loader.loadLowFrom(input);
|
||||
buffer[column.offset + 6 * STRIDE] = column.loader.loadHighFrom(input);
|
||||
}
|
||||
// Now, the next coefficient, which will be loaded by loader, is #4.
|
||||
}
|
||||
|
||||
|
||||
/// Using given loader, it loads another WIN_SIZE_Y coefficients
|
||||
/// into specified column.
|
||||
/// @tparam CHECKED true if there are near image boundaries
|
||||
/// @param col info about loaded column
|
||||
/// @param input buffer with input coefficients
|
||||
template <bool CHECKED>
|
||||
inline __device__ void loadWindowIntoColumn(RDWT97Column<CHECKED> & col,
|
||||
const float * const input) {
|
||||
for(int i = 7; i < (7 + WIN_SIZE_Y); i += 2) {
|
||||
buffer[col.offset + i * STRIDE] = col.loader.loadLowFrom(input);
|
||||
buffer[col.offset + (i + 1) * STRIDE] = col.loader.loadHighFrom(input);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/// Actual GPU 9/7 RDWT sliding window lifting schema implementation.
|
||||
/// @tparam CHECKED_LOADS true if loader should check boundaries
|
||||
/// @tparam CHECKED_WRITES true if boundaries should be taken into account
|
||||
/// when writing into output buffer
|
||||
/// @param in input image (9/7 transformed coefficients)
|
||||
/// @param out output buffer (for reverse transformed image)
|
||||
/// @param sizeX width of the output image
|
||||
/// @param sizeY height of the output image
|
||||
/// @param winSteps number of steps of sliding window
|
||||
template <bool CHECKED_LOADS, bool CHECKED_WRITES>
|
||||
__device__ void transform(const float * const in, float * const out,
|
||||
const int sizeX, const int sizeY,
|
||||
const int winSteps) {
|
||||
// info about one main column and one boundary column
|
||||
RDWT97Column<CHECKED_LOADS> column;
|
||||
RDWT97Column<CHECKED_LOADS> boundaryColumn;
|
||||
|
||||
// index of first image row to be transformed
|
||||
const int firstY = blockIdx.y * WIN_SIZE_Y * winSteps;
|
||||
|
||||
// initialize boundary columns
|
||||
boundaryColumn.clear();
|
||||
if(threadIdx.x < 7) {
|
||||
// each thread among first 7 ones gets index of one of boundary columns
|
||||
const int colId = threadIdx.x + ((threadIdx.x < 4) ? WIN_SIZE_X : -7);
|
||||
|
||||
// Thread initializes offset of the boundary column (in shared
|
||||
// buffer), first 7 pixels of the column and a loader for this column.
|
||||
initColumn(colId, in, sizeX, sizeY, boundaryColumn, firstY);
|
||||
}
|
||||
|
||||
// All threads initialize central columns.
|
||||
initColumn(parityIdx<WIN_SIZE_X>(), in, sizeX, sizeY, column, firstY);
|
||||
|
||||
// horizontally transform first 7 rows
|
||||
horizontalRDWT97(7, 0);
|
||||
|
||||
// writer of output pixels - initialize it
|
||||
const int outputX = blockIdx.x * WIN_SIZE_X + threadIdx.x;
|
||||
VerticalDWTPixelWriter<float, CHECKED_WRITES> writer;
|
||||
writer.init(sizeX, sizeY, outputX, firstY);
|
||||
|
||||
// offset of column (in transform buffer) saved by this thread
|
||||
const int outColumnOffset = buffer.getColumnOffset(threadIdx.x);
|
||||
|
||||
// (Each iteration assumes that first 7 rows of transform buffer are
|
||||
// already loaded with horizontally transformed pixels.)
|
||||
for(int w = 0; w < winSteps; w++) {
|
||||
// Load another WIN_SIZE_Y lines of this thread's column
|
||||
// into the transform buffer.
|
||||
loadWindowIntoColumn(column, in);
|
||||
|
||||
// possibly load boundary columns
|
||||
if(threadIdx.x < 7) {
|
||||
loadWindowIntoColumn(boundaryColumn, in);
|
||||
}
|
||||
|
||||
// horizontally transform all newly loaded lines
|
||||
horizontalRDWT97(WIN_SIZE_Y, 7);
|
||||
|
||||
// Using 7 registers, remember current values of last 7 rows
|
||||
// of transform buffer. These rows are transformed horizontally
|
||||
// only and will be used in next iteration.
|
||||
float last7Lines[7];
|
||||
for(int i = 0; i < 7; i++) {
|
||||
last7Lines[i] = buffer[outColumnOffset + (WIN_SIZE_Y + i) * STRIDE];
|
||||
}
|
||||
|
||||
// vertically transform all central columns
|
||||
buffer.scaleVertical(scale97Div, scale97Mul, outColumnOffset,
|
||||
WIN_SIZE_Y + 7, 0);
|
||||
buffer.forEachVerticalOdd(outColumnOffset, AddScaledSum(r97update2));
|
||||
buffer.forEachVerticalEven(outColumnOffset, AddScaledSum(r97predict2));
|
||||
buffer.forEachVerticalOdd(outColumnOffset, AddScaledSum(r97update1));
|
||||
buffer.forEachVerticalEven(outColumnOffset, AddScaledSum(r97Predict1));
|
||||
|
||||
// Save all results of current window. Results are in transform buffer
|
||||
// at rows from #3 to #(3 + WIN_SIZE_Y). Other rows are invalid now.
|
||||
// (They only served as a boundary for vertical RDWT.)
|
||||
for(int i = 3; i < (3 + WIN_SIZE_Y); i++) {
|
||||
writer.writeInto(out, buffer[outColumnOffset + i * STRIDE]);
|
||||
}
|
||||
|
||||
// Use last 7 remembered lines as first 7 lines for next iteration.
|
||||
// As expected, these lines are already horizontally transformed.
|
||||
for(int i = 0; i < 7; i++) {
|
||||
buffer[outColumnOffset + i * STRIDE] = last7Lines[i];
|
||||
}
|
||||
|
||||
// Wait for all writing threads before proceeding to loading new
|
||||
// coeficients in next iteration. (Not to overwrite those which
|
||||
// are not written yet.)
|
||||
__syncthreads();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public:
|
||||
/// Main GPU 9/7 RDWT entry point.
|
||||
/// @param in input image (9/7 transformed coefficients)
|
||||
/// @param out output buffer (for reverse transformed image)
|
||||
/// @param sizeX width of the output image
|
||||
/// @param sizeY height of the output image
|
||||
__device__ static void run(const float * const input, float * const output,
|
||||
const int sx, const int sy, const int steps) {
|
||||
// prepare instance with buffer in shared memory
|
||||
__shared__ RDWT97<WIN_SIZE_X, WIN_SIZE_Y> rdwt97;
|
||||
|
||||
// Compute limits of this threadblock's block of pixels and use them to
|
||||
// determine, whether this threadblock will have to deal with boundary.
|
||||
// (3 in next expressions is for radius of impulse response of 9/7 RDWT.)
|
||||
const int maxX = (blockIdx.x + 1) * WIN_SIZE_X + 3;
|
||||
const int maxY = (blockIdx.y + 1) * WIN_SIZE_Y * steps + 3;
|
||||
const bool atRightBoudary = maxX >= sx;
|
||||
const bool atBottomBoudary = maxY >= sy;
|
||||
|
||||
// Select specialized version of code according to distance of this
|
||||
// threadblock's pixels from image boundary.
|
||||
if(atBottomBoudary) {
|
||||
// near bottom boundary => check both writing and reading
|
||||
rdwt97.transform<true, true>(input, output, sx, sy, steps);
|
||||
} else if(atRightBoudary) {
|
||||
// near right boundary only => check writing only
|
||||
rdwt97.transform<false, true>(input, output, sx, sy, steps);
|
||||
} else {
|
||||
// no nearby boundary => check nothing
|
||||
rdwt97.transform<false, false>(input, output, sx, sy, steps);
|
||||
}
|
||||
}
|
||||
|
||||
}; // end of class RDWT97
|
||||
|
||||
|
||||
|
||||
/// Main GPU 9/7 RDWT entry point.
|
||||
/// @param in input image (9/7 transformed coefficients)
|
||||
/// @param out output buffer (for reverse transformed image)
|
||||
/// @param sizeX width of the output image
|
||||
/// @param sizeY height of the output image
|
||||
template <int WIN_SX, int WIN_SY>
|
||||
__launch_bounds__(WIN_SX, CTMIN(SHM_SIZE/sizeof(RDWT97<WIN_SX, WIN_SY>), 8))
|
||||
__global__ void rdwt97Kernel(const float * const in, float * const out,
|
||||
const int sx, const int sy, const int steps) {
|
||||
RDWT97<WIN_SX, WIN_SY>::run(in, out, sx, sy, steps);
|
||||
}
|
||||
|
||||
|
||||
|
||||
/// Only computes optimal number of sliding window steps,
|
||||
/// number of threadblocks and then lanches the 9/7 RDWT kernel.
|
||||
/// @tparam WIN_SX width of sliding window
|
||||
/// @tparam WIN_SY height of sliding window
|
||||
/// @param in input image
|
||||
/// @param out output buffer
|
||||
/// @param sx width of the input image
|
||||
/// @param sy height of the input image
|
||||
template <int WIN_SX, int WIN_SY>
|
||||
void launchRDWT97Kernel (float * in, float * out, int sx, int sy) {
|
||||
// compute optimal number of steps of each sliding window
|
||||
const int steps = divRndUp(sy, 15 * WIN_SY);
|
||||
|
||||
// prepare grid size
|
||||
dim3 gSize(divRndUp(sx, WIN_SX), divRndUp(sy, WIN_SY * steps));
|
||||
|
||||
// finally launch kernel
|
||||
PERF_BEGIN
|
||||
rdwt97Kernel<WIN_SX, WIN_SY><<<gSize, WIN_SX>>>(in, out, sx, sy, steps);
|
||||
PERF_END(" RDWT97", sx, sy)
|
||||
CudaDWTTester::checkLastKernelCall("RDWT 9/7 kernel");
|
||||
}
|
||||
|
||||
|
||||
|
||||
/// Reverse 9/7 2D DWT. See common rules (dwt.h) for more details.
|
||||
/// @param in Input DWT coefficients. Format described in common rules.
|
||||
/// Will not be preserved (will be overwritten).
|
||||
/// @param out output buffer on GPU - will contain original image
|
||||
/// in normalized range [-0.5, 0.5].
|
||||
/// @param sizeX width of input image (in pixels)
|
||||
/// @param sizeY height of input image (in pixels)
|
||||
/// @param levels number of recursive DWT levels
|
||||
void rdwt97(float * in, float * out, int sizeX, int sizeY, int levels) {
|
||||
if(levels > 1) {
|
||||
// let this function recursively reverse transform deeper levels first
|
||||
const int llSizeX = divRndUp(sizeX, 2);
|
||||
const int llSizeY = divRndUp(sizeY, 2);
|
||||
rdwt97(in, out, llSizeX, llSizeY, levels - 1);
|
||||
|
||||
// copy reverse transformed LL band from output back into the input
|
||||
memCopy(in, out, llSizeX, llSizeY);
|
||||
}
|
||||
|
||||
// select right width of kernel for the size of the image
|
||||
if(sizeX >= 960) {
|
||||
launchRDWT97Kernel<192, 8>(in, out, sizeX, sizeY);
|
||||
} else if (sizeX >= 480) {
|
||||
launchRDWT97Kernel<128, 6>(in, out, sizeX, sizeY);
|
||||
} else {
|
||||
launchRDWT97Kernel<64, 6>(in, out, sizeX, sizeY);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
} // end of namespace dwt_cuda
|
|
@ -1,338 +0,0 @@
|
|||
/// line 248 the index
|
||||
/// @file transform_buffer.h
|
||||
/// @brief Buffer with separated even and odd columns and related algorithms.
|
||||
/// @author Martin Jirman (207962@mail.muni.cz)
|
||||
/// @date 2011-01-20 18:33
|
||||
///
|
||||
///
|
||||
/// Copyright (c) 2011 Martin Jirman
|
||||
/// All rights reserved.
|
||||
///
|
||||
/// Redistribution and use in source and binary forms, with or without
|
||||
/// modification, are permitted provided that the following conditions are met:
|
||||
///
|
||||
/// * Redistributions of source code must retain the above copyright
|
||||
/// notice, this list of conditions and the following disclaimer.
|
||||
/// * Redistributions in binary form must reproduce the above copyright
|
||||
/// notice, this list of conditions and the following disclaimer in the
|
||||
/// documentation and/or other materials provided with the distribution.
|
||||
///
|
||||
/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
/// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
/// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
/// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
/// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
/// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
/// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
/// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
/// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
/// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
/// POSSIBILITY OF SUCH DAMAGE.
|
||||
///
|
||||
|
||||
#ifndef TRANSFORM_BUFFER_H
|
||||
#define TRANSFORM_BUFFER_H
|
||||
|
||||
namespace dwt_cuda {
|
||||
|
||||
/// Buffer (in shared memory of GPU) where block of input image is stored,
|
||||
/// but odd and even lines are separated. (Generates less bank conflicts when
|
||||
/// using lifting schema.) All operations expect SIZE_X threads.
|
||||
/// Also implements basic building blocks of lifting schema.
|
||||
/// @tparam SIZE_X width of the buffer excluding two boundaries (Also
|
||||
/// a number of threads participating on all operations.)
|
||||
/// Must be divisible by 4.
|
||||
/// @tparam SIZE_Y height of buffer (total number of lines)
|
||||
/// @tparam BOUNDARY_X number of extra pixels at the left and right side
|
||||
/// boundary is expected to be smaller than half SIZE_X
|
||||
/// Must be divisible by 2.
|
||||
template <typename T, int SIZE_X, int SIZE_Y, int BOUNDARY_X>
|
||||
class TransformBuffer {
|
||||
public:
|
||||
enum {
|
||||
/// difference between pointers to two vertical neigbors
|
||||
VERTICAL_STRIDE = BOUNDARY_X + (SIZE_X / 2)
|
||||
};
|
||||
|
||||
private:
|
||||
enum {
|
||||
/// number of shared memory banks - needed for correct padding
|
||||
#ifdef __CUDA_ARCH__
|
||||
SHM_BANKS = ((__CUDA_ARCH__ >= 200) ? 32 : 16),
|
||||
#else
|
||||
SHM_BANKS = 16, // for host code only - can be anything, won't be used
|
||||
#endif
|
||||
|
||||
/// size of one of two buffers (odd or even)
|
||||
BUFFER_SIZE = VERTICAL_STRIDE * SIZE_Y,
|
||||
|
||||
/// unused space between two buffers
|
||||
PADDING = SHM_BANKS - ((BUFFER_SIZE + SHM_BANKS / 2) % SHM_BANKS),
|
||||
|
||||
/// offset of the odd columns buffer from the beginning of data buffer
|
||||
ODD_OFFSET = BUFFER_SIZE + PADDING,
|
||||
};
|
||||
|
||||
/// buffer for both even and odd columns
|
||||
T data[2 * BUFFER_SIZE + PADDING];
|
||||
|
||||
/// Applies specified function to all central elements while also passing
|
||||
/// previous and next elements as parameters.
|
||||
/// @param count count of central elements to apply function to
|
||||
/// @param prevOffset offset of first central element
|
||||
/// @param midOffset offset of first central element's predecessor
|
||||
/// @param nextOffset offset of first central element's successor
|
||||
/// @param function the function itself
|
||||
template <typename FUNC>
|
||||
__device__ void horizontalStep(const int count, const int prevOffset,
|
||||
const int midOffset, const int nextOffset,
|
||||
const FUNC &function) {
|
||||
// number of unchecked iterations
|
||||
const int STEPS = count / SIZE_X;
|
||||
|
||||
// items remaining after last unchecked iteration
|
||||
const int finalCount = count % SIZE_X;
|
||||
|
||||
// offset of items processed in last (checked) iteration
|
||||
const int finalOffset = count - finalCount;
|
||||
|
||||
// all threads perform fixed number of iterations ...
|
||||
for (int i = 0; i < STEPS; i++) {
|
||||
// for(int i = 0; i < 3; i++) {
|
||||
const T previous = data[prevOffset + i * SIZE_X + threadIdx.x];
|
||||
const T next = data[nextOffset + i * SIZE_X + threadIdx.x];
|
||||
T ¢er = data[midOffset + i * SIZE_X + threadIdx.x];
|
||||
// function(previous, center, (nextOffset + i*SIZE_X+threadIdx.x));
|
||||
function(previous, center, next); // the real one
|
||||
}
|
||||
|
||||
// ... but not all threads participate on final iteration
|
||||
if (threadIdx.x < finalCount) {
|
||||
const T previous = data[prevOffset + finalOffset + threadIdx.x];
|
||||
const T next = data[nextOffset + finalOffset + threadIdx.x];
|
||||
T ¢er = data[midOffset + finalOffset + threadIdx.x];
|
||||
// function(previous, center, (nextOffset+finalOffset+threadIdx.x));
|
||||
// kaixi
|
||||
function(previous, center, next); // the real one
|
||||
}
|
||||
}
|
||||
|
||||
public:
|
||||
__device__ void getPrintData() {
|
||||
//
|
||||
for (int i = 0; i < 2 * BUFFER_SIZE + PADDING; i++) {
|
||||
printf(" index: %d data: %f \n ", i, data[i]);
|
||||
}
|
||||
}
|
||||
|
||||
/// Gets offset of the column with given index. Central columns have
|
||||
/// indices from 0 to NUM_LINES - 1, left boundary columns have negative
|
||||
/// indices and right boundary columns indices start with NUM_LINES.
|
||||
/// @param columnIndex index of column to get pointer to
|
||||
/// @return offset of the first item of column with specified index
|
||||
__device__ int getColumnOffset(int columnIndex) {
|
||||
columnIndex += BOUNDARY_X; // skip boundary
|
||||
return columnIndex / 2 // select right column
|
||||
+ (columnIndex & 1) * ODD_OFFSET; // select odd or even buffer
|
||||
}
|
||||
|
||||
/// Provides access to data of the transform buffer.
|
||||
/// @param index index of the item to work with
|
||||
/// @return reference to item at given index
|
||||
__device__ T &operator[](const int index) { return data[index]; }
|
||||
|
||||
/// Applies specified function to all horizontally even elements in
|
||||
/// specified lines. (Including even elements in boundaries except
|
||||
/// first even element in first left boundary.) SIZE_X threads participate
|
||||
/// and synchronization is needed before result can be used.
|
||||
/// @param firstLine index of first line
|
||||
/// @param numLines count of lines
|
||||
/// @param func function to be applied on all even elements
|
||||
/// parameters: previous (odd) element, the even
|
||||
/// element itself and finally next (odd) element
|
||||
template <typename FUNC>
|
||||
__device__ void forEachHorizontalEven(const int firstLine, const int numLines,
|
||||
const FUNC &func) {
|
||||
// number of even elemens to apply function to
|
||||
const int count = numLines * VERTICAL_STRIDE - 1;
|
||||
// offset of first even element
|
||||
const int centerOffset = firstLine * VERTICAL_STRIDE + 1;
|
||||
// offset of odd predecessor of first even element
|
||||
const int prevOffset = firstLine * VERTICAL_STRIDE + ODD_OFFSET;
|
||||
// offset of odd successor of first even element
|
||||
const int nextOffset = prevOffset + 1;
|
||||
|
||||
// if(threadIdx.x == 0) {
|
||||
|
||||
// printf("forEachHorizontalEven count %d, centerOffset %d prevOffset %d
|
||||
// nextOffset %d \n", count, centerOffset, prevOffset, nextOffset);
|
||||
// }
|
||||
|
||||
// call generic horizontal step function
|
||||
horizontalStep(count, prevOffset, centerOffset, nextOffset, func);
|
||||
}
|
||||
|
||||
/// Applies given function to all horizontally odd elements in specified
|
||||
/// lines. (Including odd elements in boundaries except last odd element
|
||||
/// in last right boundary.) SIZE_X threads participate and synchronization
|
||||
/// is needed before result can be used.
|
||||
/// @param firstLine index of first line
|
||||
/// @param numLines count of lines
|
||||
/// @param func function to be applied on all odd elements
|
||||
/// parameters: previous (even) element, the odd
|
||||
/// element itself and finally next (even) element
|
||||
template <typename FUNC>
|
||||
__device__ void forEachHorizontalOdd(const int firstLine, const int numLines,
|
||||
const FUNC &func) {
|
||||
// numbet of odd elements to apply function to
|
||||
const int count = numLines * VERTICAL_STRIDE - 1;
|
||||
// offset of even predecessor of first odd element
|
||||
const int prevOffset = firstLine * VERTICAL_STRIDE;
|
||||
// offset of first odd element
|
||||
const int centerOffset = prevOffset + ODD_OFFSET;
|
||||
// offset of even successor of first odd element
|
||||
const int nextOffset = prevOffset + 1;
|
||||
|
||||
// if(threadIdx.x == 0) {
|
||||
// printf("forEachHorizontalOdd count %d, centerOffset %d prevOffset %d
|
||||
// nextOffset %d \n", count, centerOffset, prevOffset, nextOffset);
|
||||
// }
|
||||
|
||||
// call generic horizontal step function
|
||||
horizontalStep(count, prevOffset, centerOffset, nextOffset, func);
|
||||
}
|
||||
|
||||
/// Applies specified function to all even elements (except element #0)
|
||||
/// of given column. Each thread takes care of one column, so there's
|
||||
/// no need for synchronization.
|
||||
/// @param columnOffset offset of thread's column
|
||||
/// @param f function to be applied on all even elements
|
||||
/// parameters: previous (odd) element, the even
|
||||
/// element itself and finally next (odd) element
|
||||
template <typename F>
|
||||
__device__ void forEachVerticalEven(const int columnOffset, const F &f) {
|
||||
if (SIZE_Y > 3) { // makes no sense otherwise
|
||||
const int steps = SIZE_Y / 2 - 1;
|
||||
for (int i = 0; i < steps; i++) {
|
||||
const int row = 2 + i * 2;
|
||||
const T prev = data[columnOffset + (row - 1) * VERTICAL_STRIDE];
|
||||
const T next = data[columnOffset + (row + 1) * VERTICAL_STRIDE];
|
||||
f(prev, data[columnOffset + row * VERTICAL_STRIDE], next);
|
||||
|
||||
//--------------- FOR TEST -----------------
|
||||
/* __syncthreads();
|
||||
if ((blockIdx.x * blockDim.x + threadIdx.x) == 0){
|
||||
diffOut[2500]++;
|
||||
diffOut[diffOut[2500]] = 2;//data[columnOffset +
|
||||
row * VERTICAL_STRIDE];
|
||||
}
|
||||
__syncthreads();
|
||||
*/ //--------------- FOR TEST -----------------
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Applies specified function to all odd elements of given column.
|
||||
/// Each thread takes care of one column, so there's no need for
|
||||
/// synchronization.
|
||||
/// @param columnOffset offset of thread's column
|
||||
/// @param f function to be applied on all odd elements
|
||||
/// parameters: previous (even) element, the odd
|
||||
/// element itself and finally next (even) element
|
||||
template <typename F>
|
||||
__device__ void forEachVerticalOdd(const int columnOffset, const F &f) {
|
||||
const int steps = (SIZE_Y - 1) / 2;
|
||||
for (int i = 0; i < steps; i++) {
|
||||
const int row = i * 2 + 1;
|
||||
const T prev = data[columnOffset + (row - 1) * VERTICAL_STRIDE];
|
||||
const T next = data[columnOffset + (row + 1) * VERTICAL_STRIDE];
|
||||
|
||||
f(prev, data[columnOffset + row * VERTICAL_STRIDE], next);
|
||||
|
||||
//--------------- FOR TEST -----------------
|
||||
/* __syncthreads();
|
||||
if ((blockIdx.x * blockDim.x + threadIdx.x) == 0){
|
||||
diffOut[2500]++;
|
||||
diffOut[diffOut[2500]] = 1; //data[columnOffset +
|
||||
row * VERTICAL_STRIDE];
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
*/ //--------------- FOR TEST -----------------
|
||||
}
|
||||
}
|
||||
|
||||
/// Scales elements at specified lines.
|
||||
/// @param evenScale scaling factor for horizontally even elements
|
||||
/// @param oddScale scaling factor for horizontally odd elements
|
||||
/// @param numLines number of lines, whose elements should be scaled
|
||||
/// @param firstLine index of first line to scale elements in
|
||||
__device__ void scaleHorizontal(const T evenScale, const T oddScale,
|
||||
const int firstLine, const int numLines) {
|
||||
const int offset = firstLine * VERTICAL_STRIDE;
|
||||
const int count = numLines * VERTICAL_STRIDE;
|
||||
const int steps = count / SIZE_X;
|
||||
const int finalCount = count % SIZE_X;
|
||||
const int finalOffset = count - finalCount;
|
||||
|
||||
// printf("scaleHorizontal sizeX: %d offset %d, count, %d, steps, %d,
|
||||
// finalCount %d, finalOffset %d \n", SIZE_X, offset, count, steps,
|
||||
// finalCount, finalOffset);
|
||||
|
||||
// run iterations, whete all threads participate
|
||||
for (int i = 0; i < steps; i++) {
|
||||
data[threadIdx.x + i * SIZE_X + offset] *= evenScale;
|
||||
// if(threadIdx.x + i * SIZE_X + offset == 531) {
|
||||
// printf("threadidx 531: %d \n", threadIdx.x);
|
||||
// }
|
||||
// if(threadIdx.x + i * SIZE_X + offset + ODD_OFFSET == 531) {
|
||||
// printf("threadidx 531: %d \n", threadIdx.x);
|
||||
// }
|
||||
data[threadIdx.x + i * SIZE_X + offset + ODD_OFFSET] *= oddScale;
|
||||
}
|
||||
|
||||
// some threads also finish remaining unscaled items
|
||||
if (threadIdx.x < finalCount) {
|
||||
data[threadIdx.x + finalOffset + offset] *= evenScale;
|
||||
// if(threadIdx.x + finalOffset + offset == 531) {
|
||||
// printf("threadidx 531: %d \n", threadIdx.x);
|
||||
// }
|
||||
// if(threadIdx.x + finalOffset + offset + ODD_OFFSET == 531) {
|
||||
// printf("threadidx 531: %d \n", threadIdx.x);
|
||||
// }
|
||||
data[threadIdx.x + finalOffset + offset + ODD_OFFSET] *= oddScale;
|
||||
}
|
||||
}
|
||||
|
||||
/// Scales elements in specified column.
|
||||
/// @param evenScale scaling factor for vertically even elements
|
||||
/// @param oddScale scaling factor for vertically odd elements
|
||||
/// @param columnOffset offset of the column to work with
|
||||
/// @param numLines number of lines, whose elements should be scaled
|
||||
/// @param firstLine index of first line to scale elements in
|
||||
__device__ void scaleVertical(const T evenScale, const T oddScale,
|
||||
const int columnOffset, const int numLines,
|
||||
const int firstLine) {
|
||||
for (int i = firstLine; i < (numLines + firstLine); i++) {
|
||||
if (i & 1) {
|
||||
data[columnOffset + i * VERTICAL_STRIDE] *= oddScale;
|
||||
} else {
|
||||
data[columnOffset + i * VERTICAL_STRIDE] *= evenScale;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//****************For Test(Feb23), test inter parameters*************
|
||||
__device__ int getVERTICAL_STRIDE() { return VERTICAL_STRIDE; }
|
||||
__device__ int getSHM_BANKS() { return SHM_BANKS; }
|
||||
__device__ int getBuffersize() { return BUFFER_SIZE; }
|
||||
__device__ int getPADDING() { return PADDING; }
|
||||
__device__ int getODD_OFFSET() { return ODD_OFFSET; }
|
||||
|
||||
//****************For Test(Feb23), test inter parameters*************
|
||||
|
||||
}; // end of class TransformBuffer
|
||||
|
||||
} // namespace dwt_cuda
|
||||
|
||||
#endif // TRANSFORM_BUFFER_H
|
|
@ -1,401 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2009, Jiri Matela
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include <unistd.h>
|
||||
#include <error.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <fcntl.h>
|
||||
#include <errno.h>
|
||||
#include <string.h>
|
||||
#include <assert.h>
|
||||
#include <sys/time.h>
|
||||
#include <getopt.h>
|
||||
|
||||
#include "common.h"
|
||||
#include "components.h"
|
||||
#include "dwt.h"
|
||||
|
||||
struct dwt {
|
||||
char * srcFilename;
|
||||
char * outFilename;
|
||||
unsigned char *srcImg;
|
||||
int pixWidth;
|
||||
int pixHeight;
|
||||
int components;
|
||||
int dwtLvls;
|
||||
};
|
||||
|
||||
int getImg(char * srcFilename, unsigned char *srcImg, int inputSize)
|
||||
{
|
||||
// printf("Loading ipnput: %s\n", srcFilename);
|
||||
char *path = "../../data/dwt2d/";
|
||||
char *newSrc = NULL;
|
||||
|
||||
if((newSrc = (char *)malloc(strlen(srcFilename)+strlen(path)+1)) != NULL)
|
||||
{
|
||||
newSrc[0] = '\0';
|
||||
strcat(newSrc, path);
|
||||
strcat(newSrc, srcFilename);
|
||||
srcFilename= newSrc;
|
||||
}
|
||||
printf("Loading ipnput: %s\n", srcFilename);
|
||||
|
||||
//srcFilename = strcat("../../data/dwt2d/",srcFilename);
|
||||
//read image
|
||||
int i = open(srcFilename, O_RDONLY, 0644);
|
||||
if (i == -1) {
|
||||
error(0,errno,"cannot access %s", srcFilename);
|
||||
return -1;
|
||||
}
|
||||
int ret = read(i, srcImg, inputSize);
|
||||
printf("precteno %d, inputsize %d\n", ret, inputSize);
|
||||
close(i);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
void usage() {
|
||||
printf("dwt [otpions] src_img.rgb <out_img.dwt>\n\
|
||||
-d, --dimension\t\tdimensions of src img, e.g. 1920x1080\n\
|
||||
-c, --components\t\tnumber of color components, default 3\n\
|
||||
-b, --depth\t\t\tbit depth, default 8\n\
|
||||
-l, --level\t\t\tDWT level, default 3\n\
|
||||
-D, --device\t\t\tcuda device\n\
|
||||
-f, --forward\t\t\tforward transform\n\
|
||||
-r, --reverse\t\t\treverse transform\n\
|
||||
-9, --97\t\t\t9/7 transform\n\
|
||||
-5, --53\t\t\t5/3 transform\n\
|
||||
-w --write-visual\t\twrite output in visual (tiled) fashion instead of the linear\n");
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void processDWT(struct dwt *d, int forward, int writeVisual)
|
||||
{
|
||||
int componentSize = d->pixWidth*d->pixHeight*sizeof(T);
|
||||
|
||||
T *c_r_out, *backup ;
|
||||
cudaMalloc((void**)&c_r_out, componentSize); //< aligned component size
|
||||
cudaCheckError("Alloc device memory");
|
||||
cudaMemset(c_r_out, 0, componentSize);
|
||||
cudaCheckError("Memset device memory");
|
||||
|
||||
cudaMalloc((void**)&backup, componentSize); //< aligned component size
|
||||
cudaCheckError("Alloc device memory");
|
||||
cudaMemset(backup, 0, componentSize);
|
||||
cudaCheckError("Memset device memory");
|
||||
|
||||
if (d->components == 3) {
|
||||
/* Alloc two more buffers for G and B */
|
||||
T *c_g_out, *c_b_out;
|
||||
cudaMalloc((void**)&c_g_out, componentSize); //< aligned component size
|
||||
cudaCheckError("Alloc device memory");
|
||||
cudaMemset(c_g_out, 0, componentSize);
|
||||
cudaCheckError("Memset device memory");
|
||||
|
||||
cudaMalloc((void**)&c_b_out, componentSize); //< aligned component size
|
||||
cudaCheckError("Alloc device memory");
|
||||
cudaMemset(c_b_out, 0, componentSize);
|
||||
cudaCheckError("Memset device memory");
|
||||
|
||||
/* Load components */
|
||||
T *c_r, *c_g, *c_b;
|
||||
cudaMalloc((void**)&c_r, componentSize); //< R, aligned component size
|
||||
cudaCheckError("Alloc device memory");
|
||||
cudaMemset(c_r, 0, componentSize);
|
||||
cudaCheckError("Memset device memory");
|
||||
|
||||
cudaMalloc((void**)&c_g, componentSize); //< G, aligned component size
|
||||
cudaCheckError("Alloc device memory");
|
||||
cudaMemset(c_g, 0, componentSize);
|
||||
cudaCheckError("Memset device memory");
|
||||
|
||||
cudaMalloc((void**)&c_b, componentSize); //< B, aligned component size
|
||||
cudaCheckError("Alloc device memory");
|
||||
cudaMemset(c_b, 0, componentSize);
|
||||
cudaCheckError("Memset device memory");
|
||||
|
||||
rgbToComponents(c_r, c_g, c_b, d->srcImg, d->pixWidth, d->pixHeight);
|
||||
|
||||
|
||||
/* Compute DWT and always store into file */
|
||||
nStage2dDWT(c_r, c_r_out, backup, d->pixWidth, d->pixHeight, d->dwtLvls, forward);
|
||||
nStage2dDWT(c_g, c_g_out, backup, d->pixWidth, d->pixHeight, d->dwtLvls, forward);
|
||||
nStage2dDWT(c_b, c_b_out, backup, d->pixWidth, d->pixHeight, d->dwtLvls, forward);
|
||||
|
||||
// -------test----------
|
||||
// T *h_r_out=(T*)malloc(componentSize);
|
||||
// cudaMemcpy(h_r_out, c_g_out, componentSize, cudaMemcpyDeviceToHost);
|
||||
// int ii;
|
||||
// for(ii=0;ii<componentSize/sizeof(T);ii++) {
|
||||
// fprintf(stderr, "%d ", h_r_out[ii]);
|
||||
// if((ii+1) % (d->pixWidth) == 0) fprintf(stderr, "\n");
|
||||
// }
|
||||
// -------test----------
|
||||
|
||||
|
||||
/* Store DWT to file */
|
||||
writeLinear(c_r_out, d->pixWidth, d->pixHeight, d->outFilename, ".r");
|
||||
// writeLinear(c_g_out, d->pixWidth, d->pixHeight, d->outFilename, ".g");
|
||||
// writeLinear(c_b_out, d->pixWidth, d->pixHeight, d->outFilename, ".b");
|
||||
#ifdef OUTPUT
|
||||
if (writeVisual) {
|
||||
writeNStage2DDWT(c_r_out, d->pixWidth, d->pixHeight, d->dwtLvls, d->outFilename, ".r");
|
||||
writeNStage2DDWT(c_g_out, d->pixWidth, d->pixHeight, d->dwtLvls, d->outFilename, ".g");
|
||||
writeNStage2DDWT(c_b_out, d->pixWidth, d->pixHeight, d->dwtLvls, d->outFilename, ".b");
|
||||
} else {
|
||||
writeLinear(c_r_out, d->pixWidth, d->pixHeight, d->outFilename, ".r");
|
||||
writeLinear(c_g_out, d->pixWidth, d->pixHeight, d->outFilename, ".g");
|
||||
writeLinear(c_b_out, d->pixWidth, d->pixHeight, d->outFilename, ".b");
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
cudaFree(c_r);
|
||||
cudaCheckError("Cuda free");
|
||||
cudaFree(c_g);
|
||||
cudaCheckError("Cuda free");
|
||||
cudaFree(c_b);
|
||||
cudaCheckError("Cuda free");
|
||||
cudaFree(c_g_out);
|
||||
cudaCheckError("Cuda free");
|
||||
cudaFree(c_b_out);
|
||||
cudaCheckError("Cuda free");
|
||||
|
||||
}
|
||||
else if (d->components == 1) {
|
||||
//Load component
|
||||
T *c_r;
|
||||
cudaMalloc((void**)&(c_r), componentSize); //< R, aligned component size
|
||||
cudaCheckError("Alloc device memory");
|
||||
cudaMemset(c_r, 0, componentSize);
|
||||
cudaCheckError("Memset device memory");
|
||||
|
||||
bwToComponent(c_r, d->srcImg, d->pixWidth, d->pixHeight);
|
||||
|
||||
// Compute DWT
|
||||
nStage2dDWT(c_r, c_r_out, backup, d->pixWidth, d->pixHeight, d->dwtLvls, forward);
|
||||
|
||||
// Store DWT to file
|
||||
// #ifdef OUTPUT
|
||||
if (writeVisual) {
|
||||
writeNStage2DDWT(c_r_out, d->pixWidth, d->pixHeight, d->dwtLvls, d->outFilename, ".out");
|
||||
} else {
|
||||
writeLinear(c_r_out, d->pixWidth, d->pixHeight, d->outFilename, ".lin.out");
|
||||
}
|
||||
// #endif
|
||||
cudaFree(c_r);
|
||||
cudaCheckError("Cuda free");
|
||||
}
|
||||
|
||||
cudaFree(c_r_out);
|
||||
cudaCheckError("Cuda free device");
|
||||
cudaFree(backup);
|
||||
cudaCheckError("Cuda free device");
|
||||
}
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
int optindex = 0;
|
||||
char ch;
|
||||
struct option longopts[] = {
|
||||
{"dimension", required_argument, 0, 'd'}, //dimensions of src img
|
||||
{"components", required_argument, 0, 'c'}, //numger of components of src img
|
||||
{"depth", required_argument, 0, 'b'}, //bit depth of src img
|
||||
{"level", required_argument, 0, 'l'}, //level of dwt
|
||||
{"device", required_argument, 0, 'D'}, //cuda device
|
||||
{"forward", no_argument, 0, 'f'}, //forward transform
|
||||
{"reverse", no_argument, 0, 'r'}, //reverse transform
|
||||
{"97", no_argument, 0, '9'}, //9/7 transform
|
||||
{"53", no_argument, 0, '5' }, //5/3transform
|
||||
{"write-visual",no_argument, 0, 'w' }, //write output (subbands) in visual (tiled) order instead of linear
|
||||
{"help", no_argument, 0, 'h'}
|
||||
};
|
||||
|
||||
int pixWidth = 0; //<real pixWidth
|
||||
int pixHeight = 0; //<real pixHeight
|
||||
int compCount = 3; //number of components; 3 for RGB or YUV, 4 for RGBA
|
||||
int bitDepth = 8;
|
||||
int dwtLvls = 3; //default numuber of DWT levels
|
||||
int device = 0;
|
||||
int forward = 1; //forward transform
|
||||
int dwt97 = 1; //1=dwt9/7, 0=dwt5/3 transform
|
||||
int writeVisual = 0; //write output (subbands) in visual (tiled) order instead of linear
|
||||
char * pos;
|
||||
|
||||
while ((ch = getopt_long(argc, argv, "d:c:b:l:D:fr95wh", longopts, &optindex)) != -1) {
|
||||
switch (ch) {
|
||||
case 'd':
|
||||
pixWidth = atoi(optarg);
|
||||
pos = strstr(optarg, "x");
|
||||
if (pos == NULL || pixWidth == 0 || (strlen(pos) >= strlen(optarg))) {
|
||||
usage();
|
||||
return -1;
|
||||
}
|
||||
pixHeight = atoi(pos+1);
|
||||
break;
|
||||
case 'c':
|
||||
compCount = atoi(optarg);
|
||||
break;
|
||||
case 'b':
|
||||
bitDepth = atoi(optarg);
|
||||
break;
|
||||
case 'l':
|
||||
dwtLvls = atoi(optarg);
|
||||
break;
|
||||
case 'D':
|
||||
device = atoi(optarg);
|
||||
break;
|
||||
case 'f':
|
||||
forward = 1;
|
||||
break;
|
||||
case 'r':
|
||||
forward = 0;
|
||||
break;
|
||||
case '9':
|
||||
dwt97 = 1;
|
||||
break;
|
||||
case '5':
|
||||
dwt97 = 0;
|
||||
break;
|
||||
case 'w':
|
||||
writeVisual = 1;
|
||||
break;
|
||||
case 'h':
|
||||
usage();
|
||||
return 0;
|
||||
case '?':
|
||||
return -1;
|
||||
default :
|
||||
usage();
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
argc -= optind;
|
||||
argv += optind;
|
||||
|
||||
if (argc == 0) { // at least one filename is expected
|
||||
printf("Please supply src file name\n");
|
||||
usage();
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (pixWidth <= 0 || pixHeight <=0) {
|
||||
printf("Wrong or missing dimensions\n");
|
||||
usage();
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (forward == 0) {
|
||||
writeVisual = 0; //do not write visual when RDWT
|
||||
}
|
||||
|
||||
// device init
|
||||
int devCount;
|
||||
cudaSetDevice(0);
|
||||
cudaGetDeviceCount(&devCount);
|
||||
cudaCheckError("Get device count");
|
||||
if (devCount == 0) {
|
||||
printf("No CUDA enabled device\n");
|
||||
return -1;
|
||||
}
|
||||
if (device < 0 || device > devCount -1) {
|
||||
printf("Selected device %d is out of bound. Devices on your system are in range %d - %d\n",
|
||||
device, 0, devCount -1);
|
||||
return -1;
|
||||
}
|
||||
cudaDeviceProp devProp;
|
||||
cudaGetDeviceProperties(&devProp, device);
|
||||
cudaCheckError("Get device properties");
|
||||
// if (devProp.major < 1) {
|
||||
// printf("Device %d does not support CUDA\n", device);
|
||||
// return -1;
|
||||
// }
|
||||
printf("Using device %d: %s\n", device, devProp.name);
|
||||
cudaSetDevice(device);
|
||||
cudaCheckError("Set selected device");
|
||||
|
||||
struct dwt *d;
|
||||
d = (struct dwt *)malloc(sizeof(struct dwt));
|
||||
d->srcImg = NULL;
|
||||
d->pixWidth = pixWidth;
|
||||
d->pixHeight = pixHeight;
|
||||
d->components = compCount;
|
||||
d->dwtLvls = dwtLvls;
|
||||
|
||||
// file names
|
||||
d->srcFilename = (char *)malloc(strlen(argv[0]));
|
||||
strcpy(d->srcFilename, argv[0]);
|
||||
if (argc == 1) { // only one filename supplyed
|
||||
d->outFilename = (char *)malloc(strlen(d->srcFilename)+4);
|
||||
strcpy(d->outFilename, d->srcFilename);
|
||||
strcpy(d->outFilename+strlen(d->srcFilename), ".dwt");
|
||||
} else {
|
||||
d->outFilename = strdup(argv[1]);
|
||||
}
|
||||
|
||||
//Input review
|
||||
printf("Source file:\t\t%s\n", d->srcFilename);
|
||||
printf(" Dimensions:\t\t%dx%d\n", pixWidth, pixHeight);
|
||||
printf(" Components count:\t%d\n", compCount);
|
||||
printf(" Bit depth:\t\t%d\n", bitDepth);
|
||||
printf(" DWT levels:\t\t%d\n", dwtLvls);
|
||||
printf(" Forward transform:\t%d\n", forward);
|
||||
printf(" 9/7 transform:\t\t%d\n", dwt97);
|
||||
|
||||
//data sizes
|
||||
int inputSize = pixWidth*pixHeight*compCount; //<amount of data (in bytes) to proccess
|
||||
|
||||
//load img source image
|
||||
cudaMallocHost((void **)&d->srcImg, inputSize);
|
||||
cudaCheckError("Alloc host memory");
|
||||
if (getImg(d->srcFilename, d->srcImg, inputSize) == -1)
|
||||
return -1;
|
||||
|
||||
/* DWT */
|
||||
if (forward == 1) {
|
||||
if(dwt97 == 1 )
|
||||
processDWT<float>(d, forward, writeVisual);
|
||||
else // 5/3
|
||||
processDWT<int>(d, forward, writeVisual);
|
||||
}
|
||||
else { // reverse
|
||||
if(dwt97 == 1 )
|
||||
processDWT<float>(d, forward, writeVisual);
|
||||
else // 5/3
|
||||
processDWT<int>(d, forward, writeVisual);
|
||||
}
|
||||
|
||||
//writeComponent(r_cuda, pixWidth, pixHeight, srcFilename, ".g");
|
||||
//writeComponent(g_wave_cuda, 512000, ".g");
|
||||
//writeComponent(g_cuda, componentSize, ".g");
|
||||
//writeComponent(b_wave_cuda, componentSize, ".b");
|
||||
cudaFreeHost(d->srcImg);
|
||||
cudaCheckError("Cuda free host");
|
||||
|
||||
return 0;
|
||||
}
|
|
@ -1,8 +0,0 @@
|
|||
./dwt2d 4.bmp z.dwt -d 4x4 -f -5 -l 3
|
||||
# ./dwt2d 8.bmp -d 8x8 -f -5 -l 3
|
||||
# ./dwt2d 16.bmp -d 16x16 -f -5 -l 3
|
||||
# ./dwt2d 64.bmp -d 64x64 -f -5 -l 3
|
||||
|
||||
# ./dwt2d 192.bmp -d 192x192 -f -5 -l 3
|
||||
# ls
|
||||
# ./dwt2d rgb.bmp -d 1024x1024 -f -5 -l 3
|
|
@ -1,7 +0,0 @@
|
|||
# ./dwt2d 192.bmp -d 192x192 -f -5 -l 3
|
||||
# ls
|
||||
# ./dwt2d rgb.bmp -d 1024x1024 -f -5 -l 3
|
||||
# ./dwt2d 16.bmp -d 16x16 -f -9 -l 3\
|
||||
./dwt2d 4.bmp -d 4x4 -r -5 -l 3
|
||||
# ./dwt2d 4.bmp -d 4x4 -r -9 -l 3
|
||||
# ./dwt2d 8.bmp -d 8x8 -f -9 -l 3
|
|
@ -1,14 +0,0 @@
|
|||
# ./nvcc_dwt2d 192.bmp -d 192x192 -f -5 -l 3
|
||||
# ls
|
||||
# ./nvcc_dwt2d rgb.bmp -d 1024x1024 -f -5 -l 3
|
||||
# ./nvcc_dwt2d 4.bmp -d 4x4 -f -9 -l 3
|
||||
./nvcc_dwt2d 4.bmp -d 4x4 -f -5 -l 3
|
||||
# ./nvcc_dwt2d 8.bmp -d 8x8 -f -9 -l 3
|
||||
# ./nvcc_dwt2d 16.bmp -d 16x16 -f -5 -l 3
|
||||
# ./nvcc_dwt2d 16.bmp -d 16x16 -r -5 -l 3
|
||||
# ./nvcc_dwt2d 16.bmp -d 16x16 -f -9 -l 3
|
||||
# ./nvcc_dwt2d 4.bmp -d 4x4 -r -9 -l 3
|
||||
# ./nvcc_dwt2d 64.bmp -d 64x64 -f -5 -l 3
|
||||
# ./nvcc_dwt2d 192.bmp -d 192x192 -f -5 -l 3
|
||||
# ls
|
||||
# ./nvcc_dwt2d rgb.bmp -d 1024x1024 -f -5 -l 3
|
|
@ -1,51 +0,0 @@
|
|||
|
||||
|
||||
#!/bin/bash
|
||||
|
||||
clang++ -I. -I/include -fno-strict-aliasing dwt_cuda/fdwt53.cu dwt_cuda/fdwt97.cu dwt_cuda/common.cu dwt_cuda/rdwt97.cu dwt_cuda/rdwt53.cu components.cu dwt.cu main.cu -c --cuda-path=/usr/local/cuda-10.1 --cuda-gpu-arch=sm_50 -I. -I/include -L/usr/local/cuda-10.1/lib64 -lcudart_static -ldl -lrt -pthread -save-temps -v
|
||||
|
||||
export LD_LIBRARY_PATH=../../build/runtime:../../build/runtime/threadPool:$LD_LIBRARY_PATH
|
||||
|
||||
../../build/compilation/kernelTranslator common-cuda-nvptx64-nvidia-cuda-sm_50.bc common.bc
|
||||
../../build/compilation/kernelTranslator components-cuda-nvptx64-nvidia-cuda-sm_50.bc components.bc
|
||||
../../build/compilation/kernelTranslator fdwt53-cuda-nvptx64-nvidia-cuda-sm_50.bc fdwt53.bc
|
||||
|
||||
../../build/compilation/kernelTranslator dwt-cuda-nvptx64-nvidia-cuda-sm_50.bc dwt.bc
|
||||
|
||||
../../build/compilation/hostTranslator main-host-x86_64-unknown-linux-gnu.bc host.bc
|
||||
../../build/compilation/hostTranslator common-host-x86_64-unknown-linux-gnu.bc common_host.bc
|
||||
../../build/compilation/hostTranslator components-host-x86_64-unknown-linux-gnu.bc components_host.bc
|
||||
../../build/compilation/hostTranslator dwt-host-x86_64-unknown-linux-gnu.bc dwt_host.bc
|
||||
../../build/compilation/hostTranslator fdwt53-host-x86_64-unknown-linux-gnu.bc fdwt53_host.bc
|
||||
|
||||
../../build/compilation/hostTranslator fdwt97-host-x86_64-unknown-linux-gnu.bc fdwt97_host.bc
|
||||
../../build/compilation/hostTranslator rdwt53-host-x86_64-unknown-linux-gnu.bc rdwt53_host.bc
|
||||
../../build/compilation/hostTranslator rdwt97-host-x86_64-unknown-linux-gnu.bc rdwt97_host.bc
|
||||
../../build/compilation/kernelTranslator fdwt97-cuda-nvptx64-nvidia-cuda-sm_50.bc fdwt97.bc
|
||||
../../build/compilation/kernelTranslator rdwt97-cuda-nvptx64-nvidia-cuda-sm_50.bc rdwt97.bc
|
||||
../../build/compilation/kernelTranslator rdwt53-cuda-nvptx64-nvidia-cuda-sm_50.bc rdwt53.bc
|
||||
|
||||
llc --relocation-model=pic --filetype=obj common.bc
|
||||
llc --relocation-model=pic --filetype=obj components.bc
|
||||
llc --relocation-model=pic --filetype=obj fdwt53.bc
|
||||
|
||||
llc --relocation-model=pic --filetype=obj dwt.bc
|
||||
|
||||
|
||||
llc --relocation-model=pic --filetype=obj host.bc
|
||||
|
||||
llc --relocation-model=pic --filetype=obj common_host.bc
|
||||
llc --relocation-model=pic --filetype=obj components_host.bc
|
||||
llc --relocation-model=pic --filetype=obj fdwt53_host.bc
|
||||
|
||||
llc --relocation-model=pic --filetype=obj dwt_host.bc
|
||||
|
||||
|
||||
llc --relocation-model=pic --filetype=obj fdwt97_host.bc
|
||||
llc --relocation-model=pic --filetype=obj rdwt97_host.bc
|
||||
llc --relocation-model=pic --filetype=obj rdwt53_host.bc
|
||||
llc --relocation-model=pic --filetype=obj fdwt97.bc
|
||||
llc --relocation-model=pic --filetype=obj rdwt97.bc
|
||||
llc --relocation-model=pic --filetype=obj rdwt53.bc
|
||||
|
||||
g++ -g -Wall -L../../build/runtime -L../../build/runtime/threadPool -o dwt2d -fPIC -no-pie common.o components.o dwt.o fdwt53.o fdwt97.o rdwt97.o rdwt53.o host.o common_host.o components_host.o dwt_host.o fdwt53_host.o fdwt97_host.o rdwt97_host.o rdwt53_host.o -lc -lx86Runtime -lthreadPool -lpthread
|
|
@ -1,9 +0,0 @@
|
|||
/usr/local/cuda/bin/nvcc -arch sm_50 -I. -I/include -O2 --compiler-options -fno-strict-aliasing -c main.cu -o main.cu.o
|
||||
/usr/local/cuda/bin/nvcc -arch sm_50 -I. -I/include -O2 --compiler-options -fno-strict-aliasing -c dwt.cu -o dwt.cu.o
|
||||
/usr/local/cuda/bin/nvcc -arch sm_50 -I. -I/include -O2 --compiler-options -fno-strict-aliasing -c components.cu -o components.cu.o
|
||||
/usr/local/cuda/bin/nvcc -arch sm_50 -I. -I/include -O2 --compiler-options -fno-strict-aliasing -c dwt_cuda/fdwt53.cu -o dwt_cuda/fdwt53.cu.o
|
||||
/usr/local/cuda/bin/nvcc -arch sm_50 -I. -I/include -O2 --compiler-options -fno-strict-aliasing -c dwt_cuda/fdwt97.cu -o dwt_cuda/fdwt97.cu.o
|
||||
/usr/local/cuda/bin/nvcc -arch sm_50 -I. -I/include -O2 --compiler-options -fno-strict-aliasing -c dwt_cuda/common.cu -o dwt_cuda/common.cu.o
|
||||
/usr/local/cuda/bin/nvcc -arch sm_50 -I. -I/include -O2 --compiler-options -fno-strict-aliasing -c dwt_cuda/rdwt97.cu -o dwt_cuda/rdwt97.cu.o
|
||||
/usr/local/cuda/bin/nvcc -arch sm_50 -I. -I/include -O2 --compiler-options -fno-strict-aliasing -c dwt_cuda/rdwt53.cu -o dwt_cuda/rdwt53.cu.o
|
||||
g++ -fPIC -o nvcc_dwt2d main.cu.o dwt.cu.o components.cu.o dwt_cuda/fdwt53.cu.o dwt_cuda/fdwt97.cu.o dwt_cuda/common.cu.o dwt_cuda/rdwt97.cu.o dwt_cuda/rdwt53.cu.o -L/usr/local/cuda/lib64 -lcudart
|
|
@ -1,396 +0,0 @@
|
|||
; ModuleID = 'gaussian-cuda-nvptx64-nvidia-cuda-sm_61.bc'
|
||||
source_filename = "gaussian.cu"
|
||||
target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
|
||||
target triple = "nvptx64-nvidia-cuda"
|
||||
|
||||
%struct.__cuda_builtin_threadIdx_t = type { i8 }
|
||||
%struct.__cuda_builtin_blockIdx_t = type { i8 }
|
||||
%struct.__cuda_builtin_blockDim_t = type { i8 }
|
||||
%struct.cudaFuncAttributes = type { i64, i64, i64, i32, i32, i32, i32, i32, i32, i32 }
|
||||
|
||||
$_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv = comdat any
|
||||
|
||||
$_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv = comdat any
|
||||
|
||||
$_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv = comdat any
|
||||
|
||||
$_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_yEv = comdat any
|
||||
|
||||
$_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_yEv = comdat any
|
||||
|
||||
$_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_yEv = comdat any
|
||||
|
||||
@threadIdx = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_threadIdx_t, align 1
|
||||
@blockIdx = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_blockIdx_t, align 1
|
||||
@blockDim = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_blockDim_t, align 1
|
||||
|
||||
; Function Attrs: convergent noinline nounwind optnone
|
||||
define weak dso_local i32 @cudaMalloc(i8** %p, i64 %s) #0 {
|
||||
entry:
|
||||
%p.addr = alloca i8**, align 8
|
||||
%s.addr = alloca i64, align 8
|
||||
store i8** %p, i8*** %p.addr, align 8
|
||||
store i64 %s, i64* %s.addr, align 8
|
||||
ret i32 999
|
||||
}
|
||||
|
||||
; Function Attrs: convergent noinline nounwind optnone
|
||||
define weak dso_local i32 @cudaFuncGetAttributes(%struct.cudaFuncAttributes* %p, i8* %c) #0 {
|
||||
entry:
|
||||
%p.addr = alloca %struct.cudaFuncAttributes*, align 8
|
||||
%c.addr = alloca i8*, align 8
|
||||
store %struct.cudaFuncAttributes* %p, %struct.cudaFuncAttributes** %p.addr, align 8
|
||||
store i8* %c, i8** %c.addr, align 8
|
||||
ret i32 999
|
||||
}
|
||||
|
||||
; Function Attrs: convergent noinline nounwind optnone
|
||||
define weak dso_local i32 @cudaDeviceGetAttribute(i32* %value, i32 %attr, i32 %device) #0 {
|
||||
entry:
|
||||
%value.addr = alloca i32*, align 8
|
||||
%attr.addr = alloca i32, align 4
|
||||
%device.addr = alloca i32, align 4
|
||||
store i32* %value, i32** %value.addr, align 8
|
||||
store i32 %attr, i32* %attr.addr, align 4
|
||||
store i32 %device, i32* %device.addr, align 4
|
||||
ret i32 999
|
||||
}
|
||||
|
||||
; Function Attrs: convergent noinline nounwind optnone
|
||||
define weak dso_local i32 @cudaGetDevice(i32* %device) #0 {
|
||||
entry:
|
||||
%device.addr = alloca i32*, align 8
|
||||
store i32* %device, i32** %device.addr, align 8
|
||||
ret i32 999
|
||||
}
|
||||
|
||||
; Function Attrs: convergent noinline nounwind optnone
|
||||
define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessor(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize) #0 {
|
||||
entry:
|
||||
%numBlocks.addr = alloca i32*, align 8
|
||||
%func.addr = alloca i8*, align 8
|
||||
%blockSize.addr = alloca i32, align 4
|
||||
%dynamicSmemSize.addr = alloca i64, align 8
|
||||
store i32* %numBlocks, i32** %numBlocks.addr, align 8
|
||||
store i8* %func, i8** %func.addr, align 8
|
||||
store i32 %blockSize, i32* %blockSize.addr, align 4
|
||||
store i64 %dynamicSmemSize, i64* %dynamicSmemSize.addr, align 8
|
||||
ret i32 999
|
||||
}
|
||||
|
||||
; Function Attrs: convergent noinline nounwind optnone
|
||||
define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize, i32 %flags) #0 {
|
||||
entry:
|
||||
%numBlocks.addr = alloca i32*, align 8
|
||||
%func.addr = alloca i8*, align 8
|
||||
%blockSize.addr = alloca i32, align 4
|
||||
%dynamicSmemSize.addr = alloca i64, align 8
|
||||
%flags.addr = alloca i32, align 4
|
||||
store i32* %numBlocks, i32** %numBlocks.addr, align 8
|
||||
store i8* %func, i8** %func.addr, align 8
|
||||
store i32 %blockSize, i32* %blockSize.addr, align 4
|
||||
store i64 %dynamicSmemSize, i64* %dynamicSmemSize.addr, align 8
|
||||
store i32 %flags, i32* %flags.addr, align 4
|
||||
ret i32 999
|
||||
}
|
||||
|
||||
; Function Attrs: convergent noinline nounwind optnone
|
||||
define dso_local void @_Z4Fan1PfS_ii(float* %m_cuda, float* %a_cuda, i32 %Size, i32 %t) #0 {
|
||||
entry:
|
||||
%m_cuda.addr = alloca float*, align 8
|
||||
%a_cuda.addr = alloca float*, align 8
|
||||
%Size.addr = alloca i32, align 4
|
||||
%t.addr = alloca i32, align 4
|
||||
store float* %m_cuda, float** %m_cuda.addr, align 8
|
||||
store float* %a_cuda, float** %a_cuda.addr, align 8
|
||||
store i32 %Size, i32* %Size.addr, align 4
|
||||
store i32 %t, i32* %t.addr, align 4
|
||||
%call = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #3
|
||||
%call1 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #3
|
||||
%call2 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #3
|
||||
%mul = mul i32 %call1, %call2
|
||||
%add = add i32 %call, %mul
|
||||
%0 = load i32, i32* %Size.addr, align 4
|
||||
%sub = sub nsw i32 %0, 1
|
||||
%1 = load i32, i32* %t.addr, align 4
|
||||
%sub3 = sub nsw i32 %sub, %1
|
||||
%cmp = icmp uge i32 %add, %sub3
|
||||
br i1 %cmp, label %if.then, label %if.end
|
||||
|
||||
if.then: ; preds = %entry
|
||||
br label %return
|
||||
|
||||
if.end: ; preds = %entry
|
||||
%2 = load float*, float** %a_cuda.addr, align 8
|
||||
%3 = load i32, i32* %Size.addr, align 4
|
||||
%call4 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #3
|
||||
%call5 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #3
|
||||
%mul6 = mul i32 %call4, %call5
|
||||
%call7 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #3
|
||||
%add8 = add i32 %mul6, %call7
|
||||
%4 = load i32, i32* %t.addr, align 4
|
||||
%add9 = add i32 %add8, %4
|
||||
%add10 = add i32 %add9, 1
|
||||
%mul11 = mul i32 %3, %add10
|
||||
%idx.ext = zext i32 %mul11 to i64
|
||||
%add.ptr = getelementptr inbounds float, float* %2, i64 %idx.ext
|
||||
%5 = load i32, i32* %t.addr, align 4
|
||||
%idx.ext12 = sext i32 %5 to i64
|
||||
%add.ptr13 = getelementptr inbounds float, float* %add.ptr, i64 %idx.ext12
|
||||
%6 = load float, float* %add.ptr13, align 4
|
||||
%7 = load float*, float** %a_cuda.addr, align 8
|
||||
%8 = load i32, i32* %Size.addr, align 4
|
||||
%9 = load i32, i32* %t.addr, align 4
|
||||
%mul14 = mul nsw i32 %8, %9
|
||||
%idx.ext15 = sext i32 %mul14 to i64
|
||||
%add.ptr16 = getelementptr inbounds float, float* %7, i64 %idx.ext15
|
||||
%10 = load i32, i32* %t.addr, align 4
|
||||
%idx.ext17 = sext i32 %10 to i64
|
||||
%add.ptr18 = getelementptr inbounds float, float* %add.ptr16, i64 %idx.ext17
|
||||
%11 = load float, float* %add.ptr18, align 4
|
||||
%div = fdiv float %6, %11
|
||||
%12 = load float*, float** %m_cuda.addr, align 8
|
||||
%13 = load i32, i32* %Size.addr, align 4
|
||||
%call19 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #3
|
||||
%call20 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #3
|
||||
%mul21 = mul i32 %call19, %call20
|
||||
%call22 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #3
|
||||
%add23 = add i32 %mul21, %call22
|
||||
%14 = load i32, i32* %t.addr, align 4
|
||||
%add24 = add i32 %add23, %14
|
||||
%add25 = add i32 %add24, 1
|
||||
%mul26 = mul i32 %13, %add25
|
||||
%idx.ext27 = zext i32 %mul26 to i64
|
||||
%add.ptr28 = getelementptr inbounds float, float* %12, i64 %idx.ext27
|
||||
%15 = load i32, i32* %t.addr, align 4
|
||||
%idx.ext29 = sext i32 %15 to i64
|
||||
%add.ptr30 = getelementptr inbounds float, float* %add.ptr28, i64 %idx.ext29
|
||||
store float %div, float* %add.ptr30, align 4
|
||||
br label %return
|
||||
|
||||
return: ; preds = %if.end, %if.then
|
||||
ret void
|
||||
}
|
||||
|
||||
; Function Attrs: alwaysinline convergent nounwind
|
||||
define linkonce_odr dso_local i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #1 comdat align 2 {
|
||||
entry:
|
||||
%0 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
|
||||
ret i32 %0
|
||||
}
|
||||
|
||||
; Function Attrs: alwaysinline convergent nounwind
|
||||
define linkonce_odr dso_local i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #1 comdat align 2 {
|
||||
entry:
|
||||
%0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
|
||||
ret i32 %0
|
||||
}
|
||||
|
||||
; Function Attrs: alwaysinline convergent nounwind
|
||||
define linkonce_odr dso_local i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #1 comdat align 2 {
|
||||
entry:
|
||||
%0 = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
|
||||
ret i32 %0
|
||||
}
|
||||
|
||||
; Function Attrs: convergent noinline nounwind optnone
|
||||
define dso_local void @_Z4Fan2PfS_S_iii(float* %m_cuda, float* %a_cuda, float* %b_cuda, i32 %Size, i32 %j1, i32 %t) #0 {
|
||||
entry:
|
||||
%m_cuda.addr = alloca float*, align 8
|
||||
%a_cuda.addr = alloca float*, align 8
|
||||
%b_cuda.addr = alloca float*, align 8
|
||||
%Size.addr = alloca i32, align 4
|
||||
%j1.addr = alloca i32, align 4
|
||||
%t.addr = alloca i32, align 4
|
||||
%xidx = alloca i32, align 4
|
||||
%yidx = alloca i32, align 4
|
||||
store float* %m_cuda, float** %m_cuda.addr, align 8
|
||||
store float* %a_cuda, float** %a_cuda.addr, align 8
|
||||
store float* %b_cuda, float** %b_cuda.addr, align 8
|
||||
store i32 %Size, i32* %Size.addr, align 4
|
||||
store i32 %j1, i32* %j1.addr, align 4
|
||||
store i32 %t, i32* %t.addr, align 4
|
||||
%call = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #3
|
||||
%call1 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #3
|
||||
%call2 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #3
|
||||
%mul = mul i32 %call1, %call2
|
||||
%add = add i32 %call, %mul
|
||||
%0 = load i32, i32* %Size.addr, align 4
|
||||
%sub = sub nsw i32 %0, 1
|
||||
%1 = load i32, i32* %t.addr, align 4
|
||||
%sub3 = sub nsw i32 %sub, %1
|
||||
%cmp = icmp uge i32 %add, %sub3
|
||||
br i1 %cmp, label %if.then, label %if.end
|
||||
|
||||
if.then: ; preds = %entry
|
||||
br label %if.end58
|
||||
|
||||
if.end: ; preds = %entry
|
||||
%call4 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_yEv() #3
|
||||
%call5 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_yEv() #3
|
||||
%call6 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_yEv() #3
|
||||
%mul7 = mul i32 %call5, %call6
|
||||
%add8 = add i32 %call4, %mul7
|
||||
%2 = load i32, i32* %Size.addr, align 4
|
||||
%3 = load i32, i32* %t.addr, align 4
|
||||
%sub9 = sub nsw i32 %2, %3
|
||||
%cmp10 = icmp uge i32 %add8, %sub9
|
||||
br i1 %cmp10, label %if.then11, label %if.end12
|
||||
|
||||
if.then11: ; preds = %if.end
|
||||
br label %if.end58
|
||||
|
||||
if.end12: ; preds = %if.end
|
||||
%call13 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #3
|
||||
%call14 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #3
|
||||
%mul15 = mul i32 %call13, %call14
|
||||
%call16 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #3
|
||||
%add17 = add i32 %mul15, %call16
|
||||
store i32 %add17, i32* %xidx, align 4
|
||||
%call18 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_yEv() #3
|
||||
%call19 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_yEv() #3
|
||||
%mul20 = mul i32 %call18, %call19
|
||||
%call21 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_yEv() #3
|
||||
%add22 = add i32 %mul20, %call21
|
||||
store i32 %add22, i32* %yidx, align 4
|
||||
%4 = load float*, float** %m_cuda.addr, align 8
|
||||
%5 = load i32, i32* %Size.addr, align 4
|
||||
%6 = load i32, i32* %xidx, align 4
|
||||
%add23 = add nsw i32 %6, 1
|
||||
%7 = load i32, i32* %t.addr, align 4
|
||||
%add24 = add nsw i32 %add23, %7
|
||||
%mul25 = mul nsw i32 %5, %add24
|
||||
%8 = load i32, i32* %t.addr, align 4
|
||||
%add26 = add nsw i32 %mul25, %8
|
||||
%idxprom = sext i32 %add26 to i64
|
||||
%arrayidx = getelementptr inbounds float, float* %4, i64 %idxprom
|
||||
%9 = load float, float* %arrayidx, align 4
|
||||
%10 = load float*, float** %a_cuda.addr, align 8
|
||||
%11 = load i32, i32* %Size.addr, align 4
|
||||
%12 = load i32, i32* %t.addr, align 4
|
||||
%mul27 = mul nsw i32 %11, %12
|
||||
%13 = load i32, i32* %yidx, align 4
|
||||
%14 = load i32, i32* %t.addr, align 4
|
||||
%add28 = add nsw i32 %13, %14
|
||||
%add29 = add nsw i32 %mul27, %add28
|
||||
%idxprom30 = sext i32 %add29 to i64
|
||||
%arrayidx31 = getelementptr inbounds float, float* %10, i64 %idxprom30
|
||||
%15 = load float, float* %arrayidx31, align 4
|
||||
%mul32 = fmul contract float %9, %15
|
||||
%16 = load float*, float** %a_cuda.addr, align 8
|
||||
%17 = load i32, i32* %Size.addr, align 4
|
||||
%18 = load i32, i32* %xidx, align 4
|
||||
%add33 = add nsw i32 %18, 1
|
||||
%19 = load i32, i32* %t.addr, align 4
|
||||
%add34 = add nsw i32 %add33, %19
|
||||
%mul35 = mul nsw i32 %17, %add34
|
||||
%20 = load i32, i32* %yidx, align 4
|
||||
%21 = load i32, i32* %t.addr, align 4
|
||||
%add36 = add nsw i32 %20, %21
|
||||
%add37 = add nsw i32 %mul35, %add36
|
||||
%idxprom38 = sext i32 %add37 to i64
|
||||
%arrayidx39 = getelementptr inbounds float, float* %16, i64 %idxprom38
|
||||
%22 = load float, float* %arrayidx39, align 4
|
||||
%sub40 = fsub contract float %22, %mul32
|
||||
store float %sub40, float* %arrayidx39, align 4
|
||||
%23 = load i32, i32* %yidx, align 4
|
||||
%cmp41 = icmp eq i32 %23, 0
|
||||
br i1 %cmp41, label %if.then42, label %if.end58
|
||||
|
||||
if.then42: ; preds = %if.end12
|
||||
%24 = load float*, float** %m_cuda.addr, align 8
|
||||
%25 = load i32, i32* %Size.addr, align 4
|
||||
%26 = load i32, i32* %xidx, align 4
|
||||
%add43 = add nsw i32 %26, 1
|
||||
%27 = load i32, i32* %t.addr, align 4
|
||||
%add44 = add nsw i32 %add43, %27
|
||||
%mul45 = mul nsw i32 %25, %add44
|
||||
%28 = load i32, i32* %yidx, align 4
|
||||
%29 = load i32, i32* %t.addr, align 4
|
||||
%add46 = add nsw i32 %28, %29
|
||||
%add47 = add nsw i32 %mul45, %add46
|
||||
%idxprom48 = sext i32 %add47 to i64
|
||||
%arrayidx49 = getelementptr inbounds float, float* %24, i64 %idxprom48
|
||||
%30 = load float, float* %arrayidx49, align 4
|
||||
%31 = load float*, float** %b_cuda.addr, align 8
|
||||
%32 = load i32, i32* %t.addr, align 4
|
||||
%idxprom50 = sext i32 %32 to i64
|
||||
%arrayidx51 = getelementptr inbounds float, float* %31, i64 %idxprom50
|
||||
%33 = load float, float* %arrayidx51, align 4
|
||||
%mul52 = fmul contract float %30, %33
|
||||
%34 = load float*, float** %b_cuda.addr, align 8
|
||||
%35 = load i32, i32* %xidx, align 4
|
||||
%add53 = add nsw i32 %35, 1
|
||||
%36 = load i32, i32* %t.addr, align 4
|
||||
%add54 = add nsw i32 %add53, %36
|
||||
%idxprom55 = sext i32 %add54 to i64
|
||||
%arrayidx56 = getelementptr inbounds float, float* %34, i64 %idxprom55
|
||||
%37 = load float, float* %arrayidx56, align 4
|
||||
%sub57 = fsub contract float %37, %mul52
|
||||
store float %sub57, float* %arrayidx56, align 4
|
||||
br label %if.end58
|
||||
|
||||
if.end58: ; preds = %if.then, %if.then11, %if.then42, %if.end12
|
||||
ret void
|
||||
}
|
||||
|
||||
; Function Attrs: alwaysinline convergent nounwind
|
||||
define linkonce_odr dso_local i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_yEv() #1 comdat align 2 {
|
||||
entry:
|
||||
%0 = call i32 @llvm.nvvm.read.ptx.sreg.tid.y()
|
||||
ret i32 %0
|
||||
}
|
||||
|
||||
; Function Attrs: alwaysinline convergent nounwind
|
||||
define linkonce_odr dso_local i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_yEv() #1 comdat align 2 {
|
||||
entry:
|
||||
%0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y()
|
||||
ret i32 %0
|
||||
}
|
||||
|
||||
; Function Attrs: alwaysinline convergent nounwind
|
||||
define linkonce_odr dso_local i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_yEv() #1 comdat align 2 {
|
||||
entry:
|
||||
%0 = call i32 @llvm.nvvm.read.ptx.sreg.ntid.y()
|
||||
ret i32 %0
|
||||
}
|
||||
|
||||
; Function Attrs: nounwind readnone
|
||||
declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() #2
|
||||
|
||||
; Function Attrs: nounwind readnone
|
||||
declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #2
|
||||
|
||||
; Function Attrs: nounwind readnone
|
||||
declare i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #2
|
||||
|
||||
; Function Attrs: nounwind readnone
|
||||
declare i32 @llvm.nvvm.read.ptx.sreg.tid.y() #2
|
||||
|
||||
; Function Attrs: nounwind readnone
|
||||
declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.y() #2
|
||||
|
||||
; Function Attrs: nounwind readnone
|
||||
declare i32 @llvm.nvvm.read.ptx.sreg.ntid.y() #2
|
||||
|
||||
attributes #0 = { convergent noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" }
|
||||
attributes #1 = { alwaysinline convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" }
|
||||
attributes #2 = { nounwind readnone }
|
||||
attributes #3 = { convergent nounwind }
|
||||
|
||||
!llvm.module.flags = !{!0, !1, !2}
|
||||
!nvvm.annotations = !{!3, !4, !5, !6, !5, !7, !7, !7, !7, !8, !8, !7}
|
||||
!llvm.ident = !{!9}
|
||||
!nvvmir.version = !{!10}
|
||||
|
||||
!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 10, i32 1]}
|
||||
!1 = !{i32 1, !"wchar_size", i32 4}
|
||||
!2 = !{i32 4, !"nvvm-reflect-ftz", i32 0}
|
||||
!3 = !{void (float*, float*, i32, i32)* @_Z4Fan1PfS_ii, !"kernel", i32 1}
|
||||
!4 = !{void (float*, float*, float*, i32, i32, i32)* @_Z4Fan2PfS_S_iii, !"kernel", i32 1}
|
||||
!5 = !{null, !"align", i32 8}
|
||||
!6 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080}
|
||||
!7 = !{null, !"align", i32 16}
|
||||
!8 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088}
|
||||
!9 = !{!"clang version 10.0.1 (https://github.com/llvm/llvm-project.git ef32c611aa214dea855364efd7ba451ec5ec3f74)"}
|
||||
!10 = !{i32 1, i32 4}
|
File diff suppressed because one or more lines are too long
|
@ -1,522 +0,0 @@
|
|||
/*-----------------------------------------------------------
|
||||
** gaussian.cu -- The program is to solve a linear system Ax = b
|
||||
** by using Gaussian Elimination. The algorithm on page 101
|
||||
** ("Foundations of Parallel Programming") is used.
|
||||
** The sequential version is gaussian.c. This parallel
|
||||
** implementation converts three independent for() loops
|
||||
** into three Fans. Use the data file ge_3.dat to verify
|
||||
** the correction of the output.
|
||||
**
|
||||
** Written by Andreas Kura, 02/15/95
|
||||
** Modified by Chong-wei Xu, 04/20/95
|
||||
** Modified by Chris Gregg for CUDA, 07/20/2009
|
||||
**-----------------------------------------------------------
|
||||
*/
|
||||
#include "cuda_runtime.h"
|
||||
#include <math.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <sys/time.h>
|
||||
|
||||
#ifdef TIMING
|
||||
#include "timing.h"
|
||||
#endif
|
||||
|
||||
#ifdef RD_WG_SIZE_0_0
|
||||
#define MAXBLOCKSIZE RD_WG_SIZE_0_0
|
||||
#elif defined(RD_WG_SIZE_0)
|
||||
#define MAXBLOCKSIZE RD_WG_SIZE_0
|
||||
#elif defined(RD_WG_SIZE)
|
||||
#define MAXBLOCKSIZE RD_WG_SIZE
|
||||
#else
|
||||
#define MAXBLOCKSIZE 512
|
||||
#endif
|
||||
|
||||
// 2D defines. Go from specific to general
|
||||
#ifdef RD_WG_SIZE_1_0
|
||||
#define BLOCK_SIZE_XY RD_WG_SIZE_1_0
|
||||
#elif defined(RD_WG_SIZE_1)
|
||||
#define BLOCK_SIZE_XY RD_WG_SIZE_1
|
||||
#elif defined(RD_WG_SIZE)
|
||||
#define BLOCK_SIZE_XY RD_WG_SIZE
|
||||
#else
|
||||
#define BLOCK_SIZE_XY 1
|
||||
#endif
|
||||
|
||||
#ifdef TIMING
|
||||
struct timeval tv;
|
||||
struct timeval tv_total_start, tv_total_end;
|
||||
struct timeval tv_h2d_start, tv_h2d_end;
|
||||
struct timeval tv_d2h_start, tv_d2h_end;
|
||||
struct timeval tv_kernel_start, tv_kernel_end;
|
||||
struct timeval tv_mem_alloc_start, tv_mem_alloc_end;
|
||||
struct timeval tv_close_start, tv_close_end;
|
||||
float init_time = 0, mem_alloc_time = 0, h2d_time = 0, kernel_time = 0,
|
||||
d2h_time = 0, close_time = 0, total_time = 0;
|
||||
#endif
|
||||
|
||||
int Size;
|
||||
float *a, *b, *finalVec;
|
||||
float *m;
|
||||
|
||||
FILE *fp;
|
||||
|
||||
void InitProblemOnce(char *filename);
|
||||
void InitPerRun();
|
||||
void ForwardSub();
|
||||
void BackSub();
|
||||
__global__ void Fan1(float *m, float *a, int Size, int t);
|
||||
__global__ void Fan2(float *m, float *a, float *b, int Size, int j1, int t);
|
||||
void InitMat(float *ary, int nrow, int ncol);
|
||||
void InitAry(float *ary, int ary_size);
|
||||
void PrintMat(float *ary, int nrow, int ncolumn);
|
||||
void PrintAry(float *ary, int ary_size);
|
||||
void PrintDeviceProperties();
|
||||
void checkCUDAError(const char *msg);
|
||||
|
||||
unsigned int totalKernelTime = 0;
|
||||
|
||||
// create both matrix and right hand side, Ke Wang 2013/08/12 11:51:06
|
||||
void create_matrix(float *m, int size) {
|
||||
int i, j;
|
||||
float lamda = -0.01;
|
||||
float coe[2 * size - 1];
|
||||
float coe_i = 0.0;
|
||||
|
||||
for (i = 0; i < size; i++) {
|
||||
coe_i = 10 * exp(lamda * i);
|
||||
j = size - 1 + i;
|
||||
coe[j] = coe_i;
|
||||
j = size - 1 - i;
|
||||
coe[j] = coe_i;
|
||||
}
|
||||
|
||||
for (i = 0; i < size; i++) {
|
||||
for (j = 0; j < size; j++) {
|
||||
m[i * size + j] = coe[size - 1 - i + j];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
printf("WG size of kernel 1 = %d, WG size of kernel 2= %d X %d\n",
|
||||
MAXBLOCKSIZE, BLOCK_SIZE_XY, BLOCK_SIZE_XY);
|
||||
int verbose = 1;
|
||||
int i, j;
|
||||
char flag;
|
||||
if (argc < 2) {
|
||||
printf("Usage: gaussian -f filename / -s size [-q]\n\n");
|
||||
printf("-q (quiet) suppresses printing the matrix and result values.\n");
|
||||
printf("-f (filename) path of input file\n");
|
||||
printf(
|
||||
"-s (size) size of matrix. Create matrix and rhs in this program \n");
|
||||
printf(
|
||||
"The first line of the file contains the dimension of the matrix, n.");
|
||||
printf("The second line of the file is a newline.\n");
|
||||
printf("The next n lines contain n tab separated values for the matrix.");
|
||||
printf("The next line of the file is a newline.\n");
|
||||
printf("The next line of the file is a 1xn vector with tab separated "
|
||||
"values.\n");
|
||||
printf("The next line of the file is a newline. (optional)\n");
|
||||
printf("The final line of the file is the pre-computed solution. "
|
||||
"(optional)\n");
|
||||
printf("Example: matrix4.txt:\n");
|
||||
printf("4\n");
|
||||
printf("\n");
|
||||
printf("-0.6 -0.5 0.7 0.3\n");
|
||||
printf("-0.3 -0.9 0.3 0.7\n");
|
||||
printf("-0.4 -0.5 -0.3 -0.8\n");
|
||||
printf("0.0 -0.1 0.2 0.9\n");
|
||||
printf("\n");
|
||||
printf("-0.85 -0.68 0.24 -0.53\n");
|
||||
printf("\n");
|
||||
printf("0.7 0.0 -0.4 -0.5\n");
|
||||
exit(0);
|
||||
}
|
||||
|
||||
cudaSetDevice(0);
|
||||
|
||||
PrintDeviceProperties();
|
||||
// char filename[100];
|
||||
// sprintf(filename,"matrices/matrix%d.txt",size);
|
||||
|
||||
for (i = 1; i < argc; i++) {
|
||||
if (argv[i][0] == '-') { // flag
|
||||
flag = argv[i][1];
|
||||
switch (flag) {
|
||||
case 's': // platform
|
||||
i++;
|
||||
Size = atoi(argv[i]);
|
||||
printf("Create matrix internally in parse, size = %d \n", Size);
|
||||
|
||||
a = (float *)malloc(Size * Size * sizeof(float));
|
||||
create_matrix(a, Size);
|
||||
|
||||
b = (float *)malloc(Size * sizeof(float));
|
||||
for (j = 0; j < Size; j++)
|
||||
b[j] = 1.0;
|
||||
|
||||
m = (float *)malloc(Size * Size * sizeof(float));
|
||||
break;
|
||||
case 'f': // platform
|
||||
i++;
|
||||
printf("Read file from %s \n", argv[i]);
|
||||
InitProblemOnce(argv[i]);
|
||||
break;
|
||||
case 'q': // quiet
|
||||
verbose = 1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// InitProblemOnce(filename);
|
||||
|
||||
InitPerRun();
|
||||
// begin timing
|
||||
struct timeval time_start;
|
||||
gettimeofday(&time_start, NULL);
|
||||
|
||||
// run kernels
|
||||
ForwardSub();
|
||||
|
||||
// end timing
|
||||
struct timeval time_end;
|
||||
gettimeofday(&time_end, NULL);
|
||||
unsigned int time_total = (time_end.tv_sec * 1000000 + time_end.tv_usec) -
|
||||
(time_start.tv_sec * 1000000 + time_start.tv_usec);
|
||||
|
||||
if (verbose) {
|
||||
printf("Matrix m is: \n");
|
||||
PrintMat(m, Size, Size);
|
||||
|
||||
printf("Matrix a is: \n");
|
||||
PrintMat(a, Size, Size);
|
||||
|
||||
printf("Array b is: \n");
|
||||
PrintAry(b, Size);
|
||||
}
|
||||
BackSub();
|
||||
if (verbose) {
|
||||
printf("The final solution is: \n");
|
||||
PrintAry(finalVec, Size);
|
||||
}
|
||||
printf("\nTime total (including memory transfers)\t%f sec\n",
|
||||
time_total * 1e-6);
|
||||
printf("Time for CUDA kernels:\t%f sec\n", totalKernelTime * 1e-6);
|
||||
|
||||
/*printf("%d,%d\n",size,time_total);
|
||||
fprintf(stderr,"%d,%d\n",size,time_total);*/
|
||||
|
||||
free(m);
|
||||
free(a);
|
||||
free(b);
|
||||
|
||||
#ifdef TIMING
|
||||
printf("Exec: %f\n", kernel_time);
|
||||
#endif
|
||||
}
|
||||
/*------------------------------------------------------
|
||||
** PrintDeviceProperties
|
||||
**-----------------------------------------------------
|
||||
*/
|
||||
void PrintDeviceProperties() {
|
||||
cudaDeviceProp deviceProp;
|
||||
int nDevCount = 0;
|
||||
|
||||
cudaGetDeviceCount(&nDevCount);
|
||||
printf("Total Device found: %d", nDevCount);
|
||||
for (int nDeviceIdx = 0; nDeviceIdx < nDevCount; ++nDeviceIdx) {
|
||||
memset(&deviceProp, 0, sizeof(deviceProp));
|
||||
if (cudaSuccess == cudaGetDeviceProperties(&deviceProp, nDeviceIdx)) {
|
||||
printf("\nDevice Name \t\t - %s ", deviceProp.name);
|
||||
printf("\n**************************************");
|
||||
printf("\nTotal Global Memory\t\t\t - %lu KB",
|
||||
deviceProp.totalGlobalMem / 1024);
|
||||
printf("\nShared memory available per block \t - %lu KB",
|
||||
deviceProp.sharedMemPerBlock / 1024);
|
||||
printf("\nNumber of registers per thread block \t - %d",
|
||||
deviceProp.regsPerBlock);
|
||||
printf("\nWarp size in threads \t\t\t - %d", deviceProp.warpSize);
|
||||
printf("\nMemory Pitch \t\t\t\t - %zu bytes", deviceProp.memPitch);
|
||||
printf("\nMaximum threads per block \t\t - %d",
|
||||
deviceProp.maxThreadsPerBlock);
|
||||
printf("\nMaximum Thread Dimension (block) \t - %d %d %d",
|
||||
deviceProp.maxThreadsDim[0], deviceProp.maxThreadsDim[1],
|
||||
deviceProp.maxThreadsDim[2]);
|
||||
printf("\nMaximum Thread Dimension (grid) \t - %d %d %d",
|
||||
deviceProp.maxGridSize[0], deviceProp.maxGridSize[1],
|
||||
deviceProp.maxGridSize[2]);
|
||||
printf("\nTotal constant memory \t\t\t - %zu bytes",
|
||||
deviceProp.totalConstMem);
|
||||
printf("\nCUDA ver \t\t\t\t - %d.%d", deviceProp.major, deviceProp.minor);
|
||||
printf("\nClock rate \t\t\t\t - %d KHz", deviceProp.clockRate);
|
||||
printf("\nTexture Alignment \t\t\t - %zu bytes",
|
||||
deviceProp.textureAlignment);
|
||||
printf("\nDevice Overlap \t\t\t\t - %s",
|
||||
deviceProp.deviceOverlap ? "Allowed" : "Not Allowed");
|
||||
printf("\nNumber of Multi processors \t\t - %d\n\n",
|
||||
deviceProp.multiProcessorCount);
|
||||
} else
|
||||
printf("\n%s", cudaGetErrorString(cudaGetLastError()));
|
||||
}
|
||||
}
|
||||
|
||||
/*------------------------------------------------------
|
||||
** InitProblemOnce -- Initialize all of matrices and
|
||||
** vectors by opening a data file specified by the user.
|
||||
**
|
||||
** We used dynamic array *a, *b, and *m to allocate
|
||||
** the memory storages.
|
||||
**------------------------------------------------------
|
||||
*/
|
||||
void InitProblemOnce(char *filename) {
|
||||
// char *filename = argv[1];
|
||||
|
||||
// printf("Enter the data file name: ");
|
||||
// scanf("%s", filename);
|
||||
printf("The file name is: %s\n", filename);
|
||||
|
||||
fp = fopen(filename, "r");
|
||||
|
||||
fscanf(fp, "%d", &Size);
|
||||
|
||||
a = (float *)malloc(Size * Size * sizeof(float));
|
||||
|
||||
InitMat(a, Size, Size);
|
||||
printf("The input matrix a is:\n");
|
||||
PrintMat(a, Size, Size);
|
||||
b = (float *)malloc(Size * sizeof(float));
|
||||
|
||||
InitAry(b, Size);
|
||||
printf("The input array b is:\n");
|
||||
PrintAry(b, Size);
|
||||
|
||||
m = (float *)malloc(Size * Size * sizeof(float));
|
||||
}
|
||||
|
||||
/*------------------------------------------------------
|
||||
** InitPerRun() -- Initialize the contents of the
|
||||
** multipier matrix **m
|
||||
**------------------------------------------------------
|
||||
*/
|
||||
void InitPerRun() {
|
||||
int i;
|
||||
for (i = 0; i < Size * Size; i++)
|
||||
*(m + i) = 0.0;
|
||||
}
|
||||
|
||||
/*-------------------------------------------------------
|
||||
** Fan1() -- Calculate multiplier matrix
|
||||
** Pay attention to the index. Index i give the range
|
||||
** which starts from 0 to range-1. The real values of
|
||||
** the index should be adjust and related with the value
|
||||
** of t which is defined on the ForwardSub().
|
||||
**-------------------------------------------------------
|
||||
*/
|
||||
__global__ void Fan1(float *m_cuda, float *a_cuda, int Size, int t) {
|
||||
// if(threadIdx.x + blockIdx.x * blockDim.x >= Size-1-t) {
|
||||
// printf("blockIDx.x: %d, threadIdx.x: %d, Size: %d, t:%d,
|
||||
// Size-1-t: %d\n",blockIdx.x,threadIdx.x,Size,t,Size-1-t);
|
||||
// }
|
||||
|
||||
if (threadIdx.x + blockIdx.x * blockDim.x >= Size - 1 - t)
|
||||
return;
|
||||
*(m_cuda + Size * (blockDim.x * blockIdx.x + threadIdx.x + t + 1) + t) =
|
||||
*(a_cuda + Size * (blockDim.x * blockIdx.x + threadIdx.x + t + 1) + t) /
|
||||
*(a_cuda + Size * t + t);
|
||||
}
|
||||
|
||||
/*-------------------------------------------------------
|
||||
** Fan2() -- Modify the matrix A into LUD
|
||||
**-------------------------------------------------------
|
||||
*/
|
||||
|
||||
__global__ void Fan2(float *m_cuda, float *a_cuda, float *b_cuda, int Size,
|
||||
int j1, int t) {
|
||||
if (threadIdx.x + blockIdx.x * blockDim.x >= Size - 1 - t)
|
||||
return;
|
||||
if (threadIdx.y + blockIdx.y * blockDim.y >= Size - t)
|
||||
return;
|
||||
|
||||
int xidx = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
int yidx = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
// printf("blockIdx.x: %d, threadIdx.x: %d, blockIdx.y: %d, threadIdx.y: %d,
|
||||
// blockDim.x: %d, blockDim.y:
|
||||
// %d\n",blockIdx.x,threadIdx.x,blockIdx.y,threadIdx.y,blockDim.x,blockDim.y);
|
||||
|
||||
a_cuda[Size * (xidx + 1 + t) + (yidx + t)] -=
|
||||
m_cuda[Size * (xidx + 1 + t) + t] * a_cuda[Size * t + (yidx + t)];
|
||||
// a_cuda[xidx+1+t][yidx+t] -= m_cuda[xidx+1+t][t] * a_cuda[t][yidx+t];
|
||||
if (yidx == 0) {
|
||||
// printf("blockIdx.x:%d,threadIdx.x:%d,blockIdx.y:%d,threadIdx.y:%d,blockDim.x:%d,blockDim.y:%d\n",blockIdx.x,threadIdx.x,blockIdx.y,threadIdx.y,blockDim.x,blockDim.y);
|
||||
// printf("xidx:%d,yidx:%d\n",xidx,yidx);
|
||||
b_cuda[xidx + 1 + t] -=
|
||||
m_cuda[Size * (xidx + 1 + t) + (yidx + t)] * b_cuda[t];
|
||||
}
|
||||
}
|
||||
|
||||
/*------------------------------------------------------
|
||||
** ForwardSub() -- Forward substitution of Gaussian
|
||||
** elimination.
|
||||
**------------------------------------------------------
|
||||
*/
|
||||
void ForwardSub() {
|
||||
int t;
|
||||
float *m_cuda, *a_cuda, *b_cuda;
|
||||
|
||||
int A = 1;
|
||||
int B = 2;
|
||||
int C = 3;
|
||||
int D = 4;
|
||||
int E = 5;
|
||||
int F = 6;
|
||||
// printf("blockIDx.x: %d, threadIdx.x: %d, Size: %d, t: %d, Size-1-t: %d\n",
|
||||
// A, B, C, D, E); printf("blockIdx.x: %d, threadIdx.x: %d, blockIdx.y: %d,
|
||||
// threadIdx.y: %d, blockDim.x: %d, blockDim.y: %d\n", A , B, C, D, E, F);
|
||||
|
||||
// allocate memory on GPU
|
||||
cudaMalloc((void **)&m_cuda, Size * Size * sizeof(float));
|
||||
|
||||
cudaMalloc((void **)&a_cuda, Size * Size * sizeof(float));
|
||||
|
||||
cudaMalloc((void **)&b_cuda, Size * sizeof(float));
|
||||
|
||||
// copy memory to GPU
|
||||
cudaMemcpy(m_cuda, m, Size * Size * sizeof(float), cudaMemcpyHostToDevice);
|
||||
cudaMemcpy(a_cuda, a, Size * Size * sizeof(float), cudaMemcpyHostToDevice);
|
||||
cudaMemcpy(b_cuda, b, Size * sizeof(float), cudaMemcpyHostToDevice);
|
||||
|
||||
int block_size, grid_size;
|
||||
|
||||
block_size = MAXBLOCKSIZE;
|
||||
grid_size = (Size / block_size) + (!(Size % block_size) ? 0 : 1);
|
||||
printf("1d grid size: %d\n", grid_size);
|
||||
|
||||
dim3 dimBlock(block_size);
|
||||
dim3 dimGrid(grid_size);
|
||||
// dim3 dimGrid( (N/dimBlock.x) + (!(N%dimBlock.x)?0:1) );
|
||||
|
||||
int blockSize2d, gridSize2d;
|
||||
blockSize2d = BLOCK_SIZE_XY;
|
||||
gridSize2d = (Size / blockSize2d) + (!(Size % blockSize2d ? 0 : 1));
|
||||
|
||||
dim3 dimBlockXY(blockSize2d, blockSize2d);
|
||||
|
||||
printf("BlockXY: %d \n", blockSize2d);
|
||||
dim3 dimGridXY(gridSize2d, gridSize2d);
|
||||
|
||||
#ifdef TIMING
|
||||
gettimeofday(&tv_kernel_start, NULL);
|
||||
#endif
|
||||
printf("first grid size: %d second: %d\n", grid_size, gridSize2d);
|
||||
// begin timing kernels
|
||||
struct timeval time_start;
|
||||
gettimeofday(&time_start, NULL);
|
||||
for (t = 0; t < (Size - 1); t++) {
|
||||
Fan1<<<dimGrid, dimBlock>>>(m_cuda, a_cuda, Size, t);
|
||||
cudaDeviceSynchronize();
|
||||
Fan2<<<dimGridXY, dimBlockXY>>>(m_cuda, a_cuda, b_cuda, Size, Size - t, t);
|
||||
cudaDeviceSynchronize();
|
||||
checkCUDAError("Fan2");
|
||||
}
|
||||
// end timing kernels
|
||||
struct timeval time_end;
|
||||
gettimeofday(&time_end, NULL);
|
||||
totalKernelTime = (time_end.tv_sec * 1000000 + time_end.tv_usec) -
|
||||
(time_start.tv_sec * 1000000 + time_start.tv_usec);
|
||||
|
||||
#ifdef TIMING
|
||||
tvsub(&time_end, &tv_kernel_start, &tv);
|
||||
kernel_time += tv.tv_sec * 1000.0 + (float)tv.tv_usec / 1000.0;
|
||||
#endif
|
||||
|
||||
// copy memory back to CPU
|
||||
cudaMemcpy(m, m_cuda, Size * Size * sizeof(float), cudaMemcpyDeviceToHost);
|
||||
cudaMemcpy(a, a_cuda, Size * Size * sizeof(float), cudaMemcpyDeviceToHost);
|
||||
cudaMemcpy(b, b_cuda, Size * sizeof(float), cudaMemcpyDeviceToHost);
|
||||
cudaFree(m_cuda);
|
||||
cudaFree(a_cuda);
|
||||
cudaFree(b_cuda);
|
||||
}
|
||||
|
||||
/*------------------------------------------------------
|
||||
** BackSub() -- Backward substitution
|
||||
**------------------------------------------------------
|
||||
*/
|
||||
|
||||
void BackSub() {
|
||||
// create a new vector to hold the final answer
|
||||
finalVec = (float *)malloc(Size * sizeof(float));
|
||||
// solve "bottom up"
|
||||
int i, j;
|
||||
for (i = 0; i < Size; i++) {
|
||||
finalVec[Size - i - 1] = b[Size - i - 1];
|
||||
for (j = 0; j < i; j++) {
|
||||
finalVec[Size - i - 1] -= *(a + Size * (Size - i - 1) + (Size - j - 1)) *
|
||||
finalVec[Size - j - 1];
|
||||
}
|
||||
finalVec[Size - i - 1] =
|
||||
finalVec[Size - i - 1] / *(a + Size * (Size - i - 1) + (Size - i - 1));
|
||||
}
|
||||
}
|
||||
|
||||
void InitMat(float *ary, int nrow, int ncol) {
|
||||
int i, j;
|
||||
|
||||
for (i = 0; i < nrow; i++) {
|
||||
for (j = 0; j < ncol; j++) {
|
||||
fscanf(fp, "%f", ary + Size * i + j);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*------------------------------------------------------
|
||||
** PrintMat() -- Print the contents of the matrix
|
||||
**------------------------------------------------------
|
||||
*/
|
||||
void PrintMat(float *ary, int nrow, int ncol) {
|
||||
return;
|
||||
int i, j;
|
||||
|
||||
for (i = 0; i < nrow; i++) {
|
||||
for (j = 0; j < ncol; j++) {
|
||||
printf("%8.2f ", *(ary + Size * i + j));
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
/*------------------------------------------------------
|
||||
** InitAry() -- Initialize the array (vector) by reading
|
||||
** data from the data file
|
||||
**------------------------------------------------------
|
||||
*/
|
||||
void InitAry(float *ary, int ary_size) {
|
||||
int i;
|
||||
|
||||
for (i = 0; i < ary_size; i++) {
|
||||
fscanf(fp, "%f", &ary[i]);
|
||||
}
|
||||
}
|
||||
|
||||
/*------------------------------------------------------
|
||||
** PrintAry() -- Print the contents of the array (vector)
|
||||
**------------------------------------------------------
|
||||
*/
|
||||
void PrintAry(float *ary, int ary_size) {
|
||||
int i;
|
||||
for (i = 0; i < ary_size; i++) {
|
||||
printf("%.2f ", ary[i]);
|
||||
}
|
||||
printf("\n\n");
|
||||
}
|
||||
void checkCUDAError(const char *msg) {
|
||||
cudaError_t err = cudaGetLastError();
|
||||
if (cudaSuccess != err) {
|
||||
fprintf(stderr, "Cuda error: %s: %s.\n", msg, cudaGetErrorString(err));
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
}
|
|
@ -1,23 +0,0 @@
|
|||
#!/bin/bash
|
||||
set -e
|
||||
llvm-as gaussian-cuda-nvptx64-nvidia-cuda-sm_61.ll
|
||||
llvm-as gaussian-host-x86_64-unknown-linux-gnu.ll
|
||||
../../build/compilation/kernelTranslator gaussian-cuda-nvptx64-nvidia-cuda-sm_61.bc kernel.bc
|
||||
../../build/compilation/hostTranslator gaussian-host-x86_64-unknown-linux-gnu.bc host.bc
|
||||
|
||||
llc --relocation-model=pic --filetype=obj kernel.bc
|
||||
llc --relocation-model=pic --filetype=obj host.bc
|
||||
|
||||
g++ -Wall -L../../build/runtime \
|
||||
-L../../build/runtime/threadPool \
|
||||
-o gaussian -fPIC -no-pie host.o kernel.o -lc -lx86Runtime -lthreadPool -lpthread
|
||||
|
||||
export LD_LIBRARY_PATH=../../build/runtime:../../build/runtime/threadPool:$LD_LIBRARY_PATH
|
||||
./gaussian -f ../../rodinia-data/gaussian/matrix4.txt >> res.log
|
||||
|
||||
if grep -q "0.70 0.00 -0.40 -0.50" res.log; then
|
||||
echo "Pass"
|
||||
else
|
||||
echo "Error result"
|
||||
exit 1
|
||||
fi
|
File diff suppressed because it is too large
Load Diff
|
@ -1,317 +0,0 @@
|
|||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
/*
|
||||
* avilib.h
|
||||
*
|
||||
* Copyright (C) Thomas Östreich - June 2001
|
||||
* multiple audio track support Copyright (C) 2002 Thomas Östreich
|
||||
*
|
||||
* Original code:
|
||||
* Copyright (C) 1999 Rainer Johanni <Rainer@Johanni.de>
|
||||
*
|
||||
* This file is part of transcode, a linux video stream processing tool
|
||||
*
|
||||
* transcode is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2, or (at your option)
|
||||
* any later version.
|
||||
*
|
||||
* transcode is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with GNU Make; see the file COPYING. If not, write to
|
||||
* the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
|
||||
*
|
||||
*/
|
||||
|
||||
#include <fcntl.h>
|
||||
#include <stdio.h>
|
||||
#include <sys/stat.h>
|
||||
#include <sys/types.h>
|
||||
#include <unistd.h>
|
||||
// #include <windows.h>
|
||||
#include <errno.h>
|
||||
#include <inttypes.h>
|
||||
#include <limits.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
#ifndef AVILIB_H
|
||||
#define AVILIB_H
|
||||
|
||||
#define AVI_MAX_TRACKS 8
|
||||
|
||||
typedef struct {
|
||||
unsigned long key;
|
||||
unsigned long pos;
|
||||
unsigned long len;
|
||||
} video_index_entry;
|
||||
|
||||
typedef struct {
|
||||
unsigned long pos;
|
||||
unsigned long len;
|
||||
unsigned long tot;
|
||||
} audio_index_entry;
|
||||
|
||||
typedef struct track_s {
|
||||
|
||||
long a_fmt; /* Audio format, see #defines below */
|
||||
long a_chans; /* Audio channels, 0 for no audio */
|
||||
long a_rate; /* Rate in Hz */
|
||||
long a_bits; /* bits per audio sample */
|
||||
long mp3rate; /* mp3 bitrate kbs*/
|
||||
|
||||
long audio_strn; /* Audio stream number */
|
||||
long audio_bytes; /* Total number of bytes of audio data */
|
||||
long audio_chunks; /* Chunks of audio data in the file */
|
||||
|
||||
char audio_tag[4]; /* Tag of audio data */
|
||||
long audio_posc; /* Audio position: chunk */
|
||||
long audio_posb; /* Audio position: byte within chunk */
|
||||
|
||||
long a_codech_off; /* absolut offset of audio codec information */
|
||||
long a_codecf_off; /* absolut offset of audio codec information */
|
||||
|
||||
audio_index_entry *audio_index;
|
||||
|
||||
} track_t;
|
||||
|
||||
typedef struct {
|
||||
|
||||
long fdes; /* File descriptor of AVI file */
|
||||
long mode; /* 0 for reading, 1 for writing */
|
||||
|
||||
long width; /* Width of a video frame */
|
||||
long height; /* Height of a video frame */
|
||||
double fps; /* Frames per second */
|
||||
char compressor[8]; /* Type of compressor, 4 bytes + padding for 0 byte */
|
||||
char compressor2[8]; /* Type of compressor, 4 bytes + padding for 0 byte */
|
||||
long video_strn; /* Video stream number */
|
||||
long video_frames; /* Number of video frames */
|
||||
char video_tag[4]; /* Tag of video data */
|
||||
long video_pos; /* Number of next frame to be read
|
||||
(if index present) */
|
||||
|
||||
unsigned long max_len; /* maximum video chunk present */
|
||||
|
||||
track_t track[AVI_MAX_TRACKS]; // up to AVI_MAX_TRACKS audio tracks supported
|
||||
|
||||
unsigned long pos; /* position in file */
|
||||
long n_idx; /* number of index entries actually filled */
|
||||
long max_idx; /* number of index entries actually allocated */
|
||||
|
||||
long v_codech_off; /* absolut offset of video codec (strh) info */
|
||||
long v_codecf_off; /* absolut offset of video codec (strf) info */
|
||||
|
||||
unsigned char (*idx)[16]; /* index entries (AVI idx1 tag) */
|
||||
video_index_entry *video_index;
|
||||
|
||||
unsigned long last_pos; /* Position of last frame written */
|
||||
unsigned long last_len; /* Length of last frame written */
|
||||
int must_use_index; /* Flag if frames are duplicated */
|
||||
unsigned long movi_start;
|
||||
|
||||
int anum; // total number of audio tracks
|
||||
int aptr; // current audio working track
|
||||
|
||||
} avi_t;
|
||||
|
||||
#define AVI_MODE_WRITE 0
|
||||
#define AVI_MODE_READ 1
|
||||
|
||||
/* The error codes delivered by avi_open_input_file */
|
||||
|
||||
#define AVI_ERR_SIZELIM \
|
||||
1 /* The write of the data would exceed \
|
||||
the maximum size of the AVI file. \
|
||||
This is more a warning than an \
|
||||
error since the file may be closed safely */
|
||||
|
||||
#define AVI_ERR_OPEN \
|
||||
2 /* Error opening the AVI file - wrong path \
|
||||
name or file nor readable/writable \
|
||||
*/
|
||||
|
||||
#define AVI_ERR_READ 3 /* Error reading from AVI File */
|
||||
|
||||
#define AVI_ERR_WRITE \
|
||||
4 /* Error writing to AVI File, \
|
||||
disk full ??? */
|
||||
|
||||
#define AVI_ERR_WRITE_INDEX \
|
||||
5 /* Could not write index to AVI file \
|
||||
during close, file may still be \
|
||||
usable */
|
||||
|
||||
#define AVI_ERR_CLOSE \
|
||||
6 /* Could not write header to AVI file \
|
||||
or not truncate the file during \
|
||||
close, file is most probably corrupted */
|
||||
|
||||
#define AVI_ERR_NOT_PERM \
|
||||
7 /* Operation not permitted: \
|
||||
trying to read from a file open \
|
||||
for writing or vice versa */
|
||||
|
||||
#define AVI_ERR_NO_MEM 8 /* malloc failed */
|
||||
|
||||
#define AVI_ERR_NO_AVI 9 /* Not an AVI file */
|
||||
|
||||
#define AVI_ERR_NO_HDRL \
|
||||
10 /* AVI file has no has no header list, \
|
||||
corrupted ??? */
|
||||
|
||||
#define AVI_ERR_NO_MOVI \
|
||||
11 /* AVI file has no has no MOVI list, \
|
||||
corrupted ??? */
|
||||
|
||||
#define AVI_ERR_NO_VIDS 12 /* AVI file contains no video data */
|
||||
|
||||
#define AVI_ERR_NO_IDX \
|
||||
13 /* The file has been opened with \
|
||||
getIndex==0, but an operation has \
|
||||
been performed that needs an index */
|
||||
|
||||
/* Possible Audio formats */
|
||||
|
||||
#ifndef WAVE_FORMAT_PCM
|
||||
#define WAVE_FORMAT_UNKNOWN (0x0000)
|
||||
#define WAVE_FORMAT_PCM (0x0001)
|
||||
#define WAVE_FORMAT_ADPCM (0x0002)
|
||||
#define WAVE_FORMAT_IBM_CVSD (0x0005)
|
||||
#define WAVE_FORMAT_ALAW (0x0006)
|
||||
#define WAVE_FORMAT_MULAW (0x0007)
|
||||
#define WAVE_FORMAT_OKI_ADPCM (0x0010)
|
||||
#define WAVE_FORMAT_DVI_ADPCM (0x0011)
|
||||
#define WAVE_FORMAT_DIGISTD (0x0015)
|
||||
#define WAVE_FORMAT_DIGIFIX (0x0016)
|
||||
#define WAVE_FORMAT_YAMAHA_ADPCM (0x0020)
|
||||
#define WAVE_FORMAT_DSP_TRUESPEECH (0x0022)
|
||||
#define WAVE_FORMAT_GSM610 (0x0031)
|
||||
#define IBM_FORMAT_MULAW (0x0101)
|
||||
#define IBM_FORMAT_ALAW (0x0102)
|
||||
#define IBM_FORMAT_ADPCM (0x0103)
|
||||
#endif
|
||||
|
||||
avi_t *AVI_open_output_file(char *filename);
|
||||
void AVI_set_video(avi_t *AVI, int width, int height, double fps,
|
||||
char *compressor);
|
||||
void AVI_set_audio(avi_t *AVI, int channels, long rate, int bits, int format,
|
||||
long mp3rate);
|
||||
int AVI_write_frame(avi_t *AVI, char *data, long bytes, int keyframe);
|
||||
int AVI_dup_frame(avi_t *AVI);
|
||||
int AVI_write_audio(avi_t *AVI, char *data, long bytes);
|
||||
int AVI_append_audio(avi_t *AVI, char *data, long bytes);
|
||||
long AVI_bytes_remain(avi_t *AVI);
|
||||
int AVI_close(avi_t *AVI);
|
||||
long AVI_bytes_written(avi_t *AVI);
|
||||
|
||||
avi_t *AVI_open_input_file(char *filename, int getIndex);
|
||||
avi_t *AVI_open_fd(int fd, int getIndex);
|
||||
int avi_parse_input_file(avi_t *AVI, int getIndex);
|
||||
long AVI_audio_mp3rate(avi_t *AVI);
|
||||
long AVI_video_frames(avi_t *AVI);
|
||||
int AVI_video_width(avi_t *AVI);
|
||||
int AVI_video_height(avi_t *AVI);
|
||||
double AVI_frame_rate(avi_t *AVI);
|
||||
char *AVI_video_compressor(avi_t *AVI);
|
||||
|
||||
int AVI_audio_channels(avi_t *AVI);
|
||||
int AVI_audio_bits(avi_t *AVI);
|
||||
int AVI_audio_format(avi_t *AVI);
|
||||
long AVI_audio_rate(avi_t *AVI);
|
||||
long AVI_audio_bytes(avi_t *AVI);
|
||||
long AVI_audio_chunks(avi_t *AVI);
|
||||
|
||||
long AVI_max_video_chunk(avi_t *AVI);
|
||||
|
||||
long AVI_frame_size(avi_t *AVI, long frame);
|
||||
long AVI_audio_size(avi_t *AVI, long frame);
|
||||
int AVI_seek_start(avi_t *AVI);
|
||||
int AVI_set_video_position(avi_t *AVI, long frame);
|
||||
long AVI_get_video_position(avi_t *AVI, long frame);
|
||||
long AVI_read_frame(avi_t *AVI, char *vidbuf, int *keyframe);
|
||||
|
||||
int AVI_set_audio_position(avi_t *AVI, long byte);
|
||||
int AVI_set_audio_bitrate(avi_t *AVI, long bitrate);
|
||||
|
||||
long AVI_read_audio(avi_t *AVI, char *audbuf, long bytes);
|
||||
|
||||
long AVI_audio_codech_offset(avi_t *AVI);
|
||||
long AVI_audio_codecf_offset(avi_t *AVI);
|
||||
long AVI_video_codech_offset(avi_t *AVI);
|
||||
long AVI_video_codecf_offset(avi_t *AVI);
|
||||
|
||||
int AVI_read_data(avi_t *AVI, char *vidbuf, long max_vidbuf, char *audbuf,
|
||||
long max_audbuf, long *len);
|
||||
|
||||
void AVI_print_error(char *str);
|
||||
char *AVI_strerror();
|
||||
char *AVI_syserror();
|
||||
|
||||
int AVI_scan(char *name);
|
||||
int AVI_dump(char *name, int mode);
|
||||
|
||||
char *AVI_codec2str(short cc);
|
||||
int AVI_file_check(char *import_file);
|
||||
|
||||
void AVI_info(avi_t *avifile);
|
||||
uint64_t AVI_max_size();
|
||||
int avi_update_header(avi_t *AVI);
|
||||
|
||||
int AVI_set_audio_track(avi_t *AVI, int track);
|
||||
int AVI_get_audio_track(avi_t *AVI);
|
||||
int AVI_audio_tracks(avi_t *AVI);
|
||||
|
||||
struct riff_struct {
|
||||
unsigned char id[4]; /* RIFF */
|
||||
unsigned long len;
|
||||
unsigned char wave_id[4]; /* WAVE */
|
||||
};
|
||||
|
||||
struct chunk_struct {
|
||||
unsigned char id[4];
|
||||
unsigned long len;
|
||||
};
|
||||
|
||||
struct common_struct {
|
||||
unsigned short wFormatTag;
|
||||
unsigned short wChannels;
|
||||
unsigned long dwSamplesPerSec;
|
||||
unsigned long dwAvgBytesPerSec;
|
||||
unsigned short wBlockAlign;
|
||||
unsigned short wBitsPerSample; /* Only for PCM */
|
||||
};
|
||||
|
||||
struct wave_header {
|
||||
struct riff_struct riff;
|
||||
struct chunk_struct format;
|
||||
struct common_struct common;
|
||||
struct chunk_struct data;
|
||||
};
|
||||
|
||||
struct AVIStreamHeader {
|
||||
long fccType;
|
||||
long fccHandler;
|
||||
long dwFlags;
|
||||
long dwPriority;
|
||||
long dwInitialFrames;
|
||||
long dwScale;
|
||||
long dwRate;
|
||||
long dwStart;
|
||||
long dwLength;
|
||||
long dwSuggestedBufferSize;
|
||||
long dwQuality;
|
||||
long dwSampleSize;
|
||||
};
|
||||
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
|
@ -1,130 +0,0 @@
|
|||
// #ifdef __cplusplus
|
||||
// extern "C" {
|
||||
// #endif
|
||||
|
||||
//===============================================================================================================================================================================================================
|
||||
// DEFINE / INCLUDE
|
||||
//===============================================================================================================================================================================================================
|
||||
#include "avimod.h"
|
||||
|
||||
//===============================================================================================================================================================================================================
|
||||
// FUNCTIONS
|
||||
//===============================================================================================================================================================================================================
|
||||
|
||||
// Flips the specified image and crops it to the specified dimensions
|
||||
// If scaled == true, all values are scaled to the range [0.0, 1.0
|
||||
fp *chop_flip_image(char *image, int height, int width, int cropped, int scaled,
|
||||
int converted) {
|
||||
|
||||
// fixed dimensions for cropping or not cropping, square vertices starting
|
||||
// from initial point in top left corner going down and right
|
||||
int top;
|
||||
int bottom;
|
||||
int left;
|
||||
int right;
|
||||
if (cropped == 1) {
|
||||
top = 0;
|
||||
bottom = 0;
|
||||
left = 0;
|
||||
right = 0;
|
||||
} else {
|
||||
top = 0;
|
||||
bottom = height - 1;
|
||||
left = 0;
|
||||
right = width - 1;
|
||||
}
|
||||
|
||||
// dimensions of new cropped image
|
||||
int height_new = bottom - top + 1;
|
||||
int width_new = right - left + 1;
|
||||
|
||||
// counters
|
||||
int i, j;
|
||||
|
||||
// allocate memory for cropped/flipped frame
|
||||
fp *result = (fp *)malloc(height_new * width_new * sizeof(fp));
|
||||
|
||||
// crop/flip and scale frame
|
||||
fp temp;
|
||||
if (scaled) {
|
||||
fp scale = 1.0 / 255.0;
|
||||
for (i = 0; i < height_new; i++) { // rows
|
||||
for (j = 0; j < width_new; j++) { // colums
|
||||
temp =
|
||||
(fp)image[((height - 1 - (i + top)) * width) + (j + left)] * scale;
|
||||
if (temp < 0) {
|
||||
result[i * width_new + j] = temp + 256;
|
||||
} else {
|
||||
result[i * width_new + j] = temp;
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for (i = 0; i < height_new; i++) { // rows
|
||||
for (j = 0; j < width_new; j++) { // colums
|
||||
temp = (fp)image[((height - 1 - (i + top)) * width) + (j + left)];
|
||||
if (temp < 0) {
|
||||
result[i * width_new + j] = temp + 256;
|
||||
} else {
|
||||
result[i * width_new + j] = temp;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// convert storage method (from row-major to column-major)
|
||||
fp *result_converted = (fp *)malloc(height_new * width_new * sizeof(fp));
|
||||
if (converted == 1) {
|
||||
for (i = 0; i < width_new; i++) { // rows
|
||||
for (j = 0; j < height_new; j++) { // colums
|
||||
result_converted[i * height_new + j] = result[j * width_new + i];
|
||||
}
|
||||
}
|
||||
} else {
|
||||
result_converted = result;
|
||||
}
|
||||
free(result);
|
||||
|
||||
// return
|
||||
return result_converted;
|
||||
}
|
||||
|
||||
// Returns the specified frame from the specified video file
|
||||
// If cropped == true, the frame is cropped to pre-determined dimensions
|
||||
// (hardcoded to the boundaries of the blood vessel in the test video)
|
||||
// If scaled == true, all values are scaled to the range [0.0, 1.0]
|
||||
fp *get_frame(avi_t *cell_file, int frame_num, int cropped, int scaled,
|
||||
int converted) {
|
||||
|
||||
// variable
|
||||
int dummy;
|
||||
int width = AVI_video_width(cell_file);
|
||||
int height = AVI_video_height(cell_file);
|
||||
int status;
|
||||
|
||||
// There are 600 frames in this file (i.e. frame_num = 600 causes an error)
|
||||
AVI_set_video_position(cell_file, frame_num);
|
||||
|
||||
// Read in the frame from the AVI
|
||||
char *image_buf = (char *)malloc(width * height * sizeof(char));
|
||||
status = AVI_read_frame(cell_file, image_buf, &dummy);
|
||||
if (status == -1) {
|
||||
AVI_print_error((char *)"Error with AVI_read_frame");
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
// The image is read in upside-down, so we need to flip it
|
||||
fp *image_chopped;
|
||||
image_chopped =
|
||||
chop_flip_image(image_buf, height, width, cropped, scaled, converted);
|
||||
|
||||
// free image buffer
|
||||
free(image_buf);
|
||||
|
||||
// return
|
||||
return image_chopped;
|
||||
}
|
||||
|
||||
// #ifdef __cplusplus
|
||||
// }
|
||||
// #endif
|
|
@ -1,24 +0,0 @@
|
|||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
//===============================================================================================================================================================================================================
|
||||
// DEFINE / INCLUDE
|
||||
//===============================================================================================================================================================================================================
|
||||
#define fp float
|
||||
|
||||
#include "avilib.h"
|
||||
|
||||
//===============================================================================================================================================================================================================
|
||||
// DEFINE / INCLUDE
|
||||
//===============================================================================================================================================================================================================
|
||||
|
||||
fp *chop_flip_image(char *image, int height, int width, int cropped, int scaled,
|
||||
int converted);
|
||||
|
||||
fp *get_frame(avi_t *cell_file, int frame_num, int cropped, int scaled,
|
||||
int converted);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
|
@ -1,396 +0,0 @@
|
|||
//===============================================================================================================================================================================================================
|
||||
//===============================================================================================================================================================================================================
|
||||
// DEFINE / INCLUDE
|
||||
//===============================================================================================================================================================================================================
|
||||
//===============================================================================================================================================================================================================
|
||||
|
||||
#define fp float
|
||||
|
||||
/* #define NUMBER_THREADS 512 */
|
||||
#ifdef RD_WG_SIZE_0_0
|
||||
#define NUMBER_THREADS RD_WG_SIZE_0_0
|
||||
#elif defined(RD_WG_SIZE_0)
|
||||
#define NUMBER_THREADS RD_WG_SIZE_0
|
||||
#elif defined(RD_WG_SIZE)
|
||||
#define NUMBER_THREADS RD_WG_SIZE
|
||||
#else
|
||||
#define NUMBER_THREADS 256
|
||||
#endif
|
||||
|
||||
#define ENDO_POINTS 20
|
||||
#define EPI_POINTS 31
|
||||
#define ALL_POINTS 51
|
||||
|
||||
//===============================================================================================================================================================================================================
|
||||
//===============================================================================================================================================================================================================
|
||||
// PARAMS_COMMON_CHANGE STRUCT
|
||||
//===============================================================================================================================================================================================================
|
||||
//===============================================================================================================================================================================================================
|
||||
|
||||
typedef struct params_common_change {
|
||||
|
||||
//======================================================================================================================================================
|
||||
// FRAME
|
||||
//======================================================================================================================================================
|
||||
|
||||
fp *d_frame;
|
||||
int frame_no;
|
||||
|
||||
} params_common_change;
|
||||
|
||||
//===============================================================================================================================================================================================================
|
||||
//===============================================================================================================================================================================================================
|
||||
// PARAMS_COMMON STRUCTURE
|
||||
//===============================================================================================================================================================================================================
|
||||
//===============================================================================================================================================================================================================
|
||||
|
||||
typedef struct params_common {
|
||||
|
||||
//======================================================================================================================================================
|
||||
// HARDCODED INPUTS FROM MATLAB
|
||||
//======================================================================================================================================================
|
||||
|
||||
//====================================================================================================
|
||||
// CONSTANTS
|
||||
//====================================================================================================
|
||||
|
||||
int sSize;
|
||||
int tSize;
|
||||
int maxMove;
|
||||
fp alpha;
|
||||
|
||||
//====================================================================================================
|
||||
// FRAME
|
||||
//====================================================================================================
|
||||
|
||||
int no_frames;
|
||||
int frame_rows;
|
||||
int frame_cols;
|
||||
int frame_elem;
|
||||
int frame_mem;
|
||||
|
||||
//====================================================================================================
|
||||
// ENDO POINTS
|
||||
//====================================================================================================
|
||||
|
||||
int endoPoints;
|
||||
int endo_mem;
|
||||
|
||||
int *endoRow;
|
||||
int *endoCol;
|
||||
int *tEndoRowLoc;
|
||||
int *tEndoColLoc;
|
||||
|
||||
int *d_endoRow;
|
||||
int *d_endoCol;
|
||||
int *d_tEndoRowLoc;
|
||||
int *d_tEndoColLoc;
|
||||
|
||||
fp *d_endoT;
|
||||
|
||||
//====================================================================================================
|
||||
// EPI POINTS
|
||||
//====================================================================================================
|
||||
int epiPoints;
|
||||
int epi_mem;
|
||||
|
||||
int *epiRow;
|
||||
int *epiCol;
|
||||
int *tEpiRowLoc;
|
||||
int *tEpiColLoc;
|
||||
|
||||
int *d_epiRow;
|
||||
int *d_epiCol;
|
||||
int *d_tEpiRowLoc;
|
||||
int *d_tEpiColLoc;
|
||||
|
||||
fp *d_epiT;
|
||||
|
||||
//====================================================================================================
|
||||
// ALL POINTS
|
||||
//====================================================================================================
|
||||
|
||||
int allPoints;
|
||||
|
||||
//======================================================================================================================================================
|
||||
// RIGHT TEMPLATE FROM TEMPLATE ARRAY
|
||||
//======================================================================================================================================================
|
||||
|
||||
int in_rows;
|
||||
int in_cols;
|
||||
int in_elem;
|
||||
int in_mem;
|
||||
|
||||
//======================================================================================================================================================
|
||||
// AREA AROUND POINT FROM FRAME
|
||||
//======================================================================================================================================================
|
||||
|
||||
int in2_rows;
|
||||
int in2_cols;
|
||||
int in2_elem;
|
||||
int in2_mem;
|
||||
|
||||
//======================================================================================================================================================
|
||||
// CONVOLUTION
|
||||
//======================================================================================================================================================
|
||||
|
||||
int conv_rows;
|
||||
int conv_cols;
|
||||
int conv_elem;
|
||||
int conv_mem;
|
||||
int ioffset;
|
||||
int joffset;
|
||||
|
||||
//======================================================================================================================================================
|
||||
// CUMULATIVE SUM 1
|
||||
//======================================================================================================================================================
|
||||
|
||||
//====================================================================================================
|
||||
// PAD ARRAY, VERTICAL CUMULATIVE SUM
|
||||
//====================================================================================================
|
||||
|
||||
int in2_pad_add_rows;
|
||||
int in2_pad_add_cols;
|
||||
int in2_pad_cumv_rows;
|
||||
int in2_pad_cumv_cols;
|
||||
int in2_pad_cumv_elem;
|
||||
int in2_pad_cumv_mem;
|
||||
|
||||
//====================================================================================================
|
||||
// SELECTION
|
||||
//====================================================================================================
|
||||
|
||||
int in2_pad_cumv_sel_rows;
|
||||
int in2_pad_cumv_sel_cols;
|
||||
int in2_pad_cumv_sel_elem;
|
||||
int in2_pad_cumv_sel_mem;
|
||||
int in2_pad_cumv_sel_rowlow;
|
||||
int in2_pad_cumv_sel_rowhig;
|
||||
int in2_pad_cumv_sel_collow;
|
||||
int in2_pad_cumv_sel_colhig;
|
||||
|
||||
//====================================================================================================
|
||||
// SELECTION 2, SUBTRACTION, HORIZONTAL CUMULATIVE SUM
|
||||
//====================================================================================================
|
||||
|
||||
int in2_pad_cumv_sel2_rowlow;
|
||||
int in2_pad_cumv_sel2_rowhig;
|
||||
int in2_pad_cumv_sel2_collow;
|
||||
int in2_pad_cumv_sel2_colhig;
|
||||
int in2_sub_cumh_rows;
|
||||
int in2_sub_cumh_cols;
|
||||
int in2_sub_cumh_elem;
|
||||
int in2_sub_cumh_mem;
|
||||
|
||||
//====================================================================================================
|
||||
// SELECTION
|
||||
//====================================================================================================
|
||||
|
||||
int in2_sub_cumh_sel_rows;
|
||||
int in2_sub_cumh_sel_cols;
|
||||
int in2_sub_cumh_sel_elem;
|
||||
int in2_sub_cumh_sel_mem;
|
||||
int in2_sub_cumh_sel_rowlow;
|
||||
int in2_sub_cumh_sel_rowhig;
|
||||
int in2_sub_cumh_sel_collow;
|
||||
int in2_sub_cumh_sel_colhig;
|
||||
|
||||
//====================================================================================================
|
||||
// SELECTION 2, SUBTRACTION
|
||||
//====================================================================================================
|
||||
|
||||
int in2_sub_cumh_sel2_rowlow;
|
||||
int in2_sub_cumh_sel2_rowhig;
|
||||
int in2_sub_cumh_sel2_collow;
|
||||
int in2_sub_cumh_sel2_colhig;
|
||||
int in2_sub2_rows;
|
||||
int in2_sub2_cols;
|
||||
int in2_sub2_elem;
|
||||
int in2_sub2_mem;
|
||||
|
||||
//======================================================================================================================================================
|
||||
// CUMULATIVE SUM 2
|
||||
//======================================================================================================================================================
|
||||
|
||||
//====================================================================================================
|
||||
// MULTIPLICATION
|
||||
//====================================================================================================
|
||||
|
||||
int in2_sqr_rows;
|
||||
int in2_sqr_cols;
|
||||
int in2_sqr_elem;
|
||||
int in2_sqr_mem;
|
||||
|
||||
//====================================================================================================
|
||||
// SELECTION 2, SUBTRACTION
|
||||
//====================================================================================================
|
||||
|
||||
int in2_sqr_sub2_rows;
|
||||
int in2_sqr_sub2_cols;
|
||||
int in2_sqr_sub2_elem;
|
||||
int in2_sqr_sub2_mem;
|
||||
|
||||
//======================================================================================================================================================
|
||||
// FINAL
|
||||
//======================================================================================================================================================
|
||||
|
||||
int in_sqr_rows;
|
||||
int in_sqr_cols;
|
||||
int in_sqr_elem;
|
||||
int in_sqr_mem;
|
||||
|
||||
//======================================================================================================================================================
|
||||
// TEMPLATE MASK CREATE
|
||||
//======================================================================================================================================================
|
||||
|
||||
int tMask_rows;
|
||||
int tMask_cols;
|
||||
int tMask_elem;
|
||||
int tMask_mem;
|
||||
|
||||
//======================================================================================================================================================
|
||||
// POINT MASK INITIALIZE
|
||||
//======================================================================================================================================================
|
||||
|
||||
int mask_rows;
|
||||
int mask_cols;
|
||||
int mask_elem;
|
||||
int mask_mem;
|
||||
|
||||
//======================================================================================================================================================
|
||||
// MASK CONVOLUTION
|
||||
//======================================================================================================================================================
|
||||
|
||||
int mask_conv_rows;
|
||||
int mask_conv_cols;
|
||||
int mask_conv_elem;
|
||||
int mask_conv_mem;
|
||||
int mask_conv_ioffset;
|
||||
int mask_conv_joffset;
|
||||
|
||||
} params_common;
|
||||
|
||||
//===============================================================================================================================================================================================================
|
||||
//===============================================================================================================================================================================================================
|
||||
// PARAMS_UNIQUE STRUCTURE
|
||||
//===============================================================================================================================================================================================================
|
||||
//===============================================================================================================================================================================================================
|
||||
|
||||
typedef struct params_unique {
|
||||
|
||||
//======================================================================================================================================================
|
||||
// POINT NUMBER
|
||||
//======================================================================================================================================================
|
||||
|
||||
int *d_Row;
|
||||
int *d_Col;
|
||||
int *d_tRowLoc;
|
||||
int *d_tColLoc;
|
||||
fp *d_T;
|
||||
|
||||
//======================================================================================================================================================
|
||||
// POINT NUMBER
|
||||
//======================================================================================================================================================
|
||||
|
||||
int point_no;
|
||||
|
||||
//======================================================================================================================================================
|
||||
// RIGHT TEMPLATE FROM TEMPLATE ARRAY
|
||||
//======================================================================================================================================================
|
||||
|
||||
int in_pointer;
|
||||
|
||||
//======================================================================================================================================================
|
||||
// AREA AROUND POINT FROM FRAME
|
||||
//======================================================================================================================================================
|
||||
|
||||
fp *d_in2;
|
||||
|
||||
//======================================================================================================================================================
|
||||
// CONVOLUTION
|
||||
//======================================================================================================================================================
|
||||
|
||||
fp *d_conv;
|
||||
fp *d_in_mod;
|
||||
|
||||
//======================================================================================================================================================
|
||||
// CUMULATIVE SUM
|
||||
//======================================================================================================================================================
|
||||
|
||||
//====================================================================================================
|
||||
// PAD ARRAY, VERTICAL CUMULATIVE SUM
|
||||
//====================================================================================================
|
||||
|
||||
fp *d_in2_pad_cumv;
|
||||
|
||||
//====================================================================================================
|
||||
// SELECTION
|
||||
//====================================================================================================
|
||||
|
||||
fp *d_in2_pad_cumv_sel;
|
||||
|
||||
//====================================================================================================
|
||||
// SELECTION 2, SUBTRACTION, HORIZONTAL CUMULATIVE SUM
|
||||
//====================================================================================================
|
||||
|
||||
fp *d_in2_sub_cumh;
|
||||
|
||||
//====================================================================================================
|
||||
// SELECTION
|
||||
//====================================================================================================
|
||||
|
||||
fp *d_in2_sub_cumh_sel;
|
||||
|
||||
//====================================================================================================
|
||||
// SELECTION 2, SUBTRACTION
|
||||
//====================================================================================================
|
||||
|
||||
fp *d_in2_sub2;
|
||||
|
||||
//======================================================================================================================================================
|
||||
// CUMULATIVE SUM 2
|
||||
//======================================================================================================================================================
|
||||
|
||||
//====================================================================================================
|
||||
// MULTIPLICATION
|
||||
//====================================================================================================
|
||||
|
||||
fp *d_in2_sqr;
|
||||
|
||||
//====================================================================================================
|
||||
// SELECTION 2, SUBTRACTION
|
||||
//====================================================================================================
|
||||
|
||||
fp *d_in2_sqr_sub2;
|
||||
|
||||
//======================================================================================================================================================
|
||||
// FINAL
|
||||
//======================================================================================================================================================
|
||||
|
||||
fp *d_in_sqr;
|
||||
|
||||
//======================================================================================================================================================
|
||||
// TEMPLATE MASK
|
||||
//======================================================================================================================================================
|
||||
|
||||
fp *d_tMask;
|
||||
|
||||
//======================================================================================================================================================
|
||||
// POINT MASK INITIALIZE
|
||||
//======================================================================================================================================================
|
||||
|
||||
fp *d_mask;
|
||||
|
||||
//======================================================================================================================================================
|
||||
// MASK CONVOLUTION
|
||||
//======================================================================================================================================================
|
||||
|
||||
fp *d_mask_conv;
|
||||
|
||||
} params_unique;
|
||||
|
||||
//===============================================================================================================================================================================================================
|
||||
//===============================================================================================================================================================================================================
|
||||
// END OF STRUCTURE
|
||||
//===============================================================================================================================================================================================================
|
||||
//===============================================================================================================================================================================================================
|
File diff suppressed because it is too large
Load Diff
|
@ -1,795 +0,0 @@
|
|||
//===============================================================================================================================================================================================================
|
||||
//===============================================================================================================================================================================================================
|
||||
// DEFINE / INCLUDE
|
||||
//===============================================================================================================================================================================================================
|
||||
//===============================================================================================================================================================================================================
|
||||
|
||||
//======================================================================================================================================================
|
||||
// LIBRARIES
|
||||
//======================================================================================================================================================
|
||||
|
||||
#include <math.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
#include <avilib.h>
|
||||
#include <avimod.h>
|
||||
#include <cuda.h>
|
||||
|
||||
//======================================================================================================================================================
|
||||
// STRUCTURES, GLOBAL STRUCTURE VARIABLES
|
||||
//======================================================================================================================================================
|
||||
|
||||
#include "define.c"
|
||||
|
||||
params_common_change common_change;
|
||||
__constant__ params_common_change d_common_change;
|
||||
|
||||
params_common common;
|
||||
__constant__ params_common d_common;
|
||||
|
||||
params_unique unique[ALL_POINTS]; // cannot determine size dynamically so choose
|
||||
// more than usually needed
|
||||
__constant__ params_unique d_unique[ALL_POINTS];
|
||||
|
||||
//======================================================================================================================================================
|
||||
// KERNEL CODE
|
||||
//======================================================================================================================================================
|
||||
|
||||
#include "kernel.cu"
|
||||
|
||||
// WRITE DATA FUNCTION
|
||||
//===============================================================================================================================================================================================================200
|
||||
|
||||
void write_data(char *filename, int frameNo, int frames_processed,
|
||||
int endoPoints, int *input_a, int *input_b, int epiPoints,
|
||||
int *input_2a, int *input_2b) {
|
||||
|
||||
//================================================================================80
|
||||
// VARIABLES
|
||||
//================================================================================80
|
||||
|
||||
FILE *fid;
|
||||
int i, j;
|
||||
char c;
|
||||
|
||||
//================================================================================80
|
||||
// OPEN FILE FOR READING
|
||||
//================================================================================80
|
||||
|
||||
fid = fopen(filename, "w+");
|
||||
if (fid == NULL) {
|
||||
printf("The file was not opened for writing\n");
|
||||
return;
|
||||
}
|
||||
|
||||
//================================================================================80
|
||||
// WRITE VALUES TO THE FILE
|
||||
//================================================================================80
|
||||
fprintf(fid, "Total AVI Frames: %d\n", frameNo);
|
||||
fprintf(fid, "Frames Processed: %d\n", frames_processed);
|
||||
fprintf(fid, "endoPoints: %d\n", endoPoints);
|
||||
fprintf(fid, "epiPoints: %d", epiPoints);
|
||||
for (j = 0; j < frames_processed; j++) {
|
||||
fprintf(fid, "\n---Frame %d---", j);
|
||||
fprintf(fid, "\n--endo--\n", j);
|
||||
for (i = 0; i < endoPoints; i++) {
|
||||
fprintf(fid, "%d\t", input_a[j + i * frameNo]);
|
||||
}
|
||||
fprintf(fid, "\n");
|
||||
for (i = 0; i < endoPoints; i++) {
|
||||
// if(input_b[j*size+i] > 2000) input_b[j*size+i]=0;
|
||||
fprintf(fid, "%d\t", input_b[j + i * frameNo]);
|
||||
}
|
||||
fprintf(fid, "\n--epi--\n", j);
|
||||
for (i = 0; i < epiPoints; i++) {
|
||||
// if(input_2a[j*size_2+i] > 2000) input_2a[j*size_2+i]=0;
|
||||
fprintf(fid, "%d\t", input_2a[j + i * frameNo]);
|
||||
}
|
||||
fprintf(fid, "\n");
|
||||
for (i = 0; i < epiPoints; i++) {
|
||||
// if(input_2b[j*size_2+i] > 2000) input_2b[j*size_2+i]=0;
|
||||
fprintf(fid, "%d\t", input_2b[j + i * frameNo]);
|
||||
}
|
||||
}
|
||||
// ================================================================================80
|
||||
// CLOSE FILE
|
||||
// ================================================================================80
|
||||
|
||||
fclose(fid);
|
||||
}
|
||||
|
||||
//===============================================================================================================================================================================================================
|
||||
//===============================================================================================================================================================================================================
|
||||
// MAIN FUNCTION
|
||||
//===============================================================================================================================================================================================================
|
||||
//===============================================================================================================================================================================================================
|
||||
int main(int argc, char *argv[]) {
|
||||
cudaSetDevice(0);
|
||||
printf("WG size of kernel = %d \n", NUMBER_THREADS);
|
||||
//======================================================================================================================================================
|
||||
// VARIABLES
|
||||
//======================================================================================================================================================
|
||||
|
||||
// CUDA kernel execution parameters
|
||||
dim3 threads;
|
||||
dim3 blocks;
|
||||
|
||||
// counter
|
||||
int i;
|
||||
int frames_processed;
|
||||
|
||||
// frames
|
||||
char *video_file_name;
|
||||
avi_t *frames;
|
||||
fp *frame;
|
||||
|
||||
//======================================================================================================================================================
|
||||
// FRAME
|
||||
//======================================================================================================================================================
|
||||
|
||||
if (argc != 3) {
|
||||
printf("ERROR: usage: heartwall <inputfile> <num of frames>\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
// open movie file
|
||||
video_file_name = argv[1];
|
||||
frames = (avi_t *)AVI_open_input_file(video_file_name, 1); // added casting
|
||||
if (frames == NULL) {
|
||||
AVI_print_error((char *)"Error with AVI_open_input_file");
|
||||
return -1;
|
||||
}
|
||||
|
||||
// common
|
||||
common.no_frames = AVI_video_frames(frames);
|
||||
common.frame_rows = AVI_video_height(frames);
|
||||
common.frame_cols = AVI_video_width(frames);
|
||||
common.frame_elem = common.frame_rows * common.frame_cols;
|
||||
common.frame_mem = sizeof(fp) * common.frame_elem;
|
||||
|
||||
// pointers
|
||||
cudaMalloc((void **)&common_change.d_frame, common.frame_mem);
|
||||
|
||||
//======================================================================================================================================================
|
||||
// CHECK INPUT ARGUMENTS
|
||||
//======================================================================================================================================================
|
||||
|
||||
frames_processed = atoi(argv[2]);
|
||||
if (frames_processed < 0 || frames_processed > common.no_frames) {
|
||||
printf("ERROR: %d is an incorrect number of frames specified, select in "
|
||||
"the range of 0-%d\n",
|
||||
frames_processed, common.no_frames);
|
||||
return 0;
|
||||
}
|
||||
|
||||
//======================================================================================================================================================
|
||||
// HARDCODED INPUTS FROM MATLAB
|
||||
//======================================================================================================================================================
|
||||
|
||||
//====================================================================================================
|
||||
// CONSTANTS
|
||||
//====================================================================================================
|
||||
|
||||
common.sSize = 40;
|
||||
common.tSize = 25;
|
||||
common.maxMove = 10;
|
||||
common.alpha = 0.87;
|
||||
|
||||
//====================================================================================================
|
||||
// ENDO POINTS
|
||||
//====================================================================================================
|
||||
|
||||
common.endoPoints = ENDO_POINTS;
|
||||
common.endo_mem = sizeof(int) * common.endoPoints;
|
||||
|
||||
common.endoRow = (int *)malloc(common.endo_mem);
|
||||
common.endoRow[0] = 369;
|
||||
common.endoRow[1] = 400;
|
||||
common.endoRow[2] = 429;
|
||||
common.endoRow[3] = 452;
|
||||
common.endoRow[4] = 476;
|
||||
common.endoRow[5] = 486;
|
||||
common.endoRow[6] = 479;
|
||||
common.endoRow[7] = 458;
|
||||
common.endoRow[8] = 433;
|
||||
common.endoRow[9] = 404;
|
||||
common.endoRow[10] = 374;
|
||||
common.endoRow[11] = 346;
|
||||
common.endoRow[12] = 318;
|
||||
common.endoRow[13] = 294;
|
||||
common.endoRow[14] = 277;
|
||||
common.endoRow[15] = 269;
|
||||
common.endoRow[16] = 275;
|
||||
common.endoRow[17] = 287;
|
||||
common.endoRow[18] = 311;
|
||||
common.endoRow[19] = 339;
|
||||
cudaMalloc((void **)&common.d_endoRow, common.endo_mem);
|
||||
cudaMemcpy(common.d_endoRow, common.endoRow, common.endo_mem,
|
||||
cudaMemcpyHostToDevice);
|
||||
|
||||
common.endoCol = (int *)malloc(common.endo_mem);
|
||||
common.endoCol[0] = 408;
|
||||
common.endoCol[1] = 406;
|
||||
common.endoCol[2] = 397;
|
||||
common.endoCol[3] = 383;
|
||||
common.endoCol[4] = 354;
|
||||
common.endoCol[5] = 322;
|
||||
common.endoCol[6] = 294;
|
||||
common.endoCol[7] = 270;
|
||||
common.endoCol[8] = 250;
|
||||
common.endoCol[9] = 237;
|
||||
common.endoCol[10] = 235;
|
||||
common.endoCol[11] = 241;
|
||||
common.endoCol[12] = 254;
|
||||
common.endoCol[13] = 273;
|
||||
common.endoCol[14] = 300;
|
||||
common.endoCol[15] = 328;
|
||||
common.endoCol[16] = 356;
|
||||
common.endoCol[17] = 383;
|
||||
common.endoCol[18] = 401;
|
||||
common.endoCol[19] = 411;
|
||||
cudaMalloc((void **)&common.d_endoCol, common.endo_mem);
|
||||
cudaMemcpy(common.d_endoCol, common.endoCol, common.endo_mem,
|
||||
cudaMemcpyHostToDevice);
|
||||
|
||||
common.tEndoRowLoc = (int *)malloc(common.endo_mem * common.no_frames);
|
||||
cudaMalloc((void **)&common.d_tEndoRowLoc,
|
||||
common.endo_mem * common.no_frames);
|
||||
|
||||
common.tEndoColLoc = (int *)malloc(common.endo_mem * common.no_frames);
|
||||
cudaMalloc((void **)&common.d_tEndoColLoc,
|
||||
common.endo_mem * common.no_frames);
|
||||
|
||||
//====================================================================================================
|
||||
// EPI POINTS
|
||||
//====================================================================================================
|
||||
|
||||
common.epiPoints = EPI_POINTS;
|
||||
common.epi_mem = sizeof(int) * common.epiPoints;
|
||||
|
||||
common.epiRow = (int *)malloc(common.epi_mem);
|
||||
common.epiRow[0] = 390;
|
||||
common.epiRow[1] = 419;
|
||||
common.epiRow[2] = 448;
|
||||
common.epiRow[3] = 474;
|
||||
common.epiRow[4] = 501;
|
||||
common.epiRow[5] = 519;
|
||||
common.epiRow[6] = 535;
|
||||
common.epiRow[7] = 542;
|
||||
common.epiRow[8] = 543;
|
||||
common.epiRow[9] = 538;
|
||||
common.epiRow[10] = 528;
|
||||
common.epiRow[11] = 511;
|
||||
common.epiRow[12] = 491;
|
||||
common.epiRow[13] = 466;
|
||||
common.epiRow[14] = 438;
|
||||
common.epiRow[15] = 406;
|
||||
common.epiRow[16] = 376;
|
||||
common.epiRow[17] = 347;
|
||||
common.epiRow[18] = 318;
|
||||
common.epiRow[19] = 291;
|
||||
common.epiRow[20] = 275;
|
||||
common.epiRow[21] = 259;
|
||||
common.epiRow[22] = 256;
|
||||
common.epiRow[23] = 252;
|
||||
common.epiRow[24] = 252;
|
||||
common.epiRow[25] = 257;
|
||||
common.epiRow[26] = 266;
|
||||
common.epiRow[27] = 283;
|
||||
common.epiRow[28] = 305;
|
||||
common.epiRow[29] = 331;
|
||||
common.epiRow[30] = 360;
|
||||
cudaMalloc((void **)&common.d_epiRow, common.epi_mem);
|
||||
cudaMemcpy(common.d_epiRow, common.epiRow, common.epi_mem,
|
||||
cudaMemcpyHostToDevice);
|
||||
|
||||
common.epiCol = (int *)malloc(common.epi_mem);
|
||||
common.epiCol[0] = 457;
|
||||
common.epiCol[1] = 454;
|
||||
common.epiCol[2] = 446;
|
||||
common.epiCol[3] = 431;
|
||||
common.epiCol[4] = 411;
|
||||
common.epiCol[5] = 388;
|
||||
common.epiCol[6] = 361;
|
||||
common.epiCol[7] = 331;
|
||||
common.epiCol[8] = 301;
|
||||
common.epiCol[9] = 273;
|
||||
common.epiCol[10] = 243;
|
||||
common.epiCol[11] = 218;
|
||||
common.epiCol[12] = 196;
|
||||
common.epiCol[13] = 178;
|
||||
common.epiCol[14] = 166;
|
||||
common.epiCol[15] = 157;
|
||||
common.epiCol[16] = 155;
|
||||
common.epiCol[17] = 165;
|
||||
common.epiCol[18] = 177;
|
||||
common.epiCol[19] = 197;
|
||||
common.epiCol[20] = 218;
|
||||
common.epiCol[21] = 248;
|
||||
common.epiCol[22] = 276;
|
||||
common.epiCol[23] = 304;
|
||||
common.epiCol[24] = 333;
|
||||
common.epiCol[25] = 361;
|
||||
common.epiCol[26] = 391;
|
||||
common.epiCol[27] = 415;
|
||||
common.epiCol[28] = 434;
|
||||
common.epiCol[29] = 448;
|
||||
common.epiCol[30] = 455;
|
||||
cudaMalloc((void **)&common.d_epiCol, common.epi_mem);
|
||||
cudaMemcpy(common.d_epiCol, common.epiCol, common.epi_mem,
|
||||
cudaMemcpyHostToDevice);
|
||||
|
||||
common.tEpiRowLoc = (int *)malloc(common.epi_mem * common.no_frames);
|
||||
cudaMalloc((void **)&common.d_tEpiRowLoc, common.epi_mem * common.no_frames);
|
||||
|
||||
common.tEpiColLoc = (int *)malloc(common.epi_mem * common.no_frames);
|
||||
cudaMalloc((void **)&common.d_tEpiColLoc, common.epi_mem * common.no_frames);
|
||||
|
||||
//====================================================================================================
|
||||
// ALL POINTS
|
||||
//====================================================================================================
|
||||
|
||||
common.allPoints = ALL_POINTS;
|
||||
|
||||
//======================================================================================================================================================
|
||||
// TEMPLATE SIZES
|
||||
//======================================================================================================================================================
|
||||
|
||||
// common
|
||||
common.in_rows = common.tSize + 1 + common.tSize;
|
||||
common.in_cols = common.in_rows;
|
||||
common.in_elem = common.in_rows * common.in_cols;
|
||||
common.in_mem = sizeof(fp) * common.in_elem;
|
||||
|
||||
//======================================================================================================================================================
|
||||
// CREATE ARRAY OF TEMPLATES FOR ALL POINTS
|
||||
//======================================================================================================================================================
|
||||
|
||||
// common
|
||||
cudaMalloc((void **)&common.d_endoT, common.in_mem * common.endoPoints);
|
||||
cudaMalloc((void **)&common.d_epiT, common.in_mem * common.epiPoints);
|
||||
|
||||
//======================================================================================================================================================
|
||||
// SPECIFIC TO ENDO OR EPI TO BE SET HERE
|
||||
//======================================================================================================================================================
|
||||
|
||||
for (i = 0; i < common.endoPoints; i++) {
|
||||
unique[i].point_no = i;
|
||||
unique[i].d_Row = common.d_endoRow;
|
||||
unique[i].d_Col = common.d_endoCol;
|
||||
unique[i].d_tRowLoc = common.d_tEndoRowLoc;
|
||||
unique[i].d_tColLoc = common.d_tEndoColLoc;
|
||||
unique[i].d_T = common.d_endoT;
|
||||
}
|
||||
for (i = common.endoPoints; i < common.allPoints; i++) {
|
||||
unique[i].point_no = i - common.endoPoints;
|
||||
unique[i].d_Row = common.d_epiRow;
|
||||
unique[i].d_Col = common.d_epiCol;
|
||||
unique[i].d_tRowLoc = common.d_tEpiRowLoc;
|
||||
unique[i].d_tColLoc = common.d_tEpiColLoc;
|
||||
unique[i].d_T = common.d_epiT;
|
||||
}
|
||||
|
||||
//======================================================================================================================================================
|
||||
// RIGHT TEMPLATE FROM TEMPLATE ARRAY
|
||||
//======================================================================================================================================================
|
||||
|
||||
// pointers
|
||||
for (i = 0; i < common.allPoints; i++) {
|
||||
unique[i].in_pointer = unique[i].point_no * common.in_elem;
|
||||
}
|
||||
|
||||
//======================================================================================================================================================
|
||||
// AREA AROUND POINT FROM FRAME
|
||||
//======================================================================================================================================================
|
||||
|
||||
// common
|
||||
common.in2_rows = 2 * common.sSize + 1;
|
||||
common.in2_cols = 2 * common.sSize + 1;
|
||||
common.in2_elem = common.in2_rows * common.in2_cols;
|
||||
common.in2_mem = sizeof(float) * common.in2_elem;
|
||||
|
||||
// pointers
|
||||
for (i = 0; i < common.allPoints; i++) {
|
||||
cudaMalloc((void **)&unique[i].d_in2, common.in2_mem);
|
||||
}
|
||||
|
||||
//======================================================================================================================================================
|
||||
// CONVOLUTION
|
||||
//======================================================================================================================================================
|
||||
|
||||
// common
|
||||
common.conv_rows =
|
||||
common.in_rows + common.in2_rows - 1; // number of rows in I
|
||||
common.conv_cols =
|
||||
common.in_cols + common.in2_cols - 1; // number of columns in I
|
||||
common.conv_elem = common.conv_rows * common.conv_cols; // number of elements
|
||||
common.conv_mem = sizeof(float) * common.conv_elem;
|
||||
common.ioffset = 0;
|
||||
common.joffset = 0;
|
||||
|
||||
// pointers
|
||||
for (i = 0; i < common.allPoints; i++) {
|
||||
cudaMalloc((void **)&unique[i].d_conv, common.conv_mem);
|
||||
}
|
||||
|
||||
//======================================================================================================================================================
|
||||
// CUMULATIVE SUM
|
||||
//======================================================================================================================================================
|
||||
|
||||
//====================================================================================================
|
||||
// PADDING OF ARRAY, VERTICAL CUMULATIVE SUM
|
||||
//====================================================================================================
|
||||
|
||||
// common
|
||||
common.in2_pad_add_rows = common.in_rows;
|
||||
common.in2_pad_add_cols = common.in_cols;
|
||||
|
||||
common.in2_pad_cumv_rows = common.in2_rows + 2 * common.in2_pad_add_rows;
|
||||
common.in2_pad_cumv_cols = common.in2_cols + 2 * common.in2_pad_add_cols;
|
||||
common.in2_pad_cumv_elem =
|
||||
common.in2_pad_cumv_rows * common.in2_pad_cumv_cols;
|
||||
common.in2_pad_cumv_mem = sizeof(float) * common.in2_pad_cumv_elem;
|
||||
|
||||
// pointers
|
||||
for (i = 0; i < common.allPoints; i++) {
|
||||
cudaMalloc((void **)&unique[i].d_in2_pad_cumv, common.in2_pad_cumv_mem);
|
||||
}
|
||||
|
||||
//====================================================================================================
|
||||
// SELECTION
|
||||
//====================================================================================================
|
||||
|
||||
// common
|
||||
common.in2_pad_cumv_sel_rowlow = 1 + common.in_rows; // (1 to n+1)
|
||||
common.in2_pad_cumv_sel_rowhig = common.in2_pad_cumv_rows - 1;
|
||||
common.in2_pad_cumv_sel_collow = 1;
|
||||
common.in2_pad_cumv_sel_colhig = common.in2_pad_cumv_cols;
|
||||
common.in2_pad_cumv_sel_rows =
|
||||
common.in2_pad_cumv_sel_rowhig - common.in2_pad_cumv_sel_rowlow + 1;
|
||||
common.in2_pad_cumv_sel_cols =
|
||||
common.in2_pad_cumv_sel_colhig - common.in2_pad_cumv_sel_collow + 1;
|
||||
common.in2_pad_cumv_sel_elem =
|
||||
common.in2_pad_cumv_sel_rows * common.in2_pad_cumv_sel_cols;
|
||||
common.in2_pad_cumv_sel_mem = sizeof(float) * common.in2_pad_cumv_sel_elem;
|
||||
|
||||
// pointers
|
||||
for (i = 0; i < common.allPoints; i++) {
|
||||
cudaMalloc((void **)&unique[i].d_in2_pad_cumv_sel,
|
||||
common.in2_pad_cumv_sel_mem);
|
||||
}
|
||||
|
||||
//====================================================================================================
|
||||
// SELECTION 2, SUBTRACTION, HORIZONTAL CUMULATIVE SUM
|
||||
//====================================================================================================
|
||||
|
||||
// common
|
||||
common.in2_pad_cumv_sel2_rowlow = 1;
|
||||
common.in2_pad_cumv_sel2_rowhig =
|
||||
common.in2_pad_cumv_rows - common.in_rows - 1;
|
||||
common.in2_pad_cumv_sel2_collow = 1;
|
||||
common.in2_pad_cumv_sel2_colhig = common.in2_pad_cumv_cols;
|
||||
common.in2_sub_cumh_rows =
|
||||
common.in2_pad_cumv_sel2_rowhig - common.in2_pad_cumv_sel2_rowlow + 1;
|
||||
common.in2_sub_cumh_cols =
|
||||
common.in2_pad_cumv_sel2_colhig - common.in2_pad_cumv_sel2_collow + 1;
|
||||
common.in2_sub_cumh_elem =
|
||||
common.in2_sub_cumh_rows * common.in2_sub_cumh_cols;
|
||||
common.in2_sub_cumh_mem = sizeof(float) * common.in2_sub_cumh_elem;
|
||||
|
||||
// pointers
|
||||
for (i = 0; i < common.allPoints; i++) {
|
||||
cudaMalloc((void **)&unique[i].d_in2_sub_cumh, common.in2_sub_cumh_mem);
|
||||
}
|
||||
|
||||
//====================================================================================================
|
||||
// SELECTION
|
||||
//====================================================================================================
|
||||
|
||||
// common
|
||||
common.in2_sub_cumh_sel_rowlow = 1;
|
||||
common.in2_sub_cumh_sel_rowhig = common.in2_sub_cumh_rows;
|
||||
common.in2_sub_cumh_sel_collow = 1 + common.in_cols;
|
||||
common.in2_sub_cumh_sel_colhig = common.in2_sub_cumh_cols - 1;
|
||||
common.in2_sub_cumh_sel_rows =
|
||||
common.in2_sub_cumh_sel_rowhig - common.in2_sub_cumh_sel_rowlow + 1;
|
||||
common.in2_sub_cumh_sel_cols =
|
||||
common.in2_sub_cumh_sel_colhig - common.in2_sub_cumh_sel_collow + 1;
|
||||
common.in2_sub_cumh_sel_elem =
|
||||
common.in2_sub_cumh_sel_rows * common.in2_sub_cumh_sel_cols;
|
||||
common.in2_sub_cumh_sel_mem = sizeof(float) * common.in2_sub_cumh_sel_elem;
|
||||
|
||||
// pointers
|
||||
for (i = 0; i < common.allPoints; i++) {
|
||||
cudaMalloc((void **)&unique[i].d_in2_sub_cumh_sel,
|
||||
common.in2_sub_cumh_sel_mem);
|
||||
}
|
||||
|
||||
//====================================================================================================
|
||||
// SELECTION 2, SUBTRACTION
|
||||
//====================================================================================================
|
||||
|
||||
// common
|
||||
common.in2_sub_cumh_sel2_rowlow = 1;
|
||||
common.in2_sub_cumh_sel2_rowhig = common.in2_sub_cumh_rows;
|
||||
common.in2_sub_cumh_sel2_collow = 1;
|
||||
common.in2_sub_cumh_sel2_colhig =
|
||||
common.in2_sub_cumh_cols - common.in_cols - 1;
|
||||
common.in2_sub2_rows =
|
||||
common.in2_sub_cumh_sel2_rowhig - common.in2_sub_cumh_sel2_rowlow + 1;
|
||||
common.in2_sub2_cols =
|
||||
common.in2_sub_cumh_sel2_colhig - common.in2_sub_cumh_sel2_collow + 1;
|
||||
common.in2_sub2_elem = common.in2_sub2_rows * common.in2_sub2_cols;
|
||||
common.in2_sub2_mem = sizeof(float) * common.in2_sub2_elem;
|
||||
|
||||
// pointers
|
||||
for (i = 0; i < common.allPoints; i++) {
|
||||
cudaMalloc((void **)&unique[i].d_in2_sub2, common.in2_sub2_mem);
|
||||
}
|
||||
|
||||
//======================================================================================================================================================
|
||||
// CUMULATIVE SUM 2
|
||||
//======================================================================================================================================================
|
||||
|
||||
//====================================================================================================
|
||||
// MULTIPLICATION
|
||||
//====================================================================================================
|
||||
|
||||
// common
|
||||
common.in2_sqr_rows = common.in2_rows;
|
||||
common.in2_sqr_cols = common.in2_cols;
|
||||
common.in2_sqr_elem = common.in2_elem;
|
||||
common.in2_sqr_mem = common.in2_mem;
|
||||
|
||||
// pointers
|
||||
for (i = 0; i < common.allPoints; i++) {
|
||||
cudaMalloc((void **)&unique[i].d_in2_sqr, common.in2_sqr_mem);
|
||||
}
|
||||
|
||||
//====================================================================================================
|
||||
// SELECTION 2, SUBTRACTION
|
||||
//====================================================================================================
|
||||
|
||||
// common
|
||||
common.in2_sqr_sub2_rows = common.in2_sub2_rows;
|
||||
common.in2_sqr_sub2_cols = common.in2_sub2_cols;
|
||||
common.in2_sqr_sub2_elem = common.in2_sub2_elem;
|
||||
common.in2_sqr_sub2_mem = common.in2_sub2_mem;
|
||||
|
||||
// pointers
|
||||
for (i = 0; i < common.allPoints; i++) {
|
||||
cudaMalloc((void **)&unique[i].d_in2_sqr_sub2, common.in2_sqr_sub2_mem);
|
||||
}
|
||||
|
||||
//======================================================================================================================================================
|
||||
// FINAL
|
||||
//======================================================================================================================================================
|
||||
|
||||
// common
|
||||
common.in_sqr_rows = common.in_rows;
|
||||
common.in_sqr_cols = common.in_cols;
|
||||
common.in_sqr_elem = common.in_elem;
|
||||
common.in_sqr_mem = common.in_mem;
|
||||
|
||||
// pointers
|
||||
for (i = 0; i < common.allPoints; i++) {
|
||||
cudaMalloc((void **)&unique[i].d_in_sqr, common.in_sqr_mem);
|
||||
}
|
||||
|
||||
//======================================================================================================================================================
|
||||
// TEMPLATE MASK CREATE
|
||||
//======================================================================================================================================================
|
||||
|
||||
// common
|
||||
common.tMask_rows = common.in_rows + (common.sSize + 1 + common.sSize) - 1;
|
||||
common.tMask_cols = common.tMask_rows;
|
||||
common.tMask_elem = common.tMask_rows * common.tMask_cols;
|
||||
common.tMask_mem = sizeof(float) * common.tMask_elem;
|
||||
|
||||
// pointers
|
||||
for (i = 0; i < common.allPoints; i++) {
|
||||
cudaMalloc((void **)&unique[i].d_tMask, common.tMask_mem);
|
||||
}
|
||||
|
||||
//======================================================================================================================================================
|
||||
// POINT MASK INITIALIZE
|
||||
//======================================================================================================================================================
|
||||
|
||||
// common
|
||||
common.mask_rows = common.maxMove;
|
||||
common.mask_cols = common.mask_rows;
|
||||
common.mask_elem = common.mask_rows * common.mask_cols;
|
||||
common.mask_mem = sizeof(float) * common.mask_elem;
|
||||
|
||||
//======================================================================================================================================================
|
||||
// MASK CONVOLUTION
|
||||
//======================================================================================================================================================
|
||||
|
||||
// common
|
||||
common.mask_conv_rows = common.tMask_rows; // number of rows in I
|
||||
common.mask_conv_cols = common.tMask_cols; // number of columns in I
|
||||
common.mask_conv_elem =
|
||||
common.mask_conv_rows * common.mask_conv_cols; // number of elements
|
||||
common.mask_conv_mem = sizeof(float) * common.mask_conv_elem;
|
||||
common.mask_conv_ioffset = (common.mask_rows - 1) / 2;
|
||||
if ((common.mask_rows - 1) % 2 > 0.5) {
|
||||
common.mask_conv_ioffset = common.mask_conv_ioffset + 1;
|
||||
}
|
||||
common.mask_conv_joffset = (common.mask_cols - 1) / 2;
|
||||
if ((common.mask_cols - 1) % 2 > 0.5) {
|
||||
common.mask_conv_joffset = common.mask_conv_joffset + 1;
|
||||
}
|
||||
|
||||
// pointers
|
||||
for (i = 0; i < common.allPoints; i++) {
|
||||
cudaMalloc((void **)&unique[i].d_mask_conv, common.mask_conv_mem);
|
||||
}
|
||||
|
||||
//======================================================================================================================================================
|
||||
// KERNEL
|
||||
//======================================================================================================================================================
|
||||
|
||||
//====================================================================================================
|
||||
// THREAD BLOCK
|
||||
//====================================================================================================
|
||||
|
||||
// All kernels operations within kernel use same max size of threads. Size of
|
||||
// block size is set to the size appropriate for max size operation (on padded
|
||||
// matrix). Other use subsets of that.
|
||||
threads.x = NUMBER_THREADS; // define the number of threads in the block
|
||||
threads.y = 1;
|
||||
blocks.x = common.allPoints; // define the number of blocks in the grid
|
||||
blocks.y = 1;
|
||||
|
||||
//====================================================================================================
|
||||
// COPY ARGUMENTS
|
||||
//====================================================================================================
|
||||
|
||||
cudaMemcpyToSymbol(d_common, &common, sizeof(params_common));
|
||||
cudaMemcpyToSymbol(d_unique, &unique, sizeof(params_unique) * ALL_POINTS);
|
||||
|
||||
//====================================================================================================
|
||||
// PRINT FRAME PROGRESS START
|
||||
//====================================================================================================
|
||||
|
||||
printf("frame progress: ");
|
||||
fflush(NULL);
|
||||
|
||||
//====================================================================================================
|
||||
// LAUNCH
|
||||
//====================================================================================================
|
||||
|
||||
for (common_change.frame_no = 0; common_change.frame_no < frames_processed;
|
||||
common_change.frame_no++) {
|
||||
printf("get frame\n");
|
||||
// Extract a cropped version of the first frame from the video file
|
||||
frame = get_frame(
|
||||
frames, // pointer to video file
|
||||
common_change.frame_no, // number of frame that needs to be returned
|
||||
0, // cropped?
|
||||
0, // scaled?
|
||||
1); // converted
|
||||
printf("memcpy\n");
|
||||
// copy frame to GPU memory
|
||||
cudaMemcpy(common_change.d_frame, frame, common.frame_mem,
|
||||
cudaMemcpyHostToDevice);
|
||||
printf("toSymbol\n");
|
||||
cudaMemcpyToSymbol(d_common_change, &common_change,
|
||||
sizeof(params_common_change));
|
||||
|
||||
// launch GPU kernel
|
||||
printf("launch\n");
|
||||
kernel<<<1, 32>>>();
|
||||
cudaDeviceSynchronize();
|
||||
printf("return\n");
|
||||
// free frame after each loop iteration, since AVI library allocates memory
|
||||
// for every frame fetched
|
||||
printf("free\n");
|
||||
free(frame);
|
||||
|
||||
// print frame progress
|
||||
printf("%d ", common_change.frame_no);
|
||||
fflush(NULL);
|
||||
}
|
||||
|
||||
//====================================================================================================
|
||||
// PRINT FRAME PROGRESS END
|
||||
//====================================================================================================
|
||||
|
||||
printf("\n");
|
||||
fflush(NULL);
|
||||
|
||||
//====================================================================================================
|
||||
// OUTPUT
|
||||
//====================================================================================================
|
||||
|
||||
cudaMemcpy(common.tEndoRowLoc, common.d_tEndoRowLoc,
|
||||
common.endo_mem * common.no_frames, cudaMemcpyDeviceToHost);
|
||||
cudaMemcpy(common.tEndoColLoc, common.d_tEndoColLoc,
|
||||
common.endo_mem * common.no_frames, cudaMemcpyDeviceToHost);
|
||||
|
||||
cudaMemcpy(common.tEpiRowLoc, common.d_tEpiRowLoc,
|
||||
common.epi_mem * common.no_frames, cudaMemcpyDeviceToHost);
|
||||
cudaMemcpy(common.tEpiColLoc, common.d_tEpiColLoc,
|
||||
common.epi_mem * common.no_frames, cudaMemcpyDeviceToHost);
|
||||
|
||||
#ifdef OUTPUT
|
||||
|
||||
//==================================================50
|
||||
// DUMP DATA TO FILE
|
||||
//==================================================50
|
||||
write_data("result.txt", common.no_frames, frames_processed,
|
||||
common.endoPoints, common.tEndoRowLoc, common.tEndoColLoc,
|
||||
common.epiPoints, common.tEpiRowLoc, common.tEpiColLoc);
|
||||
|
||||
//==================================================50
|
||||
// End
|
||||
//==================================================50
|
||||
|
||||
#endif
|
||||
|
||||
//======================================================================================================================================================
|
||||
// DEALLOCATION
|
||||
//======================================================================================================================================================
|
||||
|
||||
//====================================================================================================
|
||||
// COMMON
|
||||
//====================================================================================================
|
||||
|
||||
// frame
|
||||
cudaFree(common_change.d_frame);
|
||||
|
||||
// endo points
|
||||
free(common.endoRow);
|
||||
free(common.endoCol);
|
||||
free(common.tEndoRowLoc);
|
||||
free(common.tEndoColLoc);
|
||||
|
||||
cudaFree(common.d_endoRow);
|
||||
cudaFree(common.d_endoCol);
|
||||
cudaFree(common.d_tEndoRowLoc);
|
||||
cudaFree(common.d_tEndoColLoc);
|
||||
|
||||
cudaFree(common.d_endoT);
|
||||
|
||||
// epi points
|
||||
free(common.epiRow);
|
||||
free(common.epiCol);
|
||||
free(common.tEpiRowLoc);
|
||||
free(common.tEpiColLoc);
|
||||
|
||||
cudaFree(common.d_epiRow);
|
||||
cudaFree(common.d_epiCol);
|
||||
cudaFree(common.d_tEpiRowLoc);
|
||||
cudaFree(common.d_tEpiColLoc);
|
||||
|
||||
cudaFree(common.d_epiT);
|
||||
|
||||
//====================================================================================================
|
||||
// POINTERS
|
||||
//====================================================================================================
|
||||
|
||||
for (i = 0; i < common.allPoints; i++) {
|
||||
cudaFree(unique[i].d_in2);
|
||||
|
||||
cudaFree(unique[i].d_conv);
|
||||
cudaFree(unique[i].d_in2_pad_cumv);
|
||||
cudaFree(unique[i].d_in2_pad_cumv_sel);
|
||||
cudaFree(unique[i].d_in2_sub_cumh);
|
||||
cudaFree(unique[i].d_in2_sub_cumh_sel);
|
||||
cudaFree(unique[i].d_in2_sub2);
|
||||
cudaFree(unique[i].d_in2_sqr);
|
||||
cudaFree(unique[i].d_in2_sqr_sub2);
|
||||
cudaFree(unique[i].d_in_sqr);
|
||||
|
||||
cudaFree(unique[i].d_tMask);
|
||||
cudaFree(unique[i].d_mask_conv);
|
||||
}
|
||||
}
|
||||
|
||||
//===============================================================================================================================================================================================================
|
||||
//===============================================================================================================================================================================================================
|
||||
// MAIN FUNCTION
|
||||
//===============================================================================================================================================================================================================
|
||||
//===============================================================================================================================================================================================================
|
|
@ -1,17 +0,0 @@
|
|||
#!/bin/bash
|
||||
|
||||
cd AVI; make; cd ..;
|
||||
|
||||
clang++ -DOUTPUT main.cu -I./AVI --cuda-path=/usr/local/cuda-10.1 --cuda-gpu-arch=sm_61 -L/usr/local/cuda-10.1/lib64 -lcudart_static -ldl -lrt -pthread -save-temps -v
|
||||
|
||||
|
||||
/home/robinhan/repo/open_source_template/build/compilation/kernelTranslator main-cuda-nvptx64-nvidia-cuda-sm_61.bc kernel.bc
|
||||
/home/robinhan/repo/open_source_template/build/compilation/hostTranslator main-host-x86_64-unknown-linux-gnu.bc host.bc
|
||||
|
||||
llc --relocation-model=pic --filetype=obj kernel.bc
|
||||
llc --relocation-model=pic --filetype=obj host.bc
|
||||
|
||||
|
||||
g++ -Wall -L/home/robinhan/repo/open_source_template/build/runtime -L/home/robinhan/repo/open_source_template/build/runtime/threadPool -o heartwall -fPIC -no-pie host.o kernel.o ./AVI/avilib.o ./AVI/avimod.o -lc -lx86Runtime -lthreadPool -lpthread
|
||||
|
||||
./heartwall /home/robinhan/repo/open_source_template/runtime/examples/rodinia-data/heartwall/test.avi 20
|
|
@ -1,5 +0,0 @@
|
|||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Set Device
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
void setdevice(void) { cudaSetDevice(0); }
|
|
@ -1,719 +0,0 @@
|
|||
; ModuleID = 'hotspot-cuda-nvptx64-nvidia-cuda-sm_61.bc'
|
||||
source_filename = "hotspot.cu"
|
||||
target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
|
||||
target triple = "nvptx64-nvidia-cuda"
|
||||
|
||||
%struct.__cuda_builtin_blockIdx_t = type { i8 }
|
||||
%struct.__cuda_builtin_threadIdx_t = type { i8 }
|
||||
%struct.cudaFuncAttributes = type { i64, i64, i64, i32, i32, i32, i32, i32, i32, i32 }
|
||||
|
||||
$_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv = comdat any
|
||||
|
||||
$_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_yEv = comdat any
|
||||
|
||||
$_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv = comdat any
|
||||
|
||||
$_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_yEv = comdat any
|
||||
|
||||
@_ZZ14calculate_tempiPfS_S_iiiiffffffE12temp_on_cuda = internal addrspace(3) global [16 x [16 x float]] undef, align 4
|
||||
@_ZZ14calculate_tempiPfS_S_iiiiffffffE13power_on_cuda = internal addrspace(3) global [16 x [16 x float]] undef, align 4
|
||||
@_ZZ14calculate_tempiPfS_S_iiiiffffffE6temp_t = internal addrspace(3) global [16 x [16 x float]] undef, align 4
|
||||
@blockIdx = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_blockIdx_t, align 1
|
||||
@threadIdx = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_threadIdx_t, align 1
|
||||
|
||||
; Function Attrs: convergent noinline nounwind optnone
|
||||
define weak dso_local i32 @cudaMalloc(i8** %p, i64 %s) #0 {
|
||||
entry:
|
||||
%p.addr = alloca i8**, align 8
|
||||
%s.addr = alloca i64, align 8
|
||||
store i8** %p, i8*** %p.addr, align 8
|
||||
store i64 %s, i64* %s.addr, align 8
|
||||
ret i32 999
|
||||
}
|
||||
|
||||
; Function Attrs: convergent noinline nounwind optnone
|
||||
define weak dso_local i32 @cudaFuncGetAttributes(%struct.cudaFuncAttributes* %p, i8* %c) #0 {
|
||||
entry:
|
||||
%p.addr = alloca %struct.cudaFuncAttributes*, align 8
|
||||
%c.addr = alloca i8*, align 8
|
||||
store %struct.cudaFuncAttributes* %p, %struct.cudaFuncAttributes** %p.addr, align 8
|
||||
store i8* %c, i8** %c.addr, align 8
|
||||
ret i32 999
|
||||
}
|
||||
|
||||
; Function Attrs: convergent noinline nounwind optnone
|
||||
define weak dso_local i32 @cudaDeviceGetAttribute(i32* %value, i32 %attr, i32 %device) #0 {
|
||||
entry:
|
||||
%value.addr = alloca i32*, align 8
|
||||
%attr.addr = alloca i32, align 4
|
||||
%device.addr = alloca i32, align 4
|
||||
store i32* %value, i32** %value.addr, align 8
|
||||
store i32 %attr, i32* %attr.addr, align 4
|
||||
store i32 %device, i32* %device.addr, align 4
|
||||
ret i32 999
|
||||
}
|
||||
|
||||
; Function Attrs: convergent noinline nounwind optnone
|
||||
define weak dso_local i32 @cudaGetDevice(i32* %device) #0 {
|
||||
entry:
|
||||
%device.addr = alloca i32*, align 8
|
||||
store i32* %device, i32** %device.addr, align 8
|
||||
ret i32 999
|
||||
}
|
||||
|
||||
; Function Attrs: convergent noinline nounwind optnone
|
||||
define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessor(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize) #0 {
|
||||
entry:
|
||||
%numBlocks.addr = alloca i32*, align 8
|
||||
%func.addr = alloca i8*, align 8
|
||||
%blockSize.addr = alloca i32, align 4
|
||||
%dynamicSmemSize.addr = alloca i64, align 8
|
||||
store i32* %numBlocks, i32** %numBlocks.addr, align 8
|
||||
store i8* %func, i8** %func.addr, align 8
|
||||
store i32 %blockSize, i32* %blockSize.addr, align 4
|
||||
store i64 %dynamicSmemSize, i64* %dynamicSmemSize.addr, align 8
|
||||
ret i32 999
|
||||
}
|
||||
|
||||
; Function Attrs: convergent noinline nounwind optnone
|
||||
define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize, i32 %flags) #0 {
|
||||
entry:
|
||||
%numBlocks.addr = alloca i32*, align 8
|
||||
%func.addr = alloca i8*, align 8
|
||||
%blockSize.addr = alloca i32, align 4
|
||||
%dynamicSmemSize.addr = alloca i64, align 8
|
||||
%flags.addr = alloca i32, align 4
|
||||
store i32* %numBlocks, i32** %numBlocks.addr, align 8
|
||||
store i8* %func, i8** %func.addr, align 8
|
||||
store i32 %blockSize, i32* %blockSize.addr, align 4
|
||||
store i64 %dynamicSmemSize, i64* %dynamicSmemSize.addr, align 8
|
||||
store i32 %flags, i32* %flags.addr, align 4
|
||||
ret i32 999
|
||||
}
|
||||
|
||||
; Function Attrs: convergent noinline nounwind optnone
|
||||
define dso_local void @_Z14calculate_tempiPfS_S_iiiiffffff(i32 %iteration, float* %power, float* %temp_src, float* %temp_dst, i32 %grid_cols, i32 %grid_rows, i32 %border_cols, i32 %border_rows, float %Cap, float %Rx, float %Ry, float %Rz, float %step, float %time_elapsed) #0 {
|
||||
entry:
|
||||
%iteration.addr = alloca i32, align 4
|
||||
%power.addr = alloca float*, align 8
|
||||
%temp_src.addr = alloca float*, align 8
|
||||
%temp_dst.addr = alloca float*, align 8
|
||||
%grid_cols.addr = alloca i32, align 4
|
||||
%grid_rows.addr = alloca i32, align 4
|
||||
%border_cols.addr = alloca i32, align 4
|
||||
%border_rows.addr = alloca i32, align 4
|
||||
%Cap.addr = alloca float, align 4
|
||||
%Rx.addr = alloca float, align 4
|
||||
%Ry.addr = alloca float, align 4
|
||||
%Rz.addr = alloca float, align 4
|
||||
%step.addr = alloca float, align 4
|
||||
%time_elapsed.addr = alloca float, align 4
|
||||
%amb_temp = alloca float, align 4
|
||||
%step_div_Cap = alloca float, align 4
|
||||
%Rx_1 = alloca float, align 4
|
||||
%Ry_1 = alloca float, align 4
|
||||
%Rz_1 = alloca float, align 4
|
||||
%bx = alloca i32, align 4
|
||||
%by = alloca i32, align 4
|
||||
%tx = alloca i32, align 4
|
||||
%ty = alloca i32, align 4
|
||||
%small_block_rows = alloca i32, align 4
|
||||
%small_block_cols = alloca i32, align 4
|
||||
%blkY = alloca i32, align 4
|
||||
%blkX = alloca i32, align 4
|
||||
%blkYmax = alloca i32, align 4
|
||||
%blkXmax = alloca i32, align 4
|
||||
%yidx = alloca i32, align 4
|
||||
%xidx = alloca i32, align 4
|
||||
%loadYidx = alloca i32, align 4
|
||||
%loadXidx = alloca i32, align 4
|
||||
%index = alloca i32, align 4
|
||||
%validYmin = alloca i32, align 4
|
||||
%validYmax = alloca i32, align 4
|
||||
%validXmin = alloca i32, align 4
|
||||
%validXmax = alloca i32, align 4
|
||||
%N = alloca i32, align 4
|
||||
%S = alloca i32, align 4
|
||||
%W = alloca i32, align 4
|
||||
%E = alloca i32, align 4
|
||||
%computed = alloca i8, align 1
|
||||
%i = alloca i32, align 4
|
||||
store i32 %iteration, i32* %iteration.addr, align 4
|
||||
store float* %power, float** %power.addr, align 8
|
||||
store float* %temp_src, float** %temp_src.addr, align 8
|
||||
store float* %temp_dst, float** %temp_dst.addr, align 8
|
||||
store i32 %grid_cols, i32* %grid_cols.addr, align 4
|
||||
store i32 %grid_rows, i32* %grid_rows.addr, align 4
|
||||
store i32 %border_cols, i32* %border_cols.addr, align 4
|
||||
store i32 %border_rows, i32* %border_rows.addr, align 4
|
||||
store float %Cap, float* %Cap.addr, align 4
|
||||
store float %Rx, float* %Rx.addr, align 4
|
||||
store float %Ry, float* %Ry.addr, align 4
|
||||
store float %Rz, float* %Rz.addr, align 4
|
||||
store float %step, float* %step.addr, align 4
|
||||
store float %time_elapsed, float* %time_elapsed.addr, align 4
|
||||
store float 8.000000e+01, float* %amb_temp, align 4
|
||||
%call = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #2
|
||||
store i32 %call, i32* %bx, align 4
|
||||
%call1 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_yEv() #2
|
||||
store i32 %call1, i32* %by, align 4
|
||||
%call2 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #2
|
||||
store i32 %call2, i32* %tx, align 4
|
||||
%call3 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_yEv() #2
|
||||
store i32 %call3, i32* %ty, align 4
|
||||
%0 = load float, float* %step.addr, align 4
|
||||
%1 = load float, float* %Cap.addr, align 4
|
||||
%div = fdiv float %0, %1
|
||||
store float %div, float* %step_div_Cap, align 4
|
||||
%2 = load float, float* %Rx.addr, align 4
|
||||
%div4 = fdiv float 1.000000e+00, %2
|
||||
store float %div4, float* %Rx_1, align 4
|
||||
%3 = load float, float* %Ry.addr, align 4
|
||||
%div5 = fdiv float 1.000000e+00, %3
|
||||
store float %div5, float* %Ry_1, align 4
|
||||
%4 = load float, float* %Rz.addr, align 4
|
||||
%div6 = fdiv float 1.000000e+00, %4
|
||||
store float %div6, float* %Rz_1, align 4
|
||||
%5 = load i32, i32* %iteration.addr, align 4
|
||||
%mul = mul nsw i32 %5, 2
|
||||
%sub = sub nsw i32 16, %mul
|
||||
store i32 %sub, i32* %small_block_rows, align 4
|
||||
%6 = load i32, i32* %iteration.addr, align 4
|
||||
%mul7 = mul nsw i32 %6, 2
|
||||
%sub8 = sub nsw i32 16, %mul7
|
||||
store i32 %sub8, i32* %small_block_cols, align 4
|
||||
%7 = load i32, i32* %small_block_rows, align 4
|
||||
%8 = load i32, i32* %by, align 4
|
||||
%mul9 = mul nsw i32 %7, %8
|
||||
%9 = load i32, i32* %border_rows.addr, align 4
|
||||
%sub10 = sub nsw i32 %mul9, %9
|
||||
store i32 %sub10, i32* %blkY, align 4
|
||||
%10 = load i32, i32* %small_block_cols, align 4
|
||||
%11 = load i32, i32* %bx, align 4
|
||||
%mul11 = mul nsw i32 %10, %11
|
||||
%12 = load i32, i32* %border_cols.addr, align 4
|
||||
%sub12 = sub nsw i32 %mul11, %12
|
||||
store i32 %sub12, i32* %blkX, align 4
|
||||
%13 = load i32, i32* %blkY, align 4
|
||||
%add = add nsw i32 %13, 16
|
||||
%sub13 = sub nsw i32 %add, 1
|
||||
store i32 %sub13, i32* %blkYmax, align 4
|
||||
%14 = load i32, i32* %blkX, align 4
|
||||
%add14 = add nsw i32 %14, 16
|
||||
%sub15 = sub nsw i32 %add14, 1
|
||||
store i32 %sub15, i32* %blkXmax, align 4
|
||||
%15 = load i32, i32* %blkY, align 4
|
||||
%16 = load i32, i32* %ty, align 4
|
||||
%add16 = add nsw i32 %15, %16
|
||||
store i32 %add16, i32* %yidx, align 4
|
||||
%17 = load i32, i32* %blkX, align 4
|
||||
%18 = load i32, i32* %tx, align 4
|
||||
%add17 = add nsw i32 %17, %18
|
||||
store i32 %add17, i32* %xidx, align 4
|
||||
%19 = load i32, i32* %yidx, align 4
|
||||
store i32 %19, i32* %loadYidx, align 4
|
||||
%20 = load i32, i32* %xidx, align 4
|
||||
store i32 %20, i32* %loadXidx, align 4
|
||||
%21 = load i32, i32* %grid_cols.addr, align 4
|
||||
%22 = load i32, i32* %loadYidx, align 4
|
||||
%mul18 = mul nsw i32 %21, %22
|
||||
%23 = load i32, i32* %loadXidx, align 4
|
||||
%add19 = add nsw i32 %mul18, %23
|
||||
store i32 %add19, i32* %index, align 4
|
||||
%24 = load i32, i32* %loadYidx, align 4
|
||||
%cmp = icmp sge i32 %24, 0
|
||||
br i1 %cmp, label %land.lhs.true, label %if.end
|
||||
|
||||
land.lhs.true: ; preds = %entry
|
||||
%25 = load i32, i32* %loadYidx, align 4
|
||||
%26 = load i32, i32* %grid_rows.addr, align 4
|
||||
%sub20 = sub nsw i32 %26, 1
|
||||
%cmp21 = icmp sle i32 %25, %sub20
|
||||
br i1 %cmp21, label %land.lhs.true22, label %if.end
|
||||
|
||||
land.lhs.true22: ; preds = %land.lhs.true
|
||||
%27 = load i32, i32* %loadXidx, align 4
|
||||
%cmp23 = icmp sge i32 %27, 0
|
||||
br i1 %cmp23, label %land.lhs.true24, label %if.end
|
||||
|
||||
land.lhs.true24: ; preds = %land.lhs.true22
|
||||
%28 = load i32, i32* %loadXidx, align 4
|
||||
%29 = load i32, i32* %grid_cols.addr, align 4
|
||||
%sub25 = sub nsw i32 %29, 1
|
||||
%cmp26 = icmp sle i32 %28, %sub25
|
||||
br i1 %cmp26, label %if.then, label %if.end
|
||||
|
||||
if.then: ; preds = %land.lhs.true24
|
||||
%30 = load float*, float** %temp_src.addr, align 8
|
||||
%31 = load i32, i32* %index, align 4
|
||||
%idxprom = sext i32 %31 to i64
|
||||
%arrayidx = getelementptr inbounds float, float* %30, i64 %idxprom
|
||||
%32 = load float, float* %arrayidx, align 4
|
||||
%33 = load i32, i32* %ty, align 4
|
||||
%idxprom27 = sext i32 %33 to i64
|
||||
%arrayidx28 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ14calculate_tempiPfS_S_iiiiffffffE12temp_on_cuda to [16 x [16 x float]]*), i64 0, i64 %idxprom27
|
||||
%34 = load i32, i32* %tx, align 4
|
||||
%idxprom29 = sext i32 %34 to i64
|
||||
%arrayidx30 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx28, i64 0, i64 %idxprom29
|
||||
store float %32, float* %arrayidx30, align 4
|
||||
%35 = load float*, float** %power.addr, align 8
|
||||
%36 = load i32, i32* %index, align 4
|
||||
%idxprom31 = sext i32 %36 to i64
|
||||
%arrayidx32 = getelementptr inbounds float, float* %35, i64 %idxprom31
|
||||
%37 = load float, float* %arrayidx32, align 4
|
||||
%38 = load i32, i32* %ty, align 4
|
||||
%idxprom33 = sext i32 %38 to i64
|
||||
%arrayidx34 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ14calculate_tempiPfS_S_iiiiffffffE13power_on_cuda to [16 x [16 x float]]*), i64 0, i64 %idxprom33
|
||||
%39 = load i32, i32* %tx, align 4
|
||||
%idxprom35 = sext i32 %39 to i64
|
||||
%arrayidx36 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx34, i64 0, i64 %idxprom35
|
||||
store float %37, float* %arrayidx36, align 4
|
||||
br label %if.end
|
||||
|
||||
if.end: ; preds = %if.then, %land.lhs.true24, %land.lhs.true22, %land.lhs.true, %entry
|
||||
call void @llvm.nvvm.barrier0()
|
||||
%40 = load i32, i32* %blkY, align 4
|
||||
%cmp37 = icmp slt i32 %40, 0
|
||||
br i1 %cmp37, label %cond.true, label %cond.false
|
||||
|
||||
cond.true: ; preds = %if.end
|
||||
%41 = load i32, i32* %blkY, align 4
|
||||
%sub38 = sub nsw i32 0, %41
|
||||
br label %cond.end
|
||||
|
||||
cond.false: ; preds = %if.end
|
||||
br label %cond.end
|
||||
|
||||
cond.end: ; preds = %cond.false, %cond.true
|
||||
%cond = phi i32 [ %sub38, %cond.true ], [ 0, %cond.false ]
|
||||
store i32 %cond, i32* %validYmin, align 4
|
||||
%42 = load i32, i32* %blkYmax, align 4
|
||||
%43 = load i32, i32* %grid_rows.addr, align 4
|
||||
%sub39 = sub nsw i32 %43, 1
|
||||
%cmp40 = icmp sgt i32 %42, %sub39
|
||||
br i1 %cmp40, label %cond.true41, label %cond.false45
|
||||
|
||||
cond.true41: ; preds = %cond.end
|
||||
%44 = load i32, i32* %blkYmax, align 4
|
||||
%45 = load i32, i32* %grid_rows.addr, align 4
|
||||
%sub42 = sub nsw i32 %44, %45
|
||||
%add43 = add nsw i32 %sub42, 1
|
||||
%sub44 = sub nsw i32 15, %add43
|
||||
br label %cond.end46
|
||||
|
||||
cond.false45: ; preds = %cond.end
|
||||
br label %cond.end46
|
||||
|
||||
cond.end46: ; preds = %cond.false45, %cond.true41
|
||||
%cond47 = phi i32 [ %sub44, %cond.true41 ], [ 15, %cond.false45 ]
|
||||
store i32 %cond47, i32* %validYmax, align 4
|
||||
%46 = load i32, i32* %blkX, align 4
|
||||
%cmp48 = icmp slt i32 %46, 0
|
||||
br i1 %cmp48, label %cond.true49, label %cond.false51
|
||||
|
||||
cond.true49: ; preds = %cond.end46
|
||||
%47 = load i32, i32* %blkX, align 4
|
||||
%sub50 = sub nsw i32 0, %47
|
||||
br label %cond.end52
|
||||
|
||||
cond.false51: ; preds = %cond.end46
|
||||
br label %cond.end52
|
||||
|
||||
cond.end52: ; preds = %cond.false51, %cond.true49
|
||||
%cond53 = phi i32 [ %sub50, %cond.true49 ], [ 0, %cond.false51 ]
|
||||
store i32 %cond53, i32* %validXmin, align 4
|
||||
%48 = load i32, i32* %blkXmax, align 4
|
||||
%49 = load i32, i32* %grid_cols.addr, align 4
|
||||
%sub54 = sub nsw i32 %49, 1
|
||||
%cmp55 = icmp sgt i32 %48, %sub54
|
||||
br i1 %cmp55, label %cond.true56, label %cond.false60
|
||||
|
||||
cond.true56: ; preds = %cond.end52
|
||||
%50 = load i32, i32* %blkXmax, align 4
|
||||
%51 = load i32, i32* %grid_cols.addr, align 4
|
||||
%sub57 = sub nsw i32 %50, %51
|
||||
%add58 = add nsw i32 %sub57, 1
|
||||
%sub59 = sub nsw i32 15, %add58
|
||||
br label %cond.end61
|
||||
|
||||
cond.false60: ; preds = %cond.end52
|
||||
br label %cond.end61
|
||||
|
||||
cond.end61: ; preds = %cond.false60, %cond.true56
|
||||
%cond62 = phi i32 [ %sub59, %cond.true56 ], [ 15, %cond.false60 ]
|
||||
store i32 %cond62, i32* %validXmax, align 4
|
||||
%52 = load i32, i32* %ty, align 4
|
||||
%sub63 = sub nsw i32 %52, 1
|
||||
store i32 %sub63, i32* %N, align 4
|
||||
%53 = load i32, i32* %ty, align 4
|
||||
%add64 = add nsw i32 %53, 1
|
||||
store i32 %add64, i32* %S, align 4
|
||||
%54 = load i32, i32* %tx, align 4
|
||||
%sub65 = sub nsw i32 %54, 1
|
||||
store i32 %sub65, i32* %W, align 4
|
||||
%55 = load i32, i32* %tx, align 4
|
||||
%add66 = add nsw i32 %55, 1
|
||||
store i32 %add66, i32* %E, align 4
|
||||
%56 = load i32, i32* %N, align 4
|
||||
%57 = load i32, i32* %validYmin, align 4
|
||||
%cmp67 = icmp slt i32 %56, %57
|
||||
br i1 %cmp67, label %cond.true68, label %cond.false69
|
||||
|
||||
cond.true68: ; preds = %cond.end61
|
||||
%58 = load i32, i32* %validYmin, align 4
|
||||
br label %cond.end70
|
||||
|
||||
cond.false69: ; preds = %cond.end61
|
||||
%59 = load i32, i32* %N, align 4
|
||||
br label %cond.end70
|
||||
|
||||
cond.end70: ; preds = %cond.false69, %cond.true68
|
||||
%cond71 = phi i32 [ %58, %cond.true68 ], [ %59, %cond.false69 ]
|
||||
store i32 %cond71, i32* %N, align 4
|
||||
%60 = load i32, i32* %S, align 4
|
||||
%61 = load i32, i32* %validYmax, align 4
|
||||
%cmp72 = icmp sgt i32 %60, %61
|
||||
br i1 %cmp72, label %cond.true73, label %cond.false74
|
||||
|
||||
cond.true73: ; preds = %cond.end70
|
||||
%62 = load i32, i32* %validYmax, align 4
|
||||
br label %cond.end75
|
||||
|
||||
cond.false74: ; preds = %cond.end70
|
||||
%63 = load i32, i32* %S, align 4
|
||||
br label %cond.end75
|
||||
|
||||
cond.end75: ; preds = %cond.false74, %cond.true73
|
||||
%cond76 = phi i32 [ %62, %cond.true73 ], [ %63, %cond.false74 ]
|
||||
store i32 %cond76, i32* %S, align 4
|
||||
%64 = load i32, i32* %W, align 4
|
||||
%65 = load i32, i32* %validXmin, align 4
|
||||
%cmp77 = icmp slt i32 %64, %65
|
||||
br i1 %cmp77, label %cond.true78, label %cond.false79
|
||||
|
||||
cond.true78: ; preds = %cond.end75
|
||||
%66 = load i32, i32* %validXmin, align 4
|
||||
br label %cond.end80
|
||||
|
||||
cond.false79: ; preds = %cond.end75
|
||||
%67 = load i32, i32* %W, align 4
|
||||
br label %cond.end80
|
||||
|
||||
cond.end80: ; preds = %cond.false79, %cond.true78
|
||||
%cond81 = phi i32 [ %66, %cond.true78 ], [ %67, %cond.false79 ]
|
||||
store i32 %cond81, i32* %W, align 4
|
||||
%68 = load i32, i32* %E, align 4
|
||||
%69 = load i32, i32* %validXmax, align 4
|
||||
%cmp82 = icmp sgt i32 %68, %69
|
||||
br i1 %cmp82, label %cond.true83, label %cond.false84
|
||||
|
||||
cond.true83: ; preds = %cond.end80
|
||||
%70 = load i32, i32* %validXmax, align 4
|
||||
br label %cond.end85
|
||||
|
||||
cond.false84: ; preds = %cond.end80
|
||||
%71 = load i32, i32* %E, align 4
|
||||
br label %cond.end85
|
||||
|
||||
cond.end85: ; preds = %cond.false84, %cond.true83
|
||||
%cond86 = phi i32 [ %70, %cond.true83 ], [ %71, %cond.false84 ]
|
||||
store i32 %cond86, i32* %E, align 4
|
||||
store i32 0, i32* %i, align 4
|
||||
br label %for.cond
|
||||
|
||||
for.cond: ; preds = %for.inc, %cond.end85
|
||||
%72 = load i32, i32* %i, align 4
|
||||
%73 = load i32, i32* %iteration.addr, align 4
|
||||
%cmp87 = icmp slt i32 %72, %73
|
||||
br i1 %cmp87, label %for.body, label %for.end
|
||||
|
||||
for.body: ; preds = %for.cond
|
||||
store i8 0, i8* %computed, align 1
|
||||
%74 = load i32, i32* %tx, align 4
|
||||
%75 = load i32, i32* %i, align 4
|
||||
%add88 = add nsw i32 %75, 1
|
||||
%cmp89 = icmp sge i32 %74, %add88
|
||||
br i1 %cmp89, label %land.lhs.true90, label %if.end175
|
||||
|
||||
land.lhs.true90: ; preds = %for.body
|
||||
%76 = load i32, i32* %tx, align 4
|
||||
%77 = load i32, i32* %i, align 4
|
||||
%sub91 = sub nsw i32 16, %77
|
||||
%sub92 = sub nsw i32 %sub91, 2
|
||||
%cmp93 = icmp sle i32 %76, %sub92
|
||||
br i1 %cmp93, label %land.lhs.true94, label %if.end175
|
||||
|
||||
land.lhs.true94: ; preds = %land.lhs.true90
|
||||
%78 = load i32, i32* %ty, align 4
|
||||
%79 = load i32, i32* %i, align 4
|
||||
%add95 = add nsw i32 %79, 1
|
||||
%cmp96 = icmp sge i32 %78, %add95
|
||||
br i1 %cmp96, label %land.lhs.true97, label %if.end175
|
||||
|
||||
land.lhs.true97: ; preds = %land.lhs.true94
|
||||
%80 = load i32, i32* %ty, align 4
|
||||
%81 = load i32, i32* %i, align 4
|
||||
%sub98 = sub nsw i32 16, %81
|
||||
%sub99 = sub nsw i32 %sub98, 2
|
||||
%cmp100 = icmp sle i32 %80, %sub99
|
||||
br i1 %cmp100, label %land.lhs.true101, label %if.end175
|
||||
|
||||
land.lhs.true101: ; preds = %land.lhs.true97
|
||||
%82 = load i32, i32* %tx, align 4
|
||||
%83 = load i32, i32* %validXmin, align 4
|
||||
%cmp102 = icmp sge i32 %82, %83
|
||||
br i1 %cmp102, label %land.lhs.true103, label %if.end175
|
||||
|
||||
land.lhs.true103: ; preds = %land.lhs.true101
|
||||
%84 = load i32, i32* %tx, align 4
|
||||
%85 = load i32, i32* %validXmax, align 4
|
||||
%cmp104 = icmp sle i32 %84, %85
|
||||
br i1 %cmp104, label %land.lhs.true105, label %if.end175
|
||||
|
||||
land.lhs.true105: ; preds = %land.lhs.true103
|
||||
%86 = load i32, i32* %ty, align 4
|
||||
%87 = load i32, i32* %validYmin, align 4
|
||||
%cmp106 = icmp sge i32 %86, %87
|
||||
br i1 %cmp106, label %land.lhs.true107, label %if.end175
|
||||
|
||||
land.lhs.true107: ; preds = %land.lhs.true105
|
||||
%88 = load i32, i32* %ty, align 4
|
||||
%89 = load i32, i32* %validYmax, align 4
|
||||
%cmp108 = icmp sle i32 %88, %89
|
||||
br i1 %cmp108, label %if.then109, label %if.end175
|
||||
|
||||
if.then109: ; preds = %land.lhs.true107
|
||||
store i8 1, i8* %computed, align 1
|
||||
%90 = load i32, i32* %ty, align 4
|
||||
%idxprom110 = sext i32 %90 to i64
|
||||
%arrayidx111 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ14calculate_tempiPfS_S_iiiiffffffE12temp_on_cuda to [16 x [16 x float]]*), i64 0, i64 %idxprom110
|
||||
%91 = load i32, i32* %tx, align 4
|
||||
%idxprom112 = sext i32 %91 to i64
|
||||
%arrayidx113 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx111, i64 0, i64 %idxprom112
|
||||
%92 = load float, float* %arrayidx113, align 4
|
||||
%conv = fpext float %92 to double
|
||||
%93 = load float, float* %step_div_Cap, align 4
|
||||
%conv114 = fpext float %93 to double
|
||||
%94 = load i32, i32* %ty, align 4
|
||||
%idxprom115 = sext i32 %94 to i64
|
||||
%arrayidx116 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ14calculate_tempiPfS_S_iiiiffffffE13power_on_cuda to [16 x [16 x float]]*), i64 0, i64 %idxprom115
|
||||
%95 = load i32, i32* %tx, align 4
|
||||
%idxprom117 = sext i32 %95 to i64
|
||||
%arrayidx118 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx116, i64 0, i64 %idxprom117
|
||||
%96 = load float, float* %arrayidx118, align 4
|
||||
%conv119 = fpext float %96 to double
|
||||
%97 = load i32, i32* %S, align 4
|
||||
%idxprom120 = sext i32 %97 to i64
|
||||
%arrayidx121 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ14calculate_tempiPfS_S_iiiiffffffE12temp_on_cuda to [16 x [16 x float]]*), i64 0, i64 %idxprom120
|
||||
%98 = load i32, i32* %tx, align 4
|
||||
%idxprom122 = sext i32 %98 to i64
|
||||
%arrayidx123 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx121, i64 0, i64 %idxprom122
|
||||
%99 = load float, float* %arrayidx123, align 4
|
||||
%100 = load i32, i32* %N, align 4
|
||||
%idxprom124 = sext i32 %100 to i64
|
||||
%arrayidx125 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ14calculate_tempiPfS_S_iiiiffffffE12temp_on_cuda to [16 x [16 x float]]*), i64 0, i64 %idxprom124
|
||||
%101 = load i32, i32* %tx, align 4
|
||||
%idxprom126 = sext i32 %101 to i64
|
||||
%arrayidx127 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx125, i64 0, i64 %idxprom126
|
||||
%102 = load float, float* %arrayidx127, align 4
|
||||
%add128 = fadd contract float %99, %102
|
||||
%conv129 = fpext float %add128 to double
|
||||
%103 = load i32, i32* %ty, align 4
|
||||
%idxprom130 = sext i32 %103 to i64
|
||||
%arrayidx131 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ14calculate_tempiPfS_S_iiiiffffffE12temp_on_cuda to [16 x [16 x float]]*), i64 0, i64 %idxprom130
|
||||
%104 = load i32, i32* %tx, align 4
|
||||
%idxprom132 = sext i32 %104 to i64
|
||||
%arrayidx133 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx131, i64 0, i64 %idxprom132
|
||||
%105 = load float, float* %arrayidx133, align 4
|
||||
%conv134 = fpext float %105 to double
|
||||
%mul135 = fmul contract double 2.000000e+00, %conv134
|
||||
%sub136 = fsub contract double %conv129, %mul135
|
||||
%106 = load float, float* %Ry_1, align 4
|
||||
%conv137 = fpext float %106 to double
|
||||
%mul138 = fmul contract double %sub136, %conv137
|
||||
%add139 = fadd contract double %conv119, %mul138
|
||||
%107 = load i32, i32* %ty, align 4
|
||||
%idxprom140 = sext i32 %107 to i64
|
||||
%arrayidx141 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ14calculate_tempiPfS_S_iiiiffffffE12temp_on_cuda to [16 x [16 x float]]*), i64 0, i64 %idxprom140
|
||||
%108 = load i32, i32* %E, align 4
|
||||
%idxprom142 = sext i32 %108 to i64
|
||||
%arrayidx143 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx141, i64 0, i64 %idxprom142
|
||||
%109 = load float, float* %arrayidx143, align 4
|
||||
%110 = load i32, i32* %ty, align 4
|
||||
%idxprom144 = sext i32 %110 to i64
|
||||
%arrayidx145 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ14calculate_tempiPfS_S_iiiiffffffE12temp_on_cuda to [16 x [16 x float]]*), i64 0, i64 %idxprom144
|
||||
%111 = load i32, i32* %W, align 4
|
||||
%idxprom146 = sext i32 %111 to i64
|
||||
%arrayidx147 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx145, i64 0, i64 %idxprom146
|
||||
%112 = load float, float* %arrayidx147, align 4
|
||||
%add148 = fadd contract float %109, %112
|
||||
%conv149 = fpext float %add148 to double
|
||||
%113 = load i32, i32* %ty, align 4
|
||||
%idxprom150 = sext i32 %113 to i64
|
||||
%arrayidx151 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ14calculate_tempiPfS_S_iiiiffffffE12temp_on_cuda to [16 x [16 x float]]*), i64 0, i64 %idxprom150
|
||||
%114 = load i32, i32* %tx, align 4
|
||||
%idxprom152 = sext i32 %114 to i64
|
||||
%arrayidx153 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx151, i64 0, i64 %idxprom152
|
||||
%115 = load float, float* %arrayidx153, align 4
|
||||
%conv154 = fpext float %115 to double
|
||||
%mul155 = fmul contract double 2.000000e+00, %conv154
|
||||
%sub156 = fsub contract double %conv149, %mul155
|
||||
%116 = load float, float* %Rx_1, align 4
|
||||
%conv157 = fpext float %116 to double
|
||||
%mul158 = fmul contract double %sub156, %conv157
|
||||
%add159 = fadd contract double %add139, %mul158
|
||||
%117 = load float, float* %amb_temp, align 4
|
||||
%118 = load i32, i32* %ty, align 4
|
||||
%idxprom160 = sext i32 %118 to i64
|
||||
%arrayidx161 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ14calculate_tempiPfS_S_iiiiffffffE12temp_on_cuda to [16 x [16 x float]]*), i64 0, i64 %idxprom160
|
||||
%119 = load i32, i32* %tx, align 4
|
||||
%idxprom162 = sext i32 %119 to i64
|
||||
%arrayidx163 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx161, i64 0, i64 %idxprom162
|
||||
%120 = load float, float* %arrayidx163, align 4
|
||||
%sub164 = fsub contract float %117, %120
|
||||
%121 = load float, float* %Rz_1, align 4
|
||||
%mul165 = fmul contract float %sub164, %121
|
||||
%conv166 = fpext float %mul165 to double
|
||||
%add167 = fadd contract double %add159, %conv166
|
||||
%mul168 = fmul contract double %conv114, %add167
|
||||
%add169 = fadd contract double %conv, %mul168
|
||||
%conv170 = fptrunc double %add169 to float
|
||||
%122 = load i32, i32* %ty, align 4
|
||||
%idxprom171 = sext i32 %122 to i64
|
||||
%arrayidx172 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ14calculate_tempiPfS_S_iiiiffffffE6temp_t to [16 x [16 x float]]*), i64 0, i64 %idxprom171
|
||||
%123 = load i32, i32* %tx, align 4
|
||||
%idxprom173 = sext i32 %123 to i64
|
||||
%arrayidx174 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx172, i64 0, i64 %idxprom173
|
||||
store float %conv170, float* %arrayidx174, align 4
|
||||
br label %if.end175
|
||||
|
||||
if.end175: ; preds = %if.then109, %land.lhs.true107, %land.lhs.true105, %land.lhs.true103, %land.lhs.true101, %land.lhs.true97, %land.lhs.true94, %land.lhs.true90, %for.body
|
||||
call void @llvm.nvvm.barrier0()
|
||||
%124 = load i32, i32* %i, align 4
|
||||
%125 = load i32, i32* %iteration.addr, align 4
|
||||
%sub176 = sub nsw i32 %125, 1
|
||||
%cmp177 = icmp eq i32 %124, %sub176
|
||||
br i1 %cmp177, label %if.then178, label %if.end179
|
||||
|
||||
if.then178: ; preds = %if.end175
|
||||
br label %for.end
|
||||
|
||||
if.end179: ; preds = %if.end175
|
||||
%126 = load i8, i8* %computed, align 1
|
||||
%tobool = trunc i8 %126 to i1
|
||||
br i1 %tobool, label %if.then180, label %if.end189
|
||||
|
||||
if.then180: ; preds = %if.end179
|
||||
%127 = load i32, i32* %ty, align 4
|
||||
%idxprom181 = sext i32 %127 to i64
|
||||
%arrayidx182 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ14calculate_tempiPfS_S_iiiiffffffE6temp_t to [16 x [16 x float]]*), i64 0, i64 %idxprom181
|
||||
%128 = load i32, i32* %tx, align 4
|
||||
%idxprom183 = sext i32 %128 to i64
|
||||
%arrayidx184 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx182, i64 0, i64 %idxprom183
|
||||
%129 = load float, float* %arrayidx184, align 4
|
||||
%130 = load i32, i32* %ty, align 4
|
||||
%idxprom185 = sext i32 %130 to i64
|
||||
%arrayidx186 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ14calculate_tempiPfS_S_iiiiffffffE12temp_on_cuda to [16 x [16 x float]]*), i64 0, i64 %idxprom185
|
||||
%131 = load i32, i32* %tx, align 4
|
||||
%idxprom187 = sext i32 %131 to i64
|
||||
%arrayidx188 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx186, i64 0, i64 %idxprom187
|
||||
store float %129, float* %arrayidx188, align 4
|
||||
br label %if.end189
|
||||
|
||||
if.end189: ; preds = %if.then180, %if.end179
|
||||
call void @llvm.nvvm.barrier0()
|
||||
br label %for.inc
|
||||
|
||||
for.inc: ; preds = %if.end189
|
||||
%132 = load i32, i32* %i, align 4
|
||||
%inc = add nsw i32 %132, 1
|
||||
store i32 %inc, i32* %i, align 4
|
||||
br label %for.cond
|
||||
|
||||
for.end: ; preds = %if.then178, %for.cond
|
||||
%133 = load i8, i8* %computed, align 1
|
||||
%tobool190 = trunc i8 %133 to i1
|
||||
br i1 %tobool190, label %if.then191, label %if.end198
|
||||
|
||||
if.then191: ; preds = %for.end
|
||||
%134 = load i32, i32* %ty, align 4
|
||||
%idxprom192 = sext i32 %134 to i64
|
||||
%arrayidx193 = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* addrspacecast ([16 x [16 x float]] addrspace(3)* @_ZZ14calculate_tempiPfS_S_iiiiffffffE6temp_t to [16 x [16 x float]]*), i64 0, i64 %idxprom192
|
||||
%135 = load i32, i32* %tx, align 4
|
||||
%idxprom194 = sext i32 %135 to i64
|
||||
%arrayidx195 = getelementptr inbounds [16 x float], [16 x float]* %arrayidx193, i64 0, i64 %idxprom194
|
||||
%136 = load float, float* %arrayidx195, align 4
|
||||
%137 = load float*, float** %temp_dst.addr, align 8
|
||||
%138 = load i32, i32* %index, align 4
|
||||
%idxprom196 = sext i32 %138 to i64
|
||||
%arrayidx197 = getelementptr inbounds float, float* %137, i64 %idxprom196
|
||||
store float %136, float* %arrayidx197, align 4
|
||||
br label %if.end198
|
||||
|
||||
if.end198: ; preds = %if.then191, %for.end
|
||||
ret void
|
||||
}
|
||||
|
||||
; Function Attrs: alwaysinline convergent nounwind
|
||||
define linkonce_odr dso_local i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #1 comdat align 2 {
|
||||
entry:
|
||||
%0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
|
||||
ret i32 %0
|
||||
}
|
||||
|
||||
; Function Attrs: alwaysinline convergent nounwind
|
||||
define linkonce_odr dso_local i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_yEv() #1 comdat align 2 {
|
||||
entry:
|
||||
%0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y()
|
||||
ret i32 %0
|
||||
}
|
||||
|
||||
; Function Attrs: alwaysinline convergent nounwind
|
||||
define linkonce_odr dso_local i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #1 comdat align 2 {
|
||||
entry:
|
||||
%0 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
|
||||
ret i32 %0
|
||||
}
|
||||
|
||||
; Function Attrs: alwaysinline convergent nounwind
|
||||
define linkonce_odr dso_local i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_yEv() #1 comdat align 2 {
|
||||
entry:
|
||||
%0 = call i32 @llvm.nvvm.read.ptx.sreg.tid.y()
|
||||
ret i32 %0
|
||||
}
|
||||
|
||||
; Function Attrs: convergent nounwind
|
||||
declare void @llvm.nvvm.barrier0() #2
|
||||
|
||||
; Function Attrs: nounwind readnone
|
||||
declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #3
|
||||
|
||||
; Function Attrs: nounwind readnone
|
||||
declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.y() #3
|
||||
|
||||
; Function Attrs: nounwind readnone
|
||||
declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() #3
|
||||
|
||||
; Function Attrs: nounwind readnone
|
||||
declare i32 @llvm.nvvm.read.ptx.sreg.tid.y() #3
|
||||
|
||||
attributes #0 = { convergent noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" }
|
||||
attributes #1 = { alwaysinline convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" }
|
||||
attributes #2 = { convergent nounwind }
|
||||
attributes #3 = { nounwind readnone }
|
||||
|
||||
!llvm.module.flags = !{!0, !1, !2}
|
||||
!nvvm.annotations = !{!3, !4, !5, !4, !6, !6, !6, !6, !7, !7, !6}
|
||||
!llvm.ident = !{!8}
|
||||
!nvvmir.version = !{!9}
|
||||
|
||||
!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 10, i32 1]}
|
||||
!1 = !{i32 1, !"wchar_size", i32 4}
|
||||
!2 = !{i32 4, !"nvvm-reflect-ftz", i32 0}
|
||||
!3 = !{void (i32, float*, float*, float*, i32, i32, i32, i32, float, float, float, float, float, float)* @_Z14calculate_tempiPfS_S_iiiiffffff, !"kernel", i32 1}
|
||||
!4 = !{null, !"align", i32 8}
|
||||
!5 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080}
|
||||
!6 = !{null, !"align", i32 16}
|
||||
!7 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088}
|
||||
!8 = !{!"clang version 10.0.1 (https://github.com/llvm/llvm-project.git ef32c611aa214dea855364efd7ba451ec5ec3f74)"}
|
||||
!9 = !{i32 1, i32 4}
|
File diff suppressed because one or more lines are too long
|
@ -1,353 +0,0 @@
|
|||
#include <assert.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <time.h>
|
||||
|
||||
#ifdef RD_WG_SIZE_0_0
|
||||
#define BLOCK_SIZE RD_WG_SIZE_0_0
|
||||
#elif defined(RD_WG_SIZE_0)
|
||||
#define BLOCK_SIZE RD_WG_SIZE_0
|
||||
#elif defined(RD_WG_SIZE)
|
||||
#define BLOCK_SIZE RD_WG_SIZE
|
||||
#else
|
||||
#define BLOCK_SIZE 16
|
||||
#endif
|
||||
|
||||
#define STR_SIZE 256
|
||||
|
||||
/* maximum power density possible (say 300W for a 10mm x 10mm chip) */
|
||||
#define MAX_PD (3.0e6)
|
||||
/* required precision in degrees */
|
||||
#define PRECISION 0.001
|
||||
#define SPEC_HEAT_SI 1.75e6
|
||||
#define K_SI 100
|
||||
/* capacitance fitting factor */
|
||||
#define FACTOR_CHIP 0.5
|
||||
|
||||
/* chip parameters */
|
||||
float t_chip = 0.0005;
|
||||
float chip_height = 0.016;
|
||||
float chip_width = 0.016;
|
||||
/* ambient temperature, assuming no package at all */
|
||||
float amb_temp = 80.0;
|
||||
|
||||
void run(int argc, char **argv);
|
||||
|
||||
/* define timer macros */
|
||||
#define pin_stats_reset() startCycle()
|
||||
#define pin_stats_pause(cycles) stopCycle(cycles)
|
||||
#define pin_stats_dump(cycles) printf("timer: %Lu\n", cycles)
|
||||
|
||||
void fatal(char *s) { fprintf(stderr, "error: %s\n", s); }
|
||||
|
||||
void writeoutput(float *vect, int grid_rows, int grid_cols, char *file) {
|
||||
|
||||
int i, j, index = 0;
|
||||
FILE *fp;
|
||||
char str[STR_SIZE];
|
||||
|
||||
if ((fp = fopen(file, "w")) == 0)
|
||||
printf("The file was not opened\n");
|
||||
|
||||
for (i = 0; i < grid_rows; i++)
|
||||
for (j = 0; j < grid_cols; j++) {
|
||||
|
||||
sprintf(str, "%d\t%g\n", index, vect[i * grid_cols + j]);
|
||||
fputs(str, fp);
|
||||
index++;
|
||||
}
|
||||
|
||||
fclose(fp);
|
||||
}
|
||||
|
||||
void readinput(float *vect, int grid_rows, int grid_cols, char *file) {
|
||||
|
||||
int i, j;
|
||||
FILE *fp;
|
||||
char str[STR_SIZE];
|
||||
float val;
|
||||
|
||||
if ((fp = fopen(file, "r")) == 0)
|
||||
printf("The file was not opened\n");
|
||||
|
||||
for (i = 0; i <= grid_rows - 1; i++)
|
||||
for (j = 0; j <= grid_cols - 1; j++) {
|
||||
fgets(str, STR_SIZE, fp);
|
||||
if (feof(fp))
|
||||
fatal("not enough lines in file");
|
||||
// if ((sscanf(str, "%d%f", &index, &val) != 2) || (index !=
|
||||
// ((i-1)*(grid_cols-2)+j-1)))
|
||||
if ((sscanf(str, "%f", &val) != 1))
|
||||
fatal("invalid file format");
|
||||
vect[i * grid_cols + j] = val;
|
||||
}
|
||||
|
||||
fclose(fp);
|
||||
}
|
||||
|
||||
#define IN_RANGE(x, min, max) ((x) >= (min) && (x) <= (max))
|
||||
#define CLAMP_RANGE(x, min, max) x = (x < (min)) ? min : ((x > (max)) ? max : x)
|
||||
#define MIN(a, b) ((a) <= (b) ? (a) : (b))
|
||||
|
||||
__global__ void calculate_temp(int iteration, // number of iteration
|
||||
float *power, // power input
|
||||
float *temp_src, // temperature input/output
|
||||
float *temp_dst, // temperature input/output
|
||||
int grid_cols, // Col of grid
|
||||
int grid_rows, // Row of grid
|
||||
int border_cols, // border offset
|
||||
int border_rows, // border offset
|
||||
float Cap, // Capacitance
|
||||
float Rx, float Ry, float Rz, float step,
|
||||
float time_elapsed) {
|
||||
|
||||
__shared__ float temp_on_cuda[BLOCK_SIZE][BLOCK_SIZE];
|
||||
__shared__ float power_on_cuda[BLOCK_SIZE][BLOCK_SIZE];
|
||||
__shared__ float temp_t[BLOCK_SIZE]
|
||||
[BLOCK_SIZE]; // saving temparary temperature result
|
||||
|
||||
float amb_temp = 80.0;
|
||||
float step_div_Cap;
|
||||
float Rx_1, Ry_1, Rz_1;
|
||||
|
||||
int bx = blockIdx.x;
|
||||
int by = blockIdx.y;
|
||||
|
||||
int tx = threadIdx.x;
|
||||
int ty = threadIdx.y;
|
||||
|
||||
step_div_Cap = step / Cap;
|
||||
|
||||
Rx_1 = 1 / Rx;
|
||||
Ry_1 = 1 / Ry;
|
||||
Rz_1 = 1 / Rz;
|
||||
|
||||
// each block finally computes result for a small block
|
||||
// after N iterations.
|
||||
// it is the non-overlapping small blocks that cover
|
||||
// all the input data
|
||||
|
||||
// calculate the small block size
|
||||
int small_block_rows = BLOCK_SIZE - iteration * 2; // EXPAND_RATE
|
||||
int small_block_cols = BLOCK_SIZE - iteration * 2; // EXPAND_RATE
|
||||
|
||||
// calculate the boundary for the block according to
|
||||
// the boundary of its small block
|
||||
int blkY = small_block_rows * by - border_rows;
|
||||
int blkX = small_block_cols * bx - border_cols;
|
||||
int blkYmax = blkY + BLOCK_SIZE - 1;
|
||||
int blkXmax = blkX + BLOCK_SIZE - 1;
|
||||
|
||||
// calculate the global thread coordination
|
||||
int yidx = blkY + ty;
|
||||
int xidx = blkX + tx;
|
||||
|
||||
// load data if it is within the valid input range
|
||||
int loadYidx = yidx, loadXidx = xidx;
|
||||
int index = grid_cols * loadYidx + loadXidx;
|
||||
|
||||
if (IN_RANGE(loadYidx, 0, grid_rows - 1) &&
|
||||
IN_RANGE(loadXidx, 0, grid_cols - 1)) {
|
||||
temp_on_cuda[ty][tx] = temp_src[index]; // Load the temperature data from
|
||||
// global memory to shared memory
|
||||
power_on_cuda[ty][tx] =
|
||||
power[index]; // Load the power data from global memory to shared memory
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
// effective range within this block that falls within
|
||||
// the valid range of the input data
|
||||
// used to rule out computation outside the boundary.
|
||||
int validYmin = (blkY < 0) ? -blkY : 0;
|
||||
int validYmax = (blkYmax > grid_rows - 1)
|
||||
? BLOCK_SIZE - 1 - (blkYmax - grid_rows + 1)
|
||||
: BLOCK_SIZE - 1;
|
||||
int validXmin = (blkX < 0) ? -blkX : 0;
|
||||
int validXmax = (blkXmax > grid_cols - 1)
|
||||
? BLOCK_SIZE - 1 - (blkXmax - grid_cols + 1)
|
||||
: BLOCK_SIZE - 1;
|
||||
|
||||
int N = ty - 1;
|
||||
int S = ty + 1;
|
||||
int W = tx - 1;
|
||||
int E = tx + 1;
|
||||
|
||||
N = (N < validYmin) ? validYmin : N;
|
||||
S = (S > validYmax) ? validYmax : S;
|
||||
W = (W < validXmin) ? validXmin : W;
|
||||
E = (E > validXmax) ? validXmax : E;
|
||||
|
||||
bool computed;
|
||||
for (int i = 0; i < iteration; i++) {
|
||||
computed = false;
|
||||
if (IN_RANGE(tx, i + 1, BLOCK_SIZE - i - 2) &&
|
||||
IN_RANGE(ty, i + 1, BLOCK_SIZE - i - 2) &&
|
||||
IN_RANGE(tx, validXmin, validXmax) &&
|
||||
IN_RANGE(ty, validYmin, validYmax)) {
|
||||
computed = true;
|
||||
temp_t[ty][tx] =
|
||||
temp_on_cuda[ty][tx] +
|
||||
step_div_Cap * (power_on_cuda[ty][tx] +
|
||||
(temp_on_cuda[S][tx] + temp_on_cuda[N][tx] -
|
||||
2.0 * temp_on_cuda[ty][tx]) *
|
||||
Ry_1 +
|
||||
(temp_on_cuda[ty][E] + temp_on_cuda[ty][W] -
|
||||
2.0 * temp_on_cuda[ty][tx]) *
|
||||
Rx_1 +
|
||||
(amb_temp - temp_on_cuda[ty][tx]) * Rz_1);
|
||||
}
|
||||
__syncthreads();
|
||||
if (i == iteration - 1)
|
||||
break;
|
||||
if (computed) // Assign the computation range
|
||||
temp_on_cuda[ty][tx] = temp_t[ty][tx];
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
// update the global memory
|
||||
// after the last iteration, only threads coordinated within the
|
||||
// small block perform the calculation and switch on ``computed''
|
||||
if (computed) {
|
||||
temp_dst[index] = temp_t[ty][tx];
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
compute N time steps
|
||||
*/
|
||||
|
||||
int compute_tran_temp(float *MatrixPower, float *MatrixTemp[2], int col,
|
||||
int row, int total_iterations, int num_iterations,
|
||||
int blockCols, int blockRows, int borderCols,
|
||||
int borderRows) {
|
||||
dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
|
||||
dim3 dimGrid(blockCols, blockRows);
|
||||
|
||||
float grid_height = chip_height / row;
|
||||
float grid_width = chip_width / col;
|
||||
|
||||
float Cap = FACTOR_CHIP * SPEC_HEAT_SI * t_chip * grid_width * grid_height;
|
||||
float Rx = grid_width / (2.0 * K_SI * t_chip * grid_height);
|
||||
float Ry = grid_height / (2.0 * K_SI * t_chip * grid_width);
|
||||
float Rz = t_chip / (K_SI * grid_height * grid_width);
|
||||
|
||||
float max_slope = MAX_PD / (FACTOR_CHIP * t_chip * SPEC_HEAT_SI);
|
||||
float step = PRECISION / max_slope;
|
||||
float t;
|
||||
float time_elapsed;
|
||||
time_elapsed = 0.001;
|
||||
|
||||
int src = 1, dst = 0;
|
||||
|
||||
for (t = 0; t < total_iterations; t += num_iterations) {
|
||||
int temp = src;
|
||||
src = dst;
|
||||
dst = temp;
|
||||
calculate_temp<<<dimGrid, dimBlock>>>(
|
||||
MIN(num_iterations, total_iterations - t), MatrixPower, MatrixTemp[src],
|
||||
MatrixTemp[dst], col, row, borderCols, borderRows, Cap, Rx, Ry, Rz,
|
||||
step, time_elapsed);
|
||||
cudaDeviceSynchronize();
|
||||
}
|
||||
return dst;
|
||||
}
|
||||
|
||||
void usage(int argc, char **argv) {
|
||||
fprintf(stderr,
|
||||
"Usage: %s <grid_rows/grid_cols> <pyramid_height> <sim_time> "
|
||||
"<temp_file> <power_file> <output_file>\n",
|
||||
argv[0]);
|
||||
fprintf(stderr, "\t<grid_rows/grid_cols> - number of rows/cols in the grid "
|
||||
"(positive integer)\n");
|
||||
fprintf(stderr, "\t<pyramid_height> - pyramid heigh(positive integer)\n");
|
||||
fprintf(stderr, "\t<sim_time> - number of iterations\n");
|
||||
fprintf(stderr, "\t<temp_file> - name of the file containing the initial "
|
||||
"temperature values of each cell\n");
|
||||
fprintf(stderr, "\t<power_file> - name of the file containing the dissipated "
|
||||
"power values of each cell\n");
|
||||
fprintf(stderr, "\t<output_file> - name of the output file\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
int main(int argc, char **argv) {
|
||||
cudaSetDevice(0);
|
||||
printf("WG size of kernel = %d X %d\n", BLOCK_SIZE, BLOCK_SIZE);
|
||||
|
||||
run(argc, argv);
|
||||
|
||||
return EXIT_SUCCESS;
|
||||
}
|
||||
|
||||
void run(int argc, char **argv) {
|
||||
int size;
|
||||
int grid_rows, grid_cols;
|
||||
float *FilesavingTemp, *FilesavingPower, *MatrixOut;
|
||||
char *tfile, *pfile, *ofile;
|
||||
|
||||
int total_iterations = 60;
|
||||
int pyramid_height = 1; // number of iterations
|
||||
|
||||
if (argc != 7)
|
||||
usage(argc, argv);
|
||||
if ((grid_rows = atoi(argv[1])) <= 0 || (grid_cols = atoi(argv[1])) <= 0 ||
|
||||
(pyramid_height = atoi(argv[2])) <= 0 ||
|
||||
(total_iterations = atoi(argv[3])) <= 0)
|
||||
usage(argc, argv);
|
||||
|
||||
tfile = argv[4];
|
||||
pfile = argv[5];
|
||||
ofile = argv[6];
|
||||
|
||||
size = grid_rows * grid_cols;
|
||||
|
||||
/* --------------- pyramid parameters --------------- */
|
||||
#define EXPAND_RATE \
|
||||
2 // add one iteration will extend the pyramid base by 2 per each borderline
|
||||
int borderCols = (pyramid_height)*EXPAND_RATE / 2;
|
||||
int borderRows = (pyramid_height)*EXPAND_RATE / 2;
|
||||
int smallBlockCol = BLOCK_SIZE - (pyramid_height)*EXPAND_RATE;
|
||||
int smallBlockRow = BLOCK_SIZE - (pyramid_height)*EXPAND_RATE;
|
||||
int blockCols =
|
||||
grid_cols / smallBlockCol + ((grid_cols % smallBlockCol == 0) ? 0 : 1);
|
||||
int blockRows =
|
||||
grid_rows / smallBlockRow + ((grid_rows % smallBlockRow == 0) ? 0 : 1);
|
||||
|
||||
FilesavingTemp = (float *)malloc(size * sizeof(float));
|
||||
FilesavingPower = (float *)malloc(size * sizeof(float));
|
||||
MatrixOut = (float *)calloc(size, sizeof(float));
|
||||
|
||||
if (!FilesavingPower || !FilesavingTemp || !MatrixOut)
|
||||
fatal("unable to allocate memory");
|
||||
|
||||
printf("pyramidHeight: %d\ngridSize: [%d, %d]\nborder:[%d, "
|
||||
"%d]\nblockGrid:[%d, %d]\ntargetBlock:[%d, %d]\n",
|
||||
pyramid_height, grid_cols, grid_rows, borderCols, borderRows,
|
||||
blockCols, blockRows, smallBlockCol, smallBlockRow);
|
||||
|
||||
readinput(FilesavingTemp, grid_rows, grid_cols, tfile);
|
||||
readinput(FilesavingPower, grid_rows, grid_cols, pfile);
|
||||
|
||||
float *MatrixTemp[2], *MatrixPower;
|
||||
cudaMalloc((void **)&MatrixTemp[0], sizeof(float) * size);
|
||||
cudaMalloc((void **)&MatrixTemp[1], sizeof(float) * size);
|
||||
cudaMemcpy(MatrixTemp[0], FilesavingTemp, sizeof(float) * size,
|
||||
cudaMemcpyHostToDevice);
|
||||
|
||||
cudaMalloc((void **)&MatrixPower, sizeof(float) * size);
|
||||
cudaMemcpy(MatrixPower, FilesavingPower, sizeof(float) * size,
|
||||
cudaMemcpyHostToDevice);
|
||||
printf("Start computing the transient temperature\n");
|
||||
int ret = compute_tran_temp(MatrixPower, MatrixTemp, grid_cols, grid_rows,
|
||||
total_iterations, pyramid_height, blockCols,
|
||||
blockRows, borderCols, borderRows);
|
||||
printf("Ending simulation\n");
|
||||
cudaMemcpy(MatrixOut, MatrixTemp[ret], sizeof(float) * size,
|
||||
cudaMemcpyDeviceToHost);
|
||||
|
||||
writeoutput(MatrixOut, grid_rows, grid_cols, ofile);
|
||||
|
||||
cudaFree(MatrixPower);
|
||||
cudaFree(MatrixTemp[0]);
|
||||
cudaFree(MatrixTemp[1]);
|
||||
free(MatrixOut);
|
||||
}
|
|
@ -1,21 +0,0 @@
|
|||
#!/bin/bash
|
||||
set -e
|
||||
llvm-as hotspot-cuda-nvptx64-nvidia-cuda-sm_61.ll
|
||||
llvm-as hotspot-host-x86_64-unknown-linux-gnu.ll
|
||||
../../build/compilation/kernelTranslator hotspot-cuda-nvptx64-nvidia-cuda-sm_61.bc kernel.bc
|
||||
../../build/compilation/hostTranslator hotspot-host-x86_64-unknown-linux-gnu.bc host.bc
|
||||
|
||||
llc --relocation-model=pic --filetype=obj kernel.bc
|
||||
llc --relocation-model=pic --filetype=obj host.bc
|
||||
|
||||
g++ -Wall -L../../build/runtime -L../../build/runtime/threadPool \
|
||||
-o hotspot -fPIC -no-pie host.o kernel.o -lc -lx86Runtime -lthreadPool -lpthread
|
||||
|
||||
export LD_LIBRARY_PATH=../../build/runtime:../../build/runtime/threadPool:$LD_LIBRARY_PATH
|
||||
./hotspot 512 2 2 ../../rodinia-data/hotspot/temp_512 ../../rodinia-data/hotspot/power_512 output.out
|
||||
if head output.out | grep -q "323.829"; then
|
||||
echo "Pass"
|
||||
else
|
||||
echo "Error result"
|
||||
exit 1
|
||||
fi
|
|
@ -1,587 +0,0 @@
|
|||
; ModuleID = '3D-cuda-nvptx64-nvidia-cuda-sm_61.bc'
|
||||
source_filename = "3D.cu"
|
||||
target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
|
||||
target triple = "nvptx64-nvidia-cuda"
|
||||
|
||||
%struct.__cuda_builtin_blockDim_t = type { i8 }
|
||||
%struct.__cuda_builtin_blockIdx_t = type { i8 }
|
||||
%struct.__cuda_builtin_threadIdx_t = type { i8 }
|
||||
%struct.cudaFuncAttributes = type { i64, i64, i64, i32, i32, i32, i32, i32, i32, i32 }
|
||||
|
||||
$_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv = comdat any
|
||||
|
||||
$_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv = comdat any
|
||||
|
||||
$_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv = comdat any
|
||||
|
||||
$_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_yEv = comdat any
|
||||
|
||||
$_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_yEv = comdat any
|
||||
|
||||
$_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_yEv = comdat any
|
||||
|
||||
@blockDim = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_blockDim_t, align 1
|
||||
@blockIdx = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_blockIdx_t, align 1
|
||||
@threadIdx = extern_weak dso_local addrspace(1) global %struct.__cuda_builtin_threadIdx_t, align 1
|
||||
|
||||
; Function Attrs: convergent noinline nounwind optnone
|
||||
define weak dso_local i32 @cudaMalloc(i8** %p, i64 %s) #0 {
|
||||
entry:
|
||||
%p.addr = alloca i8**, align 8
|
||||
%s.addr = alloca i64, align 8
|
||||
store i8** %p, i8*** %p.addr, align 8
|
||||
store i64 %s, i64* %s.addr, align 8
|
||||
ret i32 999
|
||||
}
|
||||
|
||||
; Function Attrs: convergent noinline nounwind optnone
|
||||
define weak dso_local i32 @cudaFuncGetAttributes(%struct.cudaFuncAttributes* %p, i8* %c) #0 {
|
||||
entry:
|
||||
%p.addr = alloca %struct.cudaFuncAttributes*, align 8
|
||||
%c.addr = alloca i8*, align 8
|
||||
store %struct.cudaFuncAttributes* %p, %struct.cudaFuncAttributes** %p.addr, align 8
|
||||
store i8* %c, i8** %c.addr, align 8
|
||||
ret i32 999
|
||||
}
|
||||
|
||||
; Function Attrs: convergent noinline nounwind optnone
|
||||
define weak dso_local i32 @cudaDeviceGetAttribute(i32* %value, i32 %attr, i32 %device) #0 {
|
||||
entry:
|
||||
%value.addr = alloca i32*, align 8
|
||||
%attr.addr = alloca i32, align 4
|
||||
%device.addr = alloca i32, align 4
|
||||
store i32* %value, i32** %value.addr, align 8
|
||||
store i32 %attr, i32* %attr.addr, align 4
|
||||
store i32 %device, i32* %device.addr, align 4
|
||||
ret i32 999
|
||||
}
|
||||
|
||||
; Function Attrs: convergent noinline nounwind optnone
|
||||
define weak dso_local i32 @cudaGetDevice(i32* %device) #0 {
|
||||
entry:
|
||||
%device.addr = alloca i32*, align 8
|
||||
store i32* %device, i32** %device.addr, align 8
|
||||
ret i32 999
|
||||
}
|
||||
|
||||
; Function Attrs: convergent noinline nounwind optnone
|
||||
define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessor(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize) #0 {
|
||||
entry:
|
||||
%numBlocks.addr = alloca i32*, align 8
|
||||
%func.addr = alloca i8*, align 8
|
||||
%blockSize.addr = alloca i32, align 4
|
||||
%dynamicSmemSize.addr = alloca i64, align 8
|
||||
store i32* %numBlocks, i32** %numBlocks.addr, align 8
|
||||
store i8* %func, i8** %func.addr, align 8
|
||||
store i32 %blockSize, i32* %blockSize.addr, align 4
|
||||
store i64 %dynamicSmemSize, i64* %dynamicSmemSize.addr, align 8
|
||||
ret i32 999
|
||||
}
|
||||
|
||||
; Function Attrs: convergent noinline nounwind optnone
|
||||
define weak dso_local i32 @cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(i32* %numBlocks, i8* %func, i32 %blockSize, i64 %dynamicSmemSize, i32 %flags) #0 {
|
||||
entry:
|
||||
%numBlocks.addr = alloca i32*, align 8
|
||||
%func.addr = alloca i8*, align 8
|
||||
%blockSize.addr = alloca i32, align 4
|
||||
%dynamicSmemSize.addr = alloca i64, align 8
|
||||
%flags.addr = alloca i32, align 4
|
||||
store i32* %numBlocks, i32** %numBlocks.addr, align 8
|
||||
store i8* %func, i8** %func.addr, align 8
|
||||
store i32 %blockSize, i32* %blockSize.addr, align 4
|
||||
store i64 %dynamicSmemSize, i64* %dynamicSmemSize.addr, align 8
|
||||
store i32 %flags, i32* %flags.addr, align 4
|
||||
ret i32 999
|
||||
}
|
||||
|
||||
; Function Attrs: convergent noinline nounwind optnone
|
||||
define dso_local void @_Z11hotspotOpt1PfS_S_fiiifffffff(float* %p, float* %tIn, float* %tOut, float %sdc, i32 %nx, i32 %ny, i32 %nz, float %ce, float %cw, float %cn, float %cs, float %ct, float %cb, float %cc) #0 {
|
||||
entry:
|
||||
%p.addr = alloca float*, align 8
|
||||
%tIn.addr = alloca float*, align 8
|
||||
%tOut.addr = alloca float*, align 8
|
||||
%sdc.addr = alloca float, align 4
|
||||
%nx.addr = alloca i32, align 4
|
||||
%ny.addr = alloca i32, align 4
|
||||
%nz.addr = alloca i32, align 4
|
||||
%ce.addr = alloca float, align 4
|
||||
%cw.addr = alloca float, align 4
|
||||
%cn.addr = alloca float, align 4
|
||||
%cs.addr = alloca float, align 4
|
||||
%ct.addr = alloca float, align 4
|
||||
%cb.addr = alloca float, align 4
|
||||
%cc.addr = alloca float, align 4
|
||||
%amb_temp = alloca float, align 4
|
||||
%i = alloca i32, align 4
|
||||
%j = alloca i32, align 4
|
||||
%c = alloca i32, align 4
|
||||
%xy = alloca i32, align 4
|
||||
%W = alloca i32, align 4
|
||||
%E = alloca i32, align 4
|
||||
%N = alloca i32, align 4
|
||||
%S = alloca i32, align 4
|
||||
%temp1 = alloca float, align 4
|
||||
%temp2 = alloca float, align 4
|
||||
%temp3 = alloca float, align 4
|
||||
%k = alloca i32, align 4
|
||||
store float* %p, float** %p.addr, align 8
|
||||
store float* %tIn, float** %tIn.addr, align 8
|
||||
store float* %tOut, float** %tOut.addr, align 8
|
||||
store float %sdc, float* %sdc.addr, align 4
|
||||
store i32 %nx, i32* %nx.addr, align 4
|
||||
store i32 %ny, i32* %ny.addr, align 4
|
||||
store i32 %nz, i32* %nz.addr, align 4
|
||||
store float %ce, float* %ce.addr, align 4
|
||||
store float %cw, float* %cw.addr, align 4
|
||||
store float %cn, float* %cn.addr, align 4
|
||||
store float %cs, float* %cs.addr, align 4
|
||||
store float %ct, float* %ct.addr, align 4
|
||||
store float %cb, float* %cb.addr, align 4
|
||||
store float %cc, float* %cc.addr, align 4
|
||||
store float 8.000000e+01, float* %amb_temp, align 4
|
||||
%call = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #3
|
||||
%call1 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #3
|
||||
%mul = mul i32 %call, %call1
|
||||
%call2 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #3
|
||||
%add = add i32 %mul, %call2
|
||||
store i32 %add, i32* %i, align 4
|
||||
%call3 = call i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_yEv() #3
|
||||
%call4 = call i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_yEv() #3
|
||||
%mul5 = mul i32 %call3, %call4
|
||||
%call6 = call i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_yEv() #3
|
||||
%add7 = add i32 %mul5, %call6
|
||||
store i32 %add7, i32* %j, align 4
|
||||
%0 = load i32, i32* %i, align 4
|
||||
%1 = load i32, i32* %j, align 4
|
||||
%2 = load i32, i32* %nx.addr, align 4
|
||||
%mul8 = mul nsw i32 %1, %2
|
||||
%add9 = add nsw i32 %0, %mul8
|
||||
store i32 %add9, i32* %c, align 4
|
||||
%3 = load i32, i32* %nx.addr, align 4
|
||||
%4 = load i32, i32* %ny.addr, align 4
|
||||
%mul10 = mul nsw i32 %3, %4
|
||||
store i32 %mul10, i32* %xy, align 4
|
||||
%5 = load i32, i32* %i, align 4
|
||||
%cmp = icmp eq i32 %5, 0
|
||||
br i1 %cmp, label %cond.true, label %cond.false
|
||||
|
||||
cond.true: ; preds = %entry
|
||||
%6 = load i32, i32* %c, align 4
|
||||
br label %cond.end
|
||||
|
||||
cond.false: ; preds = %entry
|
||||
%7 = load i32, i32* %c, align 4
|
||||
%sub = sub nsw i32 %7, 1
|
||||
br label %cond.end
|
||||
|
||||
cond.end: ; preds = %cond.false, %cond.true
|
||||
%cond = phi i32 [ %6, %cond.true ], [ %sub, %cond.false ]
|
||||
store i32 %cond, i32* %W, align 4
|
||||
%8 = load i32, i32* %i, align 4
|
||||
%9 = load i32, i32* %nx.addr, align 4
|
||||
%sub11 = sub nsw i32 %9, 1
|
||||
%cmp12 = icmp eq i32 %8, %sub11
|
||||
br i1 %cmp12, label %cond.true13, label %cond.false14
|
||||
|
||||
cond.true13: ; preds = %cond.end
|
||||
%10 = load i32, i32* %c, align 4
|
||||
br label %cond.end16
|
||||
|
||||
cond.false14: ; preds = %cond.end
|
||||
%11 = load i32, i32* %c, align 4
|
||||
%add15 = add nsw i32 %11, 1
|
||||
br label %cond.end16
|
||||
|
||||
cond.end16: ; preds = %cond.false14, %cond.true13
|
||||
%cond17 = phi i32 [ %10, %cond.true13 ], [ %add15, %cond.false14 ]
|
||||
store i32 %cond17, i32* %E, align 4
|
||||
%12 = load i32, i32* %j, align 4
|
||||
%cmp18 = icmp eq i32 %12, 0
|
||||
br i1 %cmp18, label %cond.true19, label %cond.false20
|
||||
|
||||
cond.true19: ; preds = %cond.end16
|
||||
%13 = load i32, i32* %c, align 4
|
||||
br label %cond.end22
|
||||
|
||||
cond.false20: ; preds = %cond.end16
|
||||
%14 = load i32, i32* %c, align 4
|
||||
%15 = load i32, i32* %nx.addr, align 4
|
||||
%sub21 = sub nsw i32 %14, %15
|
||||
br label %cond.end22
|
||||
|
||||
cond.end22: ; preds = %cond.false20, %cond.true19
|
||||
%cond23 = phi i32 [ %13, %cond.true19 ], [ %sub21, %cond.false20 ]
|
||||
store i32 %cond23, i32* %N, align 4
|
||||
%16 = load i32, i32* %j, align 4
|
||||
%17 = load i32, i32* %ny.addr, align 4
|
||||
%sub24 = sub nsw i32 %17, 1
|
||||
%cmp25 = icmp eq i32 %16, %sub24
|
||||
br i1 %cmp25, label %cond.true26, label %cond.false27
|
||||
|
||||
cond.true26: ; preds = %cond.end22
|
||||
%18 = load i32, i32* %c, align 4
|
||||
br label %cond.end29
|
||||
|
||||
cond.false27: ; preds = %cond.end22
|
||||
%19 = load i32, i32* %c, align 4
|
||||
%20 = load i32, i32* %nx.addr, align 4
|
||||
%add28 = add nsw i32 %19, %20
|
||||
br label %cond.end29
|
||||
|
||||
cond.end29: ; preds = %cond.false27, %cond.true26
|
||||
%cond30 = phi i32 [ %18, %cond.true26 ], [ %add28, %cond.false27 ]
|
||||
store i32 %cond30, i32* %S, align 4
|
||||
%21 = load float*, float** %tIn.addr, align 8
|
||||
%22 = load i32, i32* %c, align 4
|
||||
%idxprom = sext i32 %22 to i64
|
||||
%arrayidx = getelementptr inbounds float, float* %21, i64 %idxprom
|
||||
%23 = load float, float* %arrayidx, align 4
|
||||
store float %23, float* %temp2, align 4
|
||||
store float %23, float* %temp1, align 4
|
||||
%24 = load float*, float** %tIn.addr, align 8
|
||||
%25 = load i32, i32* %c, align 4
|
||||
%26 = load i32, i32* %xy, align 4
|
||||
%add31 = add nsw i32 %25, %26
|
||||
%idxprom32 = sext i32 %add31 to i64
|
||||
%arrayidx33 = getelementptr inbounds float, float* %24, i64 %idxprom32
|
||||
%27 = load float, float* %arrayidx33, align 4
|
||||
store float %27, float* %temp3, align 4
|
||||
%28 = load float, float* %cc.addr, align 4
|
||||
%29 = load float, float* %temp2, align 4
|
||||
%mul34 = fmul contract float %28, %29
|
||||
%30 = load float, float* %cw.addr, align 4
|
||||
%31 = load float*, float** %tIn.addr, align 8
|
||||
%32 = load i32, i32* %W, align 4
|
||||
%idxprom35 = sext i32 %32 to i64
|
||||
%arrayidx36 = getelementptr inbounds float, float* %31, i64 %idxprom35
|
||||
%33 = load float, float* %arrayidx36, align 4
|
||||
%mul37 = fmul contract float %30, %33
|
||||
%add38 = fadd contract float %mul34, %mul37
|
||||
%34 = load float, float* %ce.addr, align 4
|
||||
%35 = load float*, float** %tIn.addr, align 8
|
||||
%36 = load i32, i32* %E, align 4
|
||||
%idxprom39 = sext i32 %36 to i64
|
||||
%arrayidx40 = getelementptr inbounds float, float* %35, i64 %idxprom39
|
||||
%37 = load float, float* %arrayidx40, align 4
|
||||
%mul41 = fmul contract float %34, %37
|
||||
%add42 = fadd contract float %add38, %mul41
|
||||
%38 = load float, float* %cs.addr, align 4
|
||||
%39 = load float*, float** %tIn.addr, align 8
|
||||
%40 = load i32, i32* %S, align 4
|
||||
%idxprom43 = sext i32 %40 to i64
|
||||
%arrayidx44 = getelementptr inbounds float, float* %39, i64 %idxprom43
|
||||
%41 = load float, float* %arrayidx44, align 4
|
||||
%mul45 = fmul contract float %38, %41
|
||||
%add46 = fadd contract float %add42, %mul45
|
||||
%42 = load float, float* %cn.addr, align 4
|
||||
%43 = load float*, float** %tIn.addr, align 8
|
||||
%44 = load i32, i32* %N, align 4
|
||||
%idxprom47 = sext i32 %44 to i64
|
||||
%arrayidx48 = getelementptr inbounds float, float* %43, i64 %idxprom47
|
||||
%45 = load float, float* %arrayidx48, align 4
|
||||
%mul49 = fmul contract float %42, %45
|
||||
%add50 = fadd contract float %add46, %mul49
|
||||
%46 = load float, float* %cb.addr, align 4
|
||||
%47 = load float, float* %temp1, align 4
|
||||
%mul51 = fmul contract float %46, %47
|
||||
%add52 = fadd contract float %add50, %mul51
|
||||
%48 = load float, float* %ct.addr, align 4
|
||||
%49 = load float, float* %temp3, align 4
|
||||
%mul53 = fmul contract float %48, %49
|
||||
%add54 = fadd contract float %add52, %mul53
|
||||
%50 = load float, float* %sdc.addr, align 4
|
||||
%51 = load float*, float** %p.addr, align 8
|
||||
%52 = load i32, i32* %c, align 4
|
||||
%idxprom55 = sext i32 %52 to i64
|
||||
%arrayidx56 = getelementptr inbounds float, float* %51, i64 %idxprom55
|
||||
%53 = load float, float* %arrayidx56, align 4
|
||||
%mul57 = fmul contract float %50, %53
|
||||
%add58 = fadd contract float %add54, %mul57
|
||||
%54 = load float, float* %ct.addr, align 4
|
||||
%55 = load float, float* %amb_temp, align 4
|
||||
%mul59 = fmul contract float %54, %55
|
||||
%add60 = fadd contract float %add58, %mul59
|
||||
%56 = load float*, float** %tOut.addr, align 8
|
||||
%57 = load i32, i32* %c, align 4
|
||||
%idxprom61 = sext i32 %57 to i64
|
||||
%arrayidx62 = getelementptr inbounds float, float* %56, i64 %idxprom61
|
||||
store float %add60, float* %arrayidx62, align 4
|
||||
%58 = load i32, i32* %xy, align 4
|
||||
%59 = load i32, i32* %c, align 4
|
||||
%add63 = add nsw i32 %59, %58
|
||||
store i32 %add63, i32* %c, align 4
|
||||
%60 = load i32, i32* %xy, align 4
|
||||
%61 = load i32, i32* %W, align 4
|
||||
%add64 = add nsw i32 %61, %60
|
||||
store i32 %add64, i32* %W, align 4
|
||||
%62 = load i32, i32* %xy, align 4
|
||||
%63 = load i32, i32* %E, align 4
|
||||
%add65 = add nsw i32 %63, %62
|
||||
store i32 %add65, i32* %E, align 4
|
||||
%64 = load i32, i32* %xy, align 4
|
||||
%65 = load i32, i32* %N, align 4
|
||||
%add66 = add nsw i32 %65, %64
|
||||
store i32 %add66, i32* %N, align 4
|
||||
%66 = load i32, i32* %xy, align 4
|
||||
%67 = load i32, i32* %S, align 4
|
||||
%add67 = add nsw i32 %67, %66
|
||||
store i32 %add67, i32* %S, align 4
|
||||
store i32 1, i32* %k, align 4
|
||||
br label %for.cond
|
||||
|
||||
for.cond: ; preds = %for.inc, %cond.end29
|
||||
%68 = load i32, i32* %k, align 4
|
||||
%69 = load i32, i32* %nz.addr, align 4
|
||||
%sub68 = sub nsw i32 %69, 1
|
||||
%cmp69 = icmp slt i32 %68, %sub68
|
||||
br i1 %cmp69, label %for.body, label %for.end
|
||||
|
||||
for.body: ; preds = %for.cond
|
||||
%70 = load float, float* %temp2, align 4
|
||||
store float %70, float* %temp1, align 4
|
||||
%71 = load float, float* %temp3, align 4
|
||||
store float %71, float* %temp2, align 4
|
||||
%72 = load float*, float** %tIn.addr, align 8
|
||||
%73 = load i32, i32* %c, align 4
|
||||
%74 = load i32, i32* %xy, align 4
|
||||
%add70 = add nsw i32 %73, %74
|
||||
%idxprom71 = sext i32 %add70 to i64
|
||||
%arrayidx72 = getelementptr inbounds float, float* %72, i64 %idxprom71
|
||||
%75 = load float, float* %arrayidx72, align 4
|
||||
store float %75, float* %temp3, align 4
|
||||
%76 = load float, float* %cc.addr, align 4
|
||||
%77 = load float, float* %temp2, align 4
|
||||
%mul73 = fmul contract float %76, %77
|
||||
%78 = load float, float* %cw.addr, align 4
|
||||
%79 = load float*, float** %tIn.addr, align 8
|
||||
%80 = load i32, i32* %W, align 4
|
||||
%idxprom74 = sext i32 %80 to i64
|
||||
%arrayidx75 = getelementptr inbounds float, float* %79, i64 %idxprom74
|
||||
%81 = load float, float* %arrayidx75, align 4
|
||||
%mul76 = fmul contract float %78, %81
|
||||
%add77 = fadd contract float %mul73, %mul76
|
||||
%82 = load float, float* %ce.addr, align 4
|
||||
%83 = load float*, float** %tIn.addr, align 8
|
||||
%84 = load i32, i32* %E, align 4
|
||||
%idxprom78 = sext i32 %84 to i64
|
||||
%arrayidx79 = getelementptr inbounds float, float* %83, i64 %idxprom78
|
||||
%85 = load float, float* %arrayidx79, align 4
|
||||
%mul80 = fmul contract float %82, %85
|
||||
%add81 = fadd contract float %add77, %mul80
|
||||
%86 = load float, float* %cs.addr, align 4
|
||||
%87 = load float*, float** %tIn.addr, align 8
|
||||
%88 = load i32, i32* %S, align 4
|
||||
%idxprom82 = sext i32 %88 to i64
|
||||
%arrayidx83 = getelementptr inbounds float, float* %87, i64 %idxprom82
|
||||
%89 = load float, float* %arrayidx83, align 4
|
||||
%mul84 = fmul contract float %86, %89
|
||||
%add85 = fadd contract float %add81, %mul84
|
||||
%90 = load float, float* %cn.addr, align 4
|
||||
%91 = load float*, float** %tIn.addr, align 8
|
||||
%92 = load i32, i32* %N, align 4
|
||||
%idxprom86 = sext i32 %92 to i64
|
||||
%arrayidx87 = getelementptr inbounds float, float* %91, i64 %idxprom86
|
||||
%93 = load float, float* %arrayidx87, align 4
|
||||
%mul88 = fmul contract float %90, %93
|
||||
%add89 = fadd contract float %add85, %mul88
|
||||
%94 = load float, float* %cb.addr, align 4
|
||||
%95 = load float, float* %temp1, align 4
|
||||
%mul90 = fmul contract float %94, %95
|
||||
%add91 = fadd contract float %add89, %mul90
|
||||
%96 = load float, float* %ct.addr, align 4
|
||||
%97 = load float, float* %temp3, align 4
|
||||
%mul92 = fmul contract float %96, %97
|
||||
%add93 = fadd contract float %add91, %mul92
|
||||
%98 = load float, float* %sdc.addr, align 4
|
||||
%99 = load float*, float** %p.addr, align 8
|
||||
%100 = load i32, i32* %c, align 4
|
||||
%idxprom94 = sext i32 %100 to i64
|
||||
%arrayidx95 = getelementptr inbounds float, float* %99, i64 %idxprom94
|
||||
%101 = load float, float* %arrayidx95, align 4
|
||||
%mul96 = fmul contract float %98, %101
|
||||
%add97 = fadd contract float %add93, %mul96
|
||||
%102 = load float, float* %ct.addr, align 4
|
||||
%103 = load float, float* %amb_temp, align 4
|
||||
%mul98 = fmul contract float %102, %103
|
||||
%add99 = fadd contract float %add97, %mul98
|
||||
%104 = load float*, float** %tOut.addr, align 8
|
||||
%105 = load i32, i32* %c, align 4
|
||||
%idxprom100 = sext i32 %105 to i64
|
||||
%arrayidx101 = getelementptr inbounds float, float* %104, i64 %idxprom100
|
||||
store float %add99, float* %arrayidx101, align 4
|
||||
%106 = load i32, i32* %xy, align 4
|
||||
%107 = load i32, i32* %c, align 4
|
||||
%add102 = add nsw i32 %107, %106
|
||||
store i32 %add102, i32* %c, align 4
|
||||
%108 = load i32, i32* %xy, align 4
|
||||
%109 = load i32, i32* %W, align 4
|
||||
%add103 = add nsw i32 %109, %108
|
||||
store i32 %add103, i32* %W, align 4
|
||||
%110 = load i32, i32* %xy, align 4
|
||||
%111 = load i32, i32* %E, align 4
|
||||
%add104 = add nsw i32 %111, %110
|
||||
store i32 %add104, i32* %E, align 4
|
||||
%112 = load i32, i32* %xy, align 4
|
||||
%113 = load i32, i32* %N, align 4
|
||||
%add105 = add nsw i32 %113, %112
|
||||
store i32 %add105, i32* %N, align 4
|
||||
%114 = load i32, i32* %xy, align 4
|
||||
%115 = load i32, i32* %S, align 4
|
||||
%add106 = add nsw i32 %115, %114
|
||||
store i32 %add106, i32* %S, align 4
|
||||
br label %for.inc
|
||||
|
||||
for.inc: ; preds = %for.body
|
||||
%116 = load i32, i32* %k, align 4
|
||||
%inc = add nsw i32 %116, 1
|
||||
store i32 %inc, i32* %k, align 4
|
||||
br label %for.cond
|
||||
|
||||
for.end: ; preds = %for.cond
|
||||
%117 = load float, float* %temp2, align 4
|
||||
store float %117, float* %temp1, align 4
|
||||
%118 = load float, float* %temp3, align 4
|
||||
store float %118, float* %temp2, align 4
|
||||
%119 = load float, float* %cc.addr, align 4
|
||||
%120 = load float, float* %temp2, align 4
|
||||
%mul107 = fmul contract float %119, %120
|
||||
%121 = load float, float* %cw.addr, align 4
|
||||
%122 = load float*, float** %tIn.addr, align 8
|
||||
%123 = load i32, i32* %W, align 4
|
||||
%idxprom108 = sext i32 %123 to i64
|
||||
%arrayidx109 = getelementptr inbounds float, float* %122, i64 %idxprom108
|
||||
%124 = load float, float* %arrayidx109, align 4
|
||||
%mul110 = fmul contract float %121, %124
|
||||
%add111 = fadd contract float %mul107, %mul110
|
||||
%125 = load float, float* %ce.addr, align 4
|
||||
%126 = load float*, float** %tIn.addr, align 8
|
||||
%127 = load i32, i32* %E, align 4
|
||||
%idxprom112 = sext i32 %127 to i64
|
||||
%arrayidx113 = getelementptr inbounds float, float* %126, i64 %idxprom112
|
||||
%128 = load float, float* %arrayidx113, align 4
|
||||
%mul114 = fmul contract float %125, %128
|
||||
%add115 = fadd contract float %add111, %mul114
|
||||
%129 = load float, float* %cs.addr, align 4
|
||||
%130 = load float*, float** %tIn.addr, align 8
|
||||
%131 = load i32, i32* %S, align 4
|
||||
%idxprom116 = sext i32 %131 to i64
|
||||
%arrayidx117 = getelementptr inbounds float, float* %130, i64 %idxprom116
|
||||
%132 = load float, float* %arrayidx117, align 4
|
||||
%mul118 = fmul contract float %129, %132
|
||||
%add119 = fadd contract float %add115, %mul118
|
||||
%133 = load float, float* %cn.addr, align 4
|
||||
%134 = load float*, float** %tIn.addr, align 8
|
||||
%135 = load i32, i32* %N, align 4
|
||||
%idxprom120 = sext i32 %135 to i64
|
||||
%arrayidx121 = getelementptr inbounds float, float* %134, i64 %idxprom120
|
||||
%136 = load float, float* %arrayidx121, align 4
|
||||
%mul122 = fmul contract float %133, %136
|
||||
%add123 = fadd contract float %add119, %mul122
|
||||
%137 = load float, float* %cb.addr, align 4
|
||||
%138 = load float, float* %temp1, align 4
|
||||
%mul124 = fmul contract float %137, %138
|
||||
%add125 = fadd contract float %add123, %mul124
|
||||
%139 = load float, float* %ct.addr, align 4
|
||||
%140 = load float, float* %temp3, align 4
|
||||
%mul126 = fmul contract float %139, %140
|
||||
%add127 = fadd contract float %add125, %mul126
|
||||
%141 = load float, float* %sdc.addr, align 4
|
||||
%142 = load float*, float** %p.addr, align 8
|
||||
%143 = load i32, i32* %c, align 4
|
||||
%idxprom128 = sext i32 %143 to i64
|
||||
%arrayidx129 = getelementptr inbounds float, float* %142, i64 %idxprom128
|
||||
%144 = load float, float* %arrayidx129, align 4
|
||||
%mul130 = fmul contract float %141, %144
|
||||
%add131 = fadd contract float %add127, %mul130
|
||||
%145 = load float, float* %ct.addr, align 4
|
||||
%146 = load float, float* %amb_temp, align 4
|
||||
%mul132 = fmul contract float %145, %146
|
||||
%add133 = fadd contract float %add131, %mul132
|
||||
%147 = load float*, float** %tOut.addr, align 8
|
||||
%148 = load i32, i32* %c, align 4
|
||||
%idxprom134 = sext i32 %148 to i64
|
||||
%arrayidx135 = getelementptr inbounds float, float* %147, i64 %idxprom134
|
||||
store float %add133, float* %arrayidx135, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; Function Attrs: alwaysinline convergent nounwind
|
||||
define linkonce_odr dso_local i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_xEv() #1 comdat align 2 {
|
||||
entry:
|
||||
%0 = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
|
||||
ret i32 %0
|
||||
}
|
||||
|
||||
; Function Attrs: alwaysinline convergent nounwind
|
||||
define linkonce_odr dso_local i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_xEv() #1 comdat align 2 {
|
||||
entry:
|
||||
%0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
|
||||
ret i32 %0
|
||||
}
|
||||
|
||||
; Function Attrs: alwaysinline convergent nounwind
|
||||
define linkonce_odr dso_local i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_xEv() #1 comdat align 2 {
|
||||
entry:
|
||||
%0 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
|
||||
ret i32 %0
|
||||
}
|
||||
|
||||
; Function Attrs: alwaysinline convergent nounwind
|
||||
define linkonce_odr dso_local i32 @_ZN25__cuda_builtin_blockDim_t17__fetch_builtin_yEv() #1 comdat align 2 {
|
||||
entry:
|
||||
%0 = call i32 @llvm.nvvm.read.ptx.sreg.ntid.y()
|
||||
ret i32 %0
|
||||
}
|
||||
|
||||
; Function Attrs: alwaysinline convergent nounwind
|
||||
define linkonce_odr dso_local i32 @_ZN25__cuda_builtin_blockIdx_t17__fetch_builtin_yEv() #1 comdat align 2 {
|
||||
entry:
|
||||
%0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y()
|
||||
ret i32 %0
|
||||
}
|
||||
|
||||
; Function Attrs: alwaysinline convergent nounwind
|
||||
define linkonce_odr dso_local i32 @_ZN26__cuda_builtin_threadIdx_t17__fetch_builtin_yEv() #1 comdat align 2 {
|
||||
entry:
|
||||
%0 = call i32 @llvm.nvvm.read.ptx.sreg.tid.y()
|
||||
ret i32 %0
|
||||
}
|
||||
|
||||
; Function Attrs: nounwind readnone
|
||||
declare i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #2
|
||||
|
||||
; Function Attrs: nounwind readnone
|
||||
declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #2
|
||||
|
||||
; Function Attrs: nounwind readnone
|
||||
declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() #2
|
||||
|
||||
; Function Attrs: nounwind readnone
|
||||
declare i32 @llvm.nvvm.read.ptx.sreg.ntid.y() #2
|
||||
|
||||
; Function Attrs: nounwind readnone
|
||||
declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.y() #2
|
||||
|
||||
; Function Attrs: nounwind readnone
|
||||
declare i32 @llvm.nvvm.read.ptx.sreg.tid.y() #2
|
||||
|
||||
attributes #0 = { convergent noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" }
|
||||
attributes #1 = { alwaysinline convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx64,+sm_61" "unsafe-fp-math"="false" "use-soft-float"="false" }
|
||||
attributes #2 = { nounwind readnone }
|
||||
attributes #3 = { convergent nounwind }
|
||||
|
||||
!llvm.module.flags = !{!0, !1, !2}
|
||||
!nvvm.annotations = !{!3, !4, !5, !4, !6, !6, !6, !6, !7, !7, !6}
|
||||
!llvm.ident = !{!8}
|
||||
!nvvmir.version = !{!9}
|
||||
|
||||
!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 10, i32 1]}
|
||||
!1 = !{i32 1, !"wchar_size", i32 4}
|
||||
!2 = !{i32 4, !"nvvm-reflect-ftz", i32 0}
|
||||
!3 = !{void (float*, float*, float*, float, i32, i32, i32, float, float, float, float, float, float, float)* @_Z11hotspotOpt1PfS_S_fiiifffffff, !"kernel", i32 1}
|
||||
!4 = !{null, !"align", i32 8}
|
||||
!5 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080}
|
||||
!6 = !{null, !"align", i32 16}
|
||||
!7 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088}
|
||||
!8 = !{!"clang version 10.0.1 (https://github.com/llvm/llvm-project.git ef32c611aa214dea855364efd7ba451ec5ec3f74)"}
|
||||
!9 = !{i32 1, i32 4}
|
File diff suppressed because one or more lines are too long
|
@ -1,205 +0,0 @@
|
|||
#include <assert.h>
|
||||
#include <math.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <sys/time.h>
|
||||
#include <time.h>
|
||||
|
||||
#define BLOCK_SIZE 16
|
||||
#define STR_SIZE 256
|
||||
|
||||
#define block_x_ 128
|
||||
#define block_y_ 2
|
||||
#define block_z_ 1
|
||||
#define MAX_PD (3.0e6)
|
||||
/* required precision in degrees */
|
||||
#define PRECISION 0.001
|
||||
#define SPEC_HEAT_SI 1.75e6
|
||||
#define K_SI 100
|
||||
/* capacitance fitting factor */
|
||||
#define FACTOR_CHIP 0.5
|
||||
|
||||
#include "opt1.cu"
|
||||
|
||||
/* chip parameters */
|
||||
float t_chip = 0.0005;
|
||||
float chip_height = 0.016;
|
||||
float chip_width = 0.016; /* ambient temperature, assuming no package at all
|
||||
*/
|
||||
float amb_temp = 80.0;
|
||||
|
||||
void fatal(const char *s) { fprintf(stderr, "Error: %s\n", s); }
|
||||
|
||||
void readinput(float *vect, int grid_rows, int grid_cols, int layers,
|
||||
char *file) {
|
||||
int i, j, k;
|
||||
FILE *fp;
|
||||
char str[STR_SIZE];
|
||||
float val;
|
||||
|
||||
if ((fp = fopen(file, "r")) == 0)
|
||||
fatal("The file was not opened");
|
||||
|
||||
for (i = 0; i <= grid_rows - 1; i++)
|
||||
for (j = 0; j <= grid_cols - 1; j++)
|
||||
for (k = 0; k <= layers - 1; k++) {
|
||||
if (fgets(str, STR_SIZE, fp) == NULL)
|
||||
fatal("Error reading file\n");
|
||||
if (feof(fp))
|
||||
fatal("not enough lines in file");
|
||||
if ((sscanf(str, "%f", &val) != 1))
|
||||
fatal("invalid file format");
|
||||
vect[i * grid_cols + j + k * grid_rows * grid_cols] = val;
|
||||
}
|
||||
|
||||
fclose(fp);
|
||||
}
|
||||
|
||||
void writeoutput(float *vect, int grid_rows, int grid_cols, int layers,
|
||||
char *file) {
|
||||
|
||||
int i, j, k, index = 0;
|
||||
FILE *fp;
|
||||
char str[STR_SIZE];
|
||||
|
||||
if ((fp = fopen(file, "w")) == 0)
|
||||
printf("The file was not opened\n");
|
||||
|
||||
for (i = 0; i < grid_rows; i++)
|
||||
for (j = 0; j < grid_cols; j++)
|
||||
for (k = 0; k < layers; k++) {
|
||||
sprintf(str, "%d\t%g\n", index,
|
||||
vect[i * grid_cols + j + k * grid_rows * grid_cols]);
|
||||
fputs(str, fp);
|
||||
index++;
|
||||
}
|
||||
|
||||
fclose(fp);
|
||||
}
|
||||
|
||||
void computeTempCPU(float *pIn, float *tIn, float *tOut, int nx, int ny, int nz,
|
||||
float Cap, float Rx, float Ry, float Rz, float dt,
|
||||
int numiter) {
|
||||
float ce, cw, cn, cs, ct, cb, cc;
|
||||
float stepDivCap = dt / Cap;
|
||||
ce = cw = stepDivCap / Rx;
|
||||
cn = cs = stepDivCap / Ry;
|
||||
ct = cb = stepDivCap / Rz;
|
||||
|
||||
cc = 1.0 - (2.0 * ce + 2.0 * cn + 3.0 * ct);
|
||||
|
||||
int c, w, e, n, s, b, t;
|
||||
int x, y, z;
|
||||
int i = 0;
|
||||
do {
|
||||
for (z = 0; z < nz; z++)
|
||||
for (y = 0; y < ny; y++)
|
||||
for (x = 0; x < nx; x++) {
|
||||
c = x + y * nx + z * nx * ny;
|
||||
|
||||
w = (x == 0) ? c : c - 1;
|
||||
e = (x == nx - 1) ? c : c + 1;
|
||||
n = (y == 0) ? c : c - nx;
|
||||
s = (y == ny - 1) ? c : c + nx;
|
||||
b = (z == 0) ? c : c - nx * ny;
|
||||
t = (z == nz - 1) ? c : c + nx * ny;
|
||||
|
||||
tOut[c] = tIn[c] * cc + tIn[n] * cn + tIn[s] * cs + tIn[e] * ce +
|
||||
tIn[w] * cw + tIn[t] * ct + tIn[b] * cb +
|
||||
(dt / Cap) * pIn[c] + ct * amb_temp;
|
||||
}
|
||||
float *temp = tIn;
|
||||
tIn = tOut;
|
||||
tOut = temp;
|
||||
i++;
|
||||
} while (i < numiter);
|
||||
}
|
||||
|
||||
float accuracy(float *arr1, float *arr2, int len) {
|
||||
float err = 0.0;
|
||||
int i;
|
||||
for (i = 0; i < len; i++) {
|
||||
err += (arr1[i] - arr2[i]) * (arr1[i] - arr2[i]);
|
||||
}
|
||||
|
||||
return (float)sqrt(err / len);
|
||||
}
|
||||
|
||||
void usage(int argc, char **argv) {
|
||||
fprintf(stderr,
|
||||
"Usage: %s <rows/cols> <layers> <iterations> <powerFile> <tempFile> "
|
||||
"<outputFile>\n",
|
||||
argv[0]);
|
||||
fprintf(
|
||||
stderr,
|
||||
"\t<rows/cols> - number of rows/cols in the grid (positive integer)\n");
|
||||
fprintf(stderr,
|
||||
"\t<layers> - number of layers in the grid (positive integer)\n");
|
||||
|
||||
fprintf(stderr, "\t<iteration> - number of iterations\n");
|
||||
fprintf(stderr, "\t<powerFile> - name of the file containing the initial "
|
||||
"power values of each cell\n");
|
||||
fprintf(stderr, "\t<tempFile> - name of the file containing the initial "
|
||||
"temperature values of each cell\n");
|
||||
fprintf(stderr, "\t<outputFile - output file\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
int main(int argc, char **argv) {
|
||||
cudaSetDevice(0);
|
||||
if (argc != 7) {
|
||||
usage(argc, argv);
|
||||
}
|
||||
|
||||
char *pfile, *tfile, *ofile;
|
||||
int iterations = atoi(argv[3]);
|
||||
|
||||
pfile = argv[4];
|
||||
tfile = argv[5];
|
||||
ofile = argv[6];
|
||||
int numCols = atoi(argv[1]);
|
||||
int numRows = atoi(argv[1]);
|
||||
int layers = atoi(argv[2]);
|
||||
|
||||
/* calculating parameters*/
|
||||
|
||||
float dx = chip_height / numRows;
|
||||
float dy = chip_width / numCols;
|
||||
float dz = t_chip / layers;
|
||||
|
||||
float Cap = FACTOR_CHIP * SPEC_HEAT_SI * t_chip * dx * dy;
|
||||
float Rx = dy / (2.0 * K_SI * t_chip * dx);
|
||||
float Ry = dx / (2.0 * K_SI * t_chip * dy);
|
||||
float Rz = dz / (K_SI * dx * dy);
|
||||
|
||||
float max_slope = MAX_PD / (FACTOR_CHIP * t_chip * SPEC_HEAT_SI);
|
||||
float dt = PRECISION / max_slope;
|
||||
|
||||
float *powerIn, *tempOut, *tempIn, *tempCopy;
|
||||
int size = numCols * numRows * layers;
|
||||
|
||||
powerIn = (float *)calloc(size, sizeof(float));
|
||||
tempCopy = (float *)malloc(size * sizeof(float));
|
||||
tempIn = (float *)calloc(size, sizeof(float));
|
||||
tempOut = (float *)calloc(size, sizeof(float));
|
||||
float *answer = (float *)calloc(size, sizeof(float));
|
||||
|
||||
readinput(powerIn, numRows, numCols, layers, pfile);
|
||||
readinput(tempIn, numRows, numCols, layers, tfile);
|
||||
|
||||
memcpy(tempCopy, tempIn, size * sizeof(float));
|
||||
|
||||
hotspot_opt1(powerIn, tempIn, tempOut, numCols, numRows, layers, Cap, Rx, Ry,
|
||||
Rz, dt, iterations);
|
||||
|
||||
computeTempCPU(powerIn, tempCopy, answer, numCols, numRows, layers, Cap, Rx,
|
||||
Ry, Rz, dt, iterations);
|
||||
|
||||
float acc = accuracy(tempOut, answer, numRows * numCols * layers);
|
||||
printf("Accuracy: %e\n", acc);
|
||||
writeoutput(tempOut, numRows, numCols, layers, ofile);
|
||||
free(tempIn);
|
||||
free(tempOut);
|
||||
free(powerIn);
|
||||
return 0;
|
||||
}
|
|
@ -1,22 +0,0 @@
|
|||
# # #!/bin/bash
|
||||
set -e
|
||||
llvm-as 3D-cuda-nvptx64-nvidia-cuda-sm_61.ll
|
||||
llvm-as 3D-host-x86_64-unknown-linux-gnu.ll
|
||||
../../build/compilation/kernelTranslator 3D-cuda-nvptx64-nvidia-cuda-sm_61.bc kernel.bc
|
||||
../../build/compilation/hostTranslator 3D-host-x86_64-unknown-linux-gnu.bc host.bc
|
||||
|
||||
llc --relocation-model=pic --filetype=obj kernel.bc
|
||||
llc --relocation-model=pic --filetype=obj host.bc
|
||||
|
||||
g++ -g -Wall -L../../build/runtime -L../../build/runtime/threadPool -o 3D \
|
||||
-fPIC -no-pie host.o kernel.o -lc -lx86Runtime -lthreadPool -lpthread
|
||||
|
||||
export LD_LIBRARY_PATH=../../build/runtime:../../build/runtime/threadPool:$LD_LIBRARY_PATH
|
||||
./3D 512 8 100 ../../rodinia-data/hotspot3D/power_512x8 ../../rodinia-data/hotspot3D/temp_512x8 output.out
|
||||
|
||||
if head output.out | grep -q "334.017"; then
|
||||
echo "Pass"
|
||||
else
|
||||
echo "Error result"
|
||||
exit 1
|
||||
fi
|
|
@ -1,24 +0,0 @@
|
|||
#ifndef _COMPARISON_HELPERS_H_
|
||||
#define _COMPARISON_HELPERS_H_
|
||||
#include <stdio.h>
|
||||
template <typename T>
|
||||
__inline int compare_vectors(T *data1, T *data2, unsigned int size) {
|
||||
printf("Comparing vectors: \n");
|
||||
bool match = true;
|
||||
for (unsigned int i = 0; i < size; i++)
|
||||
if (data1[i] != data2[i]) {
|
||||
match = false;
|
||||
printf("Diff: data1[%d]=%d, data1[%d]=%d.\n", i, data1[i], i, data2[i]);
|
||||
}
|
||||
|
||||
if (match) {
|
||||
printf("PASS! vectors are matching!\n");
|
||||
return 0;
|
||||
} else {
|
||||
printf("FAIL! vectors are NOT matching!\n");
|
||||
exit(1);
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
|
@ -1,116 +0,0 @@
|
|||
#include "stdafx.h"
|
||||
|
||||
#include "cpuencode.h"
|
||||
#include "print_helpers.h"
|
||||
|
||||
using namespace std;
|
||||
|
||||
#if 1
|
||||
|
||||
// The max. codeword length for each byte symbol is 32-bits
|
||||
|
||||
extern "C" void cpu_vlc_encode(unsigned int *indata, unsigned int num_elements,
|
||||
unsigned int *outdata, unsigned int *outsize,
|
||||
unsigned int *codewords,
|
||||
unsigned int *codewordlens) {
|
||||
unsigned int *bitstreamPt =
|
||||
(unsigned int *)outdata; /* Pointer to current byte */
|
||||
*bitstreamPt = 0x00000000U;
|
||||
unsigned int startbit = 0;
|
||||
unsigned int totalBytes = 0;
|
||||
|
||||
for (unsigned int k = 0; k < num_elements; k++) {
|
||||
unsigned int cw32 = 0;
|
||||
unsigned int val32 = indata[k];
|
||||
unsigned int numbits = 0;
|
||||
unsigned int mask32;
|
||||
|
||||
for (unsigned int i = 0; i < 4; i++) {
|
||||
unsigned char symbol = (unsigned char)(val32 >> (8 * (3 - i)));
|
||||
cw32 = codewords[symbol];
|
||||
numbits = codewordlens[symbol];
|
||||
|
||||
while (numbits > 0) {
|
||||
int writebits = min(32 - startbit, numbits);
|
||||
if (numbits == writebits)
|
||||
mask32 = (cw32 & ((1 << numbits) - 1))
|
||||
<< (32 - startbit -
|
||||
numbits); // first make sure that the start of the word
|
||||
// is clean, then shift to the left as many
|
||||
// places as you need
|
||||
else
|
||||
mask32 = cw32 >>
|
||||
(numbits - writebits); // shift out the bits that can not fit
|
||||
*bitstreamPt = (*bitstreamPt) | mask32;
|
||||
numbits = numbits - writebits;
|
||||
startbit = (startbit + writebits) % 32;
|
||||
if (startbit == 0) {
|
||||
bitstreamPt++;
|
||||
*bitstreamPt = 0x00000000;
|
||||
totalBytes += 4;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
totalBytes += (startbit / 8) +
|
||||
((startbit % 8 == 0) ? 0 : 1); // return aligned to 8-bits
|
||||
*outsize = totalBytes;
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////
|
||||
/// ALTERNATIVE CODER
|
||||
/// ASSUMPTION: The max. length of 4 combined codewords can be 2x original data,
|
||||
/// i.e. g 64 bits
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
|
||||
#else
|
||||
|
||||
extern "C" void cpu_vlc_encode(unsigned int *indata, unsigned int num_elements,
|
||||
unsigned int *outdata, unsigned int *outsize,
|
||||
unsigned int *codewords,
|
||||
unsigned int *codewordlens) {
|
||||
unsigned int *bitstreamPt =
|
||||
(unsigned int *)outdata; /* Pointer to current byte */
|
||||
// assume memset is done.
|
||||
*bitstreamPt = 0x00000000U;
|
||||
unsigned int startbit = 0;
|
||||
unsigned int totalBytes = 0;
|
||||
|
||||
for (unsigned int k = 0; k < num_elements; k++) {
|
||||
unsigned long long cw64 = 0, mask64 = 0;
|
||||
unsigned int val32 = indata[k];
|
||||
unsigned int numbits = 0;
|
||||
unsigned int mask32, temp32;
|
||||
|
||||
for (unsigned int i = 0; i < 4; i++) {
|
||||
unsigned char symbol = (unsigned char)(val32 >> (8 * (3 - i)));
|
||||
cw64 = (cw64 << codewordlens[symbol]) | codewords[symbol];
|
||||
numbits += codewordlens[symbol];
|
||||
// if (numbits>32) printf("WARRNING! Element %d is combined into numbits =
|
||||
// %d!!!!!!!\n", k, numbits);
|
||||
}
|
||||
|
||||
while (numbits > 0) {
|
||||
int writebits = min(32 - startbit, numbits);
|
||||
if (numbits == writebits) {
|
||||
temp32 = (unsigned int)cw64; //(cw64 & 0xFFFFFFFF);
|
||||
mask32 = temp32 << (32 - startbit - numbits);
|
||||
} else {
|
||||
mask32 = (unsigned int)(cw64 >> (numbits - writebits));
|
||||
cw64 = cw64 & ((1 << (numbits - writebits)) - 1);
|
||||
}
|
||||
*bitstreamPt = (*bitstreamPt) | mask32;
|
||||
numbits = numbits - writebits;
|
||||
startbit = (startbit + writebits) % 32;
|
||||
if (startbit == 0) {
|
||||
bitstreamPt++;
|
||||
*bitstreamPt = 0x00000000;
|
||||
totalBytes += 4;
|
||||
}
|
||||
}
|
||||
}
|
||||
totalBytes += (startbit / 8) +
|
||||
((startbit % 8 == 0) ? 0 : 1); // return aligned to 8-bits
|
||||
*outsize = totalBytes;
|
||||
}
|
||||
#endif
|
|
@ -1,8 +0,0 @@
|
|||
#ifndef _CE_H_
|
||||
#define _CE_H_
|
||||
|
||||
extern "C" void cpu_vlc_encode(unsigned int *indata, unsigned int num_elements,
|
||||
unsigned int *outdata, unsigned int *outsize,
|
||||
unsigned int *codewords,
|
||||
unsigned int *codewordlens);
|
||||
#endif
|
|
@ -1,20 +0,0 @@
|
|||
#ifndef __CUDA_HELPERS__
|
||||
#define __CUDA_HELPERS__
|
||||
#include <stdio.h>
|
||||
/************************************************************************/
|
||||
/* Init CUDA */
|
||||
/************************************************************************/
|
||||
#if __DEVICE_EMULATION__
|
||||
|
||||
bool InitCUDA(void) { return true; }
|
||||
|
||||
#else
|
||||
bool InitCUDA(void) {
|
||||
|
||||
cudaSetDevice(0);
|
||||
|
||||
printf("CUDA initialized.\n");
|
||||
return true;
|
||||
}
|
||||
#endif
|
||||
#endif
|
|
@ -1,931 +0,0 @@
|
|||
/*
|
||||
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
|
||||
*
|
||||
* Please refer to the NVIDIA end user license agreement (EULA) associated
|
||||
* with this source code for terms and conditions that govern your use of
|
||||
* this software. Any use, reproduction, disclosure, or distribution of
|
||||
* this software and related documentation outside the terms of the EULA
|
||||
* is strictly prohibited.
|
||||
*
|
||||
*/
|
||||
|
||||
/*
|
||||
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
|
||||
*
|
||||
* Please refer to the NVIDIA end user license agreement (EULA) associated
|
||||
* with this source code for terms and conditions that govern your use of
|
||||
* this software. Any use, reproduction, disclosure, or distribution of
|
||||
* this software and related documentation outside the terms of the EULA
|
||||
* is strictly prohibited.
|
||||
*
|
||||
*/
|
||||
|
||||
/* CUda UTility Library */
|
||||
|
||||
#ifndef _CUTIL_H_
|
||||
#define _CUTIL_H_
|
||||
|
||||
#ifdef _WIN32
|
||||
#pragma warning(disable : 4996) // disable deprecated warning
|
||||
#endif
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
// helper typedefs for building DLL
|
||||
#ifdef _WIN32
|
||||
#ifdef BUILD_DLL
|
||||
#define DLL_MAPPING __declspec(dllexport)
|
||||
#else
|
||||
#define DLL_MAPPING __declspec(dllimport)
|
||||
#endif
|
||||
#else
|
||||
#define DLL_MAPPING
|
||||
#endif
|
||||
|
||||
#ifdef _WIN32
|
||||
#define CUTIL_API __stdcall
|
||||
#else
|
||||
#define CUTIL_API
|
||||
#endif
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! CUT bool type
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
enum CUTBoolean { CUTFalse = 0, CUTTrue = 1 };
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Deallocate memory allocated within Cutil
|
||||
//! @param pointer to memory
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
DLL_MAPPING
|
||||
void CUTIL_API cutFree(void *ptr);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Helper for bank conflict checking (should only be used with the
|
||||
//! CUT_BANK_CHECKER macro)
|
||||
//! @param tidx thread id in x dimension of block
|
||||
//! @param tidy thread id in y dimension of block
|
||||
//! @param tidz thread id in z dimension of block
|
||||
//! @param bdimx block size in x dimension
|
||||
//! @param bdimy block size in y dimension
|
||||
//! @param bdimz block size in z dimension
|
||||
//! @param file name of the source file where the access takes place
|
||||
//! @param line line in the source file where the access takes place
|
||||
//! @param aname name of the array which is accessed
|
||||
//! @param index index into the array
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
DLL_MAPPING
|
||||
void CUTIL_API cutCheckBankAccess(unsigned int tidx, unsigned int tidy,
|
||||
unsigned int tidz, unsigned int bdimx,
|
||||
unsigned int bdimy, unsigned int bdimz,
|
||||
const char *file, const int line,
|
||||
const char *aname, const int index);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Find the path for a filename
|
||||
//! @return the path if succeeded, otherwise 0
|
||||
//! @param filename name of the file
|
||||
//! @param executablePath optional absolute path of the executable
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
DLL_MAPPING
|
||||
char *CUTIL_API cutFindFilePath(const char *filename,
|
||||
const char *executablePath);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Read file \filename containing single precision floating point data
|
||||
//! @return CUTTrue if reading the file succeeded, otherwise false
|
||||
//! @param filename name of the source file
|
||||
//! @param data uninitialized pointer, returned initialized and pointing to
|
||||
//! the data read
|
||||
//! @param len number of data elements in data, -1 on error
|
||||
//! @note If a NULL pointer is passed to this function and it is
|
||||
//! initialized within Cutil then cutFree() has to be used to
|
||||
//! deallocate the memory
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
DLL_MAPPING
|
||||
CUTBoolean CUTIL_API cutReadFilef(const char *filename, float **data,
|
||||
unsigned int *len, bool verbose = false);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Read file \filename containing double precision floating point data
|
||||
//! @return CUTTrue if reading the file succeeded, otherwise false
|
||||
//! @param filename name of the source file
|
||||
//! @param data uninitialized pointer, returned initialized and pointing to
|
||||
//! the data read
|
||||
//! @param len number of data elements in data, -1 on error
|
||||
//! @note If a NULL pointer is passed to this function and it is
|
||||
//! initialized within Cutil then cutFree() has to be used to
|
||||
//! deallocate the memory
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
DLL_MAPPING
|
||||
CUTBoolean CUTIL_API cutReadFiled(const char *filename, double **data,
|
||||
unsigned int *len, bool verbose = false);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Read file \filename containing integer data
|
||||
//! @return CUTTrue if reading the file succeeded, otherwise false
|
||||
//! @param filename name of the source file
|
||||
//! @param data uninitialized pointer, returned initialized and pointing to
|
||||
//! the data read
|
||||
//! @param len number of data elements in data, -1 on error
|
||||
//! @note If a NULL pointer is passed to this function and it is
|
||||
//! initialized within Cutil then cutFree() has to be used to
|
||||
//! deallocate the memory
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
DLL_MAPPING
|
||||
CUTBoolean CUTIL_API cutReadFilei(const char *filename, int **data,
|
||||
unsigned int *len, bool verbose = false);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Read file \filename containing unsigned integer data
|
||||
//! @return CUTTrue if reading the file succeeded, otherwise false
|
||||
//! @param filename name of the source file
|
||||
//! @param data uninitialized pointer, returned initialized and pointing to
|
||||
//! the data read
|
||||
//! @param len number of data elements in data, -1 on error
|
||||
//! @note If a NULL pointer is passed to this function and it is
|
||||
//! initialized within Cutil then cutFree() has to be used to
|
||||
//! deallocate the memory
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
DLL_MAPPING
|
||||
CUTBoolean CUTIL_API cutReadFileui(const char *filename, unsigned int **data,
|
||||
unsigned int *len, bool verbose = false);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Read file \filename containing char / byte data
|
||||
//! @return CUTTrue if reading the file succeeded, otherwise false
|
||||
//! @param filename name of the source file
|
||||
//! @param data uninitialized pointer, returned initialized and pointing to
|
||||
//! the data read
|
||||
//! @param len number of data elements in data, -1 on error
|
||||
//! @note If a NULL pointer is passed to this function and it is
|
||||
//! initialized within Cutil then cutFree() has to be used to
|
||||
//! deallocate the memory
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
DLL_MAPPING
|
||||
CUTBoolean CUTIL_API cutReadFileb(const char *filename, char **data,
|
||||
unsigned int *len, bool verbose = false);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Read file \filename containing unsigned char / byte data
|
||||
//! @return CUTTrue if reading the file succeeded, otherwise false
|
||||
//! @param filename name of the source file
|
||||
//! @param data uninitialized pointer, returned initialized and pointing to
|
||||
//! the data read
|
||||
//! @param len number of data elements in data, -1 on error
|
||||
//! @note If a NULL pointer is passed to this function and it is
|
||||
//! initialized within Cutil then cutFree() has to be used to
|
||||
//! deallocate the memory
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
DLL_MAPPING
|
||||
CUTBoolean CUTIL_API cutReadFileub(const char *filename, unsigned char **data,
|
||||
unsigned int *len, bool verbose = false);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Write a data file \filename containing single precision floating point
|
||||
//! data
|
||||
//! @return CUTTrue if writing the file succeeded, otherwise false
|
||||
//! @param filename name of the file to write
|
||||
//! @param data pointer to data to write
|
||||
//! @param len number of data elements in data, -1 on error
|
||||
//! @param epsilon epsilon for comparison
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
DLL_MAPPING
|
||||
CUTBoolean CUTIL_API cutWriteFilef(const char *filename, const float *data,
|
||||
unsigned int len, const float epsilon,
|
||||
bool verbose = false);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Write a data file \filename containing double precision floating point
|
||||
//! data
|
||||
//! @return CUTTrue if writing the file succeeded, otherwise false
|
||||
//! @param filename name of the file to write
|
||||
//! @param data pointer to data to write
|
||||
//! @param len number of data elements in data, -1 on error
|
||||
//! @param epsilon epsilon for comparison
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
DLL_MAPPING
|
||||
CUTBoolean CUTIL_API cutWriteFiled(const char *filename, const float *data,
|
||||
unsigned int len, const double epsilon,
|
||||
bool verbose = false);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Write a data file \filename containing integer data
|
||||
//! @return CUTTrue if writing the file succeeded, otherwise false
|
||||
//! @param filename name of the file to write
|
||||
//! @param data pointer to data to write
|
||||
//! @param len number of data elements in data, -1 on error
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
DLL_MAPPING
|
||||
CUTBoolean CUTIL_API cutWriteFilei(const char *filename, const int *data,
|
||||
unsigned int len, bool verbose = false);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Write a data file \filename containing unsigned integer data
|
||||
//! @return CUTTrue if writing the file succeeded, otherwise false
|
||||
//! @param filename name of the file to write
|
||||
//! @param data pointer to data to write
|
||||
//! @param len number of data elements in data, -1 on error
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
DLL_MAPPING
|
||||
CUTBoolean CUTIL_API cutWriteFileui(const char *filename,
|
||||
const unsigned int *data, unsigned int len,
|
||||
bool verbose = false);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Write a data file \filename containing char / byte data
|
||||
//! @return CUTTrue if writing the file succeeded, otherwise false
|
||||
//! @param filename name of the file to write
|
||||
//! @param data pointer to data to write
|
||||
//! @param len number of data elements in data, -1 on error
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
DLL_MAPPING
|
||||
CUTBoolean CUTIL_API cutWriteFileb(const char *filename, const char *data,
|
||||
unsigned int len, bool verbose = false);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Write a data file \filename containing unsigned char / byte data
|
||||
//! @return CUTTrue if writing the file succeeded, otherwise false
|
||||
//! @param filename name of the file to write
|
||||
//! @param data pointer to data to write
|
||||
//! @param len number of data elements in data, -1 on error
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
DLL_MAPPING
|
||||
CUTBoolean CUTIL_API cutWriteFileub(const char *filename,
|
||||
const unsigned char *data, unsigned int len,
|
||||
bool verbose = false);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Load PGM image file (with unsigned char as data element type)
|
||||
//! @return CUTTrue if reading the file succeeded, otherwise false
|
||||
//! @param file name of the image file
|
||||
//! @param data handle to the data read
|
||||
//! @param w width of the image
|
||||
//! @param h height of the image
|
||||
//! @note If a NULL pointer is passed to this function and it is
|
||||
//! initialized within Cutil then cutFree() has to be used to
|
||||
//! deallocate the memory
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
DLL_MAPPING
|
||||
CUTBoolean CUTIL_API cutLoadPGMub(const char *file, unsigned char **data,
|
||||
unsigned int *w, unsigned int *h);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Load PPM image file (with unsigned char as data element type)
|
||||
//! @return CUTTrue if reading the file succeeded, otherwise false
|
||||
//! @param file name of the image file
|
||||
//! @param data handle to the data read
|
||||
//! @param w width of the image
|
||||
//! @param h height of the image
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
DLL_MAPPING
|
||||
CUTBoolean CUTIL_API cutLoadPPMub(const char *file, unsigned char **data,
|
||||
unsigned int *w, unsigned int *h);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Load PPM image file (with unsigned char as data element type), padding
|
||||
//! 4th component
|
||||
//! @return CUTTrue if reading the file succeeded, otherwise false
|
||||
//! @param file name of the image file
|
||||
//! @param data handle to the data read
|
||||
//! @param w width of the image
|
||||
//! @param h height of the image
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
DLL_MAPPING
|
||||
CUTBoolean CUTIL_API cutLoadPPM4ub(const char *file, unsigned char **data,
|
||||
unsigned int *w, unsigned int *h);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Load PGM image file (with unsigned int as data element type)
|
||||
//! @return CUTTrue if reading the file succeeded, otherwise false
|
||||
//! @param file name of the image file
|
||||
//! @param data handle to the data read
|
||||
//! @param w width of the image
|
||||
//! @param h height of the image
|
||||
//! @note If a NULL pointer is passed to this function and it is
|
||||
//! initialized within Cutil then cutFree() has to be used to
|
||||
//! deallocate the memory
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
DLL_MAPPING
|
||||
CUTBoolean CUTIL_API cutLoadPGMi(const char *file, unsigned int **data,
|
||||
unsigned int *w, unsigned int *h);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Load PGM image file (with unsigned short as data element type)
|
||||
//! @return CUTTrue if reading the file succeeded, otherwise false
|
||||
//! @param file name of the image file
|
||||
//! @param data handle to the data read
|
||||
//! @param w width of the image
|
||||
//! @param h height of the image
|
||||
//! @note If a NULL pointer is passed to this function and it is
|
||||
//! initialized withing Cutil then cutFree() has to be used to
|
||||
//! deallocate the memory
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
DLL_MAPPING
|
||||
CUTBoolean CUTIL_API cutLoadPGMs(const char *file, unsigned short **data,
|
||||
unsigned int *w, unsigned int *h);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Load PGM image file (with float as data element type)
|
||||
//! @param file name of the image file
|
||||
//! @param data handle to the data read
|
||||
//! @param w width of the image
|
||||
//! @param h height of the image
|
||||
//! @note If a NULL pointer is passed to this function and it is
|
||||
//! initialized withing Cutil then cutFree() has to be used to
|
||||
//! deallocate the memory
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
DLL_MAPPING
|
||||
CUTBoolean CUTIL_API cutLoadPGMf(const char *file, float **data,
|
||||
unsigned int *w, unsigned int *h);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Save PGM image file (with unsigned char as data element type)
|
||||
//! @param file name of the image file
|
||||
//! @param data handle to the data read
|
||||
//! @param w width of the image
|
||||
//! @param h height of the image
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
DLL_MAPPING
|
||||
CUTBoolean CUTIL_API cutSavePGMub(const char *file, unsigned char *data,
|
||||
unsigned int w, unsigned int h);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Save PPM image file (with unsigned char as data element type)
|
||||
//! @param file name of the image file
|
||||
//! @param data handle to the data read
|
||||
//! @param w width of the image
|
||||
//! @param h height of the image
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
DLL_MAPPING
|
||||
CUTBoolean CUTIL_API cutSavePPMub(const char *file, unsigned char *data,
|
||||
unsigned int w, unsigned int h);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Save PPM image file (with unsigned char as data element type, padded to
|
||||
//! 4 bytes)
|
||||
//! @param file name of the image file
|
||||
//! @param data handle to the data read
|
||||
//! @param w width of the image
|
||||
//! @param h height of the image
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
DLL_MAPPING
|
||||
CUTBoolean CUTIL_API cutSavePPM4ub(const char *file, unsigned char *data,
|
||||
unsigned int w, unsigned int h);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Save PGM image file (with unsigned int as data element type)
|
||||
//! @param file name of the image file
|
||||
//! @param data handle to the data read
|
||||
//! @param w width of the image
|
||||
//! @param h height of the image
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
DLL_MAPPING
|
||||
CUTBoolean CUTIL_API cutSavePGMi(const char *file, unsigned int *data,
|
||||
unsigned int w, unsigned int h);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Save PGM image file (with unsigned short as data element type)
|
||||
//! @param file name of the image file
|
||||
//! @param data handle to the data read
|
||||
//! @param w width of the image
|
||||
//! @param h height of the image
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
DLL_MAPPING
|
||||
CUTBoolean CUTIL_API cutSavePGMs(const char *file, unsigned short *data,
|
||||
unsigned int w, unsigned int h);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Save PGM image file (with float as data element type)
|
||||
//! @param file name of the image file
|
||||
//! @param data handle to the data read
|
||||
//! @param w width of the image
|
||||
//! @param h height of the image
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
DLL_MAPPING
|
||||
CUTBoolean CUTIL_API cutSavePGMf(const char *file, float *data, unsigned int w,
|
||||
unsigned int h);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
// Command line arguments: General notes
|
||||
// * All command line arguments begin with '--' followed by the token;
|
||||
// token and value are seperated by '='; example --samples=50
|
||||
// * Arrays have the form --model=[one.obj,two.obj,three.obj]
|
||||
// (without whitespaces)
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Check if command line argument \a flag-name is given
|
||||
//! @return CUTTrue if command line argument \a flag_name has been given,
|
||||
//! otherwise 0
|
||||
//! @param argc argc as passed to main()
|
||||
//! @param argv argv as passed to main()
|
||||
//! @param flag_name name of command line flag
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
DLL_MAPPING
|
||||
CUTBoolean CUTIL_API cutCheckCmdLineFlag(const int argc, const char **argv,
|
||||
const char *flag_name);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Get the value of a command line argument of type int
|
||||
//! @return CUTTrue if command line argument \a arg_name has been given and
|
||||
//! is of the requested type, otherwise CUTFalse
|
||||
//! @param argc argc as passed to main()
|
||||
//! @param argv argv as passed to main()
|
||||
//! @param arg_name name of the command line argument
|
||||
//! @param val value of the command line argument
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
DLL_MAPPING
|
||||
CUTBoolean CUTIL_API cutGetCmdLineArgumenti(const int argc, const char **argv,
|
||||
const char *arg_name, int *val);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Get the value of a command line argument of type float
|
||||
//! @return CUTTrue if command line argument \a arg_name has been given and
|
||||
//! is of the requested type, otherwise CUTFalse
|
||||
//! @param argc argc as passed to main()
|
||||
//! @param argv argv as passed to main()
|
||||
//! @param arg_name name of the command line argument
|
||||
//! @param val value of the command line argument
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
DLL_MAPPING
|
||||
CUTBoolean CUTIL_API cutGetCmdLineArgumentf(const int argc, const char **argv,
|
||||
const char *arg_name, float *val);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Get the value of a command line argument of type string
|
||||
//! @return CUTTrue if command line argument \a arg_name has been given and
|
||||
//! is of the requested type, otherwise CUTFalse
|
||||
//! @param argc argc as passed to main()
|
||||
//! @param argv argv as passed to main()
|
||||
//! @param arg_name name of the command line argument
|
||||
//! @param val value of the command line argument
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
DLL_MAPPING
|
||||
CUTBoolean CUTIL_API cutGetCmdLineArgumentstr(const int argc, const char **argv,
|
||||
const char *arg_name, char **val);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Get the value of a command line argument list those element are strings
|
||||
//! @return CUTTrue if command line argument \a arg_name has been given and
|
||||
//! is of the requested type, otherwise CUTFalse
|
||||
//! @param argc argc as passed to main()
|
||||
//! @param argv argv as passed to main()
|
||||
//! @param arg_name name of the command line argument
|
||||
//! @param val command line argument list
|
||||
//! @param len length of the list / number of elements
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
DLL_MAPPING
|
||||
CUTBoolean CUTIL_API cutGetCmdLineArgumentListstr(const int argc,
|
||||
const char **argv,
|
||||
const char *arg_name,
|
||||
char **val,
|
||||
unsigned int *len);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Extended assert
|
||||
//! @return CUTTrue if the condition \a val holds, otherwise CUTFalse
|
||||
//! @param val condition to test
|
||||
//! @param file __FILE__ macro
|
||||
//! @param line __LINE__ macro
|
||||
//! @note This function should be used via the CONDITION(val) macro
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
DLL_MAPPING
|
||||
CUTBoolean CUTIL_API cutCheckCondition(int val, const char *file,
|
||||
const int line);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Compare two float arrays
|
||||
//! @return CUTTrue if \a reference and \a data are identical,
|
||||
//! otherwise CUTFalse
|
||||
//! @param reference handle to the reference data / gold image
|
||||
//! @param data handle to the computed data
|
||||
//! @param len number of elements in reference and data
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
DLL_MAPPING
|
||||
CUTBoolean CUTIL_API cutComparef(const float *reference, const float *data,
|
||||
const unsigned int len);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Compare two integer arrays
|
||||
//! @return CUTTrue if \a reference and \a data are identical,
|
||||
//! otherwise CUTFalse
|
||||
//! @param reference handle to the reference data / gold image
|
||||
//! @param data handle to the computed data
|
||||
//! @param len number of elements in reference and data
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
DLL_MAPPING
|
||||
CUTBoolean CUTIL_API cutComparei(const int *reference, const int *data,
|
||||
const unsigned int len);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//! Compare two unsigned integer arrays, with epsilon and threshold
|
||||
//! @return CUTTrue if \a reference and \a data are identical,
|
||||
//! otherwise CUTFalse
|
||||
//! @param reference handle to the reference data / gold image
|
||||
//! @param data handle to the computed data
|
||||
//! @param len number of elements in reference and data
|
||||
//! @param threshold tolerance % # of comparison errors (0.15f = 15%)
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
DLL_MAPPING
|
||||
CUTBoolean CUTIL_API cutCompareuit(const unsigned int *reference,
|
||||
const unsigned int *data,
|
||||
const unsigned int len, const float epsilon,
|
||||
const float threshold);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Compare two unsigned char arrays
|
||||
//! @return CUTTrue if \a reference and \a data are identical,
|
||||
//! otherwise CUTFalse
|
||||
//! @param reference handle to the reference data / gold image
|
||||
//! @param data handle to the computed data
|
||||
//! @param len number of elements in reference and data
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
DLL_MAPPING
|
||||
CUTBoolean CUTIL_API cutCompareub(const unsigned char *reference,
|
||||
const unsigned char *data,
|
||||
const unsigned int len);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//! Compare two integers with a tolernance for # of byte errors
|
||||
//! @return CUTTrue if \a reference and \a data are identical,
|
||||
//! otherwise CUTFalse
|
||||
//! @param reference handle to the reference data / gold image
|
||||
//! @param data handle to the computed data
|
||||
//! @param len number of elements in reference and data
|
||||
//! @param epsilon epsilon to use for the comparison
|
||||
//! @param threshold tolerance % # of comparison errors (0.15f = 15%)
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
DLL_MAPPING
|
||||
CUTBoolean CUTIL_API cutCompareubt(const unsigned char *reference,
|
||||
const unsigned char *data,
|
||||
const unsigned int len, const float epsilon,
|
||||
const float threshold);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//! Compare two integer arrays witha n epsilon tolerance for equality
|
||||
//! @return CUTTrue if \a reference and \a data are identical,
|
||||
//! otherwise CUTFalse
|
||||
//! @param reference handle to the reference data / gold image
|
||||
//! @param data handle to the computed data
|
||||
//! @param len number of elements in reference and data
|
||||
//! @param epsilon epsilon to use for the comparison
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
DLL_MAPPING
|
||||
CUTBoolean CUTIL_API cutCompareube(const unsigned char *reference,
|
||||
const unsigned char *data,
|
||||
const unsigned int len, const float epsilon);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Compare two float arrays with an epsilon tolerance for equality
|
||||
//! @return CUTTrue if \a reference and \a data are identical,
|
||||
//! otherwise CUTFalse
|
||||
//! @param reference handle to the reference data / gold image
|
||||
//! @param data handle to the computed data
|
||||
//! @param len number of elements in reference and data
|
||||
//! @param epsilon epsilon to use for the comparison
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
DLL_MAPPING
|
||||
CUTBoolean CUTIL_API cutComparefe(const float *reference, const float *data,
|
||||
const unsigned int len, const float epsilon);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//! Compare two float arrays with an epsilon tolerance for equality and a
|
||||
//! threshold for # pixel errors
|
||||
//! @return CUTTrue if \a reference and \a data are identical,
|
||||
//! otherwise CUTFalse
|
||||
//! @param reference handle to the reference data / gold image
|
||||
//! @param data handle to the computed data
|
||||
//! @param len number of elements in reference and data
|
||||
//! @param epsilon epsilon to use for the comparison
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
DLL_MAPPING
|
||||
CUTBoolean CUTIL_API cutComparefet(const float *reference, const float *data,
|
||||
const unsigned int len, const float epsilon,
|
||||
const float threshold);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Compare two float arrays using L2-norm with an epsilon tolerance for
|
||||
//! equality
|
||||
//! @return CUTTrue if \a reference and \a data are identical,
|
||||
//! otherwise CUTFalse
|
||||
//! @param reference handle to the reference data / gold image
|
||||
//! @param data handle to the computed data
|
||||
//! @param len number of elements in reference and data
|
||||
//! @param epsilon epsilon to use for the comparison
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
DLL_MAPPING
|
||||
CUTBoolean CUTIL_API cutCompareL2fe(const float *reference, const float *data,
|
||||
const unsigned int len,
|
||||
const float epsilon);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//! Compare two PPM image files with an epsilon tolerance for equality
|
||||
//! @return CUTTrue if \a reference and \a data are identical,
|
||||
//! otherwise CUTFalse
|
||||
//! @param src_file filename for the image to be compared
|
||||
//! @param data filename for the reference data / gold image
|
||||
//! @param epsilon epsilon to use for the comparison
|
||||
//! @param threshold threshold of pixels that can still mismatch to pass (i.e.
|
||||
//! 0.15f = 15% must pass) $param verboseErrors output details of image mismatch
|
||||
//! to std::err
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
DLL_MAPPING
|
||||
CUTBoolean CUTIL_API cutComparePPM(const char *src_file, const char *ref_file,
|
||||
const float epsilon, const float threshold,
|
||||
bool verboseErrors = false);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Timer functionality
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Create a new timer
|
||||
//! @return CUTTrue if a time has been created, otherwise false
|
||||
//! @param name of the new timer, 0 if the creation failed
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
DLL_MAPPING
|
||||
CUTBoolean CUTIL_API cutCreateTimer(unsigned int *name);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Delete a timer
|
||||
//! @return CUTTrue if a time has been deleted, otherwise false
|
||||
//! @param name of the timer to delete
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
DLL_MAPPING
|
||||
CUTBoolean CUTIL_API cutDeleteTimer(unsigned int name);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Start the time with name \a name
|
||||
//! @param name name of the timer to start
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
DLL_MAPPING
|
||||
CUTBoolean CUTIL_API cutStartTimer(const unsigned int name);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Stop the time with name \a name. Does not reset.
|
||||
//! @param name name of the timer to stop
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
DLL_MAPPING
|
||||
CUTBoolean CUTIL_API cutStopTimer(const unsigned int name);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Resets the timer's counter.
|
||||
//! @param name name of the timer to reset.
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
DLL_MAPPING
|
||||
CUTBoolean CUTIL_API cutResetTimer(const unsigned int name);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Returns total execution time in milliseconds for the timer over all
|
||||
//! runs since the last reset or timer creation.
|
||||
//! @param name name of the timer to return the time of
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
DLL_MAPPING
|
||||
float CUTIL_API cutGetTimerValue(const unsigned int name);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Return the average time in milliseconds for timer execution as the
|
||||
//! total time for the timer dividied by the number of completed (stopped)
|
||||
//! runs the timer has made.
|
||||
//! Excludes the current running time if the timer is currently running.
|
||||
//! @param name name of the timer to return the time of
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
DLL_MAPPING
|
||||
float CUTIL_API cutGetAverageTimerValue(const unsigned int name);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Macros
|
||||
|
||||
#if CUDART_VERSION >= 4000
|
||||
#define CUT_DEVICE_SYNCHRONIZE() cudaDeviceSynchronize();
|
||||
#else
|
||||
#define CUT_DEVICE_SYNCHRONIZE() cudaThreadSynchronize();
|
||||
#endif
|
||||
|
||||
#if CUDART_VERSION >= 4000
|
||||
#define CUT_DEVICE_RESET() cudaDeviceReset();
|
||||
#else
|
||||
#define CUT_DEVICE_RESET() cudaThreadExit();
|
||||
#endif
|
||||
|
||||
// This is for the CUTIL bank checker
|
||||
#ifdef _DEBUG
|
||||
#if __DEVICE_EMULATION__
|
||||
// Interface for bank conflict checker
|
||||
#define CUT_BANK_CHECKER(array, index) \
|
||||
(cutCheckBankAccess(threadIdx.x, threadIdx.y, threadIdx.z, blockDim.x, \
|
||||
blockDim.y, blockDim.z, __FILE__, __LINE__, #array, \
|
||||
index), \
|
||||
array[index])
|
||||
#else
|
||||
#define CUT_BANK_CHECKER(array, index) array[index]
|
||||
#endif
|
||||
#else
|
||||
#define CUT_BANK_CHECKER(array, index) array[index]
|
||||
#endif
|
||||
|
||||
#define CU_SAFE_CALL_NO_SYNC(call) \
|
||||
{ \
|
||||
CUresult err = call; \
|
||||
if (CUDA_SUCCESS != err) { \
|
||||
fprintf(stderr, "Cuda driver error %x in file '%s' in line %i.\n", err, \
|
||||
__FILE__, __LINE__); \
|
||||
exit(EXIT_FAILURE); \
|
||||
} \
|
||||
}
|
||||
|
||||
#define CU_SAFE_CALL(call) CU_SAFE_CALL_NO_SYNC(call);
|
||||
|
||||
#define CU_SAFE_CTX_SYNC() \
|
||||
{ \
|
||||
CUresult err = cuCtxSynchronize(); \
|
||||
if (CUDA_SUCCESS != err) { \
|
||||
fprintf(stderr, "Cuda driver error %x in file '%s' in line %i.\n", err, \
|
||||
__FILE__, __LINE__); \
|
||||
exit(EXIT_FAILURE); \
|
||||
} \
|
||||
}
|
||||
|
||||
#define CUDA_SAFE_CALL_NO_SYNC(call) \
|
||||
{ \
|
||||
cudaError err = call; \
|
||||
if (cudaSuccess != err) { \
|
||||
fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n", __FILE__, \
|
||||
__LINE__, cudaGetErrorString(err)); \
|
||||
exit(EXIT_FAILURE); \
|
||||
} \
|
||||
}
|
||||
|
||||
#define CUDA_SAFE_CALL(call) CUDA_SAFE_CALL_NO_SYNC(call);
|
||||
|
||||
#define CUDA_SAFE_THREAD_SYNC() \
|
||||
{ \
|
||||
cudaError err = CUT_DEVICE_SYNCHRONIZE(); \
|
||||
if (cudaSuccess != err) { \
|
||||
fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n", __FILE__, \
|
||||
__LINE__, cudaGetErrorString(err)); \
|
||||
} \
|
||||
}
|
||||
|
||||
#define CUFFT_SAFE_CALL(call) \
|
||||
{ \
|
||||
cufftResult err = call; \
|
||||
if (CUFFT_SUCCESS != err) { \
|
||||
fprintf(stderr, "CUFFT error in file '%s' in line %i.\n", __FILE__, \
|
||||
__LINE__); \
|
||||
exit(EXIT_FAILURE); \
|
||||
} \
|
||||
}
|
||||
|
||||
#define CUT_SAFE_CALL(call) \
|
||||
if (CUTTrue != call) { \
|
||||
fprintf(stderr, "Cut error in file '%s' in line %i.\n", __FILE__, \
|
||||
__LINE__); \
|
||||
exit(EXIT_FAILURE); \
|
||||
}
|
||||
|
||||
//! Check for CUDA error
|
||||
#ifdef _DEBUG
|
||||
#define CUT_CHECK_ERROR(errorMessage) \
|
||||
{ \
|
||||
cudaError_t err = cudaGetLastError(); \
|
||||
if (cudaSuccess != err) { \
|
||||
fprintf(stderr, "Cuda error: %s in file '%s' in line %i : %s.\n", \
|
||||
errorMessage, __FILE__, __LINE__, cudaGetErrorString(err)); \
|
||||
exit(EXIT_FAILURE); \
|
||||
} \
|
||||
err = CUT_DEVICE_SYNCHRONIZE(); \
|
||||
if (cudaSuccess != err) { \
|
||||
fprintf(stderr, "Cuda error: %s in file '%s' in line %i : %s.\n", \
|
||||
errorMessage, __FILE__, __LINE__, cudaGetErrorString(err)); \
|
||||
exit(EXIT_FAILURE); \
|
||||
} \
|
||||
}
|
||||
#else
|
||||
#define CUT_CHECK_ERROR(errorMessage) \
|
||||
{ \
|
||||
cudaError_t err = cudaGetLastError(); \
|
||||
if (cudaSuccess != err) { \
|
||||
fprintf(stderr, "Cuda error: %s in file '%s' in line %i : %s.\n", \
|
||||
errorMessage, __FILE__, __LINE__, cudaGetErrorString(err)); \
|
||||
exit(EXIT_FAILURE); \
|
||||
} \
|
||||
}
|
||||
#endif
|
||||
|
||||
//! Check for malloc error
|
||||
#define CUT_SAFE_MALLOC(mallocCall) \
|
||||
{ \
|
||||
if (!(mallocCall)) { \
|
||||
fprintf(stderr, "Host malloc failure in file '%s' in line %i\n", \
|
||||
__FILE__, __LINE__); \
|
||||
exit(EXIT_FAILURE); \
|
||||
} \
|
||||
} \
|
||||
while (0) \
|
||||
;
|
||||
|
||||
//! Check if conditon is true (flexible assert)
|
||||
#define CUT_CONDITION(val) \
|
||||
if (CUTFalse == cutCheckCondition(val, __FILE__, __LINE__)) { \
|
||||
exit(EXIT_FAILURE); \
|
||||
}
|
||||
|
||||
#if __DEVICE_EMULATION__
|
||||
|
||||
#define CUT_DEVICE_INIT(ARGC, ARGV)
|
||||
|
||||
#else
|
||||
|
||||
#define CUT_DEVICE_INIT(ARGC, ARGV) \
|
||||
{ \
|
||||
int deviceCount; \
|
||||
CUDA_SAFE_CALL_NO_SYNC(cudaGetDeviceCount(&deviceCount)); \
|
||||
if (deviceCount == 0) { \
|
||||
fprintf(stderr, "cutil error: no devices supporting CUDA.\n"); \
|
||||
exit(EXIT_FAILURE); \
|
||||
} \
|
||||
int dev = 0; \
|
||||
cutGetCmdLineArgumenti(ARGC, (const char **)ARGV, "device", &dev); \
|
||||
if (dev < 0) \
|
||||
dev = 0; \
|
||||
if (dev > deviceCount - 1) \
|
||||
dev = deviceCount - 1; \
|
||||
cudaDeviceProp deviceProp; \
|
||||
CUDA_SAFE_CALL_NO_SYNC(cudaGetDeviceProperties(&deviceProp, dev)); \
|
||||
if (cutCheckCmdLineFlag(ARGC, (const char **)ARGV, "quiet") == CUTFalse) \
|
||||
fprintf(stderr, "Using device %d: %s\n", dev, deviceProp.name); \
|
||||
CUDA_SAFE_CALL(cudaSetDevice(dev)); \
|
||||
}
|
||||
|
||||
//! Check for CUDA context lost
|
||||
#define CUDA_CHECK_CTX_LOST(errorMessage) \
|
||||
{ \
|
||||
cudaError_t err = cudaGetLastError(); \
|
||||
if (cudaSuccess != err) { \
|
||||
fprintf(stderr, "Cuda error: %s in file '%s' in line %i : %s.\n", \
|
||||
errorMessage, __FILE__, __LINE__, cudaGetErrorString(err)); \
|
||||
exit(EXIT_FAILURE); \
|
||||
} \
|
||||
err = CUT_DEVICE_SYNCHRONIZE(); \
|
||||
if (cudaSuccess != err) { \
|
||||
fprintf(stderr, "Cuda error: %s in file '%s' in line %i : %s.\n", \
|
||||
errorMessage, __FILE__, __LINE__, cudaGetErrorString(err)); \
|
||||
exit(EXIT_FAILURE); \
|
||||
} \
|
||||
}
|
||||
|
||||
//! Check for CUDA context lost
|
||||
#define CU_CHECK_CTX_LOST(errorMessage) \
|
||||
{ \
|
||||
cudaError_t err = cudaGetLastError(); \
|
||||
if (CUDA_ERROR_INVALID_CONTEXT != err) { \
|
||||
fprintf(stderr, "Cuda error: %s in file '%s' in line %i : %s.\n", \
|
||||
errorMessage, __FILE__, __LINE__, cudaGetErrorString(err)); \
|
||||
exit(EXIT_FAILURE); \
|
||||
} \
|
||||
err = CUT_DEVICE_SYNCHRONIZE(); \
|
||||
if (cudaSuccess != err) { \
|
||||
fprintf(stderr, "Cuda error: %s in file '%s' in line %i : %s.\n", \
|
||||
errorMessage, __FILE__, __LINE__, cudaGetErrorString(err)); \
|
||||
exit(EXIT_FAILURE); \
|
||||
} \
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#define CUT_DEVICE_INIT_DRV(cuDevice, ARGC, ARGV) \
|
||||
{ \
|
||||
cuDevice = 0; \
|
||||
int deviceCount = 0; \
|
||||
CUresult err = cuInit(0); \
|
||||
if (CUDA_SUCCESS == err) \
|
||||
CU_SAFE_CALL_NO_SYNC(cuDeviceGetCount(&deviceCount)); \
|
||||
if (deviceCount == 0) { \
|
||||
fprintf(stderr, "cutil error: no devices supporting CUDA\n"); \
|
||||
exit(EXIT_FAILURE); \
|
||||
} \
|
||||
int dev = 0; \
|
||||
cutGetCmdLineArgumenti(ARGC, (const char **)ARGV, "device", &dev); \
|
||||
if (dev < 0) \
|
||||
dev = 0; \
|
||||
if (dev > deviceCount - 1) \
|
||||
dev = deviceCount - 1; \
|
||||
CU_SAFE_CALL_NO_SYNC(cuDeviceGet(&cuDevice, dev)); \
|
||||
char name[100]; \
|
||||
cuDeviceGetName(name, 100, cuDevice); \
|
||||
if (cutCheckCmdLineFlag(ARGC, (const char **)ARGV, "quiet") == CUTFalse) \
|
||||
fprintf(stderr, "Using device %d: %s\n", dev, name); \
|
||||
}
|
||||
|
||||
#define CUT_EXIT(argc, argv) \
|
||||
if (!cutCheckCmdLineFlag(argc, (const char **)argv, "noprompt")) { \
|
||||
printf("\nPress ENTER to exit...\n"); \
|
||||
fflush(stdout); \
|
||||
fflush(stderr); \
|
||||
getchar(); \
|
||||
} \
|
||||
exit(EXIT_SUCCESS);
|
||||
|
||||
#endif // #ifndef _CUTIL_H_
|
|
@ -1,104 +0,0 @@
|
|||
/*
|
||||
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
|
||||
*
|
||||
* NVIDIA Corporation and its licensors retain all intellectual property and *
|
||||
* proprietary rights in and to this software and related documentation. Any
|
||||
* use, reproduction, disclosure, or distribution of this software and related
|
||||
* documentation without an express license agreement from NVIDIA Corporation is
|
||||
* strictly prohibited.
|
||||
*
|
||||
* Please refer to the applicable NVIDIA end user license agreement (EULA)
|
||||
* associated with this source code for terms and conditions that govern
|
||||
* your use of this NVIDIA software.
|
||||
*
|
||||
*/
|
||||
|
||||
#include <iostream>
|
||||
#include <stdio.h>
|
||||
|
||||
#define CHECK(ans) \
|
||||
{ gpuAssert((ans), __FILE__, __LINE__); }
|
||||
inline void gpuAssert(cudaError_t code, const char *file, int line,
|
||||
bool abort = true) {
|
||||
if (code != cudaSuccess) {
|
||||
fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file,
|
||||
line);
|
||||
if (abort)
|
||||
exit(code);
|
||||
}
|
||||
}
|
||||
|
||||
using namespace std;
|
||||
|
||||
#define SIZE (100 * 1024 * 1024)
|
||||
|
||||
__global__ void histo_kernel(unsigned char *buffer, long size,
|
||||
unsigned int *histo) {
|
||||
|
||||
__shared__ unsigned int temp[256];
|
||||
|
||||
temp[threadIdx.x] = 0;
|
||||
__syncthreads();
|
||||
|
||||
int i = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
int offset = blockDim.x * gridDim.x;
|
||||
while (i < size) {
|
||||
atomicAdd(&temp[buffer[i]], 1);
|
||||
i += offset;
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
atomicAdd(&(histo[threadIdx.x]), temp[threadIdx.x]);
|
||||
}
|
||||
|
||||
int runHisto(char *file, unsigned int *freq, unsigned int memSize,
|
||||
unsigned int *source) {
|
||||
|
||||
FILE *f = fopen(file, "rb");
|
||||
if (!f) {
|
||||
perror(file);
|
||||
exit(1);
|
||||
}
|
||||
fseek(f, 0, SEEK_SET);
|
||||
size_t result = fread(source, 1, memSize, f);
|
||||
if (result != memSize)
|
||||
fputs("Cannot read input file", stderr);
|
||||
|
||||
fclose(f);
|
||||
|
||||
unsigned char *buffer = (unsigned char *)source;
|
||||
|
||||
int blocks = 2;
|
||||
|
||||
// allocate memory on the GPU for the file's data
|
||||
int partSize = memSize / 32;
|
||||
int totalNum = memSize / sizeof(unsigned int);
|
||||
int partialNum = partSize / sizeof(unsigned int);
|
||||
|
||||
unsigned char *dev_buffer0;
|
||||
unsigned char *dev_buffer1;
|
||||
unsigned int *dev_histo;
|
||||
cudaMalloc((void **)&dev_buffer0, partSize);
|
||||
cudaMalloc((void **)&dev_buffer1, partSize);
|
||||
cudaMalloc((void **)&dev_histo, 256 * sizeof(int));
|
||||
cudaMemset(dev_histo, 0, 256 * sizeof(int));
|
||||
|
||||
for (int i = 0; i < totalNum; i += partialNum * 2) {
|
||||
CHECK(
|
||||
cudaMemcpy(dev_buffer0, buffer + i, partSize, cudaMemcpyHostToDevice));
|
||||
CHECK(cudaMemcpy(dev_buffer1, buffer + i + partialNum, partSize,
|
||||
cudaMemcpyHostToDevice));
|
||||
|
||||
// kernel launch - 2x the number of mps gave best timing
|
||||
histo_kernel<<<blocks * 2, 256>>>(dev_buffer0, partSize, dev_histo);
|
||||
cudaDeviceSynchronize();
|
||||
histo_kernel<<<blocks * 2, 256>>>(dev_buffer1, partSize, dev_histo);
|
||||
cudaDeviceSynchronize();
|
||||
}
|
||||
cudaMemcpy(freq, dev_histo, 256 * sizeof(int), cudaMemcpyDeviceToHost);
|
||||
|
||||
cudaFree(dev_histo);
|
||||
cudaFree(dev_buffer0);
|
||||
cudaFree(dev_buffer1);
|
||||
return 0;
|
||||
}
|
|
@ -1,90 +0,0 @@
|
|||
#include "stdio.h"
|
||||
#include <algorithm>
|
||||
#include <climits> // for CHAR_BIT
|
||||
#include <iostream>
|
||||
#include <iterator>
|
||||
#include <map>
|
||||
#include <math.h>
|
||||
#include <queue>
|
||||
|
||||
using namespace std;
|
||||
|
||||
const int UniqueSymbols = 1 << CHAR_BIT;
|
||||
void printBits(unsigned int val, int numbits) {
|
||||
for (int i = numbits - 1; i >= 0; i--)
|
||||
putchar('0' + ((val >> i) & 1));
|
||||
}
|
||||
|
||||
typedef vector<bool> HuffCode;
|
||||
typedef map<unsigned char, HuffCode> HuffCodeMap;
|
||||
|
||||
class INode {
|
||||
public:
|
||||
const int f;
|
||||
virtual ~INode() {}
|
||||
|
||||
protected:
|
||||
INode(int f) : f(f) {}
|
||||
};
|
||||
|
||||
class InternalNode : public INode {
|
||||
public:
|
||||
INode *const left;
|
||||
INode *const right;
|
||||
|
||||
InternalNode(INode *c0, INode *c1)
|
||||
: INode(c0->f + c1->f), left(c0), right(c1) {}
|
||||
~InternalNode() {
|
||||
delete left;
|
||||
delete right;
|
||||
}
|
||||
};
|
||||
|
||||
class LeafNode : public INode {
|
||||
public:
|
||||
const char c;
|
||||
|
||||
LeafNode(int f, char c) : INode(f), c(c) {}
|
||||
};
|
||||
|
||||
struct NodeCmp {
|
||||
bool operator()(const INode *lhs, const INode *rhs) const {
|
||||
return lhs->f > rhs->f;
|
||||
}
|
||||
};
|
||||
|
||||
INode *BuildTree(unsigned int (&frequencies)[UniqueSymbols]) {
|
||||
std::priority_queue<INode *, std::vector<INode *>, NodeCmp> trees;
|
||||
|
||||
for (int i = 0; i < UniqueSymbols; ++i) {
|
||||
if (frequencies[i] != 0)
|
||||
trees.push(new LeafNode(frequencies[i], (char)i));
|
||||
}
|
||||
while (trees.size() > 1) {
|
||||
INode *childR = trees.top();
|
||||
trees.pop();
|
||||
|
||||
INode *childL = trees.top();
|
||||
trees.pop();
|
||||
|
||||
INode *parent = new InternalNode(childR, childL);
|
||||
trees.push(parent);
|
||||
}
|
||||
return trees.top();
|
||||
}
|
||||
|
||||
void GenerateCodes(const INode *node, const HuffCode &prefix,
|
||||
HuffCodeMap &outCodes) {
|
||||
if (const LeafNode *lf = dynamic_cast<const LeafNode *>(node)) {
|
||||
outCodes[lf->c] = prefix;
|
||||
} else if (const InternalNode *in =
|
||||
dynamic_cast<const InternalNode *>(node)) {
|
||||
HuffCode leftPrefix = prefix;
|
||||
leftPrefix.push_back(false);
|
||||
GenerateCodes(in->left, leftPrefix, outCodes);
|
||||
|
||||
HuffCode rightPrefix = prefix;
|
||||
rightPrefix.push_back(true);
|
||||
GenerateCodes(in->right, rightPrefix, outCodes);
|
||||
}
|
||||
}
|
|
@ -1,65 +0,0 @@
|
|||
#ifndef _LOADTESTDATA_H_
|
||||
#define _LOADTESTDATA_H_
|
||||
|
||||
//#include "testdatagen.h"
|
||||
#include "hist.cu"
|
||||
#include "huffTree.h"
|
||||
|
||||
inline void initParams(char *file_name, uint num_block_threads,
|
||||
uint &num_blocks, uint &num_elements, uint &mem_size,
|
||||
uint symbol_type_size) {
|
||||
if (file_name == NULL) {
|
||||
num_elements = num_blocks * num_block_threads;
|
||||
mem_size = num_elements * symbol_type_size;
|
||||
} else {
|
||||
FILE *f = fopen(file_name, "rb");
|
||||
if (!f) {
|
||||
perror(file_name);
|
||||
exit(1);
|
||||
}
|
||||
fseek(f, 0, SEEK_END);
|
||||
mem_size = ftell(f);
|
||||
fclose(f);
|
||||
num_elements = mem_size / symbol_type_size;
|
||||
// todo add check if we need 1 more block!
|
||||
num_blocks = num_elements / num_block_threads;
|
||||
}
|
||||
}
|
||||
|
||||
inline void loadData(char *file_name, uint *sourceData, uint *codewords,
|
||||
uint *codewordlens, uint num_elements, uint mem_size,
|
||||
double &H) {
|
||||
if (file_name == NULL) {
|
||||
printf("No input file\n");
|
||||
exit(-1);
|
||||
} else {
|
||||
unsigned int freqs[UniqueSymbols] = {0};
|
||||
runHisto(file_name, freqs, mem_size, sourceData);
|
||||
INode *root = BuildTree(freqs);
|
||||
|
||||
HuffCodeMap codes;
|
||||
GenerateCodes(root, HuffCode(), codes);
|
||||
delete root;
|
||||
|
||||
for (HuffCodeMap::const_iterator it = codes.begin(); it != codes.end();
|
||||
++it) {
|
||||
unsigned int count = distance(it->second.begin(), it->second.end());
|
||||
for (int i = 0; i < count; i++)
|
||||
if (it->second[i])
|
||||
codewords[(unsigned int)(it->first)] +=
|
||||
(uint)pow(2.0f, (int)count - i - 1);
|
||||
codewordlens[(unsigned int)(it->first)] = count;
|
||||
}
|
||||
|
||||
H = 0.0;
|
||||
for (unsigned int i = 0; i < 256; i++)
|
||||
if (freqs[i] > 0) {
|
||||
double p = (double)freqs[i] / (double)mem_size;
|
||||
H += p * log(p) / log(2.0);
|
||||
}
|
||||
H = -H;
|
||||
printf("\n%s, %u bytes, entropy %f\n\n", file_name, mem_size, H);
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because one or more lines are too long
|
@ -1,225 +0,0 @@
|
|||
/*
|
||||
* PAVLE - Parallel Variable-Length Encoder for CUDA. Main file.
|
||||
*
|
||||
* Copyright (C) 2009 Ana Balevic <ana.balevic@gmail.com>
|
||||
* All rights reserved.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify it under
|
||||
* the terms of the MIT License. Read the full licence:
|
||||
* http://www.opensource.org/licenses/mit-license.php
|
||||
*
|
||||
* If you find this program useful, please contact me and reference PAVLE home
|
||||
* page in your work.
|
||||
*
|
||||
*/
|
||||
|
||||
#include "comparison_helpers.h"
|
||||
#include "cuda_helpers.h"
|
||||
#include "load_data.h"
|
||||
#include "print_helpers.h"
|
||||
#include "stats_logger.h"
|
||||
#include "stdafx.h"
|
||||
#include <cuda_runtime.h>
|
||||
#include <sys/time.h>
|
||||
|
||||
//#include "vlc_kernel_gm32.cu"
|
||||
//#include "vlc_kernel_sm32.cu"
|
||||
#include "vlc_kernel_sm64huff.cu"
|
||||
//#include "vlc_kernel_dpt.cu"
|
||||
//#include "vlc_kernel_dptt.cu"
|
||||
//#include "scan_kernel.cu"
|
||||
#include "cpuencode.h"
|
||||
#include "pack_kernels.cu"
|
||||
#include "scan.cu"
|
||||
|
||||
long long get_time() {
|
||||
struct timeval tv;
|
||||
gettimeofday(&tv, NULL);
|
||||
return (tv.tv_sec * 1000000) + tv.tv_usec;
|
||||
}
|
||||
void runVLCTest(char *file_name, uint num_block_threads, uint num_blocks = 1);
|
||||
|
||||
extern "C" void cpu_vlc_encode(unsigned int *indata, unsigned int num_elements,
|
||||
unsigned int *outdata, unsigned int *outsize,
|
||||
unsigned int *codewords,
|
||||
unsigned int *codewordlens);
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
if (!InitCUDA()) {
|
||||
return 0;
|
||||
}
|
||||
unsigned int num_block_threads = 256;
|
||||
if (argc > 1)
|
||||
for (int i = 1; i < argc; i++)
|
||||
runVLCTest(argv[i], num_block_threads);
|
||||
else {
|
||||
runVLCTest(NULL, num_block_threads, 1024);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
void runVLCTest(char *file_name, uint num_block_threads, uint num_blocks) {
|
||||
printf("CUDA! Starting VLC Tests!\n");
|
||||
unsigned int
|
||||
num_elements; // uint num_elements = num_blocks * num_block_threads;
|
||||
unsigned int mem_size; // uint mem_size = num_elements * sizeof(int);
|
||||
unsigned int symbol_type_size = sizeof(int);
|
||||
//////// LOAD DATA ///////////////
|
||||
double H; // entropy
|
||||
initParams(file_name, num_block_threads, num_blocks, num_elements, mem_size,
|
||||
symbol_type_size);
|
||||
printf("Parameters: num_elements: %d, num_blocks: %d, num_block_threads: "
|
||||
"%d\n----------------------------\n",
|
||||
num_elements, num_blocks, num_block_threads);
|
||||
////////LOAD DATA ///////////////
|
||||
uint *sourceData = (uint *)malloc(mem_size);
|
||||
uint *destData = (uint *)malloc(mem_size);
|
||||
uint *crefData = (uint *)malloc(mem_size);
|
||||
|
||||
uint *codewords = (uint *)malloc(NUM_SYMBOLS * symbol_type_size);
|
||||
uint *codewordlens = (uint *)malloc(NUM_SYMBOLS * symbol_type_size);
|
||||
|
||||
uint *cw32 = (uint *)malloc(mem_size);
|
||||
uint *cw32len = (uint *)malloc(mem_size);
|
||||
uint *cw32idx = (uint *)malloc(mem_size);
|
||||
|
||||
uint *cindex2 = (uint *)malloc(num_blocks * sizeof(int));
|
||||
|
||||
memset(sourceData, 0, mem_size);
|
||||
memset(destData, 0, mem_size);
|
||||
memset(crefData, 0, mem_size);
|
||||
memset(cw32, 0, mem_size);
|
||||
memset(cw32len, 0, mem_size);
|
||||
memset(cw32idx, 0, mem_size);
|
||||
memset(codewords, 0, NUM_SYMBOLS * symbol_type_size);
|
||||
memset(codewordlens, 0, NUM_SYMBOLS * symbol_type_size);
|
||||
memset(cindex2, 0, num_blocks * sizeof(int));
|
||||
//////// LOAD DATA ///////////////
|
||||
loadData(file_name, sourceData, codewords, codewordlens, num_elements,
|
||||
mem_size, H);
|
||||
|
||||
//////// LOAD DATA ///////////////
|
||||
|
||||
unsigned int *d_sourceData, *d_destData, *d_destDataPacked;
|
||||
unsigned int *d_codewords, *d_codewordlens;
|
||||
unsigned int *d_cw32, *d_cw32len, *d_cw32idx, *d_cindex, *d_cindex2;
|
||||
|
||||
CUDA_SAFE_CALL(cudaMalloc((void **)&d_sourceData, mem_size));
|
||||
CUDA_SAFE_CALL(cudaMalloc((void **)&d_destData, mem_size));
|
||||
CUDA_SAFE_CALL(cudaMalloc((void **)&d_destDataPacked, mem_size));
|
||||
|
||||
CUDA_SAFE_CALL(
|
||||
cudaMalloc((void **)&d_codewords, NUM_SYMBOLS * symbol_type_size));
|
||||
CUDA_SAFE_CALL(
|
||||
cudaMalloc((void **)&d_codewordlens, NUM_SYMBOLS * symbol_type_size));
|
||||
|
||||
CUDA_SAFE_CALL(cudaMalloc((void **)&d_cw32, mem_size));
|
||||
CUDA_SAFE_CALL(cudaMalloc((void **)&d_cw32len, mem_size));
|
||||
CUDA_SAFE_CALL(cudaMalloc((void **)&d_cw32idx, mem_size));
|
||||
|
||||
CUDA_SAFE_CALL(
|
||||
cudaMalloc((void **)&d_cindex, num_blocks * sizeof(unsigned int)));
|
||||
CUDA_SAFE_CALL(
|
||||
cudaMalloc((void **)&d_cindex2, num_blocks * sizeof(unsigned int)));
|
||||
// printf("source data\n");
|
||||
// for (int i = 0; i < 200; i++) {
|
||||
// printf("%d ", sourceData[i]);
|
||||
// }
|
||||
// printf("\n");
|
||||
// printf("codewords\n");
|
||||
// for (int i = 0; i < 200; i++) {
|
||||
// printf("%d ", codewords[i]);
|
||||
// }
|
||||
// printf("\n");
|
||||
// printf("codeword lens\n");
|
||||
// for (int i = 0; i < 200; i++) {
|
||||
// printf("%d ", codewordlens[i]);
|
||||
// }
|
||||
// printf("\n");
|
||||
// return;
|
||||
CUDA_SAFE_CALL(
|
||||
cudaMemcpy(d_sourceData, sourceData, mem_size, cudaMemcpyHostToDevice));
|
||||
CUDA_SAFE_CALL(cudaMemcpy(d_codewords, codewords,
|
||||
NUM_SYMBOLS * symbol_type_size,
|
||||
cudaMemcpyHostToDevice));
|
||||
CUDA_SAFE_CALL(cudaMemcpy(d_codewordlens, codewordlens,
|
||||
NUM_SYMBOLS * symbol_type_size,
|
||||
cudaMemcpyHostToDevice));
|
||||
CUDA_SAFE_CALL(
|
||||
cudaMemcpy(d_destData, destData, mem_size, cudaMemcpyHostToDevice));
|
||||
|
||||
dim3 grid_size(num_blocks, 1, 1);
|
||||
dim3 block_size(num_block_threads, 1, 1);
|
||||
unsigned int sm_size;
|
||||
|
||||
unsigned int NT = 10; // number of runs for each execution time
|
||||
|
||||
//////////////////* CPU ENCODER *///////////////////////////////////
|
||||
unsigned int refbytesize;
|
||||
long long timer = get_time();
|
||||
cpu_vlc_encode((unsigned int *)sourceData, num_elements,
|
||||
(unsigned int *)crefData, &refbytesize, codewords,
|
||||
codewordlens);
|
||||
float msec = (float)((get_time() - timer) / 1000.0);
|
||||
printf("CPU Encoding time (CPU): %f (ms)\n", msec);
|
||||
printf("CPU Encoded to %d [B]\n", refbytesize);
|
||||
unsigned int num_ints = refbytesize / 4 + ((refbytesize % 4 == 0) ? 0 : 1);
|
||||
//////////////////* END CPU *///////////////////////////////////
|
||||
|
||||
//////////////////* SM64HUFF KERNEL *///////////////////////////////////
|
||||
grid_size.x = num_blocks;
|
||||
block_size.x = num_block_threads;
|
||||
sm_size = block_size.x * sizeof(unsigned int);
|
||||
#ifdef CACHECWLUT
|
||||
sm_size = 2 * NUM_SYMBOLS * sizeof(int) + block_size.x * sizeof(unsigned int);
|
||||
#endif
|
||||
|
||||
for (int i = 0; i < NT; i++) {
|
||||
vlc_encode_kernel_sm64huff<<<grid_size, block_size>>>(
|
||||
d_sourceData, d_codewords, d_codewordlens,
|
||||
#ifdef TESTING
|
||||
d_cw32, d_cw32len, d_cw32idx,
|
||||
#endif
|
||||
d_destData, d_cindex); // testedOK2
|
||||
cudaThreadSynchronize();
|
||||
}
|
||||
// //////////////////* END KERNEL *///////////////////////////////////
|
||||
|
||||
#ifdef TESTING
|
||||
unsigned int num_scan_elements = grid_size.x;
|
||||
preallocBlockSums(num_scan_elements);
|
||||
cudaMemset(d_destDataPacked, 0, mem_size);
|
||||
printf("Num_blocks to be passed to scan is %d.\n", num_scan_elements);
|
||||
prescanArray(d_cindex2, d_cindex, num_scan_elements);
|
||||
pack2<<<num_scan_elements / 32, 32>>>(
|
||||
(unsigned int *)d_destData, d_cindex, d_cindex2,
|
||||
(unsigned int *)d_destDataPacked, num_elements / num_scan_elements);
|
||||
cudaThreadSynchronize();
|
||||
CUT_CHECK_ERROR("Pack2 Kernel execution failed\n");
|
||||
deallocBlockSums();
|
||||
// return;
|
||||
|
||||
CUDA_SAFE_CALL(
|
||||
cudaMemcpy(destData, d_destDataPacked, mem_size, cudaMemcpyDeviceToHost));
|
||||
compare_vectors((unsigned int *)crefData, (unsigned int *)destData, num_ints);
|
||||
#endif
|
||||
|
||||
free(sourceData);
|
||||
free(destData);
|
||||
free(codewords);
|
||||
free(codewordlens);
|
||||
free(cw32);
|
||||
free(cw32len);
|
||||
free(crefData);
|
||||
CUDA_SAFE_CALL(cudaFree(d_sourceData));
|
||||
CUDA_SAFE_CALL(cudaFree(d_destData));
|
||||
CUDA_SAFE_CALL(cudaFree(d_destDataPacked));
|
||||
CUDA_SAFE_CALL(cudaFree(d_codewords));
|
||||
CUDA_SAFE_CALL(cudaFree(d_codewordlens));
|
||||
CUDA_SAFE_CALL(cudaFree(d_cw32));
|
||||
CUDA_SAFE_CALL(cudaFree(d_cw32len));
|
||||
CUDA_SAFE_CALL(cudaFree(d_cw32idx));
|
||||
CUDA_SAFE_CALL(cudaFree(d_cindex));
|
||||
CUDA_SAFE_CALL(cudaFree(d_cindex2));
|
||||
free(cindex2);
|
||||
}
|
|
@ -1,62 +0,0 @@
|
|||
/*
|
||||
* Copyright Ana Balevic, 2008-2009. All rights reserved.
|
||||
*/
|
||||
#ifndef _PABIO_KERNEL2_H_
|
||||
#define _PABIO_KERNEL2_H_
|
||||
|
||||
#include "parameters.h"
|
||||
|
||||
/* PARALLEL PUT BITS IMPLEMENTATION (CUDA1.1+ compatible)
|
||||
* Set numbits in the destination word out[kc] starting from the position startbit
|
||||
* Implementation comments:
|
||||
* Second atomic operation actually sets these bits to the value stored in the codeword; the other bits are left unotuched
|
||||
* First atomic operation is a necessary prepration - we change only the bits that will be affected by the codeword to be written to 1s
|
||||
* in order for set bits to work with using atomicand.
|
||||
* TODOs: benchmark performance 1) gm atomics vs sm atomics; 2) memset at init time vs. atomicOr
|
||||
*/
|
||||
__device__ void static put_bits_atomic2(unsigned int* out, unsigned int kc,
|
||||
unsigned int startbit, unsigned int numbits,
|
||||
unsigned int codeword) {
|
||||
unsigned int cw32 = codeword;
|
||||
unsigned int restbits = 32-startbit-numbits;
|
||||
|
||||
/* 1. Prepare the memory location */
|
||||
#ifndef MEMSET0 //Can remove this part if the contents of the memory are already set to all 0s
|
||||
unsigned int mask = ((1<<numbits)-1); // -> 0000...001111
|
||||
mask<<=restbits; //fill in zeros at the back positions -> 0000...001111000 -> 11111110000111111111111 (in order not to and other positions)
|
||||
atomicAnd(&out[kc], ~mask); //set 0s in the destination from startbit in the len of numbits
|
||||
#endif
|
||||
|
||||
/* 2. Write the codeword */
|
||||
cw32 = cw32<<restbits;
|
||||
atomicOr(&out[kc], cw32);
|
||||
}
|
||||
|
||||
|
||||
|
||||
/* PARALLEL PUT BITS IMPLEMENTATION (CUDA1.1+ compatible)
|
||||
* Checkes if the part of the word to be written matches whole memory location, and if yes, avoids using the atmoics.
|
||||
* Experience: no benefits, even a bit slower on CUDA.
|
||||
*/
|
||||
__device__ void static put_bits_atomic2a(unsigned int* out, unsigned int kc,
|
||||
unsigned int startbit, unsigned int numbits,
|
||||
unsigned int codeword) {
|
||||
unsigned int cw32 = codeword;
|
||||
unsigned int restbits = 32-startbit-numbits;
|
||||
|
||||
/* 1. Prepare the memory location */
|
||||
#ifndef MEMSET0 //Can remove this part if the contents of the memory are already set to all 0s
|
||||
unsigned int mask = ((1<<numbits)-1); // -> 0000...001111
|
||||
mask<<=restbits; //fill in zeros at the back positions -> 0000...001111000 -> 11111110000111111111111 (in order not to and other positions)
|
||||
atomicAnd(&out[kc], ~mask); //set 0s in the destination from startbit in the len of numbits
|
||||
#endif
|
||||
|
||||
/* 2. Write the codeword */
|
||||
if (startbit == 0 && restbits == 0) {
|
||||
out[kc] = cw32;
|
||||
} else {
|
||||
cw32 = cw32<<restbits;
|
||||
atomicOr(&out[kc], cw32);
|
||||
}
|
||||
}
|
||||
#endif //ifndef _PABIO_KERNEL_H_
|
|
@ -1,43 +0,0 @@
|
|||
#ifndef _PACK_KERNELS_H_
|
||||
#define _PACK_KERNELS_H_
|
||||
#include "parameters.h"
|
||||
|
||||
__global__ static void pack2(unsigned int *srcData, unsigned int *cindex,
|
||||
unsigned int *cindex2, unsigned int *dstData,
|
||||
unsigned int original_num_block_elements) {
|
||||
unsigned int tid = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
|
||||
// source index
|
||||
unsigned int offset = tid * original_num_block_elements; // DPB,
|
||||
unsigned int bitsize = cindex[tid];
|
||||
|
||||
// destination index
|
||||
unsigned int pos = cindex2[tid], dword = pos / 32, bit = pos % 32;
|
||||
|
||||
unsigned int i, dw, tmp;
|
||||
dw = srcData[offset]; // load the first dword from srcData[]
|
||||
tmp = dw >> bit; // cut off those bits that do not fit into the initial
|
||||
// location in destData[]
|
||||
atomicOr(&dstData[dword], tmp); // fill up this initial location
|
||||
tmp = (bit == 0) ? 0 : (dw << 32 - bit);
|
||||
for (i = 1; i < bitsize / 32;
|
||||
i++) { // from now on, we have exclusive access to destData[]
|
||||
dw = srcData[offset + i]; // load next dword from srcData[]
|
||||
tmp |= dw >> bit; // fill up tmp
|
||||
dstData[dword + i] = tmp; // write complete dword to destData[]
|
||||
tmp = (bit == 0) ? 0 : (dw << 32 - bit);
|
||||
}
|
||||
// exclusive access to dstData[] ends here
|
||||
// the remaining block can, or rather should be further optimized
|
||||
// write the remaining bits in tmp, UNLESS bit is 0 and bitsize is divisible
|
||||
// by 32, in this case do nothing
|
||||
if (bit != 0 || bitsize % 32 != 0)
|
||||
atomicOr(&dstData[dword + i], tmp);
|
||||
if (bitsize % 32 != 0) {
|
||||
dw = srcData[offset + i];
|
||||
atomicOr(&dstData[dword + i], dw >> bit);
|
||||
atomicOr(&dstData[dword + i + 1], (bit == 0) ? 0 : (dw << 32 - bit));
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
|
@ -1,27 +0,0 @@
|
|||
#ifndef _PARAMS_H_
|
||||
#define _PARAMS_H_
|
||||
|
||||
typedef unsigned int uint;
|
||||
typedef unsigned char uint8;
|
||||
|
||||
#define BENCH 0
|
||||
/* 0 - MEASURE TIME, NO TESTING
|
||||
** 1 - TEST
|
||||
** 2 - TEST & VERBOSE
|
||||
*/
|
||||
#define TESTING
|
||||
|
||||
#define DPT 4 // data (dwords) per thread
|
||||
|
||||
#define CACHECWLUT // MAX DPT = 8
|
||||
//#define CACHESRCDATA // MAX DPT = 4
|
||||
|
||||
#define SMATOMICS
|
||||
|
||||
#define MEMSET0
|
||||
|
||||
#define MAX_SM_BLOCK_SIZE_GPU 16384 // B
|
||||
|
||||
#define NUM_SYMBOLS 256 // fixed to 256.
|
||||
|
||||
#endif
|
|
@ -1,217 +0,0 @@
|
|||
#ifndef _PRINT_HELPERS_H_
|
||||
#define _PRINT_HELPERS_H_
|
||||
|
||||
#include "parameters.h"
|
||||
#include <stdio.h>
|
||||
|
||||
__inline void printdbg_data_bin(const char *filename, unsigned int *data,
|
||||
unsigned int num_ints) {
|
||||
FILE *dump = fopen((const char *)filename, "wt");
|
||||
for (unsigned int i = 0; i < num_ints; i++) {
|
||||
unsigned int mask = 0x80000000;
|
||||
for (unsigned int j = 0; j < 32; j++) {
|
||||
if (data[i] & mask)
|
||||
fprintf(dump, "1"); // printf("1");
|
||||
else
|
||||
fprintf(dump, "0"); // printf("0");
|
||||
mask = mask >> 1;
|
||||
}
|
||||
fprintf(dump, "\n");
|
||||
}
|
||||
fclose(dump);
|
||||
}
|
||||
__inline void printdbg_data_int(const char *filename, unsigned int *data,
|
||||
unsigned int num_ints) {
|
||||
FILE *dump = fopen((const char *)filename, "wt");
|
||||
for (unsigned int i = 0; i < num_ints; i++) {
|
||||
fprintf(dump, "%d: %d\n", i, data[i]);
|
||||
}
|
||||
fclose(dump);
|
||||
}
|
||||
|
||||
__inline void printdbg_gpu_data_detailed(FILE *gpudump, unsigned int *cw32,
|
||||
unsigned int *cw32len,
|
||||
unsigned int *cw32idx,
|
||||
unsigned int num_elements) {
|
||||
for (unsigned int i = 0; i < num_elements; i++) {
|
||||
fprintf(gpudump, "bp: %d, kc: %d, startbit: %d, cwlen: %d, cw:\t\t",
|
||||
cw32idx[i], cw32idx[i] / 32, cw32idx[i] % 32, cw32len[i]);
|
||||
// print codeword:
|
||||
unsigned int mask = 0x80000000;
|
||||
mask = mask >> (32 - cw32len[i]);
|
||||
for (unsigned int j = 0; j < cw32len[i]; j++) {
|
||||
if (cw32[i] & mask)
|
||||
fprintf(gpudump, "1"); // printf("1");
|
||||
else
|
||||
fprintf(gpudump, "0"); // printf("0");
|
||||
mask = mask >> 1;
|
||||
}
|
||||
fprintf(gpudump, "\n");
|
||||
}
|
||||
}
|
||||
|
||||
__inline void printdbg_gpu_data_detailed2(const char *filename,
|
||||
unsigned int *cw32,
|
||||
unsigned int *cw32len,
|
||||
unsigned int *cw32idx,
|
||||
unsigned int num_elements) {
|
||||
FILE *gpudump = fopen((const char *)filename, "wt");
|
||||
for (unsigned int i = 0; i < num_elements; i++) {
|
||||
fprintf(gpudump, "bp: %d, kc: %d, startbit: %d, cwlen: %d, cw:\t\t",
|
||||
cw32idx[i], cw32idx[i] / 32, cw32idx[i] % 32, cw32len[i]);
|
||||
// print codeword:
|
||||
unsigned int mask = 0x80000000;
|
||||
mask = mask >> (32 - cw32len[i]);
|
||||
for (unsigned int j = 0; j < cw32len[i]; j++) {
|
||||
if (cw32[i] & mask)
|
||||
fprintf(gpudump, "1"); // printf("1");
|
||||
else
|
||||
fprintf(gpudump, "0"); // printf("0");
|
||||
mask = mask >> 1;
|
||||
}
|
||||
fprintf(gpudump, "\n");
|
||||
}
|
||||
fclose(gpudump);
|
||||
}
|
||||
|
||||
/************************************************************************/
|
||||
/* BIT PRINTS */
|
||||
/************************************************************************/
|
||||
__inline void printBits(unsigned char number) {
|
||||
unsigned char mask = 0x80;
|
||||
for (unsigned int j = 0; j < 8; j++) {
|
||||
if (number & mask)
|
||||
printf("1");
|
||||
else
|
||||
printf("0");
|
||||
mask = mask >> 1;
|
||||
}
|
||||
printf(" ");
|
||||
}
|
||||
__inline void print32Bits(unsigned int number) {
|
||||
unsigned int mask = 0x80000000;
|
||||
for (unsigned int j = 0; j < 32; j++) {
|
||||
if (number & mask)
|
||||
printf("1");
|
||||
else
|
||||
printf("0");
|
||||
mask = mask >> 1;
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
__inline void print32BitsM(unsigned int marker) {
|
||||
for (unsigned int j = 0; j < 32; j++) {
|
||||
if (marker == (j + 1))
|
||||
printf("|");
|
||||
else
|
||||
printf(".");
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
__inline void print_array_char_as_bits(unsigned char *a, unsigned int len) {
|
||||
|
||||
printf(
|
||||
" ========================= Printing vector =======================\n");
|
||||
printf("Total number of elements is %d\n", len);
|
||||
for (unsigned int i = 0; i < len; i++) {
|
||||
printf("a[%d]=%d \t", i, a[i]);
|
||||
printBits(a[i]);
|
||||
printf("\n");
|
||||
}
|
||||
printf("\n");
|
||||
printf(
|
||||
" ==================================================================\n");
|
||||
}
|
||||
|
||||
__inline void print_array_ints_as_bits(unsigned int *a, unsigned int len) {
|
||||
|
||||
printf(
|
||||
" ========================= Printing vector =======================\n");
|
||||
for (unsigned int i = 0; i < len; i++) {
|
||||
print32Bits(a[i]);
|
||||
printf("\n");
|
||||
}
|
||||
printf("\n");
|
||||
printf(
|
||||
" ==================================================================\n");
|
||||
}
|
||||
|
||||
__inline void print_compare_array_ints_as_bits(unsigned int *a, unsigned int *b,
|
||||
unsigned int len) {
|
||||
|
||||
printf(
|
||||
" ========================= Printing vector =======================\n");
|
||||
for (unsigned int i = 0; i < len; i++) {
|
||||
print32Bits(a[i]);
|
||||
print32Bits(b[i]);
|
||||
printf("\n");
|
||||
}
|
||||
printf("\n");
|
||||
printf(
|
||||
" ==================================================================\n");
|
||||
}
|
||||
|
||||
__inline void print_array_in_hex(unsigned int *a, unsigned int len) {
|
||||
|
||||
printf(
|
||||
" ========================= Printing vector =======================\n");
|
||||
// printf("Total number of elements is %d\n", len);
|
||||
for (unsigned int i = 0; i < len; i++) {
|
||||
printf("%#X\t", a[i]);
|
||||
}
|
||||
|
||||
printf("\n");
|
||||
printf(
|
||||
" ==================================================================\n");
|
||||
}
|
||||
|
||||
/************************************************************************/
|
||||
/* ARRAY PRINTS */
|
||||
/***********************************************************************/
|
||||
|
||||
template <typename T> __inline void print_array(T *a, unsigned int len) {
|
||||
|
||||
printf(
|
||||
" ========================= Printing vector =======================\n");
|
||||
printf("Total number of elements is %d\n", len);
|
||||
for (unsigned int i = 0; i < len; i++) {
|
||||
printf("a[%d]=%d \t", i, a[i]);
|
||||
}
|
||||
|
||||
printf("\n");
|
||||
printf(
|
||||
" ==================================================================\n");
|
||||
}
|
||||
|
||||
template <typename ST, typename CT>
|
||||
__inline void print_rled_arrays(ST *rle_symbols, CT *rle_counts,
|
||||
unsigned int rle_len) {
|
||||
ST current_symbol;
|
||||
CT current_count;
|
||||
printf(" ========================= Printing RLE vector "
|
||||
"=======================\n");
|
||||
printf(" Total number of RL Pairs is %d\n", rle_len);
|
||||
for (unsigned int k = 0; k < rle_len; k++) {
|
||||
current_symbol = rle_symbols[k];
|
||||
current_count = rle_counts[k];
|
||||
printf("(%d,%d) ,\t", current_symbol, current_count);
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
__inline void print_packed_rle_array(unsigned int *rle, unsigned int rle_len) {
|
||||
unsigned short current_symbol;
|
||||
unsigned short current_count;
|
||||
printf(" ========================= Printing RLE vector "
|
||||
"=======================\n");
|
||||
printf(" Total number of RL Pairs is %d\n", rle_len);
|
||||
for (unsigned int k = 0; k < rle_len; k++) {
|
||||
current_symbol = (unsigned short)(rle[k] >> 16); // get the higher half-word
|
||||
current_count =
|
||||
(unsigned short)rle[k] & 0x0000FFFFF; // get the shorter half-word
|
||||
printf("(%d,%d) ,\t", current_symbol, current_count);
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
#endif // _PRINT_HELPERS_H_
|
|
@ -1,20 +0,0 @@
|
|||
#!/bin/bash
|
||||
set -e
|
||||
# clang++ main_test_cu.cu --cuda-path=/usr/local/cuda-10.1 --cuda-gpu-arch=sm_61 -L/usr/local/cuda-10.1/lib64 -lcudart_static -ldl -lrt -pthread -save-temps -v
|
||||
clang -c -emit-llvm cpuencode.cpp
|
||||
llvm-as main_test_cu-cuda-nvptx64-nvidia-cuda-sm_61.ll
|
||||
llvm-as main_test_cu-host-x86_64-unknown-linux-gnu.ll
|
||||
|
||||
../../build/compilation/kernelTranslator main_test_cu-cuda-nvptx64-nvidia-cuda-sm_61.bc kernel.bc
|
||||
../../build/compilation/hostTranslator main_test_cu-host-x86_64-unknown-linux-gnu.bc host.bc
|
||||
|
||||
llc --relocation-model=pic --filetype=obj kernel.bc
|
||||
llc --relocation-model=pic --filetype=obj host.bc
|
||||
llc --relocation-model=pic --filetype=obj cpuencode.bc
|
||||
|
||||
g++ -Wall -L../../build/runtime \
|
||||
-L../../build/runtime/threadPool -o pavle \
|
||||
-fPIC -no-pie host.o kernel.o cpuencode.o -lc -lx86Runtime -lthreadPool -lpthread
|
||||
|
||||
export LD_LIBRARY_PATH=../../build/runtime:../../build/runtime/threadPool:$LD_LIBRARY_PATH
|
||||
./pavle ../../rodinia-data/huffman/test1024_H2.206587175259.in
|
|
@ -1,216 +0,0 @@
|
|||
/*
|
||||
* Copyright 1993-2006 NVIDIA Corporation. All rights reserved.
|
||||
*
|
||||
* NOTICE TO USER:
|
||||
*
|
||||
* This source code is subject to NVIDIA ownership rights under U.S. and
|
||||
* international Copyright laws.
|
||||
*
|
||||
* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
|
||||
* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
|
||||
* IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH
|
||||
* REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
|
||||
* MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
||||
* IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
|
||||
* OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
|
||||
* OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
|
||||
* OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
|
||||
* OR PERFORMANCE OF THIS SOURCE CODE.
|
||||
*
|
||||
* U.S. Government End Users. This source code is a "commercial item" as
|
||||
* that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of
|
||||
* "commercial computer software" and "commercial computer software
|
||||
* documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995)
|
||||
* and is provided to the U.S. Government only as a commercial end item.
|
||||
* Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
|
||||
* 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
|
||||
* source code with only those rights set forth herein.
|
||||
*/
|
||||
|
||||
#ifndef _PRESCAN_CU_
|
||||
#define _PRESCAN_CU_
|
||||
|
||||
// includes, kernels
|
||||
#include "cutil.h"
|
||||
#include "scanLargeArray_kernel.cu"
|
||||
#include <assert.h>
|
||||
#include <stdio.h>
|
||||
|
||||
#define max(a, b) (a > b ? a : b)
|
||||
inline bool isPowerOfTwo(int n) { return ((n & (n - 1)) == 0); }
|
||||
|
||||
inline int floorPow2(int n) {
|
||||
#ifdef WIN32
|
||||
// method 2
|
||||
return 1 << (int)logb((float)n);
|
||||
#else
|
||||
// method 1
|
||||
// float nf = (float)n;
|
||||
// return 1 << (((*(int*)&nf) >> 23) - 127);
|
||||
int exp;
|
||||
frexp((float)n, &exp);
|
||||
return 1 << (exp - 1);
|
||||
#endif
|
||||
}
|
||||
|
||||
#define BLOCK_SIZE 256
|
||||
|
||||
static unsigned int **g_scanBlockSums;
|
||||
static unsigned int g_numEltsAllocated = 0;
|
||||
static unsigned int g_numLevelsAllocated = 0;
|
||||
|
||||
static void preallocBlockSums(unsigned int maxNumElements) {
|
||||
assert(g_numEltsAllocated == 0); // shouldn't be called
|
||||
|
||||
g_numEltsAllocated = maxNumElements;
|
||||
|
||||
unsigned int blockSize = BLOCK_SIZE; // max size of the thread blocks
|
||||
unsigned int numElts = maxNumElements;
|
||||
int level = 0;
|
||||
|
||||
do {
|
||||
unsigned int numBlocks =
|
||||
max(1, (int)ceil((float)numElts / (2.f * blockSize)));
|
||||
if (numBlocks > 1)
|
||||
level++;
|
||||
numElts = numBlocks;
|
||||
} while (numElts > 1);
|
||||
|
||||
g_scanBlockSums = (unsigned int **)malloc(level * sizeof(unsigned int *));
|
||||
g_numLevelsAllocated = level;
|
||||
numElts = maxNumElements;
|
||||
level = 0;
|
||||
|
||||
do {
|
||||
unsigned int numBlocks =
|
||||
max(1, (int)ceil((float)numElts / (2.f * blockSize)));
|
||||
if (numBlocks > 1)
|
||||
CUDA_SAFE_CALL(cudaMalloc((void **)&g_scanBlockSums[level++],
|
||||
numBlocks * sizeof(unsigned int)));
|
||||
numElts = numBlocks;
|
||||
} while (numElts > 1);
|
||||
|
||||
CUT_CHECK_ERROR("preallocBlockSums");
|
||||
}
|
||||
|
||||
static void deallocBlockSums() {
|
||||
for (unsigned int i = 0; i < g_numLevelsAllocated; i++) {
|
||||
cudaFree(g_scanBlockSums[i]);
|
||||
}
|
||||
|
||||
CUT_CHECK_ERROR("deallocBlockSums");
|
||||
|
||||
free((void **)g_scanBlockSums);
|
||||
|
||||
g_scanBlockSums = 0;
|
||||
g_numEltsAllocated = 0;
|
||||
g_numLevelsAllocated = 0;
|
||||
}
|
||||
|
||||
static void prescanArrayRecursive(unsigned int *outArray,
|
||||
const unsigned int *inArray, int numElements,
|
||||
int level) {
|
||||
unsigned int blockSize = BLOCK_SIZE; // max size of the thread blocks
|
||||
unsigned int numBlocks =
|
||||
max(1, (int)ceil((float)numElements / (2.f * blockSize)));
|
||||
unsigned int numThreads;
|
||||
|
||||
if (numBlocks > 1)
|
||||
numThreads = blockSize;
|
||||
else if (isPowerOfTwo(numElements))
|
||||
numThreads = numElements / 2;
|
||||
else
|
||||
numThreads = floorPow2(numElements);
|
||||
|
||||
unsigned int numEltsPerBlock = numThreads * 2;
|
||||
|
||||
// if this is a non-power-of-2 array, the last block will be non-full
|
||||
// compute the smallest power of 2 able to compute its scan.
|
||||
unsigned int numEltsLastBlock =
|
||||
numElements - (numBlocks - 1) * numEltsPerBlock;
|
||||
unsigned int numThreadsLastBlock = max(1, numEltsLastBlock / 2);
|
||||
unsigned int np2LastBlock = 0;
|
||||
unsigned int sharedMemLastBlock = 0;
|
||||
|
||||
if (numEltsLastBlock != numEltsPerBlock) {
|
||||
np2LastBlock = 1;
|
||||
|
||||
if (!isPowerOfTwo(numEltsLastBlock))
|
||||
numThreadsLastBlock = floorPow2(numEltsLastBlock);
|
||||
|
||||
unsigned int extraSpace = (2 * numThreadsLastBlock) / NUM_BANKS;
|
||||
sharedMemLastBlock =
|
||||
sizeof(unsigned int) * (2 * numThreadsLastBlock + extraSpace);
|
||||
}
|
||||
|
||||
// padding space is used to avoid shared memory bank conflicts
|
||||
unsigned int extraSpace = numEltsPerBlock / NUM_BANKS;
|
||||
unsigned int sharedMemSize =
|
||||
sizeof(unsigned int) * (numEltsPerBlock + extraSpace);
|
||||
|
||||
#ifdef DEBUG
|
||||
if (numBlocks > 1) {
|
||||
assert(g_numEltsAllocated >= numElements);
|
||||
}
|
||||
#endif
|
||||
|
||||
// setup execution parameters
|
||||
// if NP2, we process the last block separately
|
||||
dim3 grid(max(1, numBlocks - np2LastBlock), 1, 1);
|
||||
dim3 threads(numThreads, 1, 1);
|
||||
|
||||
// make sure there are no CUDA errors before we start
|
||||
CUT_CHECK_ERROR("prescanArrayRecursive before kernels");
|
||||
|
||||
// execute the scan
|
||||
if (numBlocks > 1) {
|
||||
prescan<true, false><<<grid, threads>>>(
|
||||
outArray, inArray, g_scanBlockSums[level], numThreads * 2, 0, 0);
|
||||
cudaThreadSynchronize();
|
||||
CUT_CHECK_ERROR("prescanWithBlockSums");
|
||||
if (np2LastBlock) {
|
||||
prescan<true, true><<<1, numThreadsLastBlock>>>(
|
||||
outArray, inArray, g_scanBlockSums[level], numEltsLastBlock,
|
||||
numBlocks - 1, numElements - numEltsLastBlock);
|
||||
cudaThreadSynchronize();
|
||||
CUT_CHECK_ERROR("prescanNP2WithBlockSums");
|
||||
}
|
||||
|
||||
// After scanning all the sub-blocks, we are mostly done. But now we
|
||||
// need to take all of the last values of the sub-blocks and scan those.
|
||||
// This will give us a new value that must be sdded to each block to
|
||||
// get the final results.
|
||||
// recursive (CPU) call
|
||||
prescanArrayRecursive(g_scanBlockSums[level], g_scanBlockSums[level],
|
||||
numBlocks, level + 1);
|
||||
|
||||
uniformAdd<<<grid, threads>>>(outArray, g_scanBlockSums[level],
|
||||
numElements - numEltsLastBlock, 0, 0);
|
||||
cudaThreadSynchronize();
|
||||
CUT_CHECK_ERROR("uniformAdd");
|
||||
if (np2LastBlock) {
|
||||
uniformAdd<<<1, numThreadsLastBlock>>>(outArray, g_scanBlockSums[level],
|
||||
numEltsLastBlock, numBlocks - 1,
|
||||
numElements - numEltsLastBlock);
|
||||
cudaThreadSynchronize();
|
||||
CUT_CHECK_ERROR("uniformAdd");
|
||||
}
|
||||
} else if (isPowerOfTwo(numElements)) {
|
||||
prescan<false, false>
|
||||
<<<grid, threads>>>(outArray, inArray, 0, numThreads * 2, 0, 0);
|
||||
cudaThreadSynchronize();
|
||||
CUT_CHECK_ERROR("prescan");
|
||||
} else {
|
||||
prescan<false, true>
|
||||
<<<grid, threads>>>(outArray, inArray, 0, numElements, 0, 0);
|
||||
cudaThreadSynchronize();
|
||||
CUT_CHECK_ERROR("prescanNP2");
|
||||
}
|
||||
}
|
||||
|
||||
static void prescanArray(unsigned int *outArray, unsigned int *inArray,
|
||||
int numElements) {
|
||||
prescanArrayRecursive(outArray, inArray, numElements, 0);
|
||||
}
|
||||
|
||||
#endif // _PRESCAN_CU_
|
|
@ -1,237 +0,0 @@
|
|||
/*
|
||||
* Copyright 1993-2006 NVIDIA Corporation. All rights reserved.
|
||||
*
|
||||
* NOTICE TO USER:
|
||||
*
|
||||
* This source code is subject to NVIDIA ownership rights under U.S. and
|
||||
* international Copyright laws.
|
||||
*
|
||||
* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
|
||||
* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
|
||||
* IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH
|
||||
* REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
|
||||
* MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
||||
* IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
|
||||
* OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
|
||||
* OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
|
||||
* OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
|
||||
* OR PERFORMANCE OF THIS SOURCE CODE.
|
||||
*
|
||||
* U.S. Government End Users. This source code is a "commercial item" as
|
||||
* that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of
|
||||
* "commercial computer software" and "commercial computer software
|
||||
* documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995)
|
||||
* and is provided to the U.S. Government only as a commercial end item.
|
||||
* Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
|
||||
* 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
|
||||
* source code with only those rights set forth herein.
|
||||
*/
|
||||
|
||||
#ifndef _SCAN_BEST_KERNEL_CU_
|
||||
#define _SCAN_BEST_KERNEL_CU_
|
||||
|
||||
// Define this to more rigorously avoid bank conflicts,
|
||||
// even at the lower (root) levels of the tree
|
||||
// Note that due to the higher addressing overhead, performance
|
||||
// is lower with ZERO_BANK_CONFLICTS enabled. It is provided
|
||||
// as an example.
|
||||
//#define ZERO_BANK_CONFLICTS
|
||||
|
||||
// 16 banks on G80
|
||||
#define NUM_BANKS 16
|
||||
#define LOG_NUM_BANKS 4
|
||||
|
||||
#ifdef ZERO_BANK_CONFLICTS
|
||||
#define CONFLICT_FREE_OFFSET(index) \
|
||||
((index) >> LOG_NUM_BANKS + (index) >> (2 * LOG_NUM_BANKS))
|
||||
#else
|
||||
#define CONFLICT_FREE_OFFSET(index) ((index) >> LOG_NUM_BANKS)
|
||||
#endif
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// Work-efficient compute implementation of scan, one thread per 2 elements
|
||||
// Work-efficient: O(log(n)) steps, and O(n) adds.
|
||||
// Also shared storage efficient: Uses n + n/NUM_BANKS shared memory -- no
|
||||
// ping-ponging Also avoids most bank conflicts using single-element offsets
|
||||
// every NUM_BANKS elements.
|
||||
//
|
||||
// In addition, If ZERO_BANK_CONFLICTS is defined, uses
|
||||
// n + n/NUM_BANKS + n/(NUM_BANKS*NUM_BANKS)
|
||||
// shared memory. If ZERO_BANK_CONFLICTS is defined, avoids ALL bank conflicts
|
||||
// using single-element offsets every NUM_BANKS elements, plus additional
|
||||
// single-element offsets after every NUM_BANKS^2 elements.
|
||||
//
|
||||
// Uses a balanced tree type algorithm. See Blelloch, 1990 "Prefix Sums
|
||||
// and Their Applications", or Prins and Chatterjee PRAM course notes:
|
||||
// http://www.cs.unc.edu/~prins/Classes/203/Handouts/pram.pdf
|
||||
//
|
||||
// This work-efficient version is based on the algorithm presented in Guy
|
||||
// Blelloch's excellent paper "Prefix sums and their applications".
|
||||
// http://www-2.cs.cmu.edu/afs/cs.cmu.edu/project/scandal/public/papers/CMU-CS-90-190.html
|
||||
//
|
||||
// Pro: Work Efficient, very few bank conflicts (or zero if ZERO_BANK_CONFLICTS
|
||||
// is defined) Con: More instructions to compute bank-conflict-free shared
|
||||
// memory addressing, and slightly more shared memory storage used.
|
||||
//
|
||||
|
||||
template <bool isNP2>
|
||||
__device__ static void
|
||||
loadSharedChunkFromMem(unsigned int *s_data, const unsigned int *g_idata, int n,
|
||||
int baseIndex, int &ai, int &bi, int &mem_ai,
|
||||
int &mem_bi, int &bankOffsetA, int &bankOffsetB) {
|
||||
int thid = threadIdx.x;
|
||||
mem_ai = baseIndex + threadIdx.x;
|
||||
mem_bi = mem_ai + blockDim.x;
|
||||
|
||||
ai = thid;
|
||||
bi = thid + blockDim.x;
|
||||
|
||||
// compute spacing to avoid bank conflicts
|
||||
bankOffsetA = CONFLICT_FREE_OFFSET(ai);
|
||||
bankOffsetB = CONFLICT_FREE_OFFSET(bi);
|
||||
|
||||
// Cache the computational window in shared memory
|
||||
// pad values beyond n with zeros
|
||||
s_data[ai + bankOffsetA] = g_idata[mem_ai];
|
||||
|
||||
if (isNP2) // compile-time decision
|
||||
{
|
||||
s_data[bi + bankOffsetB] = (bi < n) ? g_idata[mem_bi] : 0;
|
||||
} else {
|
||||
s_data[bi + bankOffsetB] = g_idata[mem_bi];
|
||||
}
|
||||
}
|
||||
|
||||
template <bool isNP2>
|
||||
__device__ static void
|
||||
storeSharedChunkToMem(unsigned int *g_odata, const unsigned int *s_data, int n,
|
||||
int ai, int bi, int mem_ai, int mem_bi, int bankOffsetA,
|
||||
int bankOffsetB) {
|
||||
__syncthreads();
|
||||
|
||||
// write results to global memory
|
||||
g_odata[mem_ai] = s_data[ai + bankOffsetA];
|
||||
if (isNP2) // compile-time decision
|
||||
{
|
||||
if (bi < n)
|
||||
g_odata[mem_bi] = s_data[bi + bankOffsetB];
|
||||
} else {
|
||||
g_odata[mem_bi] = s_data[bi + bankOffsetB];
|
||||
}
|
||||
}
|
||||
|
||||
template <bool storeSum>
|
||||
__device__ static void clearLastElement(unsigned int *s_data,
|
||||
unsigned int *g_blockSums,
|
||||
int blockIndex) {
|
||||
if (threadIdx.x == 0) {
|
||||
int index = (blockDim.x << 1) - 1;
|
||||
index += CONFLICT_FREE_OFFSET(index);
|
||||
|
||||
if (storeSum) // compile-time decision
|
||||
{
|
||||
// write this block's total sum to the corresponding index in the
|
||||
// blockSums array
|
||||
g_blockSums[blockIndex] = s_data[index];
|
||||
}
|
||||
|
||||
// zero the last element in the scan so it will propagate back to the front
|
||||
s_data[index] = 0;
|
||||
}
|
||||
}
|
||||
|
||||
__device__ static unsigned int buildSum(unsigned int *s_data) {
|
||||
unsigned int thid = threadIdx.x;
|
||||
unsigned int stride = 1;
|
||||
|
||||
// build the sum in place up the tree
|
||||
for (int d = blockDim.x; d > 0; d >>= 1) {
|
||||
__syncthreads();
|
||||
|
||||
if (thid < d) {
|
||||
int i = __mul24(__mul24(2, stride), thid);
|
||||
int ai = i + stride - 1;
|
||||
int bi = ai + stride;
|
||||
|
||||
ai += CONFLICT_FREE_OFFSET(ai);
|
||||
bi += CONFLICT_FREE_OFFSET(bi);
|
||||
|
||||
s_data[bi] += s_data[ai];
|
||||
}
|
||||
|
||||
stride *= 2;
|
||||
}
|
||||
|
||||
return stride;
|
||||
}
|
||||
|
||||
__device__ static void scanRootToLeaves(unsigned int *s_data,
|
||||
unsigned int stride) {
|
||||
unsigned int thid = threadIdx.x;
|
||||
|
||||
// traverse down the tree building the scan in place
|
||||
for (int d = 1; d <= blockDim.x; d *= 2) {
|
||||
stride >>= 1;
|
||||
|
||||
__syncthreads();
|
||||
|
||||
if (thid < d) {
|
||||
int i = __mul24(__mul24(2, stride), thid);
|
||||
int ai = i + stride - 1;
|
||||
int bi = ai + stride;
|
||||
|
||||
ai += CONFLICT_FREE_OFFSET(ai);
|
||||
bi += CONFLICT_FREE_OFFSET(bi);
|
||||
|
||||
unsigned int t = s_data[ai];
|
||||
s_data[ai] = s_data[bi];
|
||||
s_data[bi] += t;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <bool storeSum>
|
||||
__device__ static void prescanBlock(unsigned int *data, int blockIndex,
|
||||
unsigned int *blockSums) {
|
||||
int stride = buildSum(data); // build the sum in place up the tree
|
||||
clearLastElement<storeSum>(data, blockSums,
|
||||
(blockIndex == 0) ? blockIdx.x : blockIndex);
|
||||
scanRootToLeaves(data, stride); // traverse down tree to build the scan
|
||||
}
|
||||
|
||||
template <bool storeSum, bool isNP2>
|
||||
__global__ static void
|
||||
prescan(unsigned int *g_odata, const unsigned int *g_idata,
|
||||
unsigned int *g_blockSums, int n, int blockIndex, int baseIndex) {
|
||||
int ai, bi, mem_ai, mem_bi, bankOffsetA, bankOffsetB;
|
||||
__shared__ unsigned int s_data[3072];
|
||||
|
||||
// load data into shared memory
|
||||
loadSharedChunkFromMem<isNP2>(
|
||||
s_data, g_idata, n,
|
||||
(baseIndex == 0) ? __mul24(blockIdx.x, (blockDim.x << 1)) : baseIndex, ai,
|
||||
bi, mem_ai, mem_bi, bankOffsetA, bankOffsetB);
|
||||
// scan the data in each block
|
||||
prescanBlock<storeSum>(s_data, blockIndex, g_blockSums);
|
||||
// write results to device memory
|
||||
storeSharedChunkToMem<isNP2>(g_odata, s_data, n, ai, bi, mem_ai, mem_bi,
|
||||
bankOffsetA, bankOffsetB);
|
||||
}
|
||||
|
||||
__global__ static void uniformAdd(unsigned int *g_data, unsigned int *uniforms,
|
||||
int n, int blockOffset, int baseIndex) {
|
||||
__shared__ unsigned int uni;
|
||||
if (threadIdx.x == 0)
|
||||
uni = uniforms[blockIdx.x + blockOffset];
|
||||
|
||||
unsigned int address =
|
||||
__mul24(blockIdx.x, (blockDim.x << 1)) + baseIndex + threadIdx.x;
|
||||
|
||||
__syncthreads();
|
||||
|
||||
// note two adds per thread
|
||||
g_data[address] += uni;
|
||||
g_data[address + blockDim.x] += (threadIdx.x + blockDim.x < n) * uni;
|
||||
}
|
||||
|
||||
#endif // #ifndef _SCAN_BEST_KERNEL_CU_
|
|
@ -1,43 +0,0 @@
|
|||
/*
|
||||
* Copyright 2009 Tjark Bringewat. All rights reserved.
|
||||
*/
|
||||
|
||||
#include "stats_logger.h"
|
||||
#include "stdafx.h"
|
||||
#include <cstdio>
|
||||
#include <map>
|
||||
#include <sstream>
|
||||
|
||||
std::map<std::string, unsigned int> filenames;
|
||||
|
||||
void LogStats(const char *graphname, const char *seriesname, float xValue,
|
||||
float yValue, const char *xAxisQuantity,
|
||||
const char *yAxisQuantity, const char *xAxisUnit,
|
||||
const char *yAxisUnit, const char *xAxisScaleType,
|
||||
const char *yAxisScaleType, unsigned int seriesnumber,
|
||||
const char *description) {
|
||||
std::ostringstream temp, temp2;
|
||||
temp << graphname << "__" << seriesname;
|
||||
size_t exists = filenames.count(temp.str());
|
||||
if (!exists)
|
||||
filenames[temp.str()] = seriesnumber;
|
||||
temp2 << graphname << "__" << filenames[temp.str()] << "_" << seriesname
|
||||
<< ".txt";
|
||||
FILE *f;
|
||||
if (!exists) {
|
||||
f = fopen(temp2.str().c_str(), "wt");
|
||||
fprintf(f, "SERIES_NAME\n%s\n", seriesname);
|
||||
fprintf(f, "X_AXIS_QUANTITY\n%s\n", xAxisQuantity);
|
||||
fprintf(f, "Y_AXIS_QUANTITY\n%s\n", yAxisQuantity);
|
||||
fprintf(f, "X_AXIS_UNIT\n%s\n", xAxisUnit);
|
||||
fprintf(f, "Y_AXIS_UNIT\n%s\n", yAxisUnit);
|
||||
fprintf(f, "X_AXIS_SCALE_TYPE\n%s\n", xAxisScaleType);
|
||||
fprintf(f, "Y_AXIS_SCALE_TYPE\n%s\n", yAxisScaleType);
|
||||
fprintf(f, "DESCRIPTION\n%s\n", description);
|
||||
fprintf(f, "__DATA__\n");
|
||||
} else {
|
||||
f = fopen(temp2.str().c_str(), "at");
|
||||
}
|
||||
fprintf(f, "%f %f\n", xValue, yValue);
|
||||
fclose(f);
|
||||
}
|
|
@ -1,45 +0,0 @@
|
|||
/*
|
||||
* Copyright Tjark Bringewat. All rights reserved.
|
||||
*/
|
||||
|
||||
#ifndef _STATS_LOGGER_H_
|
||||
#define _STATS_LOGGER_H_
|
||||
|
||||
#include <cstring>
|
||||
#pragma warning(disable : 4996)
|
||||
|
||||
extern "C" void
|
||||
LogStats(const char *graphname, const char *seriesname, float xValue,
|
||||
float yValue, const char *xAxisQuantity, const char *yAxisQuantity,
|
||||
const char *xAxisUnit = "", const char *yAxisUnit = "",
|
||||
const char *xAxisScaleType = "lin", const char *yAxisScaleType = "lin",
|
||||
unsigned int seriesnumber = 0, const char *description = "");
|
||||
|
||||
inline void LogStats2(
|
||||
const char *graph, // Groups several functions into one graph. Only appears
|
||||
// in the file name.
|
||||
const char *function, // Name of the particular function. Appears in file
|
||||
// name and legend.
|
||||
float yValue, float xValue, const char *yAxisName = "Time",
|
||||
const char *yAxisUnit = "ms", const char *xAxisName = "Data size",
|
||||
const char *xAxisUnit = "MB",
|
||||
const char *yAxisScaleType = "lin", // Can be lin or log for linear or
|
||||
// logarithmic scale, respectively.
|
||||
const char *xAxisScaleType = "log",
|
||||
unsigned int fId =
|
||||
0, // Determines the order in which different functions are plotted to a
|
||||
// common graph. Only appears in the file name.
|
||||
const char *description = "") {
|
||||
LogStats(graph, function, xValue, yValue, xAxisName, yAxisName, xAxisUnit,
|
||||
yAxisUnit, xAxisScaleType, yAxisScaleType, fId, description);
|
||||
if (strcmp(xAxisUnit, "MB") == 0 && strcmp(yAxisUnit, "ms") == 0) {
|
||||
char buffer[100];
|
||||
strcpy(buffer, graph);
|
||||
strcat(buffer, "_datarate");
|
||||
LogStats(buffer, function, xValue, (xValue * 1000.0f) / (yValue * 1024.0f),
|
||||
xAxisName, "Data rate", xAxisUnit, "GB/s", xAxisScaleType,
|
||||
yAxisScaleType, fId, description);
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
|
@ -1,11 +0,0 @@
|
|||
#pragma once
|
||||
|
||||
#include "cutil.h"
|
||||
#include <iostream>
|
||||
#include <malloc.h>
|
||||
#include <math.h>
|
||||
#include <memory.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <time.h>
|
|
@ -1,83 +0,0 @@
|
|||
#ifndef _TESTDATA_GEN_H_
|
||||
#define _TESTDATA_GEN_H_
|
||||
|
||||
#include "parameters.h"
|
||||
|
||||
template <typename T>
|
||||
__inline__ void generateRLETestData(T *data, unsigned int num_blocks,
|
||||
unsigned int num_block_threads) {
|
||||
unsigned int i, j;
|
||||
|
||||
/* generate first block*/
|
||||
for (i = 0; i < num_block_threads; i += 8) {
|
||||
data[i] = 1;
|
||||
data[i + 1] = 2;
|
||||
data[i + 2] = 3;
|
||||
data[i + 3] = 3;
|
||||
data[i + 4] = 3;
|
||||
data[i + 5] = 4;
|
||||
data[i + 6] = 4;
|
||||
data[i + 7] = 5;
|
||||
}
|
||||
/* copy contents of the first block to all other blocks (for testing only)*/
|
||||
for (j = 1; j < num_blocks; j++)
|
||||
for (i = 0; i < num_block_threads; i++)
|
||||
*(data + j * num_block_threads + i) = data[i];
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__inline__ void generateRLETestDataLongRuns1(T *data, unsigned int num_blocks,
|
||||
unsigned int num_block_threads,
|
||||
unsigned int avg_run_len) {
|
||||
unsigned int i, j;
|
||||
|
||||
/* generate first block*/
|
||||
for (i = 0; i < num_block_threads / avg_run_len; i++)
|
||||
for (j = 0; j < avg_run_len; j++)
|
||||
data[i * avg_run_len + j] = i;
|
||||
|
||||
/* copy contents of the first block to all other blocks (for testing only)*/
|
||||
for (j = 1; j < num_blocks; j++)
|
||||
for (i = 0; i < num_block_threads; i++)
|
||||
*(data + j * num_block_threads + i) = data[i];
|
||||
}
|
||||
|
||||
// VLE TEST DATA VER2.0
|
||||
|
||||
// for testing only: generates codewords of the following lengths: 1, 2, 3, 4,
|
||||
// 4, 5, 6, 7
|
||||
// and dummy odewords: 1, 10, 100, 1000, 1000, 10000, 100000, 1000000
|
||||
// equals 0x01, 0x02, 0x4, 0x8, 0x8, 0x10, 0x20, 0x40
|
||||
// num_symbols =256. Must be multiple of 8.
|
||||
inline void generateCodewords(unsigned int *codewords,
|
||||
unsigned int *codewordlens,
|
||||
unsigned int num_symbols) {
|
||||
unsigned int idx, i, j, numbits, k; // val, k;
|
||||
/* Generate codeword lengths*/
|
||||
for (j = 0; j < num_symbols / 8; j++) {
|
||||
for (i = 0; i < 4; i++) { // generate first half of length 1,2 3, 4
|
||||
idx = j * 8 + i;
|
||||
codewordlens[idx] = i % 4 + 1;
|
||||
}
|
||||
for (i = 0; i < 4; i++) { // generate first half of length 4, 5, 6, 7
|
||||
idx = j * 8 + 4 + i;
|
||||
codewordlens[idx] = i % 4 + 4;
|
||||
}
|
||||
}
|
||||
/* Generate codewords*/
|
||||
for (k = 0; k < num_symbols; k++) {
|
||||
numbits = codewordlens[k];
|
||||
codewords[k] = 0x01 << (numbits - 1);
|
||||
}
|
||||
}
|
||||
|
||||
inline void generateData(unsigned int *data, unsigned int num_elements,
|
||||
unsigned int *codewords, unsigned int *codewordlens,
|
||||
unsigned int num_symbols) {
|
||||
unsigned int i;
|
||||
for (i = 0; i < num_elements; i++) {
|
||||
data[i] = (unsigned int)(((float)rand() / (RAND_MAX + 1)) * num_symbols);
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
|
@ -1,160 +0,0 @@
|
|||
#ifndef _VLC_SM64HUFF_KERNEL_H_
|
||||
#define _VLC_SM64HUFF_KERNEL_H_
|
||||
|
||||
#include "pabio_kernels_v2.cu"
|
||||
#include "parameters.h"
|
||||
#include <cstdio>
|
||||
|
||||
#ifdef SMATOMICS
|
||||
|
||||
/* HUFFMAN-FRIENDLY PAVLE
|
||||
CHARACTERISTICS:
|
||||
1. CACHE CW_LUT INTO SM, LOAD AS 2 INT ARRAYS
|
||||
2. PARALLEL PREFIX SUM
|
||||
3. PARALLEL BIT I/O USING SHARED-MEMORY ATOMIC OPERATIONS (COMAPTIBLE WITH
|
||||
CUDA1.3+)
|
||||
|
||||
NOTES & ASSUMPTIONS:
|
||||
- HUFFMAN-CODING FRIENDLY, SUPPORTS CODEWORDS OF 2X SIZE OF ORIGINAL
|
||||
SYMBOLS (BYTES). - NUMBER OF THREADS PER BLOCK IS 256; IF YOU WANT TO PLAY
|
||||
WITH DIFFERENT NUMBERS, THE CW CACHING SHOULD BE MODIFIED (SEE DPT* KERNELS)
|
||||
- SM usage: 1x size of the input data (REUSE) + size of CWLUT
|
||||
TURN ON CACHING FOR HIGH ENTROPY DATA!
|
||||
*/
|
||||
|
||||
__global__ static void
|
||||
vlc_encode_kernel_sm64huff(unsigned int *data, const unsigned int *gm_codewords,
|
||||
const unsigned int *gm_codewordlens,
|
||||
#ifdef TESTING
|
||||
unsigned int *cw32, unsigned int *cw32len,
|
||||
unsigned int *cw32idx,
|
||||
#endif
|
||||
unsigned int *out, unsigned int *outidx) {
|
||||
|
||||
unsigned int kn = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
unsigned int k = threadIdx.x;
|
||||
unsigned int kc, startbit, wrbits;
|
||||
|
||||
unsigned long long cw64 = 0;
|
||||
unsigned int val32, codewordlen = 0;
|
||||
unsigned char tmpbyte, tmpcwlen;
|
||||
unsigned int tmpcw32;
|
||||
|
||||
__shared__ unsigned int sm[3072];
|
||||
__shared__ unsigned int kcmax;
|
||||
|
||||
#ifdef CACHECWLUT
|
||||
unsigned int *codewords = (unsigned int *)sm;
|
||||
unsigned int *codewordlens = (unsigned int *)(sm + NUM_SYMBOLS);
|
||||
unsigned int *as = (unsigned int *)(sm + 2 * NUM_SYMBOLS);
|
||||
|
||||
/* Load the codewords and the original data*/
|
||||
codewords[k] = gm_codewords[k];
|
||||
codewordlens[k] = gm_codewordlens[k];
|
||||
val32 = data[kn];
|
||||
__syncthreads();
|
||||
for (unsigned int i = 0; i < 4; i++) {
|
||||
tmpbyte = (unsigned char)(val32 >> ((3 - i) * 8));
|
||||
tmpcw32 = codewords[tmpbyte];
|
||||
tmpcwlen = codewordlens[tmpbyte];
|
||||
cw64 = (cw64 << tmpcwlen) | tmpcw32;
|
||||
codewordlen += tmpcwlen;
|
||||
}
|
||||
#else
|
||||
unsigned int *as = (unsigned int *)sm;
|
||||
val32 = data[kn];
|
||||
for (unsigned int i = 0; i < 4; i++) {
|
||||
tmpbyte = (unsigned char)(val32 >> ((3 - i) * 8));
|
||||
tmpcw32 = gm_codewords[tmpbyte];
|
||||
tmpcwlen = gm_codewordlens[tmpbyte];
|
||||
cw64 = (cw64 << tmpcwlen) | tmpcw32;
|
||||
codewordlen += tmpcwlen;
|
||||
}
|
||||
#endif
|
||||
as[k] = codewordlen;
|
||||
__syncthreads();
|
||||
|
||||
/* Prefix sum of codeword lengths (denoted in bits) [inplace implementation]
|
||||
*/
|
||||
unsigned int offset = 1;
|
||||
|
||||
/* Build the sum in place up the tree */
|
||||
for (unsigned int d = (blockDim.x) >> 1; d > 0; d >>= 1) {
|
||||
__syncthreads();
|
||||
if (k < d) {
|
||||
unsigned char ai = offset * (2 * k + 1) - 1;
|
||||
unsigned char bi = offset * (2 * k + 2) - 1;
|
||||
as[bi] += as[ai];
|
||||
}
|
||||
offset *= 2;
|
||||
}
|
||||
|
||||
/* scan back down the tree */
|
||||
/* clear the last element */
|
||||
if (k == 0)
|
||||
as[blockDim.x - 1] = 0;
|
||||
|
||||
// traverse down the tree building the scan in place
|
||||
for (unsigned int d = 1; d < blockDim.x; d *= 2) {
|
||||
offset >>= 1;
|
||||
__syncthreads();
|
||||
if (k < d) {
|
||||
unsigned char ai = offset * (2 * k + 1) - 1;
|
||||
unsigned char bi = offset * (2 * k + 2) - 1;
|
||||
unsigned int t = as[ai];
|
||||
as[ai] = as[bi];
|
||||
as[bi] += t;
|
||||
}
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
if (k == blockDim.x - 1) {
|
||||
outidx[blockIdx.x] = as[k] + codewordlen;
|
||||
kcmax = (as[k] + codewordlen) / 32;
|
||||
// printf("kcmax: %d\n", kcmax);
|
||||
}
|
||||
|
||||
/* Write the codes */
|
||||
kc = as[k] / 32;
|
||||
startbit = as[k] % 32;
|
||||
as[k] = 0U;
|
||||
__syncthreads();
|
||||
|
||||
/* Part 1*/
|
||||
wrbits = codewordlen > (32 - startbit) ? (32 - startbit) : codewordlen;
|
||||
tmpcw32 = (unsigned int)(cw64 >> (codewordlen - wrbits));
|
||||
// if (wrbits == 32) as[kc] = tmpcw32;
|
||||
// //unnecessary overhead; increases number of branches else
|
||||
atomicOr(&as[kc], tmpcw32 << (32 - startbit -
|
||||
wrbits)); // shift left in case it's shorter
|
||||
// then the available space
|
||||
codewordlen -= wrbits;
|
||||
|
||||
/*Part 2*/
|
||||
if (codewordlen) {
|
||||
wrbits = codewordlen > 32 ? 32 : codewordlen;
|
||||
tmpcw32 =
|
||||
(unsigned int)(cw64 >> (codewordlen - wrbits)) & ((1 << wrbits) - 1);
|
||||
// if (wrbits == 32) as[kc+1] = tmpcw32;
|
||||
// else
|
||||
atomicOr(&as[kc + 1], tmpcw32 << (32 - wrbits));
|
||||
codewordlen -= wrbits;
|
||||
}
|
||||
|
||||
/*Part 3*/
|
||||
if (codewordlen) {
|
||||
tmpcw32 = (unsigned int)(cw64 & ((1 << codewordlen) - 1));
|
||||
// if (wrbits == 32) as[kc+2] = tmpcw32;
|
||||
// else
|
||||
atomicOr(&as[kc + 2], tmpcw32 << (32 - codewordlen));
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
if (k <= kcmax)
|
||||
out[kn] = as[k];
|
||||
}
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
#endif
|
||||
|
||||
#endif
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue