Merge branch 'cupbop:master' into master
This commit is contained in:
commit
21f298524e
|
@ -27,10 +27,10 @@ Currently, CuPBoP support serveral CPU backends, including x86, AArch64, and RIS
|
||||||
export CuPBoP_PATH=`pwd`
|
export CuPBoP_PATH=`pwd`
|
||||||
export LD_LIBRARY_PATH=$CuPBoP_PATH/build/runtime:$CuPBoP_PATH/build/runtime/threadPool:$LD_LIBRARY_PATH
|
export LD_LIBRARY_PATH=$CuPBoP_PATH/build/runtime:$CuPBoP_PATH/build/runtime/threadPool:$LD_LIBRARY_PATH
|
||||||
```
|
```
|
||||||
If you are using boson, you can pre-installed llvm 10.0.0
|
|
||||||
|
|
||||||
LLVM_PATH=/opt/llvm-10.0.0
|
If you are using boson, you can pre-installed llvm 10.0.0\
|
||||||
export PATH=$LLVM_PATH/bin:$PATH
|
`LLVM_PATH=/opt/llvm-10.0.0`\
|
||||||
|
`export PATH=$LLVM_PATH/bin:$PATH`
|
||||||
|
|
||||||
2. As CuPBoP relies on CUDA structures, we need to download the CUDA header file
|
2. As CuPBoP relies on CUDA structures, we need to download the CUDA header file
|
||||||
|
|
||||||
|
|
|
@ -272,13 +272,12 @@ void AddContextSaveRestore(llvm::Instruction *instruction,
|
||||||
std::vector<Instruction *> uses;
|
std::vector<Instruction *> uses;
|
||||||
Function *f2 = instruction->getParent()->getParent();
|
Function *f2 = instruction->getParent()->getParent();
|
||||||
|
|
||||||
|
|
||||||
for (Instruction::use_iterator ui = instruction->use_begin(),
|
for (Instruction::use_iterator ui = instruction->use_begin(),
|
||||||
ue = instruction->use_end();
|
ue = instruction->use_end();
|
||||||
ui != ue; ++ui) {
|
ui != ue; ++ui) {
|
||||||
llvm::Instruction *user = cast<Instruction>(ui->getUser());
|
llvm::Instruction *user = cast<Instruction>(ui->getUser());
|
||||||
Function *f1 = user->getParent()->getParent();
|
Function *f1 = user->getParent()->getParent();
|
||||||
if(f2->getName() != f1->getName()) {
|
if (f2->getName() != f1->getName()) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
if (user == NULL)
|
if (user == NULL)
|
||||||
|
|
|
@ -89,11 +89,12 @@ void mem_share2global(llvm::Module *M) {
|
||||||
} else if (element_type->isStructTy()) {
|
} else if (element_type->isStructTy()) {
|
||||||
auto undef = llvm::UndefValue::get(element_type);
|
auto undef = llvm::UndefValue::get(element_type);
|
||||||
llvm::GlobalVariable *global_memory = new llvm::GlobalVariable(
|
llvm::GlobalVariable *global_memory = new llvm::GlobalVariable(
|
||||||
*M, element_type, false, llvm::GlobalValue::ExternalLinkage, undef,
|
*M, element_type, false, llvm::GlobalValue::ExternalLinkage,
|
||||||
new_name, NULL, llvm::GlobalValue::GeneralDynamicTLSModel, 0,
|
undef, new_name, NULL,
|
||||||
false);
|
llvm::GlobalValue::GeneralDynamicTLSModel, 0, false);
|
||||||
global_memory->setDSOLocal(true);
|
global_memory->setDSOLocal(true);
|
||||||
Comdat * comdat = M->getOrInsertComdat(StringRef(share_memory->getName()));
|
Comdat *comdat =
|
||||||
|
M->getOrInsertComdat(StringRef(share_memory->getName()));
|
||||||
comdat->setSelectionKind(Comdat::SelectionKind::Any);
|
comdat->setSelectionKind(Comdat::SelectionKind::Any);
|
||||||
global_memory->setComdat(comdat);
|
global_memory->setComdat(comdat);
|
||||||
global_memory->setLinkage(llvm::GlobalValue::LinkOnceODRLinkage);
|
global_memory->setLinkage(llvm::GlobalValue::LinkOnceODRLinkage);
|
||||||
|
@ -103,7 +104,6 @@ void mem_share2global(llvm::Module *M) {
|
||||||
std::pair<GlobalVariable *, GlobalVariable *>(share_memory,
|
std::pair<GlobalVariable *, GlobalVariable *>(share_memory,
|
||||||
global_memory));
|
global_memory));
|
||||||
|
|
||||||
|
|
||||||
} else {
|
} else {
|
||||||
assert(0 && "The required Share Memory Type is not supported\n");
|
assert(0 && "The required Share Memory Type is not supported\n");
|
||||||
}
|
}
|
||||||
|
|
|
@ -27,9 +27,9 @@
|
||||||
#ifndef _COMMON_H
|
#ifndef _COMMON_H
|
||||||
#define _COMMON_H
|
#define _COMMON_H
|
||||||
|
|
||||||
//24-bit multiplication is faster on G80,
|
// 24-bit multiplication is faster on G80,
|
||||||
//but we must be sure to multiply integers
|
// but we must be sure to multiply integers
|
||||||
//only within [-8M, 8M - 1] range
|
// only within [-8M, 8M - 1] range
|
||||||
#define IMUL(a, b) __mul24(a, b)
|
#define IMUL(a, b) __mul24(a, b)
|
||||||
|
|
||||||
////cuda timing macros
|
////cuda timing macros
|
||||||
|
@ -42,21 +42,23 @@
|
||||||
// cudaEventSynchronize(cstop); \
|
// cudaEventSynchronize(cstop); \
|
||||||
// cudaEventElapsedTime(&elapsedTime, cstart, cstop)
|
// cudaEventElapsedTime(&elapsedTime, cstart, cstop)
|
||||||
|
|
||||||
//divide and round up macro
|
// divide and round up macro
|
||||||
#define DIVANDRND(a, b) ((((a) % (b)) != 0) ? ((a) / (b) + 1) : ((a) / (b)))
|
#define DIVANDRND(a, b) ((((a) % (b)) != 0) ? ((a) / (b) + 1) : ((a) / (b)))
|
||||||
|
|
||||||
# define cudaCheckError( msg ) { \
|
#define cudaCheckError(msg) \
|
||||||
|
{ \
|
||||||
cudaError_t err = cudaGetLastError(); \
|
cudaError_t err = cudaGetLastError(); \
|
||||||
if( cudaSuccess != err) { \
|
if (cudaSuccess != err) { \
|
||||||
fprintf(stderr, "%s: %i: %s: %s.\n", \
|
fprintf(stderr, "%s: %i: %s: %s.\n", __FILE__, __LINE__, msg, \
|
||||||
__FILE__, __LINE__, msg, cudaGetErrorString( err) ); \
|
cudaGetErrorString(err)); \
|
||||||
exit(-1); \
|
exit(-1); \
|
||||||
} }
|
} \
|
||||||
|
|
||||||
# define cudaCheckAsyncError( msg ) { \
|
|
||||||
cudaThreadSynchronize(); \
|
|
||||||
cudaCheckError( msg ); \
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#define cudaCheckAsyncError(msg) \
|
||||||
|
{ \
|
||||||
|
cudaThreadSynchronize(); \
|
||||||
|
cudaCheckError(msg); \
|
||||||
|
}
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -28,11 +28,12 @@
|
||||||
#define _COMPONENTS_H
|
#define _COMPONENTS_H
|
||||||
|
|
||||||
/* Separate compoents of source 8bit RGB image */
|
/* Separate compoents of source 8bit RGB image */
|
||||||
template<typename T>
|
template <typename T>
|
||||||
void rgbToComponents(T *d_r, T *d_g, T *d_b, unsigned char * src, int width, int height);
|
void rgbToComponents(T *d_r, T *d_g, T *d_b, unsigned char *src, int width,
|
||||||
|
int height);
|
||||||
|
|
||||||
/* Copy a 8bit source image data into a color compoment of type T */
|
/* Copy a 8bit source image data into a color compoment of type T */
|
||||||
template<typename T>
|
template <typename T>
|
||||||
void bwToComponent(T *d_c, unsigned char * src, int width, int height);
|
void bwToComponent(T *d_c, unsigned char *src, int width, int height);
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -27,14 +27,15 @@
|
||||||
#ifndef _DWT_H
|
#ifndef _DWT_H
|
||||||
#define _DWT_H
|
#define _DWT_H
|
||||||
|
|
||||||
template<typename T>
|
template <typename T>
|
||||||
int nStage2dDWT(T *in, T *out, T * backup, int pixWidth, int pixHeight, int stages, bool forward);
|
int nStage2dDWT(T *in, T *out, T *backup, int pixWidth, int pixHeight,
|
||||||
|
int stages, bool forward);
|
||||||
|
|
||||||
template<typename T>
|
template <typename T>
|
||||||
int writeNStage2DDWT(T *component_cuda, int width, int height,
|
int writeNStage2DDWT(T *component_cuda, int width, int height, int stages,
|
||||||
int stages, const char * filename, const char * suffix);
|
const char *filename, const char *suffix);
|
||||||
template<typename T>
|
template <typename T>
|
||||||
int writeLinear(T *component_cuda, int width, int height,
|
int writeLinear(T *component_cuda, int width, int height, const char *filename,
|
||||||
const char * filename, const char * suffix);
|
const char *suffix);
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -29,169 +29,146 @@
|
||||||
/// POSSIBILITY OF SUCH DAMAGE.
|
/// POSSIBILITY OF SUCH DAMAGE.
|
||||||
///
|
///
|
||||||
|
|
||||||
|
|
||||||
#ifndef DWT_COMMON_H
|
#ifndef DWT_COMMON_H
|
||||||
#define DWT_COMMON_H
|
#define DWT_COMMON_H
|
||||||
|
|
||||||
|
|
||||||
#include <cstdio>
|
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
|
#include <cstdio>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
// compile time minimum macro
|
// compile time minimum macro
|
||||||
#define CTMIN(a,b) (((a) < (b)) ? (a) : (b))
|
#define CTMIN(a, b) (((a) < (b)) ? (a) : (b))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
// performance testing macros
|
// performance testing macros
|
||||||
#if defined(GPU_DWT_TESTING)
|
#if defined(GPU_DWT_TESTING)
|
||||||
#define PERF_BEGIN \
|
#define PERF_BEGIN \
|
||||||
{ \
|
{ \
|
||||||
dwt_cuda::CudaDWTTester PERF_TESTER; \
|
dwt_cuda::CudaDWTTester PERF_TESTER; \
|
||||||
for(int PERF_N = PERF_TESTER.getNumIterations(); PERF_N--; ) \
|
for (int PERF_N = PERF_TESTER.getNumIterations(); PERF_N--;) { \
|
||||||
{ \
|
|
||||||
PERF_TESTER.beginTestIteration();
|
PERF_TESTER.beginTestIteration();
|
||||||
|
|
||||||
#define PERF_END(PERF_NAME, PERF_W, PERF_H) \
|
#define PERF_END(PERF_NAME, PERF_W, PERF_H) \
|
||||||
PERF_TESTER.endTestIteration(); \
|
PERF_TESTER.endTestIteration(); \
|
||||||
} \
|
} \
|
||||||
PERF_TESTER.showPerformance(PERF_NAME, PERF_W, PERF_H); \
|
PERF_TESTER.showPerformance(PERF_NAME, PERF_W, PERF_H); \
|
||||||
}
|
}
|
||||||
#else // GPU_DWT_TESTING
|
#else // GPU_DWT_TESTING
|
||||||
#define PERF_BEGIN
|
#define PERF_BEGIN
|
||||||
#define PERF_END(PERF_NAME, PERF_W, PERF_H)
|
#define PERF_END(PERF_NAME, PERF_W, PERF_H)
|
||||||
#endif // GPU_DWT_TESTING
|
#endif // GPU_DWT_TESTING
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
namespace dwt_cuda {
|
namespace dwt_cuda {
|
||||||
|
|
||||||
|
/// Divide and round up.
|
||||||
/// Divide and round up.
|
template <typename T>
|
||||||
template <typename T>
|
__device__ __host__ inline T divRndUp(const T &n, const T &d) {
|
||||||
__device__ __host__ inline T divRndUp(const T & n, const T & d) {
|
|
||||||
return (n / d) + ((n % d) ? 1 : 0);
|
return (n / d) + ((n % d) ? 1 : 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// 9/7 forward DWT lifting schema coefficients
|
||||||
|
const float f97Predict1 = -1.586134342; ///< forward 9/7 predict 1
|
||||||
|
const float f97Update1 = -0.05298011854; ///< forward 9/7 update 1
|
||||||
|
const float f97Predict2 = 0.8829110762; ///< forward 9/7 predict 2
|
||||||
|
const float f97Update2 = 0.4435068522; ///< forward 9/7 update 2
|
||||||
|
|
||||||
// 9/7 forward DWT lifting schema coefficients
|
// 9/7 reverse DWT lifting schema coefficients
|
||||||
const float f97Predict1 = -1.586134342; ///< forward 9/7 predict 1
|
const float r97update2 = -f97Update2; ///< undo 9/7 update 2
|
||||||
const float f97Update1 = -0.05298011854; ///< forward 9/7 update 1
|
const float r97predict2 = -f97Predict2; ///< undo 9/7 predict 2
|
||||||
const float f97Predict2 = 0.8829110762; ///< forward 9/7 predict 2
|
const float r97update1 = -f97Update1; ///< undo 9/7 update 1
|
||||||
const float f97Update2 = 0.4435068522; ///< forward 9/7 update 2
|
const float r97Predict1 = -f97Predict1; ///< undo 9/7 predict 1
|
||||||
|
|
||||||
|
// FDWT 9/7 scaling coefficients
|
||||||
|
const float scale97Mul = 1.23017410491400f;
|
||||||
|
const float scale97Div = 1.0 / scale97Mul;
|
||||||
|
|
||||||
// 9/7 reverse DWT lifting schema coefficients
|
// 5/3 forward DWT lifting schema coefficients
|
||||||
const float r97update2 = -f97Update2; ///< undo 9/7 update 2
|
const float forward53Predict = -0.5f; /// forward 5/3 predict
|
||||||
const float r97predict2 = -f97Predict2; ///< undo 9/7 predict 2
|
const float forward53Update = 0.25f; /// forward 5/3 update
|
||||||
const float r97update1 = -f97Update1; ///< undo 9/7 update 1
|
|
||||||
const float r97Predict1 = -f97Predict1; ///< undo 9/7 predict 1
|
|
||||||
|
|
||||||
// FDWT 9/7 scaling coefficients
|
// 5/3 forward DWT lifting schema coefficients
|
||||||
const float scale97Mul = 1.23017410491400f;
|
const float reverse53Update = -forward53Update; /// undo 5/3 update
|
||||||
const float scale97Div = 1.0 / scale97Mul;
|
const float reverse53Predict = -forward53Predict; /// undo 5/3 predict
|
||||||
|
|
||||||
|
/// Functor which adds scaled sum of neighbors to given central pixel.
|
||||||
// 5/3 forward DWT lifting schema coefficients
|
struct AddScaledSum {
|
||||||
const float forward53Predict = -0.5f; /// forward 5/3 predict
|
|
||||||
const float forward53Update = 0.25f; /// forward 5/3 update
|
|
||||||
|
|
||||||
// 5/3 forward DWT lifting schema coefficients
|
|
||||||
const float reverse53Update = -forward53Update; /// undo 5/3 update
|
|
||||||
const float reverse53Predict = -forward53Predict; /// undo 5/3 predict
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/// Functor which adds scaled sum of neighbors to given central pixel.
|
|
||||||
struct AddScaledSum {
|
|
||||||
const float scale; // scale of neighbors
|
const float scale; // scale of neighbors
|
||||||
__device__ AddScaledSum(const float scale) : scale(scale) {}
|
__device__ AddScaledSum(const float scale) : scale(scale) {}
|
||||||
__device__ void operator()(const float p, float & c, const float n) const {
|
__device__ void operator()(const float p, float &c, const float n) const {
|
||||||
|
|
||||||
// if(threadIdx.x == 0) {
|
// if(threadIdx.x == 0) {
|
||||||
|
|
||||||
// printf("scale %f, p %f c %f n %f , result: %f\n", scale, p, c, n, scale * (p + n) );
|
// printf("scale %f, p %f c %f n %f , result: %f\n", scale, p, c, n,
|
||||||
|
// scale * (p + n) );
|
||||||
|
|
||||||
// }
|
// }
|
||||||
|
|
||||||
c += scale * (p + n);
|
c += scale * (p + n);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
/// Returns index ranging from 0 to num threads, such that first half
|
||||||
|
/// of threads get even indices and others get odd indices. Each thread
|
||||||
/// Returns index ranging from 0 to num threads, such that first half
|
/// gets different index.
|
||||||
/// of threads get even indices and others get odd indices. Each thread
|
/// Example: (for 8 threads) threadIdx.x: 0 1 2 3 4 5 6 7
|
||||||
/// gets different index.
|
/// parityIdx: 0 2 4 6 1 3 5 7
|
||||||
/// Example: (for 8 threads) threadIdx.x: 0 1 2 3 4 5 6 7
|
/// @tparam THREADS total count of participating threads
|
||||||
/// parityIdx: 0 2 4 6 1 3 5 7
|
/// @return parity-separated index of thread
|
||||||
/// @tparam THREADS total count of participating threads
|
template <int THREADS> __device__ inline int parityIdx() {
|
||||||
/// @return parity-separated index of thread
|
|
||||||
template <int THREADS>
|
|
||||||
__device__ inline int parityIdx() {
|
|
||||||
return (threadIdx.x * 2) - (THREADS - 1) * (threadIdx.x / (THREADS / 2));
|
return (threadIdx.x * 2) - (THREADS - 1) * (threadIdx.x / (THREADS / 2));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// size of shared memory
|
||||||
|
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 200)
|
||||||
|
const int SHM_SIZE = 48 * 1024;
|
||||||
|
#else
|
||||||
|
const int SHM_SIZE = 16 * 1024;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/// Perrformance and return code tester.
|
||||||
/// size of shared memory
|
class CudaDWTTester {
|
||||||
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 200)
|
private:
|
||||||
const int SHM_SIZE = 48 * 1024;
|
|
||||||
#else
|
|
||||||
const int SHM_SIZE = 16 * 1024;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/// Perrformance and return code tester.
|
|
||||||
class CudaDWTTester {
|
|
||||||
private:
|
|
||||||
static bool testRunning; ///< true if any test is currently running
|
static bool testRunning; ///< true if any test is currently running
|
||||||
cudaEvent_t beginEvent; ///< begin CUDA event
|
cudaEvent_t beginEvent; ///< begin CUDA event
|
||||||
cudaEvent_t endEvent; ///< end CUDA event
|
cudaEvent_t endEvent; ///< end CUDA event
|
||||||
std::vector<float> times; ///< collected times
|
std::vector<float> times; ///< collected times
|
||||||
const bool disabled; ///< true if this object is disabled
|
const bool disabled; ///< true if this object is disabled
|
||||||
public:
|
public:
|
||||||
/// Checks CUDA related error.
|
/// Checks CUDA related error.
|
||||||
/// @param status return code to be checked
|
/// @param status return code to be checked
|
||||||
/// @param message message to be shown if there was an error
|
/// @param message message to be shown if there was an error
|
||||||
/// @return true if there was no error, false otherwise
|
/// @return true if there was no error, false otherwise
|
||||||
static bool check(const cudaError_t & status, const char * message) {
|
static bool check(const cudaError_t &status, const char *message) {
|
||||||
#if defined(GPU_DWT_TESTING)
|
#if defined(GPU_DWT_TESTING)
|
||||||
if((!testRunning) && status != cudaSuccess) {
|
if ((!testRunning) && status != cudaSuccess) {
|
||||||
const char * errorString = cudaGetErrorString(status);
|
const char *errorString = cudaGetErrorString(status);
|
||||||
fprintf(stderr, "CUDA ERROR: '%s': %s\n", message, errorString);
|
fprintf(stderr, "CUDA ERROR: '%s': %s\n", message, errorString);
|
||||||
fflush(stderr);
|
fflush(stderr);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
#endif // GPU_DWT_TESTING
|
#endif // GPU_DWT_TESTING
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Checks last kernel call for errors.
|
/// Checks last kernel call for errors.
|
||||||
/// @param message description of the kernel call
|
/// @param message description of the kernel call
|
||||||
/// @return true if there was no error, false otherwise
|
/// @return true if there was no error, false otherwise
|
||||||
static bool checkLastKernelCall(const char * message) {
|
static bool checkLastKernelCall(const char *message) {
|
||||||
#if defined(GPU_DWT_TESTING)
|
#if defined(GPU_DWT_TESTING)
|
||||||
return testRunning ? true : check(cudaThreadSynchronize(), message);
|
return testRunning ? true : check(cudaThreadSynchronize(), message);
|
||||||
#else // GPU_DWT_TESTING
|
#else // GPU_DWT_TESTING
|
||||||
return true;
|
return true;
|
||||||
#endif // GPU_DWT_TESTING
|
#endif // GPU_DWT_TESTING
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Initializes DWT tester for time measurement
|
/// Initializes DWT tester for time measurement
|
||||||
CudaDWTTester() : disabled(testRunning) {}
|
CudaDWTTester() : disabled(testRunning) {}
|
||||||
|
|
||||||
/// Gets rpefered number of iterations
|
/// Gets rpefered number of iterations
|
||||||
int getNumIterations() {
|
int getNumIterations() { return disabled ? 1 : 31; }
|
||||||
return disabled ? 1 : 31;
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Starts one test iteration.
|
/// Starts one test iteration.
|
||||||
void beginTestIteration() {
|
void beginTestIteration() {
|
||||||
if(!disabled) {
|
if (!disabled) {
|
||||||
cudaEventCreate(&beginEvent);
|
cudaEventCreate(&beginEvent);
|
||||||
cudaEventCreate(&endEvent);
|
cudaEventCreate(&endEvent);
|
||||||
cudaEventRecord(beginEvent, 0);
|
cudaEventRecord(beginEvent, 0);
|
||||||
|
@ -201,7 +178,7 @@ namespace dwt_cuda {
|
||||||
|
|
||||||
/// Ends on etest iteration.
|
/// Ends on etest iteration.
|
||||||
void endTestIteration() {
|
void endTestIteration() {
|
||||||
if(!disabled) {
|
if (!disabled) {
|
||||||
float time;
|
float time;
|
||||||
testRunning = false;
|
testRunning = false;
|
||||||
cudaEventRecord(endEvent, 0);
|
cudaEventRecord(endEvent, 0);
|
||||||
|
@ -217,45 +194,39 @@ namespace dwt_cuda {
|
||||||
/// @param name name of processing method
|
/// @param name name of processing method
|
||||||
/// @param sizeX width of processed image
|
/// @param sizeX width of processed image
|
||||||
/// @param sizeY height of processed image
|
/// @param sizeY height of processed image
|
||||||
void showPerformance(const char * name, const int sizeX, const int sizeY) {
|
void showPerformance(const char *name, const int sizeX, const int sizeY) {
|
||||||
if(!disabled) {
|
if (!disabled) {
|
||||||
// compute mean and median
|
// compute mean and median
|
||||||
std::sort(times.begin(), times.end());
|
std::sort(times.begin(), times.end());
|
||||||
double sum = 0;
|
double sum = 0;
|
||||||
for(int i = times.size(); i--; ) {
|
for (int i = times.size(); i--;) {
|
||||||
sum += times[i];
|
sum += times[i];
|
||||||
}
|
}
|
||||||
const double median = (times[times.size() / 2]
|
const double median =
|
||||||
+ times[(times.size() - 1) / 2]) * 0.5f;
|
(times[times.size() / 2] + times[(times.size() - 1) / 2]) * 0.5f;
|
||||||
printf(" %s: %7.3f ms (mean) %7.3f ms (median) %7.3f ms (max) "
|
printf(" %s: %7.3f ms (mean) %7.3f ms (median) %7.3f ms (max) "
|
||||||
"(%d x %d)\n", name, (sum / times.size()), median,
|
"(%d x %d)\n",
|
||||||
times[times.size() - 1], sizeX, sizeY);
|
name, (sum / times.size()), median, times[times.size() - 1], sizeX,
|
||||||
|
sizeY);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
/// Simple cudaMemcpy wrapped in performance tester.
|
||||||
|
/// @param dest destination bufer
|
||||||
/// Simple cudaMemcpy wrapped in performance tester.
|
/// @param src source buffer
|
||||||
/// @param dest destination bufer
|
/// @param sx width of copied image
|
||||||
/// @param src source buffer
|
/// @param sy height of copied image
|
||||||
/// @param sx width of copied image
|
template <typename T>
|
||||||
/// @param sy height of copied image
|
inline void memCopy(T *const dest, const T *const src, const size_t sx,
|
||||||
template <typename T>
|
const size_t sy) {
|
||||||
inline void memCopy(T * const dest, const T * const src,
|
|
||||||
const size_t sx, const size_t sy) {
|
|
||||||
cudaError_t status;
|
cudaError_t status;
|
||||||
PERF_BEGIN
|
PERF_BEGIN
|
||||||
status = cudaMemcpy(dest, src, sx*sy*sizeof(T), cudaMemcpyDeviceToDevice);
|
status = cudaMemcpy(dest, src, sx * sy * sizeof(T), cudaMemcpyDeviceToDevice);
|
||||||
PERF_END(" memcpy", sx, sy)
|
PERF_END(" memcpy", sx, sy)
|
||||||
CudaDWTTester::check(status, "memcpy device > device");
|
CudaDWTTester::check(status, "memcpy device > device");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
} // end of namespace dwt_cuda
|
} // end of namespace dwt_cuda
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
#endif // DWT_COMMON_CUDA_H
|
#endif // DWT_COMMON_CUDA_H
|
||||||
|
|
||||||
|
|
|
@ -58,55 +58,46 @@
|
||||||
#ifndef DWT_CUDA_H
|
#ifndef DWT_CUDA_H
|
||||||
#define DWT_CUDA_H
|
#define DWT_CUDA_H
|
||||||
|
|
||||||
|
|
||||||
namespace dwt_cuda {
|
namespace dwt_cuda {
|
||||||
|
|
||||||
|
/// Forward 5/3 2D DWT. See common rules (above) for more details.
|
||||||
|
/// @param in Expected to be normalized into range [-128, 127].
|
||||||
|
/// Will not be preserved (will be overwritten).
|
||||||
|
/// @param out output buffer on GPU
|
||||||
|
/// @param sizeX width of input image (in pixels)
|
||||||
|
/// @param sizeY height of input image (in pixels)
|
||||||
|
/// @param levels number of recursive DWT levels
|
||||||
|
void fdwt53(int *in, int *out, int sizeX, int sizeY, int levels);
|
||||||
|
|
||||||
/// Forward 5/3 2D DWT. See common rules (above) for more details.
|
/// Reverse 5/3 2D DWT. See common rules (above) for more details.
|
||||||
/// @param in Expected to be normalized into range [-128, 127].
|
/// @param in Input DWT coefficients. Format described in common rules.
|
||||||
/// Will not be preserved (will be overwritten).
|
/// Will not be preserved (will be overwritten).
|
||||||
/// @param out output buffer on GPU
|
/// @param out output buffer on GPU - will contain original image
|
||||||
/// @param sizeX width of input image (in pixels)
|
/// in normalized range [-128, 127].
|
||||||
/// @param sizeY height of input image (in pixels)
|
/// @param sizeX width of input image (in pixels)
|
||||||
/// @param levels number of recursive DWT levels
|
/// @param sizeY height of input image (in pixels)
|
||||||
void fdwt53(int * in, int * out, int sizeX, int sizeY, int levels);
|
/// @param levels number of recursive DWT levels
|
||||||
|
void rdwt53(int *in, int *out, int sizeX, int sizeY, int levels);
|
||||||
|
|
||||||
|
/// Forward 9/7 2D DWT. See common rules (above) for more details.
|
||||||
|
/// @param in Input DWT coefficients. Should be normalized (in range
|
||||||
|
/// [-0.5, 0.5]). Will not be preserved (will be overwritten).
|
||||||
|
/// @param out output buffer on GPU - format specified in common rules
|
||||||
|
/// @param sizeX width of input image (in pixels)
|
||||||
|
/// @param sizeY height of input image (in pixels)
|
||||||
|
/// @param levels number of recursive DWT levels
|
||||||
|
void fdwt97(float *in, float *out, int sizeX, int sizeY, int levels);
|
||||||
|
|
||||||
/// Reverse 5/3 2D DWT. See common rules (above) for more details.
|
/// Reverse 9/7 2D DWT. See common rules (above) for more details.
|
||||||
/// @param in Input DWT coefficients. Format described in common rules.
|
/// @param in Input DWT coefficients. Format described in common rules.
|
||||||
/// Will not be preserved (will be overwritten).
|
/// Will not be preserved (will be overwritten).
|
||||||
/// @param out output buffer on GPU - will contain original image
|
/// @param out output buffer on GPU - will contain original image
|
||||||
/// in normalized range [-128, 127].
|
/// in normalized range [-0.5, 0.5].
|
||||||
/// @param sizeX width of input image (in pixels)
|
/// @param sizeX width of input image (in pixels)
|
||||||
/// @param sizeY height of input image (in pixels)
|
/// @param sizeY height of input image (in pixels)
|
||||||
/// @param levels number of recursive DWT levels
|
/// @param levels number of recursive DWT levels
|
||||||
void rdwt53(int * in, int * out, int sizeX, int sizeY, int levels);
|
void rdwt97(float *in, float *out, int sizeX, int sizeY, int levels);
|
||||||
|
|
||||||
|
|
||||||
/// Forward 9/7 2D DWT. See common rules (above) for more details.
|
|
||||||
/// @param in Input DWT coefficients. Should be normalized (in range
|
|
||||||
/// [-0.5, 0.5]). Will not be preserved (will be overwritten).
|
|
||||||
/// @param out output buffer on GPU - format specified in common rules
|
|
||||||
/// @param sizeX width of input image (in pixels)
|
|
||||||
/// @param sizeY height of input image (in pixels)
|
|
||||||
/// @param levels number of recursive DWT levels
|
|
||||||
void fdwt97(float * in, float * out, int sizeX, int sizeY, int levels);
|
|
||||||
|
|
||||||
|
|
||||||
/// Reverse 9/7 2D DWT. See common rules (above) for more details.
|
|
||||||
/// @param in Input DWT coefficients. Format described in common rules.
|
|
||||||
/// Will not be preserved (will be overwritten).
|
|
||||||
/// @param out output buffer on GPU - will contain original image
|
|
||||||
/// in normalized range [-0.5, 0.5].
|
|
||||||
/// @param sizeX width of input image (in pixels)
|
|
||||||
/// @param sizeY height of input image (in pixels)
|
|
||||||
/// @param levels number of recursive DWT levels
|
|
||||||
void rdwt97(float * in, float * out, int sizeX, int sizeY, int levels);
|
|
||||||
|
|
||||||
|
|
||||||
} // namespace dwt_cuda
|
} // namespace dwt_cuda
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
#endif // DWT_CUDA_H
|
#endif // DWT_CUDA_H
|
||||||
|
|
||||||
|
|
|
@ -30,64 +30,59 @@
|
||||||
/// POSSIBILITY OF SUCH DAMAGE.
|
/// POSSIBILITY OF SUCH DAMAGE.
|
||||||
///
|
///
|
||||||
|
|
||||||
|
|
||||||
#ifndef IO_H
|
#ifndef IO_H
|
||||||
#define IO_H
|
#define IO_H
|
||||||
|
|
||||||
|
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
|
||||||
namespace dwt_cuda {
|
namespace dwt_cuda {
|
||||||
|
|
||||||
|
/// Base for all IO classes - manages mirroring.
|
||||||
/// Base for all IO classes - manages mirroring.
|
class DWTIO {
|
||||||
class DWTIO {
|
protected:
|
||||||
protected:
|
|
||||||
/// Handles mirroring of image at edges in a DWT correct way.
|
/// Handles mirroring of image at edges in a DWT correct way.
|
||||||
/// @param d a position in the image (will be replaced by mirrored d)
|
/// @param d a position in the image (will be replaced by mirrored d)
|
||||||
/// @param sizeD size of the image along the dimension of 'd'
|
/// @param sizeD size of the image along the dimension of 'd'
|
||||||
__device__ static void mirror(int & d, const int & sizeD) {
|
__device__ static void mirror(int &d, const int &sizeD) {
|
||||||
// TODO: enable multiple mirroring:
|
// TODO: enable multiple mirroring:
|
||||||
// if(sizeD > 1) {
|
// if(sizeD > 1) {
|
||||||
// if(d < 0) {
|
// if(d < 0) {
|
||||||
// const int underflow = -1 - d;
|
// const int underflow = -1 - d;
|
||||||
// const int phase = (underflow / (sizeD - 1)) & 1;
|
// const int phase = (underflow / (sizeD - 1)) & 1;
|
||||||
// const int remainder = underflow % (sizeD - 1);
|
// const int remainder = underflow % (sizeD - 1);
|
||||||
// if(phase == 0) {
|
// if(phase == 0) {
|
||||||
// d = remainder + 1;
|
// d = remainder + 1;
|
||||||
// } else {
|
// } else {
|
||||||
// d = sizeD - 2 - remainder;
|
// d = sizeD - 2 - remainder;
|
||||||
// }
|
// }
|
||||||
// } else if(d >= sizeD) {
|
// } else if(d >= sizeD) {
|
||||||
// const int overflow = d - sizeD;
|
// const int overflow = d - sizeD;
|
||||||
// const int phase = (overflow / (sizeD - 1)) & 1;
|
// const int phase = (overflow / (sizeD - 1)) & 1;
|
||||||
// const int remainder = overflow % (sizeD - 1);
|
// const int remainder = overflow % (sizeD - 1);
|
||||||
// if(phase == 0) {
|
// if(phase == 0) {
|
||||||
// d = sizeD - 2 - remainder;
|
// d = sizeD - 2 - remainder;
|
||||||
// } else {
|
// } else {
|
||||||
// d = remainder + 1;
|
// d = remainder + 1;
|
||||||
// }
|
// }
|
||||||
// }
|
// }
|
||||||
// } else {
|
// } else {
|
||||||
// d = 0;
|
// d = 0;
|
||||||
// }
|
// }
|
||||||
//for test the mirror's use Feb 17
|
// for test the mirror's use Feb 17
|
||||||
if(d >= sizeD) {
|
if (d >= sizeD) {
|
||||||
d = 2 * sizeD - 2 - d;
|
d = 2 * sizeD - 2 - d;
|
||||||
} else if(d < 0) {
|
} else if (d < 0) {
|
||||||
d = -d;
|
d = -d;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
/// Base class for pixel loader and writer - manages computing start index,
|
||||||
/// Base class for pixel loader and writer - manages computing start index,
|
/// stride and end of image for loading column of pixels.
|
||||||
/// stride and end of image for loading column of pixels.
|
/// @tparam T type of image pixels
|
||||||
/// @tparam T type of image pixels
|
/// @tparam CHECKED true = be prepared to image boundary, false = don't care
|
||||||
/// @tparam CHECKED true = be prepared to image boundary, false = don't care
|
template <typename T, bool CHECKED> class VerticalDWTPixelIO : protected DWTIO {
|
||||||
template <typename T, bool CHECKED>
|
protected:
|
||||||
class VerticalDWTPixelIO : protected DWTIO {
|
|
||||||
protected:
|
|
||||||
int end; ///< index of bottom neightbor of last pixel of column
|
int end; ///< index of bottom neightbor of last pixel of column
|
||||||
int stride; ///< increment of pointer to get to next pixel
|
int stride; ///< increment of pointer to get to next pixel
|
||||||
|
|
||||||
|
@ -97,34 +92,32 @@ namespace dwt_cuda {
|
||||||
/// @param firstX x-coordinate of first pixel to use
|
/// @param firstX x-coordinate of first pixel to use
|
||||||
/// @param firstY y-coordinate of first pixel to use
|
/// @param firstY y-coordinate of first pixel to use
|
||||||
/// @return index of pixel at position [x, y] in the image
|
/// @return index of pixel at position [x, y] in the image
|
||||||
__device__ int initialize(const int sizeX, const int sizeY,
|
__device__ int initialize(const int sizeX, const int sizeY, int firstX,
|
||||||
int firstX, int firstY) {
|
int firstY) {
|
||||||
// initialize all pointers and stride
|
// initialize all pointers and stride
|
||||||
end = CHECKED ? (sizeY * sizeX + firstX) : 0;
|
end = CHECKED ? (sizeY * sizeX + firstX) : 0;
|
||||||
stride = sizeX;
|
stride = sizeX;
|
||||||
return firstX + sizeX * firstY;
|
return firstX + sizeX * firstY;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
/// Writes reverse transformed pixels directly into output image.
|
||||||
|
/// @tparam T type of output pixels
|
||||||
/// Writes reverse transformed pixels directly into output image.
|
/// @tparam CHECKED true = be prepared to image boundary, false = don't care
|
||||||
/// @tparam T type of output pixels
|
template <typename T, bool CHECKED>
|
||||||
/// @tparam CHECKED true = be prepared to image boundary, false = don't care
|
class VerticalDWTPixelWriter : VerticalDWTPixelIO<T, CHECKED> {
|
||||||
template <typename T, bool CHECKED>
|
private:
|
||||||
class VerticalDWTPixelWriter : VerticalDWTPixelIO<T, CHECKED> {
|
|
||||||
private:
|
|
||||||
int next; // index of the next pixel to be loaded
|
int next; // index of the next pixel to be loaded
|
||||||
|
|
||||||
public:
|
public:
|
||||||
/// Initializes writer - sets output buffer and a position of first pixel.
|
/// Initializes writer - sets output buffer and a position of first pixel.
|
||||||
/// @param sizeX width of the image
|
/// @param sizeX width of the image
|
||||||
/// @param sizeY height of the image
|
/// @param sizeY height of the image
|
||||||
/// @param firstX x-coordinate of first pixel to write into
|
/// @param firstX x-coordinate of first pixel to write into
|
||||||
/// @param firstY y-coordinate of first pixel to write into
|
/// @param firstY y-coordinate of first pixel to write into
|
||||||
__device__ void init(const int sizeX, const int sizeY,
|
__device__ void init(const int sizeX, const int sizeY, int firstX,
|
||||||
int firstX, int firstY) {
|
int firstY) {
|
||||||
if(firstX < sizeX) {
|
if (firstX < sizeX) {
|
||||||
next = this->initialize(sizeX, sizeY, firstX, firstY);
|
next = this->initialize(sizeX, sizeY, firstX, firstY);
|
||||||
} else {
|
} else {
|
||||||
this->end = 0;
|
this->end = 0;
|
||||||
|
@ -137,57 +130,42 @@ namespace dwt_cuda {
|
||||||
/// correctly handling mirroring.
|
/// correctly handling mirroring.
|
||||||
/// @param output output image to write pixel into
|
/// @param output output image to write pixel into
|
||||||
/// @param value value of the pixel to be written
|
/// @param value value of the pixel to be written
|
||||||
__device__ void writeInto(T * const output, const T & value) {
|
__device__ void writeInto(T *const output, const T &value) {
|
||||||
if((!CHECKED) || (next != this->end)) {
|
if ((!CHECKED) || (next != this->end)) {
|
||||||
output[next] = value;
|
output[next] = value;
|
||||||
next += this->stride;
|
next += this->stride;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
/// Loads pixels from input image.
|
||||||
|
/// @tparam T type of image input pixels
|
||||||
/// Loads pixels from input image.
|
/// @tparam CHECKED true = be prepared to image boundary, false = don't care
|
||||||
/// @tparam T type of image input pixels
|
template <typename T, bool CHECKED>
|
||||||
/// @tparam CHECKED true = be prepared to image boundary, false = don't care
|
class VerticalDWTPixelLoader : protected VerticalDWTPixelIO<const T, CHECKED> {
|
||||||
template <typename T, bool CHECKED>
|
private:
|
||||||
class VerticalDWTPixelLoader
|
|
||||||
: protected VerticalDWTPixelIO<const T, CHECKED> {
|
|
||||||
private:
|
|
||||||
int last; ///< index of last loaded pixel
|
int last; ///< index of last loaded pixel
|
||||||
public:
|
public:
|
||||||
|
|
||||||
|
|
||||||
//******************* FOR TEST **********************
|
//******************* FOR TEST **********************
|
||||||
__device__ int getlast(){
|
__device__ int getlast() { return last; }
|
||||||
return last;
|
__device__ int getend() { return this->end; }
|
||||||
}
|
__device__ int getstride() { return this->stride; }
|
||||||
__device__ int getend(){
|
__device__ void setend(int a) { this->end = a; }
|
||||||
return this->end;
|
|
||||||
}
|
|
||||||
__device__ int getstride(){
|
|
||||||
return this->stride;
|
|
||||||
}
|
|
||||||
__device__ void setend(int a){
|
|
||||||
this->end=a;
|
|
||||||
}
|
|
||||||
//******************* FOR TEST **********************
|
//******************* FOR TEST **********************
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/// Initializes loader - sets input size and a position of first pixel.
|
/// Initializes loader - sets input size and a position of first pixel.
|
||||||
/// @param sizeX width of the image
|
/// @param sizeX width of the image
|
||||||
/// @param sizeY height of the image
|
/// @param sizeY height of the image
|
||||||
/// @param firstX x-coordinate of first pixel to load
|
/// @param firstX x-coordinate of first pixel to load
|
||||||
/// @param firstY y-coordinate of first pixel to load
|
/// @param firstY y-coordinate of first pixel to load
|
||||||
__device__ void init(const int sizeX, const int sizeY,
|
__device__ void init(const int sizeX, const int sizeY, int firstX,
|
||||||
int firstX, int firstY) {
|
int firstY) {
|
||||||
// correctly mirror x coordinate
|
// correctly mirror x coordinate
|
||||||
this->mirror(firstX, sizeX);
|
this->mirror(firstX, sizeX);
|
||||||
|
|
||||||
// 'last' always points to already loaded pixel (subtract sizeX = stride)
|
// 'last' always points to already loaded pixel (subtract sizeX = stride)
|
||||||
last = this->initialize(sizeX, sizeY, firstX, firstY) - sizeX;
|
last = this->initialize(sizeX, sizeY, firstX, firstY) - sizeX;
|
||||||
//last = (FirstX + sizeX * FirstY) - sizeX
|
// last = (FirstX + sizeX * FirstY) - sizeX
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Sets all fields to zeros, for compiler not to complain about
|
/// Sets all fields to zeros, for compiler not to complain about
|
||||||
|
@ -201,15 +179,16 @@ namespace dwt_cuda {
|
||||||
/// Gets another pixel and advancees internal pointer to following one.
|
/// Gets another pixel and advancees internal pointer to following one.
|
||||||
/// @param input input image to load next pixel from
|
/// @param input input image to load next pixel from
|
||||||
/// @return next pixel from given image
|
/// @return next pixel from given image
|
||||||
__device__ T loadFrom(const T * const input) {
|
__device__ T loadFrom(const T *const input) {
|
||||||
last += this->stride;
|
last += this->stride;
|
||||||
if(CHECKED && (last == this->end)) {
|
if (CHECKED && (last == this->end)) {
|
||||||
last -= 2 * this->stride;
|
last -= 2 * this->stride;
|
||||||
this->stride = -this->stride; // reverse loader's direction
|
this->stride = -this->stride; // reverse loader's direction
|
||||||
}
|
}
|
||||||
// avoid reading from negative indices if loader is checked
|
// avoid reading from negative indices if loader is checked
|
||||||
// return (CHECKED && (last < 0)) ? 0 : input[last]; // TODO: use this checked variant later
|
// return (CHECKED && (last < 0)) ? 0 : input[last]; // TODO: use this
|
||||||
if(last < 0 ) {
|
// checked variant later
|
||||||
|
if (last < 0) {
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -218,17 +197,14 @@ namespace dwt_cuda {
|
||||||
// return last;
|
// return last;
|
||||||
// return this->stride;
|
// return this->stride;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
/// Base for band write and loader. Manages computing strides and pointers
|
||||||
|
/// to first and last pixels in a linearly-stored-bands correct way.
|
||||||
/// Base for band write and loader. Manages computing strides and pointers
|
/// @tparam T type of band coefficients
|
||||||
/// to first and last pixels in a linearly-stored-bands correct way.
|
/// @tparam CHECKED true = be prepared to image boundary, false = don't care
|
||||||
/// @tparam T type of band coefficients
|
template <typename T, bool CHECKED> class VerticalDWTBandIO : protected DWTIO {
|
||||||
/// @tparam CHECKED true = be prepared to image boundary, false = don't care
|
protected:
|
||||||
template <typename T, bool CHECKED>
|
|
||||||
class VerticalDWTBandIO : protected DWTIO {
|
|
||||||
protected:
|
|
||||||
/// index of bottom neighbor of last pixel of loaded column
|
/// index of bottom neighbor of last pixel of loaded column
|
||||||
int end;
|
int end;
|
||||||
|
|
||||||
|
@ -256,7 +232,7 @@ namespace dwt_cuda {
|
||||||
int verticalStride;
|
int verticalStride;
|
||||||
|
|
||||||
// resolve index of first pixel according to horizontal parity
|
// resolve index of first pixel according to horizontal parity
|
||||||
if(firstX & 1) {
|
if (firstX & 1) {
|
||||||
// first pixel in one of right bands
|
// first pixel in one of right bands
|
||||||
verticalStride = imageSizeX / 2;
|
verticalStride = imageSizeX / 2;
|
||||||
columnOffset += divRndUp(imageSizeX, 2) * divRndUp(imageSizeY, 2);
|
columnOffset += divRndUp(imageSizeX, 2) * divRndUp(imageSizeY, 2);
|
||||||
|
@ -271,7 +247,7 @@ namespace dwt_cuda {
|
||||||
strideHighToLow = verticalStride - strideLowToHigh;
|
strideHighToLow = verticalStride - strideLowToHigh;
|
||||||
|
|
||||||
// compute index of coefficient which indicates end of image
|
// compute index of coefficient which indicates end of image
|
||||||
if(CHECKED) {
|
if (CHECKED) {
|
||||||
end = columnOffset // right column
|
end = columnOffset // right column
|
||||||
+ (imageSizeY / 2) * verticalStride // right row
|
+ (imageSizeY / 2) * verticalStride // right row
|
||||||
+ (imageSizeY & 1) * strideLowToHigh; // possibly in high band
|
+ (imageSizeY & 1) * strideLowToHigh; // possibly in high band
|
||||||
|
@ -279,29 +255,24 @@ namespace dwt_cuda {
|
||||||
end = 0;
|
end = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
//***********for test**************
|
//***********for test**************
|
||||||
// end = CHECKED;
|
// end = CHECKED;
|
||||||
//***********for test**************
|
//***********for test**************
|
||||||
|
|
||||||
|
|
||||||
// finally, return index of the first item
|
// finally, return index of the first item
|
||||||
return columnOffset // right column
|
return columnOffset // right column
|
||||||
+ (firstY / 2) * verticalStride // right row
|
+ (firstY / 2) * verticalStride // right row
|
||||||
+ (firstY & 1) * strideLowToHigh; // possibly in high band
|
+ (firstY & 1) * strideLowToHigh; // possibly in high band
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
/// Directly loads coefficients from four consecutively stored transformed
|
||||||
|
/// bands.
|
||||||
|
/// @tparam T type of input band coefficients
|
||||||
/// Directly loads coefficients from four consecutively stored transformed
|
/// @tparam CHECKED true = be prepared to image boundary, false = don't care
|
||||||
/// bands.
|
template <typename T, bool CHECKED>
|
||||||
/// @tparam T type of input band coefficients
|
class VerticalDWTBandLoader : public VerticalDWTBandIO<const T, CHECKED> {
|
||||||
/// @tparam CHECKED true = be prepared to image boundary, false = don't care
|
private:
|
||||||
template <typename T, bool CHECKED>
|
|
||||||
class VerticalDWTBandLoader : public VerticalDWTBandIO<const T, CHECKED> {
|
|
||||||
private:
|
|
||||||
int last; ///< index of last loaded pixel
|
int last; ///< index of last loaded pixel
|
||||||
|
|
||||||
/// Checks internal index and possibly reverses direction of loader.
|
/// Checks internal index and possibly reverses direction of loader.
|
||||||
|
@ -309,9 +280,9 @@ namespace dwt_cuda {
|
||||||
/// @param input input image to load next coefficient from
|
/// @param input input image to load next coefficient from
|
||||||
/// @param stride stride to use now (one of two loader's strides)
|
/// @param stride stride to use now (one of two loader's strides)
|
||||||
/// @return loaded coefficient
|
/// @return loaded coefficient
|
||||||
__device__ T updateAndLoad(const T * const input, const int & stride) {
|
__device__ T updateAndLoad(const T *const input, const int &stride) {
|
||||||
last += stride;
|
last += stride;
|
||||||
if(CHECKED && (last == this->end)) {
|
if (CHECKED && (last == this->end)) {
|
||||||
// undo last two updates of index (to get to previous mirrored item)
|
// undo last two updates of index (to get to previous mirrored item)
|
||||||
last -= (this->strideLowToHigh + this->strideHighToLow);
|
last -= (this->strideLowToHigh + this->strideHighToLow);
|
||||||
|
|
||||||
|
@ -320,15 +291,16 @@ namespace dwt_cuda {
|
||||||
this->strideLowToHigh = -this->strideHighToLow;
|
this->strideLowToHigh = -this->strideHighToLow;
|
||||||
this->strideHighToLow = -temp;
|
this->strideHighToLow = -temp;
|
||||||
}
|
}
|
||||||
if(last < 0 ) {
|
if (last < 0) {
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
// avoid reading from negative indices if loader is checked
|
// avoid reading from negative indices if loader is checked
|
||||||
// return (CHECKED && (last < 0)) ? 0 : input[last]; // TODO: use this checked variant later
|
// return (CHECKED && (last < 0)) ? 0 : input[last]; // TODO: use this
|
||||||
|
// checked variant later
|
||||||
return input[last];
|
return input[last];
|
||||||
}
|
}
|
||||||
public:
|
|
||||||
|
|
||||||
|
public:
|
||||||
/// Initializes loader - sets input size and a position of first pixel.
|
/// Initializes loader - sets input size and a position of first pixel.
|
||||||
/// @param imageSizeX width of the image
|
/// @param imageSizeX width of the image
|
||||||
/// @param imageSizeY height of the image
|
/// @param imageSizeY height of the image
|
||||||
|
@ -336,8 +308,8 @@ namespace dwt_cuda {
|
||||||
/// (Parity determines vertically low or high band.)
|
/// (Parity determines vertically low or high band.)
|
||||||
/// @param firstY y-coordinate of first pixel to load
|
/// @param firstY y-coordinate of first pixel to load
|
||||||
/// (Parity determines horizontally low or high band.)
|
/// (Parity determines horizontally low or high band.)
|
||||||
__device__ void init(const int imageSizeX, const int imageSizeY,
|
__device__ void init(const int imageSizeX, const int imageSizeY, int firstX,
|
||||||
int firstX, const int firstY) {
|
const int firstY) {
|
||||||
this->mirror(firstX, imageSizeX);
|
this->mirror(firstX, imageSizeX);
|
||||||
last = this->initialize(imageSizeX, imageSizeY, firstX, firstY);
|
last = this->initialize(imageSizeX, imageSizeY, firstX, firstY);
|
||||||
|
|
||||||
|
@ -359,7 +331,7 @@ namespace dwt_cuda {
|
||||||
/// was in high band.
|
/// was in high band.
|
||||||
/// @param input input image to load next coefficient from
|
/// @param input input image to load next coefficient from
|
||||||
/// @return next coefficient from the lowpass band of the given image
|
/// @return next coefficient from the lowpass band of the given image
|
||||||
__device__ T loadLowFrom(const T * const input) {
|
__device__ T loadLowFrom(const T *const input) {
|
||||||
return updateAndLoad(input, this->strideHighToLow);
|
return updateAndLoad(input, this->strideHighToLow);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -368,21 +340,17 @@ namespace dwt_cuda {
|
||||||
/// was in high band.
|
/// was in high band.
|
||||||
/// @param input input image to load next coefficient from
|
/// @param input input image to load next coefficient from
|
||||||
/// @return next coefficient from the highbass band of the given image
|
/// @return next coefficient from the highbass band of the given image
|
||||||
__device__ T loadHighFrom(const T * const input) {
|
__device__ T loadHighFrom(const T *const input) {
|
||||||
return updateAndLoad(input, this->strideLowToHigh);
|
return updateAndLoad(input, this->strideLowToHigh);
|
||||||
}
|
}
|
||||||
|
};
|
||||||
|
|
||||||
};
|
/// Directly saves coefficients into four transformed bands.
|
||||||
|
/// @tparam T type of output band coefficients
|
||||||
|
/// @tparam CHECKED true = be prepared to image boundary, false = don't care
|
||||||
|
template <typename T, bool CHECKED>
|
||||||
|
class VerticalDWTBandWriter : public VerticalDWTBandIO<T, CHECKED> {
|
||||||
/// Directly saves coefficients into four transformed bands.
|
private:
|
||||||
/// @tparam T type of output band coefficients
|
|
||||||
/// @tparam CHECKED true = be prepared to image boundary, false = don't care
|
|
||||||
template <typename T, bool CHECKED>
|
|
||||||
class VerticalDWTBandWriter : public VerticalDWTBandIO<T, CHECKED> {
|
|
||||||
private:
|
|
||||||
int next; ///< index of last loaded pixel
|
int next; ///< index of last loaded pixel
|
||||||
|
|
||||||
/// Checks internal index and possibly stops the writer.
|
/// Checks internal index and possibly stops the writer.
|
||||||
|
@ -390,25 +358,26 @@ namespace dwt_cuda {
|
||||||
/// @param output output buffer
|
/// @param output output buffer
|
||||||
/// @param item item to put into the output
|
/// @param item item to put into the output
|
||||||
/// @param stride increment of the pointer to get to next output index
|
/// @param stride increment of the pointer to get to next output index
|
||||||
__device__ int saveAndUpdate(T * const output, const T & item,
|
__device__ int saveAndUpdate(T *const output, const T &item,
|
||||||
const int & stride) {
|
const int &stride) {
|
||||||
// if(blockIdx.x == 0 && blockIdx.y == 11 && threadIdx.x == 0){ //test, Mar 20
|
// if(blockIdx.x == 0 && blockIdx.y == 11 && threadIdx.x == 0){
|
||||||
if((!CHECKED) || (next != this->end)) {
|
////test, Mar 20
|
||||||
|
if ((!CHECKED) || (next != this->end)) {
|
||||||
// if(next == 4) {
|
// if(next == 4) {
|
||||||
// printf(" next: %d stride: %d val: %f \n", next, stride, item );
|
// printf(" next: %d stride: %d val: %f \n", next, stride, item );
|
||||||
// }
|
// }
|
||||||
output[next] = item;
|
output[next] = item;
|
||||||
next += stride;
|
next += stride;
|
||||||
}
|
}
|
||||||
// }
|
// }
|
||||||
// if((!CHECKED) || (next != this->end)) { //the real one
|
// if((!CHECKED) || (next != this->end)) { //the real one
|
||||||
// output[next] = item;
|
// output[next] = item;
|
||||||
// next += stride; //stride has been test
|
// next += stride; //stride has been test
|
||||||
// }
|
// }
|
||||||
return next;
|
return next;
|
||||||
}
|
}
|
||||||
public:
|
|
||||||
|
|
||||||
|
public:
|
||||||
/// Initializes writer - sets output size and a position of first pixel.
|
/// Initializes writer - sets output size and a position of first pixel.
|
||||||
/// @param output output image
|
/// @param output output image
|
||||||
/// @param imageSizeX width of the image
|
/// @param imageSizeX width of the image
|
||||||
|
@ -441,7 +410,7 @@ namespace dwt_cuda {
|
||||||
/// was in lowpass band.
|
/// was in lowpass band.
|
||||||
/// @param output output image
|
/// @param output output image
|
||||||
/// @param low lowpass coefficient to save into the lowpass band
|
/// @param low lowpass coefficient to save into the lowpass band
|
||||||
__device__ int writeLowInto(T * const output, const T & primary) {
|
__device__ int writeLowInto(T *const output, const T &primary) {
|
||||||
return saveAndUpdate(output, primary, this->strideLowToHigh);
|
return saveAndUpdate(output, primary, this->strideLowToHigh);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -450,34 +419,22 @@ namespace dwt_cuda {
|
||||||
/// was in highpass band.
|
/// was in highpass band.
|
||||||
/// @param output output image
|
/// @param output output image
|
||||||
/// @param high highpass coefficient to save into the highpass band
|
/// @param high highpass coefficient to save into the highpass band
|
||||||
__device__ int writeHighInto(T * const output, const T & other) {
|
__device__ int writeHighInto(T *const output, const T &other) {
|
||||||
return saveAndUpdate(output, other, this->strideHighToLow);
|
return saveAndUpdate(output, other, this->strideHighToLow);
|
||||||
}
|
}
|
||||||
|
|
||||||
//*******Add three functions to get private values*******
|
//*******Add three functions to get private values*******
|
||||||
__device__ int getnext(){
|
__device__ int getnext() { return next; }
|
||||||
return next;
|
|
||||||
}
|
|
||||||
|
|
||||||
__device__ int getend(){
|
__device__ int getend() { return this->end; }
|
||||||
return this->end;
|
|
||||||
}
|
|
||||||
|
|
||||||
__device__ int getstrideHighToLow(){
|
__device__ int getstrideHighToLow() { return this->strideHighToLow; }
|
||||||
return this->strideHighToLow;
|
|
||||||
}
|
|
||||||
|
|
||||||
__device__ int getstrideLowToHigh(){
|
__device__ int getstrideLowToHigh() { return this->strideLowToHigh; }
|
||||||
return this->strideLowToHigh;
|
|
||||||
}
|
|
||||||
|
|
||||||
//*******Add three functions to get private values*******
|
//*******Add three functions to get private values*******
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
} // namespace dwt_cuda
|
} // namespace dwt_cuda
|
||||||
|
|
||||||
|
|
||||||
#endif // IO_H
|
#endif // IO_H
|
||||||
|
|
||||||
|
|
|
@ -30,41 +30,38 @@
|
||||||
/// POSSIBILITY OF SUCH DAMAGE.
|
/// POSSIBILITY OF SUCH DAMAGE.
|
||||||
///
|
///
|
||||||
|
|
||||||
|
|
||||||
#ifndef TRANSFORM_BUFFER_H
|
#ifndef TRANSFORM_BUFFER_H
|
||||||
#define TRANSFORM_BUFFER_H
|
#define TRANSFORM_BUFFER_H
|
||||||
|
|
||||||
|
|
||||||
namespace dwt_cuda {
|
namespace dwt_cuda {
|
||||||
|
|
||||||
|
/// Buffer (in shared memory of GPU) where block of input image is stored,
|
||||||
/// Buffer (in shared memory of GPU) where block of input image is stored,
|
/// but odd and even lines are separated. (Generates less bank conflicts when
|
||||||
/// but odd and even lines are separated. (Generates less bank conflicts when
|
/// using lifting schema.) All operations expect SIZE_X threads.
|
||||||
/// using lifting schema.) All operations expect SIZE_X threads.
|
/// Also implements basic building blocks of lifting schema.
|
||||||
/// Also implements basic building blocks of lifting schema.
|
/// @tparam SIZE_X width of the buffer excluding two boundaries (Also
|
||||||
/// @tparam SIZE_X width of the buffer excluding two boundaries (Also
|
/// a number of threads participating on all operations.)
|
||||||
/// a number of threads participating on all operations.)
|
/// Must be divisible by 4.
|
||||||
/// Must be divisible by 4.
|
/// @tparam SIZE_Y height of buffer (total number of lines)
|
||||||
/// @tparam SIZE_Y height of buffer (total number of lines)
|
/// @tparam BOUNDARY_X number of extra pixels at the left and right side
|
||||||
/// @tparam BOUNDARY_X number of extra pixels at the left and right side
|
/// boundary is expected to be smaller than half SIZE_X
|
||||||
/// boundary is expected to be smaller than half SIZE_X
|
/// Must be divisible by 2.
|
||||||
/// Must be divisible by 2.
|
template <typename T, int SIZE_X, int SIZE_Y, int BOUNDARY_X>
|
||||||
template <typename T, int SIZE_X, int SIZE_Y, int BOUNDARY_X>
|
class TransformBuffer {
|
||||||
class TransformBuffer {
|
public:
|
||||||
public:
|
|
||||||
enum {
|
enum {
|
||||||
/// difference between pointers to two vertical neigbors
|
/// difference between pointers to two vertical neigbors
|
||||||
VERTICAL_STRIDE = BOUNDARY_X + (SIZE_X / 2)
|
VERTICAL_STRIDE = BOUNDARY_X + (SIZE_X / 2)
|
||||||
};
|
};
|
||||||
|
|
||||||
private:
|
private:
|
||||||
enum {
|
enum {
|
||||||
/// number of shared memory banks - needed for correct padding
|
/// number of shared memory banks - needed for correct padding
|
||||||
#ifdef __CUDA_ARCH__
|
#ifdef __CUDA_ARCH__
|
||||||
SHM_BANKS = ((__CUDA_ARCH__ >= 200) ? 32 : 16),
|
SHM_BANKS = ((__CUDA_ARCH__ >= 200) ? 32 : 16),
|
||||||
#else
|
#else
|
||||||
SHM_BANKS = 16, // for host code only - can be anything, won't be used
|
SHM_BANKS = 16, // for host code only - can be anything, won't be used
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
/// size of one of two buffers (odd or even)
|
/// size of one of two buffers (odd or even)
|
||||||
BUFFER_SIZE = VERTICAL_STRIDE * SIZE_Y,
|
BUFFER_SIZE = VERTICAL_STRIDE * SIZE_Y,
|
||||||
|
@ -79,8 +76,6 @@ namespace dwt_cuda {
|
||||||
/// buffer for both even and odd columns
|
/// buffer for both even and odd columns
|
||||||
T data[2 * BUFFER_SIZE + PADDING];
|
T data[2 * BUFFER_SIZE + PADDING];
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/// Applies specified function to all central elements while also passing
|
/// Applies specified function to all central elements while also passing
|
||||||
/// previous and next elements as parameters.
|
/// previous and next elements as parameters.
|
||||||
/// @param count count of central elements to apply function to
|
/// @param count count of central elements to apply function to
|
||||||
|
@ -91,7 +86,7 @@ namespace dwt_cuda {
|
||||||
template <typename FUNC>
|
template <typename FUNC>
|
||||||
__device__ void horizontalStep(const int count, const int prevOffset,
|
__device__ void horizontalStep(const int count, const int prevOffset,
|
||||||
const int midOffset, const int nextOffset,
|
const int midOffset, const int nextOffset,
|
||||||
const FUNC & function) {
|
const FUNC &function) {
|
||||||
// number of unchecked iterations
|
// number of unchecked iterations
|
||||||
const int STEPS = count / SIZE_X;
|
const int STEPS = count / SIZE_X;
|
||||||
|
|
||||||
|
@ -102,37 +97,34 @@ namespace dwt_cuda {
|
||||||
const int finalOffset = count - finalCount;
|
const int finalOffset = count - finalCount;
|
||||||
|
|
||||||
// all threads perform fixed number of iterations ...
|
// all threads perform fixed number of iterations ...
|
||||||
for(int i = 0; i < STEPS; i++) {
|
for (int i = 0; i < STEPS; i++) {
|
||||||
// for(int i = 0; i < 3; i++) {
|
// for(int i = 0; i < 3; i++) {
|
||||||
const T previous = data[prevOffset + i * SIZE_X + threadIdx.x];
|
const T previous = data[prevOffset + i * SIZE_X + threadIdx.x];
|
||||||
const T next = data[nextOffset + i * SIZE_X + threadIdx.x];
|
const T next = data[nextOffset + i * SIZE_X + threadIdx.x];
|
||||||
T & center = data[midOffset + i * SIZE_X + threadIdx.x];
|
T ¢er = data[midOffset + i * SIZE_X + threadIdx.x];
|
||||||
// function(previous, center, (nextOffset + i*SIZE_X+threadIdx.x));
|
// function(previous, center, (nextOffset + i*SIZE_X+threadIdx.x));
|
||||||
function(previous, center, next);// the real one
|
function(previous, center, next); // the real one
|
||||||
}
|
}
|
||||||
|
|
||||||
// ... but not all threads participate on final iteration
|
// ... but not all threads participate on final iteration
|
||||||
if(threadIdx.x < finalCount) {
|
if (threadIdx.x < finalCount) {
|
||||||
const T previous = data[prevOffset + finalOffset + threadIdx.x];
|
const T previous = data[prevOffset + finalOffset + threadIdx.x];
|
||||||
const T next = data[nextOffset + finalOffset + threadIdx.x];
|
const T next = data[nextOffset + finalOffset + threadIdx.x];
|
||||||
T & center = data[midOffset + finalOffset + threadIdx.x];
|
T ¢er = data[midOffset + finalOffset + threadIdx.x];
|
||||||
// function(previous, center, (nextOffset+finalOffset+threadIdx.x));
|
// function(previous, center, (nextOffset+finalOffset+threadIdx.x));
|
||||||
// kaixi
|
// kaixi
|
||||||
function(previous, center, next);//the real one
|
function(previous, center, next); // the real one
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public:
|
public:
|
||||||
|
|
||||||
__device__ void getPrintData() {
|
__device__ void getPrintData() {
|
||||||
//
|
//
|
||||||
for(int i = 0 ; i< 2 * BUFFER_SIZE + PADDING ; i++) {
|
for (int i = 0; i < 2 * BUFFER_SIZE + PADDING; i++) {
|
||||||
printf(" index: %d data: %f \n ", i ,data[i]);
|
printf(" index: %d data: %f \n ", i, data[i]);
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/// Gets offset of the column with given index. Central columns have
|
/// Gets offset of the column with given index. Central columns have
|
||||||
/// indices from 0 to NUM_LINES - 1, left boundary columns have negative
|
/// indices from 0 to NUM_LINES - 1, left boundary columns have negative
|
||||||
/// indices and right boundary columns indices start with NUM_LINES.
|
/// indices and right boundary columns indices start with NUM_LINES.
|
||||||
|
@ -144,14 +136,10 @@ namespace dwt_cuda {
|
||||||
+ (columnIndex & 1) * ODD_OFFSET; // select odd or even buffer
|
+ (columnIndex & 1) * ODD_OFFSET; // select odd or even buffer
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/// Provides access to data of the transform buffer.
|
/// Provides access to data of the transform buffer.
|
||||||
/// @param index index of the item to work with
|
/// @param index index of the item to work with
|
||||||
/// @return reference to item at given index
|
/// @return reference to item at given index
|
||||||
__device__ T & operator[] (const int index) {
|
__device__ T &operator[](const int index) { return data[index]; }
|
||||||
return data[index];
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/// Applies specified function to all horizontally even elements in
|
/// Applies specified function to all horizontally even elements in
|
||||||
/// specified lines. (Including even elements in boundaries except
|
/// specified lines. (Including even elements in boundaries except
|
||||||
|
@ -163,9 +151,8 @@ namespace dwt_cuda {
|
||||||
/// parameters: previous (odd) element, the even
|
/// parameters: previous (odd) element, the even
|
||||||
/// element itself and finally next (odd) element
|
/// element itself and finally next (odd) element
|
||||||
template <typename FUNC>
|
template <typename FUNC>
|
||||||
__device__ void forEachHorizontalEven(const int firstLine,
|
__device__ void forEachHorizontalEven(const int firstLine, const int numLines,
|
||||||
const int numLines,
|
const FUNC &func) {
|
||||||
const FUNC & func) {
|
|
||||||
// number of even elemens to apply function to
|
// number of even elemens to apply function to
|
||||||
const int count = numLines * VERTICAL_STRIDE - 1;
|
const int count = numLines * VERTICAL_STRIDE - 1;
|
||||||
// offset of first even element
|
// offset of first even element
|
||||||
|
@ -177,14 +164,14 @@ namespace dwt_cuda {
|
||||||
|
|
||||||
// if(threadIdx.x == 0) {
|
// if(threadIdx.x == 0) {
|
||||||
|
|
||||||
// printf("forEachHorizontalEven count %d, centerOffset %d prevOffset %d nextOffset %d \n", count, centerOffset, prevOffset, nextOffset);
|
// printf("forEachHorizontalEven count %d, centerOffset %d prevOffset %d
|
||||||
|
// nextOffset %d \n", count, centerOffset, prevOffset, nextOffset);
|
||||||
// }
|
// }
|
||||||
|
|
||||||
// call generic horizontal step function
|
// call generic horizontal step function
|
||||||
horizontalStep(count, prevOffset, centerOffset, nextOffset, func);
|
horizontalStep(count, prevOffset, centerOffset, nextOffset, func);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/// Applies given function to all horizontally odd elements in specified
|
/// Applies given function to all horizontally odd elements in specified
|
||||||
/// lines. (Including odd elements in boundaries except last odd element
|
/// lines. (Including odd elements in boundaries except last odd element
|
||||||
/// in last right boundary.) SIZE_X threads participate and synchronization
|
/// in last right boundary.) SIZE_X threads participate and synchronization
|
||||||
|
@ -195,9 +182,8 @@ namespace dwt_cuda {
|
||||||
/// parameters: previous (even) element, the odd
|
/// parameters: previous (even) element, the odd
|
||||||
/// element itself and finally next (even) element
|
/// element itself and finally next (even) element
|
||||||
template <typename FUNC>
|
template <typename FUNC>
|
||||||
__device__ void forEachHorizontalOdd(const int firstLine,
|
__device__ void forEachHorizontalOdd(const int firstLine, const int numLines,
|
||||||
const int numLines,
|
const FUNC &func) {
|
||||||
const FUNC & func) {
|
|
||||||
// numbet of odd elements to apply function to
|
// numbet of odd elements to apply function to
|
||||||
const int count = numLines * VERTICAL_STRIDE - 1;
|
const int count = numLines * VERTICAL_STRIDE - 1;
|
||||||
// offset of even predecessor of first odd element
|
// offset of even predecessor of first odd element
|
||||||
|
@ -208,15 +194,14 @@ namespace dwt_cuda {
|
||||||
const int nextOffset = prevOffset + 1;
|
const int nextOffset = prevOffset + 1;
|
||||||
|
|
||||||
// if(threadIdx.x == 0) {
|
// if(threadIdx.x == 0) {
|
||||||
// printf("forEachHorizontalOdd count %d, centerOffset %d prevOffset %d nextOffset %d \n", count, centerOffset, prevOffset, nextOffset);
|
// printf("forEachHorizontalOdd count %d, centerOffset %d prevOffset %d
|
||||||
|
// nextOffset %d \n", count, centerOffset, prevOffset, nextOffset);
|
||||||
// }
|
// }
|
||||||
|
|
||||||
|
|
||||||
// call generic horizontal step function
|
// call generic horizontal step function
|
||||||
horizontalStep(count, prevOffset, centerOffset, nextOffset, func);
|
horizontalStep(count, prevOffset, centerOffset, nextOffset, func);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/// Applies specified function to all even elements (except element #0)
|
/// Applies specified function to all even elements (except element #0)
|
||||||
/// of given column. Each thread takes care of one column, so there's
|
/// of given column. Each thread takes care of one column, so there's
|
||||||
/// no need for synchronization.
|
/// no need for synchronization.
|
||||||
|
@ -225,30 +210,28 @@ namespace dwt_cuda {
|
||||||
/// parameters: previous (odd) element, the even
|
/// parameters: previous (odd) element, the even
|
||||||
/// element itself and finally next (odd) element
|
/// element itself and finally next (odd) element
|
||||||
template <typename F>
|
template <typename F>
|
||||||
__device__ void forEachVerticalEven(const int columnOffset, const F & f) {
|
__device__ void forEachVerticalEven(const int columnOffset, const F &f) {
|
||||||
if(SIZE_Y > 3) { // makes no sense otherwise
|
if (SIZE_Y > 3) { // makes no sense otherwise
|
||||||
const int steps = SIZE_Y / 2 - 1;
|
const int steps = SIZE_Y / 2 - 1;
|
||||||
for(int i = 0; i < steps; i++) {
|
for (int i = 0; i < steps; i++) {
|
||||||
const int row = 2 + i * 2;
|
const int row = 2 + i * 2;
|
||||||
const T prev = data[columnOffset + (row - 1) * VERTICAL_STRIDE];
|
const T prev = data[columnOffset + (row - 1) * VERTICAL_STRIDE];
|
||||||
const T next = data[columnOffset + (row + 1) * VERTICAL_STRIDE];
|
const T next = data[columnOffset + (row + 1) * VERTICAL_STRIDE];
|
||||||
f(prev, data[columnOffset + row * VERTICAL_STRIDE] , next);
|
f(prev, data[columnOffset + row * VERTICAL_STRIDE], next);
|
||||||
|
|
||||||
//--------------- FOR TEST -----------------
|
//--------------- FOR TEST -----------------
|
||||||
/* __syncthreads();
|
/* __syncthreads();
|
||||||
if ((blockIdx.x * blockDim.x + threadIdx.x) == 0){
|
if ((blockIdx.x * blockDim.x + threadIdx.x) == 0){
|
||||||
diffOut[2500]++;
|
diffOut[2500]++;
|
||||||
diffOut[diffOut[2500]] = 2;//data[columnOffset + row * VERTICAL_STRIDE];
|
diffOut[diffOut[2500]] = 2;//data[columnOffset +
|
||||||
|
row * VERTICAL_STRIDE];
|
||||||
}
|
}
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
*/ //--------------- FOR TEST -----------------
|
*/ //--------------- FOR TEST -----------------
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/// Applies specified function to all odd elements of given column.
|
/// Applies specified function to all odd elements of given column.
|
||||||
/// Each thread takes care of one column, so there's no need for
|
/// Each thread takes care of one column, so there's no need for
|
||||||
/// synchronization.
|
/// synchronization.
|
||||||
|
@ -257,30 +240,28 @@ namespace dwt_cuda {
|
||||||
/// parameters: previous (even) element, the odd
|
/// parameters: previous (even) element, the odd
|
||||||
/// element itself and finally next (even) element
|
/// element itself and finally next (even) element
|
||||||
template <typename F>
|
template <typename F>
|
||||||
__device__ void forEachVerticalOdd(const int columnOffset, const F & f) {
|
__device__ void forEachVerticalOdd(const int columnOffset, const F &f) {
|
||||||
const int steps = (SIZE_Y - 1) / 2;
|
const int steps = (SIZE_Y - 1) / 2;
|
||||||
for(int i = 0; i < steps; i++) {
|
for (int i = 0; i < steps; i++) {
|
||||||
const int row = i * 2 + 1;
|
const int row = i * 2 + 1;
|
||||||
const T prev = data[columnOffset + (row - 1) * VERTICAL_STRIDE];
|
const T prev = data[columnOffset + (row - 1) * VERTICAL_STRIDE];
|
||||||
const T next = data[columnOffset + (row + 1) * VERTICAL_STRIDE];
|
const T next = data[columnOffset + (row + 1) * VERTICAL_STRIDE];
|
||||||
|
|
||||||
f(prev, data[columnOffset + row * VERTICAL_STRIDE], next);
|
f(prev, data[columnOffset + row * VERTICAL_STRIDE], next);
|
||||||
|
|
||||||
|
|
||||||
//--------------- FOR TEST -----------------
|
//--------------- FOR TEST -----------------
|
||||||
/* __syncthreads();
|
/* __syncthreads();
|
||||||
if ((blockIdx.x * blockDim.x + threadIdx.x) == 0){
|
if ((blockIdx.x * blockDim.x + threadIdx.x) == 0){
|
||||||
diffOut[2500]++;
|
diffOut[2500]++;
|
||||||
diffOut[diffOut[2500]] = 1; //data[columnOffset + row * VERTICAL_STRIDE];
|
diffOut[diffOut[2500]] = 1; //data[columnOffset +
|
||||||
|
row * VERTICAL_STRIDE];
|
||||||
}
|
}
|
||||||
|
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
*/ //--------------- FOR TEST -----------------
|
*/ //--------------- FOR TEST -----------------
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/// Scales elements at specified lines.
|
/// Scales elements at specified lines.
|
||||||
/// @param evenScale scaling factor for horizontally even elements
|
/// @param evenScale scaling factor for horizontally even elements
|
||||||
/// @param oddScale scaling factor for horizontally odd elements
|
/// @param oddScale scaling factor for horizontally odd elements
|
||||||
|
@ -294,10 +275,12 @@ namespace dwt_cuda {
|
||||||
const int finalCount = count % SIZE_X;
|
const int finalCount = count % SIZE_X;
|
||||||
const int finalOffset = count - finalCount;
|
const int finalOffset = count - finalCount;
|
||||||
|
|
||||||
// printf("scaleHorizontal sizeX: %d offset %d, count, %d, steps, %d, finalCount %d, finalOffset %d \n", SIZE_X, offset, count, steps, finalCount, finalOffset);
|
// printf("scaleHorizontal sizeX: %d offset %d, count, %d, steps, %d,
|
||||||
|
// finalCount %d, finalOffset %d \n", SIZE_X, offset, count, steps,
|
||||||
|
// finalCount, finalOffset);
|
||||||
|
|
||||||
// run iterations, whete all threads participate
|
// run iterations, whete all threads participate
|
||||||
for(int i = 0; i < steps; i++) {
|
for (int i = 0; i < steps; i++) {
|
||||||
data[threadIdx.x + i * SIZE_X + offset] *= evenScale;
|
data[threadIdx.x + i * SIZE_X + offset] *= evenScale;
|
||||||
// if(threadIdx.x + i * SIZE_X + offset == 531) {
|
// if(threadIdx.x + i * SIZE_X + offset == 531) {
|
||||||
// printf("threadidx 531: %d \n", threadIdx.x);
|
// printf("threadidx 531: %d \n", threadIdx.x);
|
||||||
|
@ -309,7 +292,7 @@ namespace dwt_cuda {
|
||||||
}
|
}
|
||||||
|
|
||||||
// some threads also finish remaining unscaled items
|
// some threads also finish remaining unscaled items
|
||||||
if(threadIdx.x < finalCount) {
|
if (threadIdx.x < finalCount) {
|
||||||
data[threadIdx.x + finalOffset + offset] *= evenScale;
|
data[threadIdx.x + finalOffset + offset] *= evenScale;
|
||||||
// if(threadIdx.x + finalOffset + offset == 531) {
|
// if(threadIdx.x + finalOffset + offset == 531) {
|
||||||
// printf("threadidx 531: %d \n", threadIdx.x);
|
// printf("threadidx 531: %d \n", threadIdx.x);
|
||||||
|
@ -319,10 +302,8 @@ namespace dwt_cuda {
|
||||||
// }
|
// }
|
||||||
data[threadIdx.x + finalOffset + offset + ODD_OFFSET] *= oddScale;
|
data[threadIdx.x + finalOffset + offset + ODD_OFFSET] *= oddScale;
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/// Scales elements in specified column.
|
/// Scales elements in specified column.
|
||||||
/// @param evenScale scaling factor for vertically even elements
|
/// @param evenScale scaling factor for vertically even elements
|
||||||
/// @param oddScale scaling factor for vertically odd elements
|
/// @param oddScale scaling factor for vertically odd elements
|
||||||
|
@ -332,8 +313,8 @@ namespace dwt_cuda {
|
||||||
__device__ void scaleVertical(const T evenScale, const T oddScale,
|
__device__ void scaleVertical(const T evenScale, const T oddScale,
|
||||||
const int columnOffset, const int numLines,
|
const int columnOffset, const int numLines,
|
||||||
const int firstLine) {
|
const int firstLine) {
|
||||||
for(int i = firstLine; i < (numLines + firstLine); i++) {
|
for (int i = firstLine; i < (numLines + firstLine); i++) {
|
||||||
if(i & 1) {
|
if (i & 1) {
|
||||||
data[columnOffset + i * VERTICAL_STRIDE] *= oddScale;
|
data[columnOffset + i * VERTICAL_STRIDE] *= oddScale;
|
||||||
} else {
|
} else {
|
||||||
data[columnOffset + i * VERTICAL_STRIDE] *= evenScale;
|
data[columnOffset + i * VERTICAL_STRIDE] *= evenScale;
|
||||||
|
@ -341,33 +322,17 @@ namespace dwt_cuda {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
//****************For Test(Feb23), test inter parameters*************
|
//****************For Test(Feb23), test inter parameters*************
|
||||||
__device__ int getVERTICAL_STRIDE(){
|
__device__ int getVERTICAL_STRIDE() { return VERTICAL_STRIDE; }
|
||||||
return VERTICAL_STRIDE;
|
__device__ int getSHM_BANKS() { return SHM_BANKS; }
|
||||||
}
|
__device__ int getBuffersize() { return BUFFER_SIZE; }
|
||||||
__device__ int getSHM_BANKS(){
|
__device__ int getPADDING() { return PADDING; }
|
||||||
return SHM_BANKS;
|
__device__ int getODD_OFFSET() { return ODD_OFFSET; }
|
||||||
}
|
|
||||||
__device__ int getBuffersize(){
|
|
||||||
return BUFFER_SIZE;
|
|
||||||
}
|
|
||||||
__device__ int getPADDING(){
|
|
||||||
return PADDING;
|
|
||||||
}
|
|
||||||
__device__ int getODD_OFFSET(){
|
|
||||||
return ODD_OFFSET;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
//****************For Test(Feb23), test inter parameters*************
|
//****************For Test(Feb23), test inter parameters*************
|
||||||
|
|
||||||
|
}; // end of class TransformBuffer
|
||||||
}; // end of class TransformBuffer
|
|
||||||
|
|
||||||
|
|
||||||
} // namespace dwt_cuda
|
} // namespace dwt_cuda
|
||||||
|
|
||||||
|
|
||||||
#endif // TRANSFORM_BUFFER_H
|
#endif // TRANSFORM_BUFFER_H
|
||||||
|
|
||||||
|
|
|
@ -5,4 +5,3 @@
|
||||||
./dwt2d 4.bmp -d 4x4 -r -5 -l 3
|
./dwt2d 4.bmp -d 4x4 -r -5 -l 3
|
||||||
# ./dwt2d 4.bmp -d 4x4 -r -9 -l 3
|
# ./dwt2d 4.bmp -d 4x4 -r -9 -l 3
|
||||||
# ./dwt2d 8.bmp -d 8x8 -f -9 -l 3
|
# ./dwt2d 8.bmp -d 8x8 -f -9 -l 3
|
||||||
|
|
||||||
|
|
|
@ -7,12 +7,3 @@
|
||||||
/usr/local/cuda/bin/nvcc -arch sm_50 -I. -I/include -O2 --compiler-options -fno-strict-aliasing -c dwt_cuda/rdwt97.cu -o dwt_cuda/rdwt97.cu.o
|
/usr/local/cuda/bin/nvcc -arch sm_50 -I. -I/include -O2 --compiler-options -fno-strict-aliasing -c dwt_cuda/rdwt97.cu -o dwt_cuda/rdwt97.cu.o
|
||||||
/usr/local/cuda/bin/nvcc -arch sm_50 -I. -I/include -O2 --compiler-options -fno-strict-aliasing -c dwt_cuda/rdwt53.cu -o dwt_cuda/rdwt53.cu.o
|
/usr/local/cuda/bin/nvcc -arch sm_50 -I. -I/include -O2 --compiler-options -fno-strict-aliasing -c dwt_cuda/rdwt53.cu -o dwt_cuda/rdwt53.cu.o
|
||||||
g++ -fPIC -o nvcc_dwt2d main.cu.o dwt.cu.o components.cu.o dwt_cuda/fdwt53.cu.o dwt_cuda/fdwt97.cu.o dwt_cuda/common.cu.o dwt_cuda/rdwt97.cu.o dwt_cuda/rdwt53.cu.o -L/usr/local/cuda/lib64 -lcudart
|
g++ -fPIC -o nvcc_dwt2d main.cu.o dwt.cu.o components.cu.o dwt_cuda/fdwt53.cu.o dwt_cuda/fdwt97.cu.o dwt_cuda/common.cu.o dwt_cuda/rdwt97.cu.o dwt_cuda/rdwt53.cu.o -L/usr/local/cuda/lib64 -lcudart
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,38 +1,36 @@
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
|
|
||||||
__global__
|
__global__ void saxpy(int n, float a, float *x, float *y) {
|
||||||
void saxpy(int n, float a, float *x, float *y)
|
int i = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
{
|
if (i < n)
|
||||||
int i = blockIdx.x*blockDim.x + threadIdx.x;
|
y[i] = a * x[i] + y[i];
|
||||||
if (i < n) y[i] = a*x[i] + y[i];
|
|
||||||
}
|
}
|
||||||
|
|
||||||
int main(void)
|
int main(void) {
|
||||||
{
|
int N = 1 << 20;
|
||||||
int N = 1<<20;
|
|
||||||
float *x, *y, *d_x, *d_y;
|
float *x, *y, *d_x, *d_y;
|
||||||
x = (float*)malloc(N*sizeof(float));
|
x = (float *)malloc(N * sizeof(float));
|
||||||
y = (float*)malloc(N*sizeof(float));
|
y = (float *)malloc(N * sizeof(float));
|
||||||
|
|
||||||
cudaMalloc(&d_x, N*sizeof(float));
|
cudaMalloc(&d_x, N * sizeof(float));
|
||||||
cudaMalloc(&d_y, N*sizeof(float));
|
cudaMalloc(&d_y, N * sizeof(float));
|
||||||
|
|
||||||
for (int i = 0; i < N; i++) {
|
for (int i = 0; i < N; i++) {
|
||||||
x[i] = 1.0f;
|
x[i] = 1.0f;
|
||||||
y[i] = 2.0f;
|
y[i] = 2.0f;
|
||||||
}
|
}
|
||||||
|
|
||||||
cudaMemcpy(d_x, x, N*sizeof(float), cudaMemcpyHostToDevice);
|
cudaMemcpy(d_x, x, N * sizeof(float), cudaMemcpyHostToDevice);
|
||||||
cudaMemcpy(d_y, y, N*sizeof(float), cudaMemcpyHostToDevice);
|
cudaMemcpy(d_y, y, N * sizeof(float), cudaMemcpyHostToDevice);
|
||||||
|
|
||||||
// Perform SAXPY on 1M elements
|
// Perform SAXPY on 1M elements
|
||||||
// saxpy<<<(N+255)/256, 256>>>(N, 2.0f, d_x, d_y);
|
// saxpy<<<(N+255)/256, 256>>>(N, 2.0f, d_x, d_y);
|
||||||
|
|
||||||
cudaMemcpy(y, d_y, N*sizeof(float), cudaMemcpyDeviceToHost);
|
cudaMemcpy(y, d_y, N * sizeof(float), cudaMemcpyDeviceToHost);
|
||||||
|
|
||||||
float maxError = 0.0f;
|
float maxError = 0.0f;
|
||||||
for (int i = 0; i < N; i++)
|
for (int i = 0; i < N; i++)
|
||||||
maxError = max(maxError, abs(y[i]-4.0f));
|
maxError = max(maxError, abs(y[i] - 4.0f));
|
||||||
printf("Max error: %f\n", maxError);
|
printf("Max error: %f\n", maxError);
|
||||||
|
|
||||||
cudaFree(d_x);
|
cudaFree(d_x);
|
||||||
|
|
|
@ -1,38 +1,35 @@
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
|
|
||||||
__global__
|
__global__ void saxpy(void) {
|
||||||
void saxpy(void)
|
int i = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
{
|
|
||||||
int i = blockIdx.x*blockDim.x + threadIdx.x;
|
|
||||||
printf("block_id:%d thread_id:%d \n", i)
|
printf("block_id:%d thread_id:%d \n", i)
|
||||||
}
|
}
|
||||||
|
|
||||||
int main(void)
|
int main(void) {
|
||||||
{
|
int N = 1 << 20;
|
||||||
int N = 1<<20;
|
|
||||||
float *x, *y, *d_x, *d_y;
|
float *x, *y, *d_x, *d_y;
|
||||||
x = (float*)malloc(N*sizeof(float));
|
x = (float *)malloc(N * sizeof(float));
|
||||||
y = (float*)malloc(N*sizeof(float));
|
y = (float *)malloc(N * sizeof(float));
|
||||||
|
|
||||||
cudaMalloc(&d_x, N*sizeof(float));
|
cudaMalloc(&d_x, N * sizeof(float));
|
||||||
cudaMalloc(&d_y, N*sizeof(float));
|
cudaMalloc(&d_y, N * sizeof(float));
|
||||||
|
|
||||||
for (int i = 0; i < N; i++) {
|
for (int i = 0; i < N; i++) {
|
||||||
x[i] = 1.0f;
|
x[i] = 1.0f;
|
||||||
y[i] = 2.0f;
|
y[i] = 2.0f;
|
||||||
}
|
}
|
||||||
|
|
||||||
cudaMemcpy(d_x, x, N*sizeof(float), cudaMemcpyHostToDevice);
|
cudaMemcpy(d_x, x, N * sizeof(float), cudaMemcpyHostToDevice);
|
||||||
cudaMemcpy(d_y, y, N*sizeof(float), cudaMemcpyHostToDevice);
|
cudaMemcpy(d_y, y, N * sizeof(float), cudaMemcpyHostToDevice);
|
||||||
|
|
||||||
// Perform SAXPY on 1M elements
|
// Perform SAXPY on 1M elements
|
||||||
saxpy<<<(1,1)>>>;
|
saxpy<<<(1, 1)>>>;
|
||||||
|
|
||||||
cudaMemcpy(y, d_y, N*sizeof(float), cudaMemcpyDeviceToHost);
|
cudaMemcpy(y, d_y, N * sizeof(float), cudaMemcpyDeviceToHost);
|
||||||
|
|
||||||
float maxError = 0.0f;
|
float maxError = 0.0f;
|
||||||
for (int i = 0; i < N; i++)
|
for (int i = 0; i < N; i++)
|
||||||
maxError = max(maxError, abs(y[i]-4.0f));
|
maxError = max(maxError, abs(y[i] - 4.0f));
|
||||||
printf("Max error: %f\n", maxError);
|
printf("Max error: %f\n", maxError);
|
||||||
|
|
||||||
cudaFree(d_x);
|
cudaFree(d_x);
|
||||||
|
|
|
@ -1,37 +1,32 @@
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
|
|
||||||
__global__
|
__global__ void saxpy(int N) { printf("hello!: %d\n", N); }
|
||||||
void saxpy(int N)
|
|
||||||
{
|
|
||||||
printf("hello!: %d\n", N);
|
|
||||||
}
|
|
||||||
|
|
||||||
int main(void)
|
int main(void) {
|
||||||
{
|
int N = 1 << 20;
|
||||||
int N = 1<<20;
|
|
||||||
float *x, *y, *d_x, *d_y;
|
float *x, *y, *d_x, *d_y;
|
||||||
x = (float*)malloc(N*sizeof(float));
|
x = (float *)malloc(N * sizeof(float));
|
||||||
y = (float*)malloc(N*sizeof(float));
|
y = (float *)malloc(N * sizeof(float));
|
||||||
|
|
||||||
cudaMalloc(&d_x, N*sizeof(float));
|
cudaMalloc(&d_x, N * sizeof(float));
|
||||||
cudaMalloc(&d_y, N*sizeof(float));
|
cudaMalloc(&d_y, N * sizeof(float));
|
||||||
|
|
||||||
for (int i = 0; i < N; i++) {
|
for (int i = 0; i < N; i++) {
|
||||||
x[i] = 1.0f;
|
x[i] = 1.0f;
|
||||||
y[i] = 2.0f;
|
y[i] = 2.0f;
|
||||||
}
|
}
|
||||||
|
|
||||||
cudaMemcpy(d_x, x, N*sizeof(float), cudaMemcpyHostToDevice);
|
cudaMemcpy(d_x, x, N * sizeof(float), cudaMemcpyHostToDevice);
|
||||||
cudaMemcpy(d_y, y, N*sizeof(float), cudaMemcpyHostToDevice);
|
cudaMemcpy(d_y, y, N * sizeof(float), cudaMemcpyHostToDevice);
|
||||||
|
|
||||||
// Perform SAXPY on 1M elements
|
// Perform SAXPY on 1M elements
|
||||||
saxpy<<<(1,1)>>>(N);
|
saxpy<<<(1, 1)>>>(N);
|
||||||
|
|
||||||
cudaMemcpy(y, d_y, N*sizeof(float), cudaMemcpyDeviceToHost);
|
cudaMemcpy(y, d_y, N * sizeof(float), cudaMemcpyDeviceToHost);
|
||||||
|
|
||||||
float maxError = 0.0f;
|
float maxError = 0.0f;
|
||||||
for (int i = 0; i < N; i++)
|
for (int i = 0; i < N; i++)
|
||||||
maxError = max(maxError, abs(y[i]-4.0f));
|
maxError = max(maxError, abs(y[i] - 4.0f));
|
||||||
printf("Max error: %f\n", maxError);
|
printf("Max error: %f\n", maxError);
|
||||||
|
|
||||||
cudaFree(d_x);
|
cudaFree(d_x);
|
||||||
|
|
|
@ -1,37 +1,32 @@
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
|
|
||||||
__global__
|
__global__ void saxpy(void) { printf("hello!\n"); }
|
||||||
void saxpy(void)
|
|
||||||
{
|
|
||||||
printf("hello!\n");
|
|
||||||
}
|
|
||||||
|
|
||||||
int main(void)
|
int main(void) {
|
||||||
{
|
int N = 1 << 20;
|
||||||
int N = 1<<20;
|
|
||||||
float *x, *y, *d_x, *d_y;
|
float *x, *y, *d_x, *d_y;
|
||||||
x = (float*)malloc(N*sizeof(float));
|
x = (float *)malloc(N * sizeof(float));
|
||||||
y = (float*)malloc(N*sizeof(float));
|
y = (float *)malloc(N * sizeof(float));
|
||||||
|
|
||||||
cudaMalloc(&d_x, N*sizeof(float));
|
cudaMalloc(&d_x, N * sizeof(float));
|
||||||
cudaMalloc(&d_y, N*sizeof(float));
|
cudaMalloc(&d_y, N * sizeof(float));
|
||||||
|
|
||||||
for (int i = 0; i < N; i++) {
|
for (int i = 0; i < N; i++) {
|
||||||
x[i] = 1.0f;
|
x[i] = 1.0f;
|
||||||
y[i] = 2.0f;
|
y[i] = 2.0f;
|
||||||
}
|
}
|
||||||
|
|
||||||
cudaMemcpy(d_x, x, N*sizeof(float), cudaMemcpyHostToDevice);
|
cudaMemcpy(d_x, x, N * sizeof(float), cudaMemcpyHostToDevice);
|
||||||
cudaMemcpy(d_y, y, N*sizeof(float), cudaMemcpyHostToDevice);
|
cudaMemcpy(d_y, y, N * sizeof(float), cudaMemcpyHostToDevice);
|
||||||
|
|
||||||
// Perform SAXPY on 1M elements
|
// Perform SAXPY on 1M elements
|
||||||
saxpy<<<(1,1)>>>;
|
saxpy<<<(1, 1)>>>;
|
||||||
|
|
||||||
cudaMemcpy(y, d_y, N*sizeof(float), cudaMemcpyDeviceToHost);
|
cudaMemcpy(y, d_y, N * sizeof(float), cudaMemcpyDeviceToHost);
|
||||||
|
|
||||||
float maxError = 0.0f;
|
float maxError = 0.0f;
|
||||||
for (int i = 0; i < N; i++)
|
for (int i = 0; i < N; i++)
|
||||||
maxError = max(maxError, abs(y[i]-4.0f));
|
maxError = max(maxError, abs(y[i] - 4.0f));
|
||||||
printf("Max error: %f\n", maxError);
|
printf("Max error: %f\n", maxError);
|
||||||
|
|
||||||
cudaFree(d_x);
|
cudaFree(d_x);
|
||||||
|
|
Loading…
Reference in New Issue