Merge branch 'cupbop:master' into master

This commit is contained in:
Jun Chen 2022-05-24 21:16:37 -04:00 committed by GitHub
commit 21f298524e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
27 changed files with 1246 additions and 1384 deletions

View File

@ -27,10 +27,10 @@ Currently, CuPBoP support serveral CPU backends, including x86, AArch64, and RIS
export CuPBoP_PATH=`pwd` export CuPBoP_PATH=`pwd`
export LD_LIBRARY_PATH=$CuPBoP_PATH/build/runtime:$CuPBoP_PATH/build/runtime/threadPool:$LD_LIBRARY_PATH export LD_LIBRARY_PATH=$CuPBoP_PATH/build/runtime:$CuPBoP_PATH/build/runtime/threadPool:$LD_LIBRARY_PATH
``` ```
If you are using boson, you can pre-installed llvm 10.0.0
LLVM_PATH=/opt/llvm-10.0.0 If you are using boson, you can pre-installed llvm 10.0.0\
export PATH=$LLVM_PATH/bin:$PATH `LLVM_PATH=/opt/llvm-10.0.0`\
`export PATH=$LLVM_PATH/bin:$PATH`
2. As CuPBoP relies on CUDA structures, we need to download the CUDA header file 2. As CuPBoP relies on CUDA structures, we need to download the CUDA header file

View File

@ -272,7 +272,6 @@ void AddContextSaveRestore(llvm::Instruction *instruction,
std::vector<Instruction *> uses; std::vector<Instruction *> uses;
Function *f2 = instruction->getParent()->getParent(); Function *f2 = instruction->getParent()->getParent();
for (Instruction::use_iterator ui = instruction->use_begin(), for (Instruction::use_iterator ui = instruction->use_begin(),
ue = instruction->use_end(); ue = instruction->use_end();
ui != ue; ++ui) { ui != ue; ++ui) {

View File

@ -89,11 +89,12 @@ void mem_share2global(llvm::Module *M) {
} else if (element_type->isStructTy()) { } else if (element_type->isStructTy()) {
auto undef = llvm::UndefValue::get(element_type); auto undef = llvm::UndefValue::get(element_type);
llvm::GlobalVariable *global_memory = new llvm::GlobalVariable( llvm::GlobalVariable *global_memory = new llvm::GlobalVariable(
*M, element_type, false, llvm::GlobalValue::ExternalLinkage, undef, *M, element_type, false, llvm::GlobalValue::ExternalLinkage,
new_name, NULL, llvm::GlobalValue::GeneralDynamicTLSModel, 0, undef, new_name, NULL,
false); llvm::GlobalValue::GeneralDynamicTLSModel, 0, false);
global_memory->setDSOLocal(true); global_memory->setDSOLocal(true);
Comdat * comdat = M->getOrInsertComdat(StringRef(share_memory->getName())); Comdat *comdat =
M->getOrInsertComdat(StringRef(share_memory->getName()));
comdat->setSelectionKind(Comdat::SelectionKind::Any); comdat->setSelectionKind(Comdat::SelectionKind::Any);
global_memory->setComdat(comdat); global_memory->setComdat(comdat);
global_memory->setLinkage(llvm::GlobalValue::LinkOnceODRLinkage); global_memory->setLinkage(llvm::GlobalValue::LinkOnceODRLinkage);
@ -103,7 +104,6 @@ void mem_share2global(llvm::Module *M) {
std::pair<GlobalVariable *, GlobalVariable *>(share_memory, std::pair<GlobalVariable *, GlobalVariable *>(share_memory,
global_memory)); global_memory));
} else { } else {
assert(0 && "The required Share Memory Type is not supported\n"); assert(0 && "The required Share Memory Type is not supported\n");
} }

14
examples/dwt2d/common.h Executable file → Normal file
View File

@ -45,18 +45,20 @@
// divide and round up macro // divide and round up macro
#define DIVANDRND(a, b) ((((a) % (b)) != 0) ? ((a) / (b) + 1) : ((a) / (b))) #define DIVANDRND(a, b) ((((a) % (b)) != 0) ? ((a) / (b) + 1) : ((a) / (b)))
# define cudaCheckError( msg ) { \ #define cudaCheckError(msg) \
{ \
cudaError_t err = cudaGetLastError(); \ cudaError_t err = cudaGetLastError(); \
if (cudaSuccess != err) { \ if (cudaSuccess != err) { \
fprintf(stderr, "%s: %i: %s: %s.\n", \ fprintf(stderr, "%s: %i: %s: %s.\n", __FILE__, __LINE__, msg, \
__FILE__, __LINE__, msg, cudaGetErrorString( err) ); \ cudaGetErrorString(err)); \
exit(-1); \ exit(-1); \
} } } \
}
# define cudaCheckAsyncError( msg ) { \ #define cudaCheckAsyncError(msg) \
{ \
cudaThreadSynchronize(); \ cudaThreadSynchronize(); \
cudaCheckError(msg); \ cudaCheckError(msg); \
} }
#endif #endif

3
examples/dwt2d/components.h Executable file → Normal file
View File

@ -29,7 +29,8 @@
/* Separate compoents of source 8bit RGB image */ /* Separate compoents of source 8bit RGB image */
template <typename T> template <typename T>
void rgbToComponents(T *d_r, T *d_g, T *d_b, unsigned char * src, int width, int height); void rgbToComponents(T *d_r, T *d_g, T *d_b, unsigned char *src, int width,
int height);
/* Copy a 8bit source image data into a color compoment of type T */ /* Copy a 8bit source image data into a color compoment of type T */
template <typename T> template <typename T>

11
examples/dwt2d/dwt.h Executable file → Normal file
View File

@ -28,13 +28,14 @@
#define _DWT_H #define _DWT_H
template <typename T> template <typename T>
int nStage2dDWT(T *in, T *out, T * backup, int pixWidth, int pixHeight, int stages, bool forward); int nStage2dDWT(T *in, T *out, T *backup, int pixWidth, int pixHeight,
int stages, bool forward);
template <typename T> template <typename T>
int writeNStage2DDWT(T *component_cuda, int width, int height, int writeNStage2DDWT(T *component_cuda, int width, int height, int stages,
int stages, const char * filename, const char * suffix);
template<typename T>
int writeLinear(T *component_cuda, int width, int height,
const char *filename, const char *suffix); const char *filename, const char *suffix);
template <typename T>
int writeLinear(T *component_cuda, int width, int height, const char *filename,
const char *suffix);
#endif #endif

55
examples/dwt2d/dwt_cuda/common.h Executable file → Normal file
View File

@ -29,29 +29,22 @@
/// POSSIBILITY OF SUCH DAMAGE. /// POSSIBILITY OF SUCH DAMAGE.
/// ///
#ifndef DWT_COMMON_H #ifndef DWT_COMMON_H
#define DWT_COMMON_H #define DWT_COMMON_H
#include <cstdio>
#include <algorithm> #include <algorithm>
#include <cstdio>
#include <vector> #include <vector>
// compile time minimum macro // compile time minimum macro
#define CTMIN(a, b) (((a) < (b)) ? (a) : (b)) #define CTMIN(a, b) (((a) < (b)) ? (a) : (b))
// performance testing macros // performance testing macros
#if defined(GPU_DWT_TESTING) #if defined(GPU_DWT_TESTING)
#define PERF_BEGIN \ #define PERF_BEGIN \
{ \ { \
dwt_cuda::CudaDWTTester PERF_TESTER; \ dwt_cuda::CudaDWTTester PERF_TESTER; \
for(int PERF_N = PERF_TESTER.getNumIterations(); PERF_N--; ) \ for (int PERF_N = PERF_TESTER.getNumIterations(); PERF_N--;) { \
{ \
PERF_TESTER.beginTestIteration(); PERF_TESTER.beginTestIteration();
#define PERF_END(PERF_NAME, PERF_W, PERF_H) \ #define PERF_END(PERF_NAME, PERF_W, PERF_H) \
@ -64,25 +57,20 @@
#define PERF_END(PERF_NAME, PERF_W, PERF_H) #define PERF_END(PERF_NAME, PERF_W, PERF_H)
#endif // GPU_DWT_TESTING #endif // GPU_DWT_TESTING
namespace dwt_cuda { namespace dwt_cuda {
/// Divide and round up. /// Divide and round up.
template <typename T> template <typename T>
__device__ __host__ inline T divRndUp(const T &n, const T &d) { __device__ __host__ inline T divRndUp(const T &n, const T &d) {
return (n / d) + ((n % d) ? 1 : 0); return (n / d) + ((n % d) ? 1 : 0);
} }
// 9/7 forward DWT lifting schema coefficients // 9/7 forward DWT lifting schema coefficients
const float f97Predict1 = -1.586134342; ///< forward 9/7 predict 1 const float f97Predict1 = -1.586134342; ///< forward 9/7 predict 1
const float f97Update1 = -0.05298011854; ///< forward 9/7 update 1 const float f97Update1 = -0.05298011854; ///< forward 9/7 update 1
const float f97Predict2 = 0.8829110762; ///< forward 9/7 predict 2 const float f97Predict2 = 0.8829110762; ///< forward 9/7 predict 2
const float f97Update2 = 0.4435068522; ///< forward 9/7 update 2 const float f97Update2 = 0.4435068522; ///< forward 9/7 update 2
// 9/7 reverse DWT lifting schema coefficients // 9/7 reverse DWT lifting schema coefficients
const float r97update2 = -f97Update2; ///< undo 9/7 update 2 const float r97update2 = -f97Update2; ///< undo 9/7 update 2
const float r97predict2 = -f97Predict2; ///< undo 9/7 predict 2 const float r97predict2 = -f97Predict2; ///< undo 9/7 predict 2
@ -93,7 +81,6 @@ namespace dwt_cuda {
const float scale97Mul = 1.23017410491400f; const float scale97Mul = 1.23017410491400f;
const float scale97Div = 1.0 / scale97Mul; const float scale97Div = 1.0 / scale97Mul;
// 5/3 forward DWT lifting schema coefficients // 5/3 forward DWT lifting schema coefficients
const float forward53Predict = -0.5f; /// forward 5/3 predict const float forward53Predict = -0.5f; /// forward 5/3 predict
const float forward53Update = 0.25f; /// forward 5/3 update const float forward53Update = 0.25f; /// forward 5/3 update
@ -102,8 +89,6 @@ namespace dwt_cuda {
const float reverse53Update = -forward53Update; /// undo 5/3 update const float reverse53Update = -forward53Update; /// undo 5/3 update
const float reverse53Predict = -forward53Predict; /// undo 5/3 predict const float reverse53Predict = -forward53Predict; /// undo 5/3 predict
/// Functor which adds scaled sum of neighbors to given central pixel. /// Functor which adds scaled sum of neighbors to given central pixel.
struct AddScaledSum { struct AddScaledSum {
const float scale; // scale of neighbors const float scale; // scale of neighbors
@ -112,7 +97,8 @@ namespace dwt_cuda {
// if(threadIdx.x == 0) { // if(threadIdx.x == 0) {
// printf("scale %f, p %f c %f n %f , result: %f\n", scale, p, c, n, scale * (p + n) ); // printf("scale %f, p %f c %f n %f , result: %f\n", scale, p, c, n,
// scale * (p + n) );
// } // }
@ -120,8 +106,6 @@ namespace dwt_cuda {
} }
}; };
/// Returns index ranging from 0 to num threads, such that first half /// Returns index ranging from 0 to num threads, such that first half
/// of threads get even indices and others get odd indices. Each thread /// of threads get even indices and others get odd indices. Each thread
/// gets different index. /// gets different index.
@ -129,13 +113,10 @@ namespace dwt_cuda {
/// parityIdx: 0 2 4 6 1 3 5 7 /// parityIdx: 0 2 4 6 1 3 5 7
/// @tparam THREADS total count of participating threads /// @tparam THREADS total count of participating threads
/// @return parity-separated index of thread /// @return parity-separated index of thread
template <int THREADS> template <int THREADS> __device__ inline int parityIdx() {
__device__ inline int parityIdx() {
return (threadIdx.x * 2) - (THREADS - 1) * (threadIdx.x / (THREADS / 2)); return (threadIdx.x * 2) - (THREADS - 1) * (threadIdx.x / (THREADS / 2));
} }
/// size of shared memory /// size of shared memory
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 200) #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 200)
const int SHM_SIZE = 48 * 1024; const int SHM_SIZE = 48 * 1024;
@ -143,8 +124,6 @@ namespace dwt_cuda {
const int SHM_SIZE = 16 * 1024; const int SHM_SIZE = 16 * 1024;
#endif #endif
/// Perrformance and return code tester. /// Perrformance and return code tester.
class CudaDWTTester { class CudaDWTTester {
private: private:
@ -185,9 +164,7 @@ namespace dwt_cuda {
CudaDWTTester() : disabled(testRunning) {} CudaDWTTester() : disabled(testRunning) {}
/// Gets rpefered number of iterations /// Gets rpefered number of iterations
int getNumIterations() { int getNumIterations() { return disabled ? 1 : 31; }
return disabled ? 1 : 31;
}
/// Starts one test iteration. /// Starts one test iteration.
void beginTestIteration() { void beginTestIteration() {
@ -225,25 +202,24 @@ namespace dwt_cuda {
for (int i = times.size(); i--;) { for (int i = times.size(); i--;) {
sum += times[i]; sum += times[i];
} }
const double median = (times[times.size() / 2] const double median =
+ times[(times.size() - 1) / 2]) * 0.5f; (times[times.size() / 2] + times[(times.size() - 1) / 2]) * 0.5f;
printf(" %s: %7.3f ms (mean) %7.3f ms (median) %7.3f ms (max) " printf(" %s: %7.3f ms (mean) %7.3f ms (median) %7.3f ms (max) "
"(%d x %d)\n", name, (sum / times.size()), median, "(%d x %d)\n",
times[times.size() - 1], sizeX, sizeY); name, (sum / times.size()), median, times[times.size() - 1], sizeX,
sizeY);
} }
} }
}; };
/// Simple cudaMemcpy wrapped in performance tester. /// Simple cudaMemcpy wrapped in performance tester.
/// @param dest destination bufer /// @param dest destination bufer
/// @param src source buffer /// @param src source buffer
/// @param sx width of copied image /// @param sx width of copied image
/// @param sy height of copied image /// @param sy height of copied image
template <typename T> template <typename T>
inline void memCopy(T * const dest, const T * const src, inline void memCopy(T *const dest, const T *const src, const size_t sx,
const size_t sx, const size_t sy) { const size_t sy) {
cudaError_t status; cudaError_t status;
PERF_BEGIN PERF_BEGIN
status = cudaMemcpy(dest, src, sx * sy * sizeof(T), cudaMemcpyDeviceToDevice); status = cudaMemcpy(dest, src, sx * sy * sizeof(T), cudaMemcpyDeviceToDevice);
@ -251,11 +227,6 @@ namespace dwt_cuda {
CudaDWTTester::check(status, "memcpy device > device"); CudaDWTTester::check(status, "memcpy device > device");
} }
} // end of namespace dwt_cuda } // end of namespace dwt_cuda
#endif // DWT_COMMON_CUDA_H #endif // DWT_COMMON_CUDA_H

9
examples/dwt2d/dwt_cuda/dwt.h Executable file → Normal file
View File

@ -58,10 +58,8 @@
#ifndef DWT_CUDA_H #ifndef DWT_CUDA_H
#define DWT_CUDA_H #define DWT_CUDA_H
namespace dwt_cuda { namespace dwt_cuda {
/// Forward 5/3 2D DWT. See common rules (above) for more details. /// Forward 5/3 2D DWT. See common rules (above) for more details.
/// @param in Expected to be normalized into range [-128, 127]. /// @param in Expected to be normalized into range [-128, 127].
/// Will not be preserved (will be overwritten). /// Will not be preserved (will be overwritten).
@ -71,7 +69,6 @@ namespace dwt_cuda {
/// @param levels number of recursive DWT levels /// @param levels number of recursive DWT levels
void fdwt53(int *in, int *out, int sizeX, int sizeY, int levels); void fdwt53(int *in, int *out, int sizeX, int sizeY, int levels);
/// Reverse 5/3 2D DWT. See common rules (above) for more details. /// Reverse 5/3 2D DWT. See common rules (above) for more details.
/// @param in Input DWT coefficients. Format described in common rules. /// @param in Input DWT coefficients. Format described in common rules.
/// Will not be preserved (will be overwritten). /// Will not be preserved (will be overwritten).
@ -82,7 +79,6 @@ namespace dwt_cuda {
/// @param levels number of recursive DWT levels /// @param levels number of recursive DWT levels
void rdwt53(int *in, int *out, int sizeX, int sizeY, int levels); void rdwt53(int *in, int *out, int sizeX, int sizeY, int levels);
/// Forward 9/7 2D DWT. See common rules (above) for more details. /// Forward 9/7 2D DWT. See common rules (above) for more details.
/// @param in Input DWT coefficients. Should be normalized (in range /// @param in Input DWT coefficients. Should be normalized (in range
/// [-0.5, 0.5]). Will not be preserved (will be overwritten). /// [-0.5, 0.5]). Will not be preserved (will be overwritten).
@ -92,7 +88,6 @@ namespace dwt_cuda {
/// @param levels number of recursive DWT levels /// @param levels number of recursive DWT levels
void fdwt97(float *in, float *out, int sizeX, int sizeY, int levels); void fdwt97(float *in, float *out, int sizeX, int sizeY, int levels);
/// Reverse 9/7 2D DWT. See common rules (above) for more details. /// Reverse 9/7 2D DWT. See common rules (above) for more details.
/// @param in Input DWT coefficients. Format described in common rules. /// @param in Input DWT coefficients. Format described in common rules.
/// Will not be preserved (will be overwritten). /// Will not be preserved (will be overwritten).
@ -103,10 +98,6 @@ namespace dwt_cuda {
/// @param levels number of recursive DWT levels /// @param levels number of recursive DWT levels
void rdwt97(float *in, float *out, int sizeX, int sizeY, int levels); void rdwt97(float *in, float *out, int sizeX, int sizeY, int levels);
} // namespace dwt_cuda } // namespace dwt_cuda
#endif // DWT_CUDA_H #endif // DWT_CUDA_H

97
examples/dwt2d/dwt_cuda/io.h Executable file → Normal file
View File

@ -30,16 +30,13 @@
/// POSSIBILITY OF SUCH DAMAGE. /// POSSIBILITY OF SUCH DAMAGE.
/// ///
#ifndef IO_H #ifndef IO_H
#define IO_H #define IO_H
#include "common.h" #include "common.h"
namespace dwt_cuda { namespace dwt_cuda {
/// Base for all IO classes - manages mirroring. /// Base for all IO classes - manages mirroring.
class DWTIO { class DWTIO {
protected: protected:
@ -80,13 +77,11 @@ namespace dwt_cuda {
} }
}; };
/// Base class for pixel loader and writer - manages computing start index, /// Base class for pixel loader and writer - manages computing start index,
/// stride and end of image for loading column of pixels. /// stride and end of image for loading column of pixels.
/// @tparam T type of image pixels /// @tparam T type of image pixels
/// @tparam CHECKED true = be prepared to image boundary, false = don't care /// @tparam CHECKED true = be prepared to image boundary, false = don't care
template <typename T, bool CHECKED> template <typename T, bool CHECKED> class VerticalDWTPixelIO : protected DWTIO {
class VerticalDWTPixelIO : protected DWTIO {
protected: protected:
int end; ///< index of bottom neightbor of last pixel of column int end; ///< index of bottom neightbor of last pixel of column
int stride; ///< increment of pointer to get to next pixel int stride; ///< increment of pointer to get to next pixel
@ -97,8 +92,8 @@ namespace dwt_cuda {
/// @param firstX x-coordinate of first pixel to use /// @param firstX x-coordinate of first pixel to use
/// @param firstY y-coordinate of first pixel to use /// @param firstY y-coordinate of first pixel to use
/// @return index of pixel at position [x, y] in the image /// @return index of pixel at position [x, y] in the image
__device__ int initialize(const int sizeX, const int sizeY, __device__ int initialize(const int sizeX, const int sizeY, int firstX,
int firstX, int firstY) { int firstY) {
// initialize all pointers and stride // initialize all pointers and stride
end = CHECKED ? (sizeY * sizeX + firstX) : 0; end = CHECKED ? (sizeY * sizeX + firstX) : 0;
stride = sizeX; stride = sizeX;
@ -106,8 +101,6 @@ namespace dwt_cuda {
} }
}; };
/// Writes reverse transformed pixels directly into output image. /// Writes reverse transformed pixels directly into output image.
/// @tparam T type of output pixels /// @tparam T type of output pixels
/// @tparam CHECKED true = be prepared to image boundary, false = don't care /// @tparam CHECKED true = be prepared to image boundary, false = don't care
@ -122,8 +115,8 @@ namespace dwt_cuda {
/// @param sizeY height of the image /// @param sizeY height of the image
/// @param firstX x-coordinate of first pixel to write into /// @param firstX x-coordinate of first pixel to write into
/// @param firstY y-coordinate of first pixel to write into /// @param firstY y-coordinate of first pixel to write into
__device__ void init(const int sizeX, const int sizeY, __device__ void init(const int sizeX, const int sizeY, int firstX,
int firstX, int firstY) { int firstY) {
if (firstX < sizeX) { if (firstX < sizeX) {
next = this->initialize(sizeX, sizeY, firstX, firstY); next = this->initialize(sizeX, sizeY, firstX, firstY);
} else { } else {
@ -145,43 +138,28 @@ namespace dwt_cuda {
} }
}; };
/// Loads pixels from input image. /// Loads pixels from input image.
/// @tparam T type of image input pixels /// @tparam T type of image input pixels
/// @tparam CHECKED true = be prepared to image boundary, false = don't care /// @tparam CHECKED true = be prepared to image boundary, false = don't care
template <typename T, bool CHECKED> template <typename T, bool CHECKED>
class VerticalDWTPixelLoader class VerticalDWTPixelLoader : protected VerticalDWTPixelIO<const T, CHECKED> {
: protected VerticalDWTPixelIO<const T, CHECKED> {
private: private:
int last; ///< index of last loaded pixel int last; ///< index of last loaded pixel
public: public:
//******************* FOR TEST ********************** //******************* FOR TEST **********************
__device__ int getlast(){ __device__ int getlast() { return last; }
return last; __device__ int getend() { return this->end; }
} __device__ int getstride() { return this->stride; }
__device__ int getend(){ __device__ void setend(int a) { this->end = a; }
return this->end;
}
__device__ int getstride(){
return this->stride;
}
__device__ void setend(int a){
this->end=a;
}
//******************* FOR TEST ********************** //******************* FOR TEST **********************
/// Initializes loader - sets input size and a position of first pixel. /// Initializes loader - sets input size and a position of first pixel.
/// @param sizeX width of the image /// @param sizeX width of the image
/// @param sizeY height of the image /// @param sizeY height of the image
/// @param firstX x-coordinate of first pixel to load /// @param firstX x-coordinate of first pixel to load
/// @param firstY y-coordinate of first pixel to load /// @param firstY y-coordinate of first pixel to load
__device__ void init(const int sizeX, const int sizeY, __device__ void init(const int sizeX, const int sizeY, int firstX,
int firstX, int firstY) { int firstY) {
// correctly mirror x coordinate // correctly mirror x coordinate
this->mirror(firstX, sizeX); this->mirror(firstX, sizeX);
@ -208,7 +186,8 @@ namespace dwt_cuda {
this->stride = -this->stride; // reverse loader's direction this->stride = -this->stride; // reverse loader's direction
} }
// avoid reading from negative indices if loader is checked // avoid reading from negative indices if loader is checked
// return (CHECKED && (last < 0)) ? 0 : input[last]; // TODO: use this checked variant later // return (CHECKED && (last < 0)) ? 0 : input[last]; // TODO: use this
// checked variant later
if (last < 0) { if (last < 0) {
return 0; return 0;
} }
@ -220,14 +199,11 @@ namespace dwt_cuda {
} }
}; };
/// Base for band write and loader. Manages computing strides and pointers /// Base for band write and loader. Manages computing strides and pointers
/// to first and last pixels in a linearly-stored-bands correct way. /// to first and last pixels in a linearly-stored-bands correct way.
/// @tparam T type of band coefficients /// @tparam T type of band coefficients
/// @tparam CHECKED true = be prepared to image boundary, false = don't care /// @tparam CHECKED true = be prepared to image boundary, false = don't care
template <typename T, bool CHECKED> template <typename T, bool CHECKED> class VerticalDWTBandIO : protected DWTIO {
class VerticalDWTBandIO : protected DWTIO {
protected: protected:
/// index of bottom neighbor of last pixel of loaded column /// index of bottom neighbor of last pixel of loaded column
int end; int end;
@ -279,12 +255,10 @@ namespace dwt_cuda {
end = 0; end = 0;
} }
//***********for test************** //***********for test**************
// end = CHECKED; // end = CHECKED;
//***********for test************** //***********for test**************
// finally, return index of the first item // finally, return index of the first item
return columnOffset // right column return columnOffset // right column
+ (firstY / 2) * verticalStride // right row + (firstY / 2) * verticalStride // right row
@ -292,9 +266,6 @@ namespace dwt_cuda {
} }
}; };
/// Directly loads coefficients from four consecutively stored transformed /// Directly loads coefficients from four consecutively stored transformed
/// bands. /// bands.
/// @tparam T type of input band coefficients /// @tparam T type of input band coefficients
@ -324,11 +295,12 @@ namespace dwt_cuda {
return 0; return 0;
} }
// avoid reading from negative indices if loader is checked // avoid reading from negative indices if loader is checked
// return (CHECKED && (last < 0)) ? 0 : input[last]; // TODO: use this checked variant later // return (CHECKED && (last < 0)) ? 0 : input[last]; // TODO: use this
// checked variant later
return input[last]; return input[last];
} }
public:
public:
/// Initializes loader - sets input size and a position of first pixel. /// Initializes loader - sets input size and a position of first pixel.
/// @param imageSizeX width of the image /// @param imageSizeX width of the image
/// @param imageSizeY height of the image /// @param imageSizeY height of the image
@ -336,8 +308,8 @@ namespace dwt_cuda {
/// (Parity determines vertically low or high band.) /// (Parity determines vertically low or high band.)
/// @param firstY y-coordinate of first pixel to load /// @param firstY y-coordinate of first pixel to load
/// (Parity determines horizontally low or high band.) /// (Parity determines horizontally low or high band.)
__device__ void init(const int imageSizeX, const int imageSizeY, __device__ void init(const int imageSizeX, const int imageSizeY, int firstX,
int firstX, const int firstY) { const int firstY) {
this->mirror(firstX, imageSizeX); this->mirror(firstX, imageSizeX);
last = this->initialize(imageSizeX, imageSizeY, firstX, firstY); last = this->initialize(imageSizeX, imageSizeY, firstX, firstY);
@ -371,12 +343,8 @@ namespace dwt_cuda {
__device__ T loadHighFrom(const T *const input) { __device__ T loadHighFrom(const T *const input) {
return updateAndLoad(input, this->strideLowToHigh); return updateAndLoad(input, this->strideLowToHigh);
} }
}; };
/// Directly saves coefficients into four transformed bands. /// Directly saves coefficients into four transformed bands.
/// @tparam T type of output band coefficients /// @tparam T type of output band coefficients
/// @tparam CHECKED true = be prepared to image boundary, false = don't care /// @tparam CHECKED true = be prepared to image boundary, false = don't care
@ -392,7 +360,8 @@ namespace dwt_cuda {
/// @param stride increment of the pointer to get to next output index /// @param stride increment of the pointer to get to next output index
__device__ int saveAndUpdate(T *const output, const T &item, __device__ int saveAndUpdate(T *const output, const T &item,
const int &stride) { const int &stride) {
// if(blockIdx.x == 0 && blockIdx.y == 11 && threadIdx.x == 0){ //test, Mar 20 // if(blockIdx.x == 0 && blockIdx.y == 11 && threadIdx.x == 0){
////test, Mar 20
if ((!CHECKED) || (next != this->end)) { if ((!CHECKED) || (next != this->end)) {
// if(next == 4) { // if(next == 4) {
// printf(" next: %d stride: %d val: %f \n", next, stride, item ); // printf(" next: %d stride: %d val: %f \n", next, stride, item );
@ -407,8 +376,8 @@ namespace dwt_cuda {
// } // }
return next; return next;
} }
public:
public:
/// Initializes writer - sets output size and a position of first pixel. /// Initializes writer - sets output size and a position of first pixel.
/// @param output output image /// @param output output image
/// @param imageSizeX width of the image /// @param imageSizeX width of the image
@ -455,29 +424,17 @@ namespace dwt_cuda {
} }
//*******Add three functions to get private values******* //*******Add three functions to get private values*******
__device__ int getnext(){ __device__ int getnext() { return next; }
return next;
}
__device__ int getend(){ __device__ int getend() { return this->end; }
return this->end;
}
__device__ int getstrideHighToLow(){ __device__ int getstrideHighToLow() { return this->strideHighToLow; }
return this->strideHighToLow;
}
__device__ int getstrideLowToHigh(){ __device__ int getstrideLowToHigh() { return this->strideLowToHigh; }
return this->strideLowToHigh;
}
//*******Add three functions to get private values******* //*******Add three functions to get private values*******
}; };
} // namespace dwt_cuda } // namespace dwt_cuda
#endif // IO_H #endif // IO_H

75
examples/dwt2d/dwt_cuda/transform_buffer.h Executable file → Normal file
View File

@ -30,14 +30,11 @@
/// POSSIBILITY OF SUCH DAMAGE. /// POSSIBILITY OF SUCH DAMAGE.
/// ///
#ifndef TRANSFORM_BUFFER_H #ifndef TRANSFORM_BUFFER_H
#define TRANSFORM_BUFFER_H #define TRANSFORM_BUFFER_H
namespace dwt_cuda { namespace dwt_cuda {
/// Buffer (in shared memory of GPU) where block of input image is stored, /// Buffer (in shared memory of GPU) where block of input image is stored,
/// but odd and even lines are separated. (Generates less bank conflicts when /// but odd and even lines are separated. (Generates less bank conflicts when
/// using lifting schema.) All operations expect SIZE_X threads. /// using lifting schema.) All operations expect SIZE_X threads.
@ -79,8 +76,6 @@ namespace dwt_cuda {
/// buffer for both even and odd columns /// buffer for both even and odd columns
T data[2 * BUFFER_SIZE + PADDING]; T data[2 * BUFFER_SIZE + PADDING];
/// Applies specified function to all central elements while also passing /// Applies specified function to all central elements while also passing
/// previous and next elements as parameters. /// previous and next elements as parameters.
/// @param count count of central elements to apply function to /// @param count count of central elements to apply function to
@ -123,16 +118,13 @@ namespace dwt_cuda {
} }
public: public:
__device__ void getPrintData() { __device__ void getPrintData() {
// //
for (int i = 0; i < 2 * BUFFER_SIZE + PADDING; i++) { for (int i = 0; i < 2 * BUFFER_SIZE + PADDING; i++) {
printf(" index: %d data: %f \n ", i, data[i]); printf(" index: %d data: %f \n ", i, data[i]);
} }
} }
/// Gets offset of the column with given index. Central columns have /// Gets offset of the column with given index. Central columns have
/// indices from 0 to NUM_LINES - 1, left boundary columns have negative /// indices from 0 to NUM_LINES - 1, left boundary columns have negative
/// indices and right boundary columns indices start with NUM_LINES. /// indices and right boundary columns indices start with NUM_LINES.
@ -144,14 +136,10 @@ namespace dwt_cuda {
+ (columnIndex & 1) * ODD_OFFSET; // select odd or even buffer + (columnIndex & 1) * ODD_OFFSET; // select odd or even buffer
} }
/// Provides access to data of the transform buffer. /// Provides access to data of the transform buffer.
/// @param index index of the item to work with /// @param index index of the item to work with
/// @return reference to item at given index /// @return reference to item at given index
__device__ T & operator[] (const int index) { __device__ T &operator[](const int index) { return data[index]; }
return data[index];
}
/// Applies specified function to all horizontally even elements in /// Applies specified function to all horizontally even elements in
/// specified lines. (Including even elements in boundaries except /// specified lines. (Including even elements in boundaries except
@ -163,8 +151,7 @@ namespace dwt_cuda {
/// parameters: previous (odd) element, the even /// parameters: previous (odd) element, the even
/// element itself and finally next (odd) element /// element itself and finally next (odd) element
template <typename FUNC> template <typename FUNC>
__device__ void forEachHorizontalEven(const int firstLine, __device__ void forEachHorizontalEven(const int firstLine, const int numLines,
const int numLines,
const FUNC &func) { const FUNC &func) {
// number of even elemens to apply function to // number of even elemens to apply function to
const int count = numLines * VERTICAL_STRIDE - 1; const int count = numLines * VERTICAL_STRIDE - 1;
@ -177,14 +164,14 @@ namespace dwt_cuda {
// if(threadIdx.x == 0) { // if(threadIdx.x == 0) {
// printf("forEachHorizontalEven count %d, centerOffset %d prevOffset %d nextOffset %d \n", count, centerOffset, prevOffset, nextOffset); // printf("forEachHorizontalEven count %d, centerOffset %d prevOffset %d
// nextOffset %d \n", count, centerOffset, prevOffset, nextOffset);
// } // }
// call generic horizontal step function // call generic horizontal step function
horizontalStep(count, prevOffset, centerOffset, nextOffset, func); horizontalStep(count, prevOffset, centerOffset, nextOffset, func);
} }
/// Applies given function to all horizontally odd elements in specified /// Applies given function to all horizontally odd elements in specified
/// lines. (Including odd elements in boundaries except last odd element /// lines. (Including odd elements in boundaries except last odd element
/// in last right boundary.) SIZE_X threads participate and synchronization /// in last right boundary.) SIZE_X threads participate and synchronization
@ -195,8 +182,7 @@ namespace dwt_cuda {
/// parameters: previous (even) element, the odd /// parameters: previous (even) element, the odd
/// element itself and finally next (even) element /// element itself and finally next (even) element
template <typename FUNC> template <typename FUNC>
__device__ void forEachHorizontalOdd(const int firstLine, __device__ void forEachHorizontalOdd(const int firstLine, const int numLines,
const int numLines,
const FUNC &func) { const FUNC &func) {
// numbet of odd elements to apply function to // numbet of odd elements to apply function to
const int count = numLines * VERTICAL_STRIDE - 1; const int count = numLines * VERTICAL_STRIDE - 1;
@ -208,15 +194,14 @@ namespace dwt_cuda {
const int nextOffset = prevOffset + 1; const int nextOffset = prevOffset + 1;
// if(threadIdx.x == 0) { // if(threadIdx.x == 0) {
// printf("forEachHorizontalOdd count %d, centerOffset %d prevOffset %d nextOffset %d \n", count, centerOffset, prevOffset, nextOffset); // printf("forEachHorizontalOdd count %d, centerOffset %d prevOffset %d
// nextOffset %d \n", count, centerOffset, prevOffset, nextOffset);
// } // }
// call generic horizontal step function // call generic horizontal step function
horizontalStep(count, prevOffset, centerOffset, nextOffset, func); horizontalStep(count, prevOffset, centerOffset, nextOffset, func);
} }
/// Applies specified function to all even elements (except element #0) /// Applies specified function to all even elements (except element #0)
/// of given column. Each thread takes care of one column, so there's /// of given column. Each thread takes care of one column, so there's
/// no need for synchronization. /// no need for synchronization.
@ -238,17 +223,15 @@ namespace dwt_cuda {
/* __syncthreads(); /* __syncthreads();
if ((blockIdx.x * blockDim.x + threadIdx.x) == 0){ if ((blockIdx.x * blockDim.x + threadIdx.x) == 0){
diffOut[2500]++; diffOut[2500]++;
diffOut[diffOut[2500]] = 2;//data[columnOffset + row * VERTICAL_STRIDE]; diffOut[diffOut[2500]] = 2;//data[columnOffset +
row * VERTICAL_STRIDE];
} }
__syncthreads(); __syncthreads();
*/ //--------------- FOR TEST ----------------- */ //--------------- FOR TEST -----------------
} }
} }
} }
/// Applies specified function to all odd elements of given column. /// Applies specified function to all odd elements of given column.
/// Each thread takes care of one column, so there's no need for /// Each thread takes care of one column, so there's no need for
/// synchronization. /// synchronization.
@ -266,12 +249,12 @@ namespace dwt_cuda {
f(prev, data[columnOffset + row * VERTICAL_STRIDE], next); f(prev, data[columnOffset + row * VERTICAL_STRIDE], next);
//--------------- FOR TEST ----------------- //--------------- FOR TEST -----------------
/* __syncthreads(); /* __syncthreads();
if ((blockIdx.x * blockDim.x + threadIdx.x) == 0){ if ((blockIdx.x * blockDim.x + threadIdx.x) == 0){
diffOut[2500]++; diffOut[2500]++;
diffOut[diffOut[2500]] = 1; //data[columnOffset + row * VERTICAL_STRIDE]; diffOut[diffOut[2500]] = 1; //data[columnOffset +
row * VERTICAL_STRIDE];
} }
__syncthreads(); __syncthreads();
@ -279,8 +262,6 @@ namespace dwt_cuda {
} }
} }
/// Scales elements at specified lines. /// Scales elements at specified lines.
/// @param evenScale scaling factor for horizontally even elements /// @param evenScale scaling factor for horizontally even elements
/// @param oddScale scaling factor for horizontally odd elements /// @param oddScale scaling factor for horizontally odd elements
@ -294,7 +275,9 @@ namespace dwt_cuda {
const int finalCount = count % SIZE_X; const int finalCount = count % SIZE_X;
const int finalOffset = count - finalCount; const int finalOffset = count - finalCount;
// printf("scaleHorizontal sizeX: %d offset %d, count, %d, steps, %d, finalCount %d, finalOffset %d \n", SIZE_X, offset, count, steps, finalCount, finalOffset); // printf("scaleHorizontal sizeX: %d offset %d, count, %d, steps, %d,
// finalCount %d, finalOffset %d \n", SIZE_X, offset, count, steps,
// finalCount, finalOffset);
// run iterations, whete all threads participate // run iterations, whete all threads participate
for (int i = 0; i < steps; i++) { for (int i = 0; i < steps; i++) {
@ -319,10 +302,8 @@ namespace dwt_cuda {
// } // }
data[threadIdx.x + finalOffset + offset + ODD_OFFSET] *= oddScale; data[threadIdx.x + finalOffset + offset + ODD_OFFSET] *= oddScale;
} }
} }
/// Scales elements in specified column. /// Scales elements in specified column.
/// @param evenScale scaling factor for vertically even elements /// @param evenScale scaling factor for vertically even elements
/// @param oddScale scaling factor for vertically odd elements /// @param oddScale scaling factor for vertically odd elements
@ -341,33 +322,17 @@ namespace dwt_cuda {
} }
} }
//****************For Test(Feb23), test inter parameters*************
__device__ int getVERTICAL_STRIDE() { return VERTICAL_STRIDE; }
__device__ int getSHM_BANKS() { return SHM_BANKS; }
__device__ int getBuffersize() { return BUFFER_SIZE; }
__device__ int getPADDING() { return PADDING; }
__device__ int getODD_OFFSET() { return ODD_OFFSET; }
//****************For Test(Feb23), test inter parameters************* //****************For Test(Feb23), test inter parameters*************
__device__ int getVERTICAL_STRIDE(){
return VERTICAL_STRIDE;
}
__device__ int getSHM_BANKS(){
return SHM_BANKS;
}
__device__ int getBuffersize(){
return BUFFER_SIZE;
}
__device__ int getPADDING(){
return PADDING;
}
__device__ int getODD_OFFSET(){
return ODD_OFFSET;
}
//****************For Test(Feb23), test inter parameters*************
}; // end of class TransformBuffer }; // end of class TransformBuffer
} // namespace dwt_cuda } // namespace dwt_cuda
#endif // TRANSFORM_BUFFER_H #endif // TRANSFORM_BUFFER_H

View File

@ -5,4 +5,3 @@
./dwt2d 4.bmp -d 4x4 -r -5 -l 3 ./dwt2d 4.bmp -d 4x4 -r -5 -l 3
# ./dwt2d 4.bmp -d 4x4 -r -9 -l 3 # ./dwt2d 4.bmp -d 4x4 -r -9 -l 3
# ./dwt2d 8.bmp -d 8x8 -f -9 -l 3 # ./dwt2d 8.bmp -d 8x8 -f -9 -l 3

View File

@ -7,12 +7,3 @@
/usr/local/cuda/bin/nvcc -arch sm_50 -I. -I/include -O2 --compiler-options -fno-strict-aliasing -c dwt_cuda/rdwt97.cu -o dwt_cuda/rdwt97.cu.o /usr/local/cuda/bin/nvcc -arch sm_50 -I. -I/include -O2 --compiler-options -fno-strict-aliasing -c dwt_cuda/rdwt97.cu -o dwt_cuda/rdwt97.cu.o
/usr/local/cuda/bin/nvcc -arch sm_50 -I. -I/include -O2 --compiler-options -fno-strict-aliasing -c dwt_cuda/rdwt53.cu -o dwt_cuda/rdwt53.cu.o /usr/local/cuda/bin/nvcc -arch sm_50 -I. -I/include -O2 --compiler-options -fno-strict-aliasing -c dwt_cuda/rdwt53.cu -o dwt_cuda/rdwt53.cu.o
g++ -fPIC -o nvcc_dwt2d main.cu.o dwt.cu.o components.cu.o dwt_cuda/fdwt53.cu.o dwt_cuda/fdwt97.cu.o dwt_cuda/common.cu.o dwt_cuda/rdwt97.cu.o dwt_cuda/rdwt53.cu.o -L/usr/local/cuda/lib64 -lcudart g++ -fPIC -o nvcc_dwt2d main.cu.o dwt.cu.o components.cu.o dwt_cuda/fdwt53.cu.o dwt_cuda/fdwt97.cu.o dwt_cuda/common.cu.o dwt_cuda/rdwt97.cu.o dwt_cuda/rdwt53.cu.o -L/usr/local/cuda/lib64 -lcudart

View File

@ -1,14 +1,12 @@
#include <stdio.h> #include <stdio.h>
__global__ __global__ void saxpy(int n, float a, float *x, float *y) {
void saxpy(int n, float a, float *x, float *y)
{
int i = blockIdx.x * blockDim.x + threadIdx.x; int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < n) y[i] = a*x[i] + y[i]; if (i < n)
y[i] = a * x[i] + y[i];
} }
int main(void) int main(void) {
{
int N = 1 << 20; int N = 1 << 20;
float *x, *y, *d_x, *d_y; float *x, *y, *d_x, *d_y;
x = (float *)malloc(N * sizeof(float)); x = (float *)malloc(N * sizeof(float));

View File

@ -1,14 +1,11 @@
#include <stdio.h> #include <stdio.h>
__global__ __global__ void saxpy(void) {
void saxpy(void)
{
int i = blockIdx.x * blockDim.x + threadIdx.x; int i = blockIdx.x * blockDim.x + threadIdx.x;
printf("block_id:%d thread_id:%d \n", i) printf("block_id:%d thread_id:%d \n", i)
} }
int main(void) int main(void) {
{
int N = 1 << 20; int N = 1 << 20;
float *x, *y, *d_x, *d_y; float *x, *y, *d_x, *d_y;
x = (float *)malloc(N * sizeof(float)); x = (float *)malloc(N * sizeof(float));

View File

@ -1,13 +1,8 @@
#include <stdio.h> #include <stdio.h>
__global__ __global__ void saxpy(int N) { printf("hello!: %d\n", N); }
void saxpy(int N)
{
printf("hello!: %d\n", N);
}
int main(void) int main(void) {
{
int N = 1 << 20; int N = 1 << 20;
float *x, *y, *d_x, *d_y; float *x, *y, *d_x, *d_y;
x = (float *)malloc(N * sizeof(float)); x = (float *)malloc(N * sizeof(float));

View File

@ -1,13 +1,8 @@
#include <stdio.h> #include <stdio.h>
__global__ __global__ void saxpy(void) { printf("hello!\n"); }
void saxpy(void)
{
printf("hello!\n");
}
int main(void) int main(void) {
{
int N = 1 << 20; int N = 1 << 20;
float *x, *y, *d_x, *d_y; float *x, *y, *d_x, *d_y;
x = (float *)malloc(N * sizeof(float)); x = (float *)malloc(N * sizeof(float));