/// /// @file common.h /// @author Martin Jirman (207962@mail.muni.cz) /// @brief Common stuff for all CUDA dwt functions. /// @date 2011-01-20 14:19 /// /// Copyright (c) 2011 Martin Jirman /// All rights reserved. /// /// Redistribution and use in source and binary forms, with or without /// modification, are permitted provided that the following conditions are met: /// /// * Redistributions of source code must retain the above copyright /// notice, this list of conditions and the following disclaimer. /// * Redistributions in binary form must reproduce the above copyright /// notice, this list of conditions and the following disclaimer in the /// documentation and/or other materials provided with the distribution. /// /// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" /// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE /// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE /// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE /// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR /// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF /// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS /// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN /// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) /// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE /// POSSIBILITY OF SUCH DAMAGE. /// #ifndef DWT_COMMON_H #define DWT_COMMON_H #include #include #include // compile time minimum macro #define CTMIN(a,b) (((a) < (b)) ? (a) : (b)) // performance testing macros #if defined(GPU_DWT_TESTING) #define PERF_BEGIN \ { \ dwt_cuda::CudaDWTTester PERF_TESTER; \ for(int PERF_N = PERF_TESTER.getNumIterations(); PERF_N--; ) \ { \ PERF_TESTER.beginTestIteration(); #define PERF_END(PERF_NAME, PERF_W, PERF_H) \ PERF_TESTER.endTestIteration(); \ } \ PERF_TESTER.showPerformance(PERF_NAME, PERF_W, PERF_H); \ } #else // GPU_DWT_TESTING #define PERF_BEGIN #define PERF_END(PERF_NAME, PERF_W, PERF_H) #endif // GPU_DWT_TESTING namespace dwt_cuda { /// Divide and round up. template __device__ __host__ inline T divRndUp(const T & n, const T & d) { return (n / d) + ((n % d) ? 1 : 0); } // 9/7 forward DWT lifting schema coefficients const float f97Predict1 = -1.586134342; ///< forward 9/7 predict 1 const float f97Update1 = -0.05298011854; ///< forward 9/7 update 1 const float f97Predict2 = 0.8829110762; ///< forward 9/7 predict 2 const float f97Update2 = 0.4435068522; ///< forward 9/7 update 2 // 9/7 reverse DWT lifting schema coefficients const float r97update2 = -f97Update2; ///< undo 9/7 update 2 const float r97predict2 = -f97Predict2; ///< undo 9/7 predict 2 const float r97update1 = -f97Update1; ///< undo 9/7 update 1 const float r97Predict1 = -f97Predict1; ///< undo 9/7 predict 1 // FDWT 9/7 scaling coefficients const float scale97Mul = 1.23017410491400f; const float scale97Div = 1.0 / scale97Mul; // 5/3 forward DWT lifting schema coefficients const float forward53Predict = -0.5f; /// forward 5/3 predict const float forward53Update = 0.25f; /// forward 5/3 update // 5/3 forward DWT lifting schema coefficients const float reverse53Update = -forward53Update; /// undo 5/3 update const float reverse53Predict = -forward53Predict; /// undo 5/3 predict /// Functor which adds scaled sum of neighbors to given central pixel. struct AddScaledSum { const float scale; // scale of neighbors __device__ AddScaledSum(const float scale) : scale(scale) {} __device__ void operator()(const float p, float & c, const float n) const { // if(threadIdx.x == 0) { // printf("scale %f, p %f c %f n %f , result: %f\n", scale, p, c, n, scale * (p + n) ); // } c += scale * (p + n); } }; /// Returns index ranging from 0 to num threads, such that first half /// of threads get even indices and others get odd indices. Each thread /// gets different index. /// Example: (for 8 threads) threadIdx.x: 0 1 2 3 4 5 6 7 /// parityIdx: 0 2 4 6 1 3 5 7 /// @tparam THREADS total count of participating threads /// @return parity-separated index of thread template __device__ inline int parityIdx() { return (threadIdx.x * 2) - (THREADS - 1) * (threadIdx.x / (THREADS / 2)); } /// size of shared memory #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 200) const int SHM_SIZE = 48 * 1024; #else const int SHM_SIZE = 16 * 1024; #endif /// Perrformance and return code tester. class CudaDWTTester { private: static bool testRunning; ///< true if any test is currently running cudaEvent_t beginEvent; ///< begin CUDA event cudaEvent_t endEvent; ///< end CUDA event std::vector times; ///< collected times const bool disabled; ///< true if this object is disabled public: /// Checks CUDA related error. /// @param status return code to be checked /// @param message message to be shown if there was an error /// @return true if there was no error, false otherwise static bool check(const cudaError_t & status, const char * message) { #if defined(GPU_DWT_TESTING) if((!testRunning) && status != cudaSuccess) { const char * errorString = cudaGetErrorString(status); fprintf(stderr, "CUDA ERROR: '%s': %s\n", message, errorString); fflush(stderr); return false; } #endif // GPU_DWT_TESTING return true; } /// Checks last kernel call for errors. /// @param message description of the kernel call /// @return true if there was no error, false otherwise static bool checkLastKernelCall(const char * message) { #if defined(GPU_DWT_TESTING) return testRunning ? true : check(cudaThreadSynchronize(), message); #else // GPU_DWT_TESTING return true; #endif // GPU_DWT_TESTING } /// Initializes DWT tester for time measurement CudaDWTTester() : disabled(testRunning) {} /// Gets rpefered number of iterations int getNumIterations() { return disabled ? 1 : 31; } /// Starts one test iteration. void beginTestIteration() { if(!disabled) { cudaEventCreate(&beginEvent); cudaEventCreate(&endEvent); cudaEventRecord(beginEvent, 0); testRunning = true; } } /// Ends on etest iteration. void endTestIteration() { if(!disabled) { float time; testRunning = false; cudaEventRecord(endEvent, 0); cudaEventSynchronize(endEvent); cudaEventElapsedTime(&time, beginEvent, endEvent); cudaEventDestroy(beginEvent); cudaEventDestroy(endEvent); times.push_back(time); } } /// Shows brief info about all iterations. /// @param name name of processing method /// @param sizeX width of processed image /// @param sizeY height of processed image void showPerformance(const char * name, const int sizeX, const int sizeY) { if(!disabled) { // compute mean and median std::sort(times.begin(), times.end()); double sum = 0; for(int i = times.size(); i--; ) { sum += times[i]; } const double median = (times[times.size() / 2] + times[(times.size() - 1) / 2]) * 0.5f; printf(" %s: %7.3f ms (mean) %7.3f ms (median) %7.3f ms (max) " "(%d x %d)\n", name, (sum / times.size()), median, times[times.size() - 1], sizeX, sizeY); } } }; /// Simple cudaMemcpy wrapped in performance tester. /// @param dest destination bufer /// @param src source buffer /// @param sx width of copied image /// @param sy height of copied image template inline void memCopy(T * const dest, const T * const src, const size_t sx, const size_t sy) { cudaError_t status; PERF_BEGIN status = cudaMemcpy(dest, src, sx*sy*sizeof(T), cudaMemcpyDeviceToDevice); PERF_END(" memcpy", sx, sy) CudaDWTTester::check(status, "memcpy device > device"); } } // end of namespace dwt_cuda #endif // DWT_COMMON_CUDA_H