scan.hpp 9.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258
  1. /*M///////////////////////////////////////////////////////////////////////////////////////
  2. //
  3. // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
  4. //
  5. // By downloading, copying, installing or using the software you agree to this license.
  6. // If you do not agree to this license, do not download, install,
  7. // copy or use the software.
  8. //
  9. //
  10. // License Agreement
  11. // For Open Source Computer Vision Library
  12. //
  13. // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
  14. // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
  15. // Third party copyrights are property of their respective owners.
  16. //
  17. // Redistribution and use in source and binary forms, with or without modification,
  18. // are permitted provided that the following conditions are met:
  19. //
  20. // * Redistribution's of source code must retain the above copyright notice,
  21. // this list of conditions and the following disclaimer.
  22. //
  23. // * Redistribution's in binary form must reproduce the above copyright notice,
  24. // this list of conditions and the following disclaimer in the documentation
  25. // and/or other materials provided with the distribution.
  26. //
  27. // * The name of the copyright holders may not be used to endorse or promote products
  28. // derived from this software without specific prior written permission.
  29. //
  30. // This software is provided by the copyright holders and contributors "as is" and
  31. // any express or implied warranties, including, but not limited to, the implied
  32. // warranties of merchantability and fitness for a particular purpose are disclaimed.
  33. // In no event shall the Intel Corporation or contributors be liable for any direct,
  34. // indirect, incidental, special, exemplary, or consequential damages
  35. // (including, but not limited to, procurement of substitute goods or services;
  36. // loss of use, data, or profits; or business interruption) however caused
  37. // and on any theory of liability, whether in contract, strict liability,
  38. // or tort (including negligence or otherwise) arising in any way out of
  39. // the use of this software, even if advised of the possibility of such damage.
  40. //
  41. //M*/
  42. #ifndef OPENCV_CUDA_SCAN_HPP
  43. #define OPENCV_CUDA_SCAN_HPP
  44. #include "opencv2/core/cuda/common.hpp"
  45. #include "opencv2/core/cuda/utility.hpp"
  46. #include "opencv2/core/cuda/warp.hpp"
  47. #include "opencv2/core/cuda/warp_shuffle.hpp"
  48. /** @file
  49. * @deprecated Use @ref cudev instead.
  50. */
  51. //! @cond IGNORED
  52. namespace cv { namespace cuda { namespace device
  53. {
  54. enum ScanKind { EXCLUSIVE = 0, INCLUSIVE = 1 };
  55. template <ScanKind Kind, typename T, typename F> struct WarpScan
  56. {
  57. __device__ __forceinline__ WarpScan() {}
  58. __device__ __forceinline__ WarpScan(const WarpScan& other) { CV_UNUSED(other); }
  59. __device__ __forceinline__ T operator()( volatile T *ptr , const unsigned int idx)
  60. {
  61. const unsigned int lane = idx & 31;
  62. F op;
  63. if ( lane >= 1) ptr [idx ] = op(ptr [idx - 1], ptr [idx]);
  64. if ( lane >= 2) ptr [idx ] = op(ptr [idx - 2], ptr [idx]);
  65. if ( lane >= 4) ptr [idx ] = op(ptr [idx - 4], ptr [idx]);
  66. if ( lane >= 8) ptr [idx ] = op(ptr [idx - 8], ptr [idx]);
  67. if ( lane >= 16) ptr [idx ] = op(ptr [idx - 16], ptr [idx]);
  68. if( Kind == INCLUSIVE )
  69. return ptr [idx];
  70. else
  71. return (lane > 0) ? ptr [idx - 1] : 0;
  72. }
  73. __device__ __forceinline__ unsigned int index(const unsigned int tid)
  74. {
  75. return tid;
  76. }
  77. __device__ __forceinline__ void init(volatile T *ptr){}
  78. static const int warp_offset = 0;
  79. typedef WarpScan<INCLUSIVE, T, F> merge;
  80. };
  81. template <ScanKind Kind , typename T, typename F> struct WarpScanNoComp
  82. {
  83. __device__ __forceinline__ WarpScanNoComp() {}
  84. __device__ __forceinline__ WarpScanNoComp(const WarpScanNoComp& other) { CV_UNUSED(other); }
  85. __device__ __forceinline__ T operator()( volatile T *ptr , const unsigned int idx)
  86. {
  87. const unsigned int lane = threadIdx.x & 31;
  88. F op;
  89. ptr [idx ] = op(ptr [idx - 1], ptr [idx]);
  90. ptr [idx ] = op(ptr [idx - 2], ptr [idx]);
  91. ptr [idx ] = op(ptr [idx - 4], ptr [idx]);
  92. ptr [idx ] = op(ptr [idx - 8], ptr [idx]);
  93. ptr [idx ] = op(ptr [idx - 16], ptr [idx]);
  94. if( Kind == INCLUSIVE )
  95. return ptr [idx];
  96. else
  97. return (lane > 0) ? ptr [idx - 1] : 0;
  98. }
  99. __device__ __forceinline__ unsigned int index(const unsigned int tid)
  100. {
  101. return (tid >> warp_log) * warp_smem_stride + 16 + (tid & warp_mask);
  102. }
  103. __device__ __forceinline__ void init(volatile T *ptr)
  104. {
  105. ptr[threadIdx.x] = 0;
  106. }
  107. static const int warp_smem_stride = 32 + 16 + 1;
  108. static const int warp_offset = 16;
  109. static const int warp_log = 5;
  110. static const int warp_mask = 31;
  111. typedef WarpScanNoComp<INCLUSIVE, T, F> merge;
  112. };
  113. template <ScanKind Kind , typename T, typename Sc, typename F> struct BlockScan
  114. {
  115. __device__ __forceinline__ BlockScan() {}
  116. __device__ __forceinline__ BlockScan(const BlockScan& other) { CV_UNUSED(other); }
  117. __device__ __forceinline__ T operator()(volatile T *ptr)
  118. {
  119. const unsigned int tid = threadIdx.x;
  120. const unsigned int lane = tid & warp_mask;
  121. const unsigned int warp = tid >> warp_log;
  122. Sc scan;
  123. typename Sc::merge merge_scan;
  124. const unsigned int idx = scan.index(tid);
  125. T val = scan(ptr, idx);
  126. __syncthreads ();
  127. if( warp == 0)
  128. scan.init(ptr);
  129. __syncthreads ();
  130. if( lane == 31 )
  131. ptr [scan.warp_offset + warp ] = (Kind == INCLUSIVE) ? val : ptr [idx];
  132. __syncthreads ();
  133. if( warp == 0 )
  134. merge_scan(ptr, idx);
  135. __syncthreads();
  136. if ( warp > 0)
  137. val = ptr [scan.warp_offset + warp - 1] + val;
  138. __syncthreads ();
  139. ptr[idx] = val;
  140. __syncthreads ();
  141. return val ;
  142. }
  143. static const int warp_log = 5;
  144. static const int warp_mask = 31;
  145. };
  146. template <typename T>
  147. __device__ T warpScanInclusive(T idata, volatile T* s_Data, unsigned int tid)
  148. {
  149. #if __CUDA_ARCH__ >= 300
  150. const unsigned int laneId = cv::cuda::device::Warp::laneId();
  151. // scan on shuffl functions
  152. #pragma unroll
  153. for (int i = 1; i <= (OPENCV_CUDA_WARP_SIZE / 2); i *= 2)
  154. {
  155. const T n = cv::cuda::device::shfl_up(idata, i);
  156. if (laneId >= i)
  157. idata += n;
  158. }
  159. return idata;
  160. #else
  161. unsigned int pos = 2 * tid - (tid & (OPENCV_CUDA_WARP_SIZE - 1));
  162. s_Data[pos] = 0;
  163. pos += OPENCV_CUDA_WARP_SIZE;
  164. s_Data[pos] = idata;
  165. s_Data[pos] += s_Data[pos - 1];
  166. s_Data[pos] += s_Data[pos - 2];
  167. s_Data[pos] += s_Data[pos - 4];
  168. s_Data[pos] += s_Data[pos - 8];
  169. s_Data[pos] += s_Data[pos - 16];
  170. return s_Data[pos];
  171. #endif
  172. }
  173. template <typename T>
  174. __device__ __forceinline__ T warpScanExclusive(T idata, volatile T* s_Data, unsigned int tid)
  175. {
  176. return warpScanInclusive(idata, s_Data, tid) - idata;
  177. }
  178. template <int tiNumScanThreads, typename T>
  179. __device__ T blockScanInclusive(T idata, volatile T* s_Data, unsigned int tid)
  180. {
  181. if (tiNumScanThreads > OPENCV_CUDA_WARP_SIZE)
  182. {
  183. //Bottom-level inclusive warp scan
  184. T warpResult = warpScanInclusive(idata, s_Data, tid);
  185. //Save top elements of each warp for exclusive warp scan
  186. //sync to wait for warp scans to complete (because s_Data is being overwritten)
  187. __syncthreads();
  188. if ((tid & (OPENCV_CUDA_WARP_SIZE - 1)) == (OPENCV_CUDA_WARP_SIZE - 1))
  189. {
  190. s_Data[tid >> OPENCV_CUDA_LOG_WARP_SIZE] = warpResult;
  191. }
  192. //wait for warp scans to complete
  193. __syncthreads();
  194. if (tid < (tiNumScanThreads / OPENCV_CUDA_WARP_SIZE) )
  195. {
  196. //grab top warp elements
  197. T val = s_Data[tid];
  198. //calculate exclusive scan and write back to shared memory
  199. s_Data[tid] = warpScanExclusive(val, s_Data, tid);
  200. }
  201. //return updated warp scans with exclusive scan results
  202. __syncthreads();
  203. return warpResult + s_Data[tid >> OPENCV_CUDA_LOG_WARP_SIZE];
  204. }
  205. else
  206. {
  207. return warpScanInclusive(idata, s_Data, tid);
  208. }
  209. }
  210. }}}
  211. //! @endcond
  212. #endif // OPENCV_CUDA_SCAN_HPP