block.hpp 8.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211
  1. /*M///////////////////////////////////////////////////////////////////////////////////////
  2. //
  3. // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
  4. //
  5. // By downloading, copying, installing or using the software you agree to this license.
  6. // If you do not agree to this license, do not download, install,
  7. // copy or use the software.
  8. //
  9. //
  10. // License Agreement
  11. // For Open Source Computer Vision Library
  12. //
  13. // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
  14. // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
  15. // Third party copyrights are property of their respective owners.
  16. //
  17. // Redistribution and use in source and binary forms, with or without modification,
  18. // are permitted provided that the following conditions are met:
  19. //
  20. // * Redistribution's of source code must retain the above copyright notice,
  21. // this list of conditions and the following disclaimer.
  22. //
  23. // * Redistribution's in binary form must reproduce the above copyright notice,
  24. // this list of conditions and the following disclaimer in the documentation
  25. // and/or other materials provided with the distribution.
  26. //
  27. // * The name of the copyright holders may not be used to endorse or promote products
  28. // derived from this software without specific prior written permission.
  29. //
  30. // This software is provided by the copyright holders and contributors "as is" and
  31. // any express or implied warranties, including, but not limited to, the implied
  32. // warranties of merchantability and fitness for a particular purpose are disclaimed.
  33. // In no event shall the Intel Corporation or contributors be liable for any direct,
  34. // indirect, incidental, special, exemplary, or consequential damages
  35. // (including, but not limited to, procurement of substitute goods or services;
  36. // loss of use, data, or profits; or business interruption) however caused
  37. // and on any theory of liability, whether in contract, strict liability,
  38. // or tort (including negligence or otherwise) arising in any way out of
  39. // the use of this software, even if advised of the possibility of such damage.
  40. //
  41. //M*/
  42. #ifndef OPENCV_CUDA_DEVICE_BLOCK_HPP
  43. #define OPENCV_CUDA_DEVICE_BLOCK_HPP
  44. /** @file
  45. * @deprecated Use @ref cudev instead.
  46. */
  47. //! @cond IGNORED
  48. namespace cv { namespace cuda { namespace device
  49. {
  50. struct Block
  51. {
  52. static __device__ __forceinline__ unsigned int id()
  53. {
  54. return blockIdx.x;
  55. }
  56. static __device__ __forceinline__ unsigned int stride()
  57. {
  58. return blockDim.x * blockDim.y * blockDim.z;
  59. }
  60. static __device__ __forceinline__ void sync()
  61. {
  62. __syncthreads();
  63. }
  64. static __device__ __forceinline__ int flattenedThreadId()
  65. {
  66. return threadIdx.z * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x;
  67. }
  68. template<typename It, typename T>
  69. static __device__ __forceinline__ void fill(It beg, It end, const T& value)
  70. {
  71. int STRIDE = stride();
  72. It t = beg + flattenedThreadId();
  73. for(; t < end; t += STRIDE)
  74. *t = value;
  75. }
  76. template<typename OutIt, typename T>
  77. static __device__ __forceinline__ void yota(OutIt beg, OutIt end, T value)
  78. {
  79. int STRIDE = stride();
  80. int tid = flattenedThreadId();
  81. value += tid;
  82. for(OutIt t = beg + tid; t < end; t += STRIDE, value += STRIDE)
  83. *t = value;
  84. }
  85. template<typename InIt, typename OutIt>
  86. static __device__ __forceinline__ void copy(InIt beg, InIt end, OutIt out)
  87. {
  88. int STRIDE = stride();
  89. InIt t = beg + flattenedThreadId();
  90. OutIt o = out + (t - beg);
  91. for(; t < end; t += STRIDE, o += STRIDE)
  92. *o = *t;
  93. }
  94. template<typename InIt, typename OutIt, class UnOp>
  95. static __device__ __forceinline__ void transform(InIt beg, InIt end, OutIt out, UnOp op)
  96. {
  97. int STRIDE = stride();
  98. InIt t = beg + flattenedThreadId();
  99. OutIt o = out + (t - beg);
  100. for(; t < end; t += STRIDE, o += STRIDE)
  101. *o = op(*t);
  102. }
  103. template<typename InIt1, typename InIt2, typename OutIt, class BinOp>
  104. static __device__ __forceinline__ void transform(InIt1 beg1, InIt1 end1, InIt2 beg2, OutIt out, BinOp op)
  105. {
  106. int STRIDE = stride();
  107. InIt1 t1 = beg1 + flattenedThreadId();
  108. InIt2 t2 = beg2 + flattenedThreadId();
  109. OutIt o = out + (t1 - beg1);
  110. for(; t1 < end1; t1 += STRIDE, t2 += STRIDE, o += STRIDE)
  111. *o = op(*t1, *t2);
  112. }
  113. template<int CTA_SIZE, typename T, class BinOp>
  114. static __device__ __forceinline__ void reduce(volatile T* buffer, BinOp op)
  115. {
  116. int tid = flattenedThreadId();
  117. T val = buffer[tid];
  118. if (CTA_SIZE >= 1024) { if (tid < 512) buffer[tid] = val = op(val, buffer[tid + 512]); __syncthreads(); }
  119. if (CTA_SIZE >= 512) { if (tid < 256) buffer[tid] = val = op(val, buffer[tid + 256]); __syncthreads(); }
  120. if (CTA_SIZE >= 256) { if (tid < 128) buffer[tid] = val = op(val, buffer[tid + 128]); __syncthreads(); }
  121. if (CTA_SIZE >= 128) { if (tid < 64) buffer[tid] = val = op(val, buffer[tid + 64]); __syncthreads(); }
  122. if (tid < 32)
  123. {
  124. if (CTA_SIZE >= 64) { buffer[tid] = val = op(val, buffer[tid + 32]); }
  125. if (CTA_SIZE >= 32) { buffer[tid] = val = op(val, buffer[tid + 16]); }
  126. if (CTA_SIZE >= 16) { buffer[tid] = val = op(val, buffer[tid + 8]); }
  127. if (CTA_SIZE >= 8) { buffer[tid] = val = op(val, buffer[tid + 4]); }
  128. if (CTA_SIZE >= 4) { buffer[tid] = val = op(val, buffer[tid + 2]); }
  129. if (CTA_SIZE >= 2) { buffer[tid] = val = op(val, buffer[tid + 1]); }
  130. }
  131. }
  132. template<int CTA_SIZE, typename T, class BinOp>
  133. static __device__ __forceinline__ T reduce(volatile T* buffer, T init, BinOp op)
  134. {
  135. int tid = flattenedThreadId();
  136. T val = buffer[tid] = init;
  137. __syncthreads();
  138. if (CTA_SIZE >= 1024) { if (tid < 512) buffer[tid] = val = op(val, buffer[tid + 512]); __syncthreads(); }
  139. if (CTA_SIZE >= 512) { if (tid < 256) buffer[tid] = val = op(val, buffer[tid + 256]); __syncthreads(); }
  140. if (CTA_SIZE >= 256) { if (tid < 128) buffer[tid] = val = op(val, buffer[tid + 128]); __syncthreads(); }
  141. if (CTA_SIZE >= 128) { if (tid < 64) buffer[tid] = val = op(val, buffer[tid + 64]); __syncthreads(); }
  142. if (tid < 32)
  143. {
  144. if (CTA_SIZE >= 64) { buffer[tid] = val = op(val, buffer[tid + 32]); }
  145. if (CTA_SIZE >= 32) { buffer[tid] = val = op(val, buffer[tid + 16]); }
  146. if (CTA_SIZE >= 16) { buffer[tid] = val = op(val, buffer[tid + 8]); }
  147. if (CTA_SIZE >= 8) { buffer[tid] = val = op(val, buffer[tid + 4]); }
  148. if (CTA_SIZE >= 4) { buffer[tid] = val = op(val, buffer[tid + 2]); }
  149. if (CTA_SIZE >= 2) { buffer[tid] = val = op(val, buffer[tid + 1]); }
  150. }
  151. __syncthreads();
  152. return buffer[0];
  153. }
  154. template <typename T, class BinOp>
  155. static __device__ __forceinline__ void reduce_n(T* data, unsigned int n, BinOp op)
  156. {
  157. int ftid = flattenedThreadId();
  158. int sft = stride();
  159. if (sft < n)
  160. {
  161. for (unsigned int i = sft + ftid; i < n; i += sft)
  162. data[ftid] = op(data[ftid], data[i]);
  163. __syncthreads();
  164. n = sft;
  165. }
  166. while (n > 1)
  167. {
  168. unsigned int half = n/2;
  169. if (ftid < half)
  170. data[ftid] = op(data[ftid], data[n - ftid - 1]);
  171. __syncthreads();
  172. n = n - half;
  173. }
  174. }
  175. };
  176. }}}
  177. //! @endcond
  178. #endif /* OPENCV_CUDA_DEVICE_BLOCK_HPP */