emulation.hpp 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269
  1. /*M///////////////////////////////////////////////////////////////////////////////////////
  2. //
  3. // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
  4. //
  5. // By downloading, copying, installing or using the software you agree to this license.
  6. // If you do not agree to this license, do not download, install,
  7. // copy or use the software.
  8. //
  9. //
  10. // License Agreement
  11. // For Open Source Computer Vision Library
  12. //
  13. // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
  14. // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
  15. // Third party copyrights are property of their respective owners.
  16. //
  17. // Redistribution and use in source and binary forms, with or without modification,
  18. // are permitted provided that the following conditions are met:
  19. //
  20. // * Redistribution's of source code must retain the above copyright notice,
  21. // this list of conditions and the following disclaimer.
  22. //
  23. // * Redistribution's in binary form must reproduce the above copyright notice,
  24. // this list of conditions and the following disclaimer in the documentation
  25. // and/or other materials provided with the distribution.
  26. //
  27. // * The name of the copyright holders may not be used to endorse or promote products
  28. // derived from this software without specific prior written permission.
  29. //
  30. // This software is provided by the copyright holders and contributors "as is" and
  31. // any express or implied warranties, including, but not limited to, the implied
  32. // warranties of merchantability and fitness for a particular purpose are disclaimed.
  33. // In no event shall the Intel Corporation or contributors be liable for any direct,
  34. // indirect, incidental, special, exemplary, or consequential damages
  35. // (including, but not limited to, procurement of substitute goods or services;
  36. // loss of use, data, or profits; or business interruption) however caused
  37. // and on any theory of liability, whether in contract, strict liability,
  38. // or tort (including negligence or otherwise) arising in any way out of
  39. // the use of this software, even if advised of the possibility of such damage.
  40. //
  41. //M*/
  42. #ifndef OPENCV_CUDA_EMULATION_HPP_
  43. #define OPENCV_CUDA_EMULATION_HPP_
  44. #include "common.hpp"
  45. #include "warp_reduce.hpp"
  46. /** @file
  47. * @deprecated Use @ref cudev instead.
  48. */
  49. //! @cond IGNORED
  50. namespace cv { namespace cuda { namespace device
  51. {
  52. struct Emulation
  53. {
  54. static __device__ __forceinline__ int syncthreadsOr(int pred)
  55. {
  56. #if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ < 200)
  57. // just campilation stab
  58. return 0;
  59. #else
  60. return __syncthreads_or(pred);
  61. #endif
  62. }
  63. template<int CTA_SIZE>
  64. static __forceinline__ __device__ int Ballot(int predicate)
  65. {
  66. #if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ >= 200)
  67. return __ballot(predicate);
  68. #else
  69. __shared__ volatile int cta_buffer[CTA_SIZE];
  70. int tid = threadIdx.x;
  71. cta_buffer[tid] = predicate ? (1 << (tid & 31)) : 0;
  72. return warp_reduce(cta_buffer);
  73. #endif
  74. }
  75. struct smem
  76. {
  77. enum { TAG_MASK = (1U << ( (sizeof(unsigned int) << 3) - 5U)) - 1U };
  78. template<typename T>
  79. static __device__ __forceinline__ T atomicInc(T* address, T val)
  80. {
  81. #if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ < 120)
  82. T count;
  83. unsigned int tag = threadIdx.x << ( (sizeof(unsigned int) << 3) - 5U);
  84. do
  85. {
  86. count = *address & TAG_MASK;
  87. count = tag | (count + 1);
  88. *address = count;
  89. } while (*address != count);
  90. return (count & TAG_MASK) - 1;
  91. #else
  92. return ::atomicInc(address, val);
  93. #endif
  94. }
  95. template<typename T>
  96. static __device__ __forceinline__ T atomicAdd(T* address, T val)
  97. {
  98. #if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ < 120)
  99. T count;
  100. unsigned int tag = threadIdx.x << ( (sizeof(unsigned int) << 3) - 5U);
  101. do
  102. {
  103. count = *address & TAG_MASK;
  104. count = tag | (count + val);
  105. *address = count;
  106. } while (*address != count);
  107. return (count & TAG_MASK) - val;
  108. #else
  109. return ::atomicAdd(address, val);
  110. #endif
  111. }
  112. template<typename T>
  113. static __device__ __forceinline__ T atomicMin(T* address, T val)
  114. {
  115. #if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ < 120)
  116. T count = ::min(*address, val);
  117. do
  118. {
  119. *address = count;
  120. } while (*address > count);
  121. return count;
  122. #else
  123. return ::atomicMin(address, val);
  124. #endif
  125. }
  126. }; // struct cmem
  127. struct glob
  128. {
  129. static __device__ __forceinline__ int atomicAdd(int* address, int val)
  130. {
  131. return ::atomicAdd(address, val);
  132. }
  133. static __device__ __forceinline__ unsigned int atomicAdd(unsigned int* address, unsigned int val)
  134. {
  135. return ::atomicAdd(address, val);
  136. }
  137. static __device__ __forceinline__ float atomicAdd(float* address, float val)
  138. {
  139. #if __CUDA_ARCH__ >= 200
  140. return ::atomicAdd(address, val);
  141. #else
  142. int* address_as_i = (int*) address;
  143. int old = *address_as_i, assumed;
  144. do {
  145. assumed = old;
  146. old = ::atomicCAS(address_as_i, assumed,
  147. __float_as_int(val + __int_as_float(assumed)));
  148. } while (assumed != old);
  149. return __int_as_float(old);
  150. #endif
  151. }
  152. static __device__ __forceinline__ double atomicAdd(double* address, double val)
  153. {
  154. #if __CUDA_ARCH__ >= 130
  155. unsigned long long int* address_as_ull = (unsigned long long int*) address;
  156. unsigned long long int old = *address_as_ull, assumed;
  157. do {
  158. assumed = old;
  159. old = ::atomicCAS(address_as_ull, assumed,
  160. __double_as_longlong(val + __longlong_as_double(assumed)));
  161. } while (assumed != old);
  162. return __longlong_as_double(old);
  163. #else
  164. CV_UNUSED(address);
  165. CV_UNUSED(val);
  166. return 0.0;
  167. #endif
  168. }
  169. static __device__ __forceinline__ int atomicMin(int* address, int val)
  170. {
  171. return ::atomicMin(address, val);
  172. }
  173. static __device__ __forceinline__ float atomicMin(float* address, float val)
  174. {
  175. #if __CUDA_ARCH__ >= 120
  176. int* address_as_i = (int*) address;
  177. int old = *address_as_i, assumed;
  178. do {
  179. assumed = old;
  180. old = ::atomicCAS(address_as_i, assumed,
  181. __float_as_int(::fminf(val, __int_as_float(assumed))));
  182. } while (assumed != old);
  183. return __int_as_float(old);
  184. #else
  185. CV_UNUSED(address);
  186. CV_UNUSED(val);
  187. return 0.0f;
  188. #endif
  189. }
  190. static __device__ __forceinline__ double atomicMin(double* address, double val)
  191. {
  192. #if __CUDA_ARCH__ >= 130
  193. unsigned long long int* address_as_ull = (unsigned long long int*) address;
  194. unsigned long long int old = *address_as_ull, assumed;
  195. do {
  196. assumed = old;
  197. old = ::atomicCAS(address_as_ull, assumed,
  198. __double_as_longlong(::fmin(val, __longlong_as_double(assumed))));
  199. } while (assumed != old);
  200. return __longlong_as_double(old);
  201. #else
  202. CV_UNUSED(address);
  203. CV_UNUSED(val);
  204. return 0.0;
  205. #endif
  206. }
  207. static __device__ __forceinline__ int atomicMax(int* address, int val)
  208. {
  209. return ::atomicMax(address, val);
  210. }
  211. static __device__ __forceinline__ float atomicMax(float* address, float val)
  212. {
  213. #if __CUDA_ARCH__ >= 120
  214. int* address_as_i = (int*) address;
  215. int old = *address_as_i, assumed;
  216. do {
  217. assumed = old;
  218. old = ::atomicCAS(address_as_i, assumed,
  219. __float_as_int(::fmaxf(val, __int_as_float(assumed))));
  220. } while (assumed != old);
  221. return __int_as_float(old);
  222. #else
  223. CV_UNUSED(address);
  224. CV_UNUSED(val);
  225. return 0.0f;
  226. #endif
  227. }
  228. static __device__ __forceinline__ double atomicMax(double* address, double val)
  229. {
  230. #if __CUDA_ARCH__ >= 130
  231. unsigned long long int* address_as_ull = (unsigned long long int*) address;
  232. unsigned long long int old = *address_as_ull, assumed;
  233. do {
  234. assumed = old;
  235. old = ::atomicCAS(address_as_ull, assumed,
  236. __double_as_longlong(::fmax(val, __longlong_as_double(assumed))));
  237. } while (assumed != old);
  238. return __longlong_as_double(old);
  239. #else
  240. CV_UNUSED(address);
  241. CV_UNUSED(val);
  242. return 0.0;
  243. #endif
  244. }
  245. };
  246. }; //struct Emulation
  247. }}} // namespace cv { namespace cuda { namespace cudev
  248. //! @endcond
  249. #endif /* OPENCV_CUDA_EMULATION_HPP_ */