type_shim.h 9.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419
  1. #include <ATen/ATen.h>
  2. #include "compat.h"
  3. // Forward/backward compatiblity hack around
  4. // https://github.com/pytorch/pytorch/commit/3aeb78079bcd68282fe9117088e138b77318e288
  5. // pending more future-proof guidance from upstream.
  6. // struct TypeShim
  7. // {
  8. // const at::Type& payload;
  9. // TypeShim(const at::Type& type) : payload(type) {}
  10. // // Enable trivial conversion to a const at::Type& for pre-3aeb78
  11. // operator const at::Type&(){ return payload; };
  12. // // Enable dispatch switch statements to take *this directly for post-3aeb78
  13. // //operator at::ScalarType(){ return payload.; };
  14. // };
  15. #define DISPATCH_FLOAT_AND_HALF(TYPE, LEVEL, NAME, ...) \
  16. switch(TYPE) \
  17. { \
  18. case at::ScalarType::Float: \
  19. { \
  20. using scalar_t_##LEVEL = float; \
  21. __VA_ARGS__; \
  22. break; \
  23. } \
  24. case at::ScalarType::Half: \
  25. { \
  26. using scalar_t_##LEVEL = at::Half; \
  27. __VA_ARGS__; \
  28. break; \
  29. } \
  30. default: \
  31. AT_ERROR(#NAME, " not implemented for '", toString(TYPE), "'"); \
  32. }
  33. #define DISPATCH_FLOAT_HALF_AND_BFLOAT(TYPE, LEVEL, NAME, ...) \
  34. switch(TYPE) \
  35. { \
  36. case at::ScalarType::Float: \
  37. { \
  38. using scalar_t_##LEVEL = float; \
  39. __VA_ARGS__; \
  40. break; \
  41. } \
  42. case at::ScalarType::Half: \
  43. { \
  44. using scalar_t_##LEVEL = at::Half; \
  45. __VA_ARGS__; \
  46. break; \
  47. } \
  48. case at::ScalarType::BFloat16: \
  49. { \
  50. using scalar_t_##LEVEL = at::BFloat16; \
  51. __VA_ARGS__; \
  52. break; \
  53. } \
  54. default: \
  55. AT_ERROR(#NAME, " not implemented for '", toString(TYPE), "'"); \
  56. }
  57. #define DISPATCH_FLOAT_HALF_AND_BYTE(TYPE, LEVEL, NAME, ...) \
  58. switch(TYPE) \
  59. { \
  60. case at::ScalarType::Float: \
  61. { \
  62. using scalar_t_##LEVEL = float; \
  63. __VA_ARGS__; \
  64. break; \
  65. } \
  66. case at::ScalarType::Half: \
  67. { \
  68. using scalar_t_##LEVEL = at::Half; \
  69. __VA_ARGS__; \
  70. break; \
  71. } \
  72. case at::ScalarType::Byte: \
  73. { \
  74. using scalar_t_##LEVEL = uint8_t; \
  75. __VA_ARGS__; \
  76. break; \
  77. } \
  78. default: \
  79. AT_ERROR(#NAME, " not implemented for '", toString(TYPE), "'"); \
  80. }
  81. #define DISPATCH_DOUBLE_FLOAT_AND_HALF(TYPE, LEVEL, NAME, ...) \
  82. switch(TYPE) \
  83. { \
  84. case at::ScalarType::Double: \
  85. { \
  86. using scalar_t_##LEVEL = double; \
  87. __VA_ARGS__; \
  88. break; \
  89. } \
  90. case at::ScalarType::Float: \
  91. { \
  92. using scalar_t_##LEVEL = float; \
  93. __VA_ARGS__; \
  94. break; \
  95. } \
  96. case at::ScalarType::Half: \
  97. { \
  98. using scalar_t_##LEVEL = at::Half; \
  99. __VA_ARGS__; \
  100. break; \
  101. } \
  102. default: \
  103. AT_ERROR(#NAME, " not implemented for '", toString(TYPE), "'"); \
  104. }
  105. #define DISPATCH_DOUBLE_FLOAT_HALF_AND_BFLOAT(TYPE, LEVEL, NAME, ...) \
  106. switch(TYPE) \
  107. { \
  108. case at::ScalarType::Double: \
  109. { \
  110. using scalar_t_##LEVEL = double; \
  111. __VA_ARGS__; \
  112. break; \
  113. } \
  114. case at::ScalarType::Float: \
  115. { \
  116. using scalar_t_##LEVEL = float; \
  117. __VA_ARGS__; \
  118. break; \
  119. } \
  120. case at::ScalarType::Half: \
  121. { \
  122. using scalar_t_##LEVEL = at::Half; \
  123. __VA_ARGS__; \
  124. break; \
  125. } \
  126. case at::ScalarType::BFloat16: \
  127. { \
  128. using scalar_t_##LEVEL = at::BFloat16; \
  129. __VA_ARGS__; \
  130. break; \
  131. } \
  132. default: \
  133. AT_ERROR(#NAME, " not implemented for '", toString(TYPE), "'"); \
  134. }
  135. #define DISPATCH_DOUBLE_AND_FLOAT(TYPE, LEVEL, NAME, ...) \
  136. switch(TYPE) \
  137. { \
  138. case at::ScalarType::Double: \
  139. { \
  140. using scalar_t_##LEVEL = double; \
  141. __VA_ARGS__; \
  142. break; \
  143. } \
  144. case at::ScalarType::Float: \
  145. { \
  146. using scalar_t_##LEVEL = float; \
  147. __VA_ARGS__; \
  148. break; \
  149. } \
  150. default: \
  151. AT_ERROR(#NAME, " not implemented for '", toString(TYPE), "'"); \
  152. }
  153. #define DISPATCH_HALF_AND_BFLOAT(TYPE, NAME, ...) \
  154. switch(TYPE) \
  155. { \
  156. case at::ScalarType::Half: \
  157. { \
  158. using scalar_t = at::Half; \
  159. __VA_ARGS__; \
  160. break; \
  161. } \
  162. case at::ScalarType::BFloat16: \
  163. { \
  164. using scalar_t = at::BFloat16; \
  165. __VA_ARGS__; \
  166. break; \
  167. } \
  168. default: \
  169. AT_ERROR(#NAME, " not implemented for '", toString(TYPE), "'"); \
  170. }
  171. #define DISPATCH_FLOAT_HALF_AND_BFLOAT_INOUT_TYPES(TYPEIN, TYPEOUT, NAME, ...) \
  172. switch(TYPEIN) \
  173. { \
  174. case at::ScalarType::Float: \
  175. { \
  176. using scalar_t_in = float; \
  177. switch(TYPEOUT) \
  178. { \
  179. case at::ScalarType::Float: \
  180. { \
  181. using scalar_t_out = float; \
  182. __VA_ARGS__; \
  183. break; \
  184. } \
  185. case at::ScalarType::Half: \
  186. { \
  187. using scalar_t_out = at::Half; \
  188. __VA_ARGS__; \
  189. break; \
  190. } \
  191. case at::ScalarType::BFloat16: \
  192. { \
  193. using scalar_t_out = at::BFloat16; \
  194. __VA_ARGS__; \
  195. break; \
  196. } \
  197. default: \
  198. AT_ERROR(#NAME, " not implemented for '", toString(TYPEOUT), "'"); \
  199. } \
  200. break; \
  201. } \
  202. case at::ScalarType::Half: \
  203. { \
  204. using scalar_t_in = at::Half; \
  205. using scalar_t_out = at::Half; \
  206. __VA_ARGS__; \
  207. break; \
  208. } \
  209. case at::ScalarType::BFloat16: \
  210. { \
  211. using scalar_t_in = at::BFloat16; \
  212. using scalar_t_out = at::BFloat16; \
  213. __VA_ARGS__; \
  214. break; \
  215. } \
  216. default: \
  217. AT_ERROR(#NAME, " not implemented for '", toString(TYPEIN), "'"); \
  218. }
  219. #define DISPATCH_DOUBLE_FLOAT_HALF_AND_BFLOAT_INOUT_TYPES(TYPEIN, TYPEOUT, NAME, ...) \
  220. switch(TYPEIN) \
  221. { \
  222. case at::ScalarType::Double: \
  223. { \
  224. using scalar_t_in = double; \
  225. switch(TYPEOUT) \
  226. { \
  227. case at::ScalarType::Double: \
  228. { \
  229. using scalar_t_out = double; \
  230. __VA_ARGS__; \
  231. break; \
  232. } \
  233. case at::ScalarType::Float: \
  234. { \
  235. using scalar_t_out = float; \
  236. __VA_ARGS__; \
  237. break; \
  238. } \
  239. case at::ScalarType::Half: \
  240. { \
  241. using scalar_t_out = at::Half; \
  242. __VA_ARGS__; \
  243. break; \
  244. } \
  245. case at::ScalarType::BFloat16: \
  246. { \
  247. using scalar_t_out = at::BFloat16; \
  248. __VA_ARGS__; \
  249. break; \
  250. } \
  251. default: \
  252. AT_ERROR(#NAME, " not implemented for '", toString(TYPEOUT), "'"); \
  253. } \
  254. break; \
  255. } \
  256. case at::ScalarType::Float: \
  257. { \
  258. using scalar_t_in = float; \
  259. switch(TYPEOUT) \
  260. { \
  261. case at::ScalarType::Float: \
  262. { \
  263. using scalar_t_out = float; \
  264. __VA_ARGS__; \
  265. break; \
  266. } \
  267. case at::ScalarType::Half: \
  268. { \
  269. using scalar_t_out = at::Half; \
  270. __VA_ARGS__; \
  271. break; \
  272. } \
  273. case at::ScalarType::BFloat16: \
  274. { \
  275. using scalar_t_out = at::BFloat16; \
  276. __VA_ARGS__; \
  277. break; \
  278. } \
  279. default: \
  280. AT_ERROR(#NAME, " not implemented for '", toString(TYPEOUT), "'"); \
  281. } \
  282. break; \
  283. } \
  284. case at::ScalarType::Half: \
  285. { \
  286. using scalar_t_in = at::Half; \
  287. using scalar_t_out = at::Half; \
  288. __VA_ARGS__; \
  289. break; \
  290. } \
  291. case at::ScalarType::BFloat16: \
  292. { \
  293. using scalar_t_in = at::BFloat16; \
  294. using scalar_t_out = at::BFloat16; \
  295. __VA_ARGS__; \
  296. break; \
  297. } \
  298. default: \
  299. AT_ERROR(#NAME, " not implemented for '", toString(TYPEIN), "'"); \
  300. }
  301. template<typename T>
  302. __device__ __forceinline__ T reduce_block_into_lanes
  303. (T *x,
  304. T val,
  305. int lanes=1,
  306. bool share_result=false) // lanes is intended to be <= 32.
  307. {
  308. int tid = threadIdx.x + threadIdx.y*blockDim.x;
  309. int blockSize = blockDim.x*blockDim.y; // blockSize is intended to be a multiple of 32.
  310. if(blockSize >= 64)
  311. {
  312. x[tid] = val;
  313. __syncthreads();
  314. }
  315. #pragma unroll
  316. for(int i = (blockSize >> 1); i >= 64; i >>= 1)
  317. {
  318. if(tid < i)
  319. x[tid] = x[tid] + x[tid+i];
  320. __syncthreads();
  321. }
  322. T final;
  323. if(tid < 32)
  324. {
  325. if(blockSize >= 64)
  326. final = x[tid] + x[tid+32];
  327. else
  328. final = val;
  329. // __SYNCWARP();
  330. #pragma unroll
  331. for(int i = 16; i >= lanes; i >>= 1)
  332. final = final + __shfl_down_sync(0xffffffff, final, i);
  333. }
  334. if(share_result)
  335. {
  336. if(tid < lanes)
  337. x[tid] = final; // EpilogueOp
  338. // Make sure the smem result is visible to all warps.
  339. __syncthreads();
  340. }
  341. return final;
  342. }
  343. template<typename T>
  344. __device__ __forceinline__ T reduce_block_into_lanes_max_op
  345. (T *x,
  346. T val,
  347. int lanes=1,
  348. bool share_result=false) // lanes is intended to be <= 32.
  349. {
  350. int tid = threadIdx.x + threadIdx.y*blockDim.x;
  351. int blockSize = blockDim.x*blockDim.y; // blockSize is intended to be a multiple of 32.
  352. if(blockSize >= 64)
  353. {
  354. x[tid] = val;
  355. __syncthreads();
  356. }
  357. #pragma unroll
  358. for(int i = (blockSize >> 1); i >= 64; i >>= 1)
  359. {
  360. if(tid < i)
  361. x[tid] = fmaxf(fabsf(x[tid]), fabsf(x[tid+i]));
  362. __syncthreads();
  363. }
  364. T final;
  365. if(tid < 32)
  366. {
  367. if(blockSize >= 64)
  368. final = fmaxf(fabsf(x[tid]), fabsf(x[tid+32]));
  369. else
  370. final = val;
  371. // __SYNCWARP();
  372. #pragma unroll
  373. for(int i = 16; i >= lanes; i >>= 1)
  374. final = fmaxf(fabsf(final), fabsf(__shfl_down_sync(0xffffffff, final, i)));
  375. }
  376. if(share_result)
  377. {
  378. if(tid < lanes)
  379. x[tid] = final; // EpilogueOp
  380. // Make sure the smem result is visible to all warps.
  381. __syncthreads();
  382. }
  383. return final;
  384. }