123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419 |
- #include <ATen/ATen.h>
- #include "compat.h"
- // Forward/backward compatiblity hack around
- // https://github.com/pytorch/pytorch/commit/3aeb78079bcd68282fe9117088e138b77318e288
- // pending more future-proof guidance from upstream.
- // struct TypeShim
- // {
- // const at::Type& payload;
- // TypeShim(const at::Type& type) : payload(type) {}
- // // Enable trivial conversion to a const at::Type& for pre-3aeb78
- // operator const at::Type&(){ return payload; };
- // // Enable dispatch switch statements to take *this directly for post-3aeb78
- // //operator at::ScalarType(){ return payload.; };
- // };
- #define DISPATCH_FLOAT_AND_HALF(TYPE, LEVEL, NAME, ...) \
- switch(TYPE) \
- { \
- case at::ScalarType::Float: \
- { \
- using scalar_t_##LEVEL = float; \
- __VA_ARGS__; \
- break; \
- } \
- case at::ScalarType::Half: \
- { \
- using scalar_t_##LEVEL = at::Half; \
- __VA_ARGS__; \
- break; \
- } \
- default: \
- AT_ERROR(#NAME, " not implemented for '", toString(TYPE), "'"); \
- }
- #define DISPATCH_FLOAT_HALF_AND_BFLOAT(TYPE, LEVEL, NAME, ...) \
- switch(TYPE) \
- { \
- case at::ScalarType::Float: \
- { \
- using scalar_t_##LEVEL = float; \
- __VA_ARGS__; \
- break; \
- } \
- case at::ScalarType::Half: \
- { \
- using scalar_t_##LEVEL = at::Half; \
- __VA_ARGS__; \
- break; \
- } \
- case at::ScalarType::BFloat16: \
- { \
- using scalar_t_##LEVEL = at::BFloat16; \
- __VA_ARGS__; \
- break; \
- } \
- default: \
- AT_ERROR(#NAME, " not implemented for '", toString(TYPE), "'"); \
- }
- #define DISPATCH_FLOAT_HALF_AND_BYTE(TYPE, LEVEL, NAME, ...) \
- switch(TYPE) \
- { \
- case at::ScalarType::Float: \
- { \
- using scalar_t_##LEVEL = float; \
- __VA_ARGS__; \
- break; \
- } \
- case at::ScalarType::Half: \
- { \
- using scalar_t_##LEVEL = at::Half; \
- __VA_ARGS__; \
- break; \
- } \
- case at::ScalarType::Byte: \
- { \
- using scalar_t_##LEVEL = uint8_t; \
- __VA_ARGS__; \
- break; \
- } \
- default: \
- AT_ERROR(#NAME, " not implemented for '", toString(TYPE), "'"); \
- }
- #define DISPATCH_DOUBLE_FLOAT_AND_HALF(TYPE, LEVEL, NAME, ...) \
- switch(TYPE) \
- { \
- case at::ScalarType::Double: \
- { \
- using scalar_t_##LEVEL = double; \
- __VA_ARGS__; \
- break; \
- } \
- case at::ScalarType::Float: \
- { \
- using scalar_t_##LEVEL = float; \
- __VA_ARGS__; \
- break; \
- } \
- case at::ScalarType::Half: \
- { \
- using scalar_t_##LEVEL = at::Half; \
- __VA_ARGS__; \
- break; \
- } \
- default: \
- AT_ERROR(#NAME, " not implemented for '", toString(TYPE), "'"); \
- }
- #define DISPATCH_DOUBLE_FLOAT_HALF_AND_BFLOAT(TYPE, LEVEL, NAME, ...) \
- switch(TYPE) \
- { \
- case at::ScalarType::Double: \
- { \
- using scalar_t_##LEVEL = double; \
- __VA_ARGS__; \
- break; \
- } \
- case at::ScalarType::Float: \
- { \
- using scalar_t_##LEVEL = float; \
- __VA_ARGS__; \
- break; \
- } \
- case at::ScalarType::Half: \
- { \
- using scalar_t_##LEVEL = at::Half; \
- __VA_ARGS__; \
- break; \
- } \
- case at::ScalarType::BFloat16: \
- { \
- using scalar_t_##LEVEL = at::BFloat16; \
- __VA_ARGS__; \
- break; \
- } \
- default: \
- AT_ERROR(#NAME, " not implemented for '", toString(TYPE), "'"); \
- }
- #define DISPATCH_DOUBLE_AND_FLOAT(TYPE, LEVEL, NAME, ...) \
- switch(TYPE) \
- { \
- case at::ScalarType::Double: \
- { \
- using scalar_t_##LEVEL = double; \
- __VA_ARGS__; \
- break; \
- } \
- case at::ScalarType::Float: \
- { \
- using scalar_t_##LEVEL = float; \
- __VA_ARGS__; \
- break; \
- } \
- default: \
- AT_ERROR(#NAME, " not implemented for '", toString(TYPE), "'"); \
- }
- #define DISPATCH_HALF_AND_BFLOAT(TYPE, NAME, ...) \
- switch(TYPE) \
- { \
- case at::ScalarType::Half: \
- { \
- using scalar_t = at::Half; \
- __VA_ARGS__; \
- break; \
- } \
- case at::ScalarType::BFloat16: \
- { \
- using scalar_t = at::BFloat16; \
- __VA_ARGS__; \
- break; \
- } \
- default: \
- AT_ERROR(#NAME, " not implemented for '", toString(TYPE), "'"); \
- }
- #define DISPATCH_FLOAT_HALF_AND_BFLOAT_INOUT_TYPES(TYPEIN, TYPEOUT, NAME, ...) \
- switch(TYPEIN) \
- { \
- case at::ScalarType::Float: \
- { \
- using scalar_t_in = float; \
- switch(TYPEOUT) \
- { \
- case at::ScalarType::Float: \
- { \
- using scalar_t_out = float; \
- __VA_ARGS__; \
- break; \
- } \
- case at::ScalarType::Half: \
- { \
- using scalar_t_out = at::Half; \
- __VA_ARGS__; \
- break; \
- } \
- case at::ScalarType::BFloat16: \
- { \
- using scalar_t_out = at::BFloat16; \
- __VA_ARGS__; \
- break; \
- } \
- default: \
- AT_ERROR(#NAME, " not implemented for '", toString(TYPEOUT), "'"); \
- } \
- break; \
- } \
- case at::ScalarType::Half: \
- { \
- using scalar_t_in = at::Half; \
- using scalar_t_out = at::Half; \
- __VA_ARGS__; \
- break; \
- } \
- case at::ScalarType::BFloat16: \
- { \
- using scalar_t_in = at::BFloat16; \
- using scalar_t_out = at::BFloat16; \
- __VA_ARGS__; \
- break; \
- } \
- default: \
- AT_ERROR(#NAME, " not implemented for '", toString(TYPEIN), "'"); \
- }
- #define DISPATCH_DOUBLE_FLOAT_HALF_AND_BFLOAT_INOUT_TYPES(TYPEIN, TYPEOUT, NAME, ...) \
- switch(TYPEIN) \
- { \
- case at::ScalarType::Double: \
- { \
- using scalar_t_in = double; \
- switch(TYPEOUT) \
- { \
- case at::ScalarType::Double: \
- { \
- using scalar_t_out = double; \
- __VA_ARGS__; \
- break; \
- } \
- case at::ScalarType::Float: \
- { \
- using scalar_t_out = float; \
- __VA_ARGS__; \
- break; \
- } \
- case at::ScalarType::Half: \
- { \
- using scalar_t_out = at::Half; \
- __VA_ARGS__; \
- break; \
- } \
- case at::ScalarType::BFloat16: \
- { \
- using scalar_t_out = at::BFloat16; \
- __VA_ARGS__; \
- break; \
- } \
- default: \
- AT_ERROR(#NAME, " not implemented for '", toString(TYPEOUT), "'"); \
- } \
- break; \
- } \
- case at::ScalarType::Float: \
- { \
- using scalar_t_in = float; \
- switch(TYPEOUT) \
- { \
- case at::ScalarType::Float: \
- { \
- using scalar_t_out = float; \
- __VA_ARGS__; \
- break; \
- } \
- case at::ScalarType::Half: \
- { \
- using scalar_t_out = at::Half; \
- __VA_ARGS__; \
- break; \
- } \
- case at::ScalarType::BFloat16: \
- { \
- using scalar_t_out = at::BFloat16; \
- __VA_ARGS__; \
- break; \
- } \
- default: \
- AT_ERROR(#NAME, " not implemented for '", toString(TYPEOUT), "'"); \
- } \
- break; \
- } \
- case at::ScalarType::Half: \
- { \
- using scalar_t_in = at::Half; \
- using scalar_t_out = at::Half; \
- __VA_ARGS__; \
- break; \
- } \
- case at::ScalarType::BFloat16: \
- { \
- using scalar_t_in = at::BFloat16; \
- using scalar_t_out = at::BFloat16; \
- __VA_ARGS__; \
- break; \
- } \
- default: \
- AT_ERROR(#NAME, " not implemented for '", toString(TYPEIN), "'"); \
- }
- template<typename T>
- __device__ __forceinline__ T reduce_block_into_lanes
- (T *x,
- T val,
- int lanes=1,
- bool share_result=false) // lanes is intended to be <= 32.
- {
- int tid = threadIdx.x + threadIdx.y*blockDim.x;
- int blockSize = blockDim.x*blockDim.y; // blockSize is intended to be a multiple of 32.
- if(blockSize >= 64)
- {
- x[tid] = val;
- __syncthreads();
- }
- #pragma unroll
- for(int i = (blockSize >> 1); i >= 64; i >>= 1)
- {
- if(tid < i)
- x[tid] = x[tid] + x[tid+i];
- __syncthreads();
- }
- T final;
- if(tid < 32)
- {
- if(blockSize >= 64)
- final = x[tid] + x[tid+32];
- else
- final = val;
- // __SYNCWARP();
- #pragma unroll
- for(int i = 16; i >= lanes; i >>= 1)
- final = final + __shfl_down_sync(0xffffffff, final, i);
- }
- if(share_result)
- {
- if(tid < lanes)
- x[tid] = final; // EpilogueOp
- // Make sure the smem result is visible to all warps.
- __syncthreads();
- }
- return final;
- }
- template<typename T>
- __device__ __forceinline__ T reduce_block_into_lanes_max_op
- (T *x,
- T val,
- int lanes=1,
- bool share_result=false) // lanes is intended to be <= 32.
- {
- int tid = threadIdx.x + threadIdx.y*blockDim.x;
- int blockSize = blockDim.x*blockDim.y; // blockSize is intended to be a multiple of 32.
- if(blockSize >= 64)
- {
- x[tid] = val;
- __syncthreads();
- }
- #pragma unroll
- for(int i = (blockSize >> 1); i >= 64; i >>= 1)
- {
- if(tid < i)
- x[tid] = fmaxf(fabsf(x[tid]), fabsf(x[tid+i]));
- __syncthreads();
- }
- T final;
- if(tid < 32)
- {
- if(blockSize >= 64)
- final = fmaxf(fabsf(x[tid]), fabsf(x[tid+32]));
- else
- final = val;
- // __SYNCWARP();
- #pragma unroll
- for(int i = 16; i >= lanes; i >>= 1)
- final = fmaxf(fabsf(final), fabsf(__shfl_down_sync(0xffffffff, final, i)));
- }
- if(share_result)
- {
- if(tid < lanes)
- x[tid] = final; // EpilogueOp
- // Make sure the smem result is visible to all warps.
- __syncthreads();
- }
- return final;
- }
|