#include #include #include #include #include #include #include /* Includes, cuda */ #include #include #if defined(CUBLAS_VERSION) && CUBLAS_VERSION >= 11000 // includes cublaslt #include #endif // FP64 Wrapper around cublas GEMMEx cublasStatus_t gemm_bias( cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const float* alpha, double* A, int lda, double* B, int ldb, const float* beta, double* C, int ldc) { return cublasGemmEx( handle, transa, transb, m, n, k, alpha, A, CUDA_R_64F, lda, B, CUDA_R_64F, ldb, beta, C, CUDA_R_64F, ldc, CUDA_R_64F, CUBLAS_GEMM_DEFAULT); } // FP32 Wrapper around cublas GEMMEx cublasStatus_t gemm_bias( cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const float* alpha, float* A, int lda, float* B, int ldb, const float* beta, float* C, int ldc) { return cublasGemmEx( handle, transa, transb, m, n, k, alpha, A, CUDA_R_32F, lda, B, CUDA_R_32F, ldb, beta, C, CUDA_R_32F, ldc, CUDA_R_32F, CUBLAS_GEMM_DEFAULT); } // FP16 Tensor core wrapper around cublas GEMMEx cublasStatus_t gemm_bias( cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const float* alpha, at::Half* A, int lda, at::Half* B, int ldb, const float* beta, at::Half* C, int ldc) { return cublasGemmEx( handle, transa, transb, m, n, k, alpha, A, CUDA_R_16F, lda, B, CUDA_R_16F, ldb, beta, C, CUDA_R_16F, ldc, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP); } // BF16 Tensor core wrapper around cublas GEMMEx cublasStatus_t gemm_bias( cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const float* alpha, at::BFloat16* A, int lda, at::BFloat16* B, int ldb, const float* beta, at::BFloat16* C, int ldc) { return cublasGemmEx( handle, transa, transb, m, n, k, alpha, A, CUDA_R_16BF, lda, B, CUDA_R_16BF, ldb, beta, C, CUDA_R_16BF, ldc, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP); } #if defined(CUBLAS_VERSION) && CUBLAS_VERSION >= 11600 int gemm_bias_lt( cublasLtHandle_t ltHandle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const float *alpha, /* host pointer */ at::Half* A, int lda, at::Half* B, int ldb, const float *beta, /* host pointer */ at::Half* C, int ldc, void *workspace, size_t workspaceSize, cudaStream_t stream, bool use_bias, const void* bias) { cublasStatus_t status = CUBLAS_STATUS_SUCCESS; cublasLtMatmulDescOpaque_t operationDesc = {}; cublasLtMatrixLayoutOpaque_t Adesc = {}, Bdesc = {}, Cdesc = {}; cublasLtMatmulPreferenceOpaque_t preference = {}; int returnedResults = 0; cublasLtMatmulHeuristicResult_t heuristicResult = {}; cublasLtEpilogue_t epilogue = CUBLASLT_EPILOGUE_DEFAULT; // Create operation descriptor; see cublasLtMatmulDescAttributes_t // for details about defaults; here we just set the transforms for // A and B. status = cublasLtMatmulDescInit(&operationDesc, CUBLAS_COMPUTE_32F, CUDA_R_32F); if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP; status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_TRANSA, &transa, sizeof(transa)); if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP; status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_TRANSB, &transb, sizeof(transa)); if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP; if (use_bias) { status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_BIAS_POINTER, &bias, sizeof(bias)); if (status != CUBLAS_STATUS_SUCCESS) { goto CLEANUP; } epilogue = CUBLASLT_EPILOGUE_BIAS; } status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_EPILOGUE, &epilogue, sizeof(epilogue)); if (status != CUBLAS_STATUS_SUCCESS) { goto CLEANUP; } // Create matrix descriptors. Not setting any extra attributes. status = cublasLtMatrixLayoutInit( &Adesc, CUDA_R_16F, transa == CUBLAS_OP_N ? m : k, transa == CUBLAS_OP_N ? k : m, lda); if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP; status = cublasLtMatrixLayoutInit( &Bdesc, CUDA_R_16F, transb == CUBLAS_OP_N ? k : n, transb == CUBLAS_OP_N ? n : k, ldb); if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP; status = cublasLtMatrixLayoutInit(&Cdesc, CUDA_R_16F, m, n, ldc); if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP; // Create preference handle; In general, extra attributes can be // used here to disable tensor ops or to make sure algo selected // will work with badly aligned A, B, C. However, for simplicity // here we assume A,B,C are always well aligned (e.g., directly // come from cudaMalloc) status = cublasLtMatmulPreferenceInit(&preference); if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP; status = cublasLtMatmulPreferenceSetAttribute( &preference, CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, &workspaceSize, sizeof(workspaceSize)); if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP; // We just need the best available heuristic to try and run matmul. // There is no guarantee that this will work. For example, if A is // badly aligned, you can request more (e.g. 32) algos and try to // run them one by one until something works. status = cublasLtMatmulAlgoGetHeuristic( ltHandle, &operationDesc, &Adesc, &Bdesc, &Cdesc, &Cdesc, &preference, 1, &heuristicResult, &returnedResults); if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP; if (returnedResults == 0) { status = CUBLAS_STATUS_NOT_SUPPORTED; goto CLEANUP; } status = cublasLtMatmul(ltHandle, &operationDesc, alpha, A, &Adesc, B, &Bdesc, beta, C, &Cdesc, C, &Cdesc, //&heuristicResult.algo, NULL, workspace, workspaceSize, stream); CLEANUP: // Descriptors are no longer needed as all GPU work was already // enqueued. return status == CUBLAS_STATUS_SUCCESS ? 0 : 1; } int gemm_bias_lt( cublasLtHandle_t ltHandle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const float *alpha, /* host pointer */ at::BFloat16* A, int lda, at::BFloat16* B, int ldb, const float *beta, /* host pointer */ at::BFloat16* C, int ldc, void *workspace, size_t workspaceSize, cudaStream_t stream, bool use_bias, const void* bias) { cublasStatus_t status = CUBLAS_STATUS_SUCCESS; cublasLtMatmulDescOpaque_t operationDesc = {}; cublasLtMatrixLayoutOpaque_t Adesc = {}, Bdesc = {}, Cdesc = {}; cublasLtMatmulPreferenceOpaque_t preference = {}; int returnedResults = 0; cublasLtMatmulHeuristicResult_t heuristicResult = {}; cublasLtEpilogue_t epilogue = CUBLASLT_EPILOGUE_DEFAULT; // Create operation descriptor; see cublasLtMatmulDescAttributes_t // for details about defaults; here we just set the transforms for // A and B. status = cublasLtMatmulDescInit(&operationDesc, CUBLAS_COMPUTE_32F, CUDA_R_32F); if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP; status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_TRANSA, &transa, sizeof(transa)); if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP; status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_TRANSB, &transb, sizeof(transa)); if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP; if (use_bias) { status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_BIAS_POINTER, &bias, sizeof(bias)); if (status != CUBLAS_STATUS_SUCCESS) { goto CLEANUP; } epilogue = CUBLASLT_EPILOGUE_BIAS; } status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_EPILOGUE, &epilogue, sizeof(epilogue)); if (status != CUBLAS_STATUS_SUCCESS) { goto CLEANUP; } // Create matrix descriptors. Not setting any extra attributes. status = cublasLtMatrixLayoutInit( &Adesc, CUDA_R_16BF, transa == CUBLAS_OP_N ? m : k, transa == CUBLAS_OP_N ? k : m, lda); if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP; status = cublasLtMatrixLayoutInit( &Bdesc, CUDA_R_16BF, transb == CUBLAS_OP_N ? k : n, transb == CUBLAS_OP_N ? n : k, ldb); if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP; status = cublasLtMatrixLayoutInit(&Cdesc, CUDA_R_16BF, m, n, ldc); if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP; // Create preference handle; In general, extra attributes can be // used here to disable tensor ops or to make sure algo selected // will work with badly aligned A, B, C. However, for simplicity // here we assume A,B,C are always well aligned (e.g., directly // come from cudaMalloc) status = cublasLtMatmulPreferenceInit(&preference); if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP; status = cublasLtMatmulPreferenceSetAttribute( &preference, CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, &workspaceSize, sizeof(workspaceSize)); if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP; // We just need the best available heuristic to try and run matmul. // There is no guarantee that this will work. For example, if A is // badly aligned, you can request more (e.g. 32) algos and try to // run them one by one until something works. status = cublasLtMatmulAlgoGetHeuristic( ltHandle, &operationDesc, &Adesc, &Bdesc, &Cdesc, &Cdesc, &preference, 1, &heuristicResult, &returnedResults); if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP; if (returnedResults == 0) { status = CUBLAS_STATUS_NOT_SUPPORTED; goto CLEANUP; } status = cublasLtMatmul(ltHandle, &operationDesc, alpha, A, &Adesc, B, &Bdesc, beta, C, &Cdesc, C, &Cdesc, //&heuristicResult.algo, NULL, workspace, workspaceSize, stream); CLEANUP: // Descriptors are no longer needed as all GPU work was already // enqueued. return status == CUBLAS_STATUS_SUCCESS ? 0 : 1; } int gemm_bias_lt( cublasLtHandle_t ltHandle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const float *alpha, /* host pointer */ double* A, int lda, double* B, int ldb, const float *beta, /* host pointer */ double* C, int ldc, void *workspace, size_t workspaceSize, cudaStream_t stream, bool use_bias, const void* bias) { return 1; } int gemm_bias_lt( cublasLtHandle_t ltHandle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const float *alpha, /* host pointer */ float *A, int lda, float *B, int ldb, const float *beta, /* host pointer */ float *C, int ldc, void *workspace, size_t workspaceSize, cudaStream_t stream, bool use_bias, const void* bias) { cublasStatus_t status = CUBLAS_STATUS_SUCCESS; cublasLtMatmulDescOpaque_t operationDesc = {}; cublasLtMatrixLayoutOpaque_t Adesc = {}, Bdesc = {}, Cdesc = {}; cublasLtMatmulPreferenceOpaque_t preference = {}; int returnedResults = 0; cublasLtMatmulHeuristicResult_t heuristicResult = {}; cublasLtEpilogue_t epilogue = CUBLASLT_EPILOGUE_DEFAULT; // Create operation descriptor; see cublasLtMatmulDescAttributes_t // for details about defaults; here we just set the transforms for // A and B. status = cublasLtMatmulDescInit(&operationDesc, CUBLAS_COMPUTE_32F, CUDA_R_32F); if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP; status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_TRANSA, &transa, sizeof(transa)); if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP; status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_TRANSB, &transb, sizeof(transa)); if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP; if (use_bias) { status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_BIAS_POINTER, &bias, sizeof(bias)); if (status != CUBLAS_STATUS_SUCCESS) { goto CLEANUP; } epilogue = CUBLASLT_EPILOGUE_BIAS; } status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_EPILOGUE, &epilogue, sizeof(epilogue)); if (status != CUBLAS_STATUS_SUCCESS) { goto CLEANUP; } // Create matrix descriptors. Not setting any extra attributes. status = cublasLtMatrixLayoutInit( &Adesc, CUDA_R_32F, transa == CUBLAS_OP_N ? m : k, transa == CUBLAS_OP_N ? k : m, lda); if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP; status = cublasLtMatrixLayoutInit( &Bdesc, CUDA_R_32F, transb == CUBLAS_OP_N ? k : n, transb == CUBLAS_OP_N ? n : k, ldb); if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP; status = cublasLtMatrixLayoutInit(&Cdesc, CUDA_R_32F, m, n, ldc); if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP; // Create preference handle; In general, extra attributes can be // used here to disable tensor ops or to make sure algo selected // will work with badly aligned A, B, C. However, for simplicity // here we assume A,B,C are always well aligned (e.g., directly // come from cudaMalloc) status = cublasLtMatmulPreferenceInit(&preference); if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP; status = cublasLtMatmulPreferenceSetAttribute( &preference, CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, &workspaceSize, sizeof(workspaceSize)); if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP; // We just need the best available heuristic to try and run matmul. // There is no guarantee that this will work. For example, if A is // badly aligned, you can request more (e.g. 32) algos and try to // run them one by one until something works. status = cublasLtMatmulAlgoGetHeuristic( ltHandle, &operationDesc, &Adesc, &Bdesc, &Cdesc, &Cdesc, &preference, 1, &heuristicResult, &returnedResults); if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP; if (returnedResults == 0) { status = CUBLAS_STATUS_NOT_SUPPORTED; goto CLEANUP; } status = cublasLtMatmul(ltHandle, &operationDesc, alpha, A, &Adesc, B, &Bdesc, beta, C, &Cdesc, C, &Cdesc, &heuristicResult.algo, workspace, workspaceSize, stream); CLEANUP: // Descriptors are no longer needed as all GPU work was already // enqueued. return status == CUBLAS_STATUS_SUCCESS ? 0 : 1; } int gemm_bias_gelu_lt( cublasLtHandle_t ltHandle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const float *alpha, /* host pointer */ at::Half* A, int lda, at::Half* B, int ldb, const float *beta, /* host pointer */ at::Half* C, int64_t ldc, void *workspace, size_t workspaceSize, cudaStream_t stream, bool use_bias, const void* gelu_in, const void* bias) { cublasStatus_t status = CUBLAS_STATUS_SUCCESS; cublasLtMatmulDescOpaque_t operationDesc = {}; cublasLtMatrixLayoutOpaque_t Adesc = {}, Bdesc = {}, Cdesc = {}; cublasLtMatmulPreferenceOpaque_t preference = {}; int returnedResults = 0; cublasLtMatmulHeuristicResult_t heuristicResult = {}; cublasLtEpilogue_t epilogue = CUBLASLT_EPILOGUE_GELU_AUX; // Create operation descriptor; see cublasLtMatmulDescAttributes_t // for details about defaults; here we just set the transforms for // A and B. status = cublasLtMatmulDescInit(&operationDesc, CUBLAS_COMPUTE_32F, CUDA_R_32F); if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP; status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_TRANSA, &transa, sizeof(transa)); if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP; status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_TRANSB, &transb, sizeof(transa)); if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP; status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_POINTER, &gelu_in, sizeof(gelu_in)); status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_LD, &ldc, sizeof(ldc)); if (use_bias) { status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_BIAS_POINTER, &bias, sizeof(bias)); if (status != CUBLAS_STATUS_SUCCESS) { goto CLEANUP; } epilogue = CUBLASLT_EPILOGUE_GELU_AUX_BIAS; } status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_EPILOGUE, &epilogue, sizeof(epilogue)); if (status != CUBLAS_STATUS_SUCCESS) { goto CLEANUP; } // Create matrix descriptors. Not setting any extra attributes. status = cublasLtMatrixLayoutInit( &Adesc, CUDA_R_16F, transa == CUBLAS_OP_N ? m : k, transa == CUBLAS_OP_N ? k : m, lda); if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP; status = cublasLtMatrixLayoutInit( &Bdesc, CUDA_R_16F, transb == CUBLAS_OP_N ? k : n, transb == CUBLAS_OP_N ? n : k, ldb); if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP; status = cublasLtMatrixLayoutInit(&Cdesc, CUDA_R_16F, m, n, ldc); if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP; // Create preference handle; In general, extra attributes can be // used here to disable tensor ops or to make sure algo selected // will work with badly aligned A, B, C. However, for simplicity // here we assume A,B,C are always well aligned (e.g., directly // come from cudaMalloc) status = cublasLtMatmulPreferenceInit(&preference); if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP; status = cublasLtMatmulPreferenceSetAttribute( &preference, CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, &workspaceSize, sizeof(workspaceSize)); if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP; // We just need the best available heuristic to try and run matmul. // There is no guarantee that this will work. For example, if A is // badly aligned, you can request more (e.g. 32) algos and try to // run them one by one until something works. status = cublasLtMatmulAlgoGetHeuristic( ltHandle, &operationDesc, &Adesc, &Bdesc, &Cdesc, &Cdesc, &preference, 1, &heuristicResult, &returnedResults); if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP; if (returnedResults == 0) { status = CUBLAS_STATUS_NOT_SUPPORTED; goto CLEANUP; } status = cublasLtMatmul(ltHandle, &operationDesc, alpha, A, &Adesc, B, &Bdesc, beta, C, &Cdesc, C, &Cdesc, //&heuristicResult.algo, NULL, workspace, workspaceSize, stream); CLEANUP: // Descriptors are no longer needed as all GPU work was already // enqueued. return status == CUBLAS_STATUS_SUCCESS ? 0 : 1; } int gemm_bias_gelu_lt( cublasLtHandle_t ltHandle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const float *alpha, /* host pointer */ at::BFloat16* A, int lda, at::BFloat16* B, int ldb, const float *beta, /* host pointer */ at::BFloat16* C, int64_t ldc, void *workspace, size_t workspaceSize, cudaStream_t stream, bool use_bias, const void* gelu_in, const void* bias) { cublasStatus_t status = CUBLAS_STATUS_SUCCESS; cublasLtMatmulDescOpaque_t operationDesc = {}; cublasLtMatrixLayoutOpaque_t Adesc = {}, Bdesc = {}, Cdesc = {}; cublasLtMatmulPreferenceOpaque_t preference = {}; int returnedResults = 0; cublasLtMatmulHeuristicResult_t heuristicResult = {}; cublasLtEpilogue_t epilogue = CUBLASLT_EPILOGUE_GELU_AUX; // Create operation descriptor; see cublasLtMatmulDescAttributes_t // for details about defaults; here we just set the transforms for // A and B. status = cublasLtMatmulDescInit(&operationDesc, CUBLAS_COMPUTE_32F, CUDA_R_32F); if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP; status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_TRANSA, &transa, sizeof(transa)); if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP; status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_TRANSB, &transb, sizeof(transa)); if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP; status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_POINTER, &gelu_in, sizeof(gelu_in)); status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_LD, &ldc, sizeof(ldc)); if (use_bias) { status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_BIAS_POINTER, &bias, sizeof(bias)); if (status != CUBLAS_STATUS_SUCCESS) { goto CLEANUP; } epilogue = CUBLASLT_EPILOGUE_GELU_AUX_BIAS; } status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_EPILOGUE, &epilogue, sizeof(epilogue)); if (status != CUBLAS_STATUS_SUCCESS) { goto CLEANUP; } // Create matrix descriptors. Not setting any extra attributes. status = cublasLtMatrixLayoutInit( &Adesc, CUDA_R_16BF, transa == CUBLAS_OP_N ? m : k, transa == CUBLAS_OP_N ? k : m, lda); if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP; status = cublasLtMatrixLayoutInit( &Bdesc, CUDA_R_16BF, transb == CUBLAS_OP_N ? k : n, transb == CUBLAS_OP_N ? n : k, ldb); if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP; status = cublasLtMatrixLayoutInit(&Cdesc, CUDA_R_16BF, m, n, ldc); if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP; // Create preference handle; In general, extra attributes can be // used here to disable tensor ops or to make sure algo selected // will work with badly aligned A, B, C. However, for simplicity // here we assume A,B,C are always well aligned (e.g., directly // come from cudaMalloc) status = cublasLtMatmulPreferenceInit(&preference); if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP; status = cublasLtMatmulPreferenceSetAttribute( &preference, CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, &workspaceSize, sizeof(workspaceSize)); if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP; // We just need the best available heuristic to try and run matmul. // There is no guarantee that this will work. For example, if A is // badly aligned, you can request more (e.g. 32) algos and try to // run them one by one until something works. status = cublasLtMatmulAlgoGetHeuristic( ltHandle, &operationDesc, &Adesc, &Bdesc, &Cdesc, &Cdesc, &preference, 1, &heuristicResult, &returnedResults); if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP; if (returnedResults == 0) { status = CUBLAS_STATUS_NOT_SUPPORTED; goto CLEANUP; } status = cublasLtMatmul(ltHandle, &operationDesc, alpha, A, &Adesc, B, &Bdesc, beta, C, &Cdesc, C, &Cdesc, //&heuristicResult.algo, NULL, workspace, workspaceSize, stream); CLEANUP: // Descriptors are no longer needed as all GPU work was already // enqueued. return status == CUBLAS_STATUS_SUCCESS ? 0 : 1; } int gemm_bias_gelu_lt( cublasLtHandle_t ltHandle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const float *alpha, /* host pointer */ double* A, int lda, double* B, int ldb, const float *beta, /* host pointer */ double* C, int ldc, void *workspace, size_t workspaceSize, cudaStream_t stream, bool use_bias, const void *gelu_in, const void* bias) { return 1; } int gemm_bias_gelu_lt( cublasLtHandle_t ltHandle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const float *alpha, /* host pointer */ float *A, int lda, float *B, int ldb, const float *beta, /* host pointer */ float *C, int64_t ldc, void *workspace, size_t workspaceSize, cudaStream_t stream, bool use_bias, const void* gelu_in, const void* bias) { cublasStatus_t status = CUBLAS_STATUS_SUCCESS; cublasLtMatmulDescOpaque_t operationDesc = {}; cublasLtMatrixLayoutOpaque_t Adesc = {}, Bdesc = {}, Cdesc = {}; cublasLtMatmulPreferenceOpaque_t preference = {}; int returnedResults = 0; cublasLtMatmulHeuristicResult_t heuristicResult = {}; cublasLtEpilogue_t epilogue = CUBLASLT_EPILOGUE_GELU_AUX; // Create operation descriptor; see cublasLtMatmulDescAttributes_t // for details about defaults; here we just set the transforms for // A and B. status = cublasLtMatmulDescInit(&operationDesc, CUBLAS_COMPUTE_32F, CUDA_R_32F); if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP; status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_TRANSA, &transa, sizeof(transa)); if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP; status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_TRANSB, &transb, sizeof(transa)); if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP; status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_POINTER, &gelu_in, sizeof(gelu_in)); status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_LD, &ldc, sizeof(ldc)); if (use_bias) { status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_BIAS_POINTER, &bias, sizeof(bias)); if (status != CUBLAS_STATUS_SUCCESS) { goto CLEANUP; } epilogue = CUBLASLT_EPILOGUE_GELU_AUX_BIAS; } status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_EPILOGUE, &epilogue, sizeof(epilogue)); if (status != CUBLAS_STATUS_SUCCESS) { goto CLEANUP; } // Create matrix descriptors. Not setting any extra attributes. status = cublasLtMatrixLayoutInit( &Adesc, CUDA_R_32F, transa == CUBLAS_OP_N ? m : k, transa == CUBLAS_OP_N ? k : m, lda); if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP; status = cublasLtMatrixLayoutInit( &Bdesc, CUDA_R_32F, transb == CUBLAS_OP_N ? k : n, transb == CUBLAS_OP_N ? n : k, ldb); if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP; status = cublasLtMatrixLayoutInit(&Cdesc, CUDA_R_32F, m, n, ldc); if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP; // Create preference handle; In general, extra attributes can be // used here to disable tensor ops or to make sure algo selected // will work with badly aligned A, B, C. However, for simplicity // here we assume A,B,C are always well aligned (e.g., directly // come from cudaMalloc) status = cublasLtMatmulPreferenceInit(&preference); if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP; status = cublasLtMatmulPreferenceSetAttribute( &preference, CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, &workspaceSize, sizeof(workspaceSize)); if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP; // We just need the best available heuristic to try and run matmul. // There is no guarantee that this will work. For example, if A is // badly aligned, you can request more (e.g. 32) algos and try to // run them one by one until something works. status = cublasLtMatmulAlgoGetHeuristic( ltHandle, &operationDesc, &Adesc, &Bdesc, &Cdesc, &Cdesc, &preference, 1, &heuristicResult, &returnedResults); if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP; if (returnedResults == 0) { status = CUBLAS_STATUS_NOT_SUPPORTED; goto CLEANUP; } status = cublasLtMatmul(ltHandle, &operationDesc, alpha, A, &Adesc, B, &Bdesc, beta, C, &Cdesc, C, &Cdesc, //&heuristicResult.algo, NULL, workspace, workspaceSize, stream); CLEANUP: // Descriptors are no longer needed as all GPU work was already // enqueued. return status == CUBLAS_STATUS_SUCCESS ? 0 : 1; } int gemm_bgradb_lt( cublasLtHandle_t ltHandle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const float *alpha, /* host pointer */ at::Half* A, int lda, at::Half* B, int ldb, const float *beta, /* host pointer */ at::Half* C, int ldc, void *workspace, size_t workspaceSize, cudaStream_t stream, bool use_bias, const void* bgrad) { cublasStatus_t status = CUBLAS_STATUS_SUCCESS; cublasLtMatmulDescOpaque_t operationDesc = {}; cublasLtMatrixLayoutOpaque_t Adesc = {}, Bdesc = {}, Cdesc = {}; cublasLtMatmulPreferenceOpaque_t preference = {}; int returnedResults = 0; cublasLtMatmulHeuristicResult_t heuristicResult = {}; cublasLtEpilogue_t epilogue = CUBLASLT_EPILOGUE_DEFAULT; // Create operation descriptor; see cublasLtMatmulDescAttributes_t // for details about defaults; here we just set the transforms for // A and B. status = cublasLtMatmulDescInit(&operationDesc, CUBLAS_COMPUTE_32F, CUDA_R_32F); if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP; status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_TRANSA, &transa, sizeof(transa)); if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP; status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_TRANSB, &transb, sizeof(transa)); if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP; if (use_bias) { status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_BIAS_POINTER, &bgrad, sizeof(bgrad)); if (status != CUBLAS_STATUS_SUCCESS) { goto CLEANUP; } epilogue = CUBLASLT_EPILOGUE_BGRADB; } status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_EPILOGUE, &epilogue, sizeof(epilogue)); if (status != CUBLAS_STATUS_SUCCESS) { goto CLEANUP; } // Create matrix descriptors. Not setting any extra attributes. status = cublasLtMatrixLayoutInit( &Adesc, CUDA_R_16F, transa == CUBLAS_OP_N ? m : k, transa == CUBLAS_OP_N ? k : m, lda); if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP; status = cublasLtMatrixLayoutInit( &Bdesc, CUDA_R_16F, transb == CUBLAS_OP_N ? k : n, transb == CUBLAS_OP_N ? n : k, ldb); if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP; status = cublasLtMatrixLayoutInit(&Cdesc, CUDA_R_16F, m, n, ldc); if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP; // Create preference handle; In general, extra attributes can be // used here to disable tensor ops or to make sure algo selected // will work with badly aligned A, B, C. However, for simplicity // here we assume A,B,C are always well aligned (e.g., directly // come from cudaMalloc) status = cublasLtMatmulPreferenceInit(&preference); if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP; status = cublasLtMatmulPreferenceSetAttribute( &preference, CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, &workspaceSize, sizeof(workspaceSize)); if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP; // We just need the best available heuristic to try and run matmul. // There is no guarantee that this will work. For example, if A is // badly aligned, you can request more (e.g. 32) algos and try to // run them one by one until something works. status = cublasLtMatmulAlgoGetHeuristic( ltHandle, &operationDesc, &Adesc, &Bdesc, &Cdesc, &Cdesc, &preference, 1, &heuristicResult, &returnedResults); if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP; if (returnedResults == 0) { status = CUBLAS_STATUS_NOT_SUPPORTED; goto CLEANUP; } status = cublasLtMatmul(ltHandle, &operationDesc, alpha, A, &Adesc, B, &Bdesc, beta, C, &Cdesc, C, &Cdesc, //&heuristicResult.algo, NULL, workspace, workspaceSize, stream); CLEANUP: // Descriptors are no longer needed as all GPU work was already // enqueued. return status == CUBLAS_STATUS_SUCCESS ? 0 : 1; } int gemm_bgradb_lt( cublasLtHandle_t ltHandle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const float *alpha, /* host pointer */ at::BFloat16* A, int lda, at::BFloat16* B, int ldb, const float *beta, /* host pointer */ at::BFloat16* C, int ldc, void *workspace, size_t workspaceSize, cudaStream_t stream, bool use_bias, const void* bgrad) { cublasStatus_t status = CUBLAS_STATUS_SUCCESS; cublasLtMatmulDescOpaque_t operationDesc = {}; cublasLtMatrixLayoutOpaque_t Adesc = {}, Bdesc = {}, Cdesc = {}; cublasLtMatmulPreferenceOpaque_t preference = {}; int returnedResults = 0; cublasLtMatmulHeuristicResult_t heuristicResult = {}; cublasLtEpilogue_t epilogue = CUBLASLT_EPILOGUE_DEFAULT; // Create operation descriptor; see cublasLtMatmulDescAttributes_t // for details about defaults; here we just set the transforms for // A and B. status = cublasLtMatmulDescInit(&operationDesc, CUBLAS_COMPUTE_32F, CUDA_R_32F); if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP; status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_TRANSA, &transa, sizeof(transa)); if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP; status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_TRANSB, &transb, sizeof(transa)); if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP; if (use_bias) { status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_BIAS_POINTER, &bgrad, sizeof(bgrad)); if (status != CUBLAS_STATUS_SUCCESS) { goto CLEANUP; } epilogue = CUBLASLT_EPILOGUE_BGRADB; } status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_EPILOGUE, &epilogue, sizeof(epilogue)); if (status != CUBLAS_STATUS_SUCCESS) { goto CLEANUP; } // Create matrix descriptors. Not setting any extra attributes. status = cublasLtMatrixLayoutInit( &Adesc, CUDA_R_16BF, transa == CUBLAS_OP_N ? m : k, transa == CUBLAS_OP_N ? k : m, lda); if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP; status = cublasLtMatrixLayoutInit( &Bdesc, CUDA_R_16BF, transb == CUBLAS_OP_N ? k : n, transb == CUBLAS_OP_N ? n : k, ldb); if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP; status = cublasLtMatrixLayoutInit(&Cdesc, CUDA_R_16BF, m, n, ldc); if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP; // Create preference handle; In general, extra attributes can be // used here to disable tensor ops or to make sure algo selected // will work with badly aligned A, B, C. However, for simplicity // here we assume A,B,C are always well aligned (e.g., directly // come from cudaMalloc) status = cublasLtMatmulPreferenceInit(&preference); if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP; status = cublasLtMatmulPreferenceSetAttribute( &preference, CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, &workspaceSize, sizeof(workspaceSize)); if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP; // We just need the best available heuristic to try and run matmul. // There is no guarantee that this will work. For example, if A is // badly aligned, you can request more (e.g. 32) algos and try to // run them one by one until something works. status = cublasLtMatmulAlgoGetHeuristic( ltHandle, &operationDesc, &Adesc, &Bdesc, &Cdesc, &Cdesc, &preference, 1, &heuristicResult, &returnedResults); if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP; if (returnedResults == 0) { status = CUBLAS_STATUS_NOT_SUPPORTED; goto CLEANUP; } status = cublasLtMatmul(ltHandle, &operationDesc, alpha, A, &Adesc, B, &Bdesc, beta, C, &Cdesc, C, &Cdesc, //&heuristicResult.algo, NULL, workspace, workspaceSize, stream); CLEANUP: // Descriptors are no longer needed as all GPU work was already // enqueued. return status == CUBLAS_STATUS_SUCCESS ? 0 : 1; } int gemm_bgradb_lt( cublasLtHandle_t ltHandle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const float *alpha, /* host pointer */ double* A, int lda, double* B, int ldb, const float *beta, /* host pointer */ double* C, int ldc, void *workspace, size_t workspaceSize, cudaStream_t stream, bool use_bias, const void* bgrad) { return 1; } int gemm_bgradb_lt( cublasLtHandle_t ltHandle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const float *alpha, /* host pointer */ float *A, int lda, float *B, int ldb, const float *beta, /* host pointer */ float *C, int ldc, void *workspace, size_t workspaceSize, cudaStream_t stream, bool use_bias, const void* bgrad) { cublasStatus_t status = CUBLAS_STATUS_SUCCESS; cublasLtMatmulDescOpaque_t operationDesc = {}; cublasLtMatrixLayoutOpaque_t Adesc = {}, Bdesc = {}, Cdesc = {}; cublasLtMatmulPreferenceOpaque_t preference = {}; int returnedResults = 0; cublasLtMatmulHeuristicResult_t heuristicResult = {}; cublasLtEpilogue_t epilogue = CUBLASLT_EPILOGUE_DEFAULT; // Create operation descriptor; see cublasLtMatmulDescAttributes_t // for details about defaults; here we just set the transforms for // A and B. status = cublasLtMatmulDescInit(&operationDesc, CUBLAS_COMPUTE_32F, CUDA_R_32F); if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP; status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_TRANSA, &transa, sizeof(transa)); if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP; status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_TRANSB, &transb, sizeof(transa)); if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP; if (use_bias) { status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_BIAS_POINTER, &bgrad, sizeof(bgrad)); if (status != CUBLAS_STATUS_SUCCESS) { goto CLEANUP; } epilogue = CUBLASLT_EPILOGUE_BGRADB; } status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_EPILOGUE, &epilogue, sizeof(epilogue)); if (status != CUBLAS_STATUS_SUCCESS) { goto CLEANUP; } // Create matrix descriptors. Not setting any extra attributes. status = cublasLtMatrixLayoutInit( &Adesc, CUDA_R_32F, transa == CUBLAS_OP_N ? m : k, transa == CUBLAS_OP_N ? k : m, lda); if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP; status = cublasLtMatrixLayoutInit( &Bdesc, CUDA_R_32F, transb == CUBLAS_OP_N ? k : n, transb == CUBLAS_OP_N ? n : k, ldb); if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP; status = cublasLtMatrixLayoutInit(&Cdesc, CUDA_R_32F, m, n, ldc); if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP; // Create preference handle; In general, extra attributes can be // used here to disable tensor ops or to make sure algo selected // will work with badly aligned A, B, C. However, for simplicity // here we assume A,B,C are always well aligned (e.g., directly // come from cudaMalloc) status = cublasLtMatmulPreferenceInit(&preference); if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP; status = cublasLtMatmulPreferenceSetAttribute( &preference, CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, &workspaceSize, sizeof(workspaceSize)); if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP; // We just need the best available heuristic to try and run matmul. // There is no guarantee that this will work. For example, if A is // badly aligned, you can request more (e.g. 32) algos and try to // run them one by one until something works. status = cublasLtMatmulAlgoGetHeuristic( ltHandle, &operationDesc, &Adesc, &Bdesc, &Cdesc, &Cdesc, &preference, 1, &heuristicResult, &returnedResults); if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP; if (returnedResults == 0) { status = CUBLAS_STATUS_NOT_SUPPORTED; goto CLEANUP; } status = cublasLtMatmul(ltHandle, &operationDesc, alpha, A, &Adesc, B, &Bdesc, beta, C, &Cdesc, C, &Cdesc, &heuristicResult.algo, workspace, workspaceSize, stream); CLEANUP: // Descriptors are no longer needed as all GPU work was already // enqueued. return status == CUBLAS_STATUS_SUCCESS ? 0 : 1; } int gemm_dgelu_bgradb_lt( cublasLtHandle_t ltHandle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const float *alpha, /* host pointer */ at::Half* A, int lda, at::Half* B, int ldb, const float *beta, /* host pointer */ at::Half* C, int64_t ldc, void *workspace, size_t workspaceSize, cudaStream_t stream, const void *gelu_in, const void *bgrad) { cublasStatus_t status = CUBLAS_STATUS_SUCCESS; cublasLtMatmulDescOpaque_t operationDesc = {}; cublasLtMatrixLayoutOpaque_t Adesc = {}, Bdesc = {}, Cdesc = {}; cublasLtMatmulPreferenceOpaque_t preference = {}; int returnedResults = 0; cublasLtMatmulHeuristicResult_t heuristicResult = {}; cublasLtEpilogue_t epilogue = CUBLASLT_EPILOGUE_DGELU_BGRAD; // Create operation descriptor; see cublasLtMatmulDescAttributes_t // for details about defaults; here we just set the transforms for // A and B. status = cublasLtMatmulDescInit(&operationDesc, CUBLAS_COMPUTE_32F, CUDA_R_32F); if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP; status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_TRANSA, &transa, sizeof(transa)); if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP; status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_TRANSB, &transb, sizeof(transa)); if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP; status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_BIAS_POINTER, &bgrad, sizeof(bgrad)); if (status != CUBLAS_STATUS_SUCCESS) { goto CLEANUP; } status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_POINTER, &gelu_in, sizeof(gelu_in)); if (status != CUBLAS_STATUS_SUCCESS) { goto CLEANUP; } status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_LD, &ldc, sizeof(ldc)); status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_EPILOGUE, &epilogue, sizeof(epilogue)); if (status != CUBLAS_STATUS_SUCCESS) { goto CLEANUP; } // Create matrix descriptors. Not setting any extra attributes. status = cublasLtMatrixLayoutInit( &Adesc, CUDA_R_16F, transa == CUBLAS_OP_N ? m : k, transa == CUBLAS_OP_N ? k : m, lda); if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP; status = cublasLtMatrixLayoutInit( &Bdesc, CUDA_R_16F, transb == CUBLAS_OP_N ? k : n, transb == CUBLAS_OP_N ? n : k, ldb); if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP; status = cublasLtMatrixLayoutInit(&Cdesc, CUDA_R_16F, m, n, ldc); if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP; // Create preference handle; In general, extra attributes can be // used here to disable tensor ops or to make sure algo selected // will work with badly aligned A, B, C. However, for simplicity // here we assume A,B,C are always well aligned (e.g., directly // come from cudaMalloc) status = cublasLtMatmulPreferenceInit(&preference); if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP; status = cublasLtMatmulPreferenceSetAttribute( &preference, CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, &workspaceSize, sizeof(workspaceSize)); if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP; // We just need the best available heuristic to try and run matmul. // There is no guarantee that this will work. For example, if A is // badly aligned, you can request more (e.g. 32) algos and try to // run them one by one until something works. status = cublasLtMatmulAlgoGetHeuristic( ltHandle, &operationDesc, &Adesc, &Bdesc, &Cdesc, &Cdesc, &preference, 1, &heuristicResult, &returnedResults); if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP; if (returnedResults == 0) { status = CUBLAS_STATUS_NOT_SUPPORTED; goto CLEANUP; } status = cublasLtMatmul(ltHandle, &operationDesc, alpha, A, &Adesc, B, &Bdesc, beta, C, &Cdesc, C, &Cdesc, //&heuristicResult.algo, NULL, workspace, workspaceSize, stream); CLEANUP: // Descriptors are no longer needed as all GPU work was already // enqueued. return status == CUBLAS_STATUS_SUCCESS ? 0 : 1; } int gemm_dgelu_bgradb_lt( cublasLtHandle_t ltHandle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const float *alpha, /* host pointer */ at::BFloat16* A, int lda, at::BFloat16* B, int ldb, const float *beta, /* host pointer */ at::BFloat16* C, int64_t ldc, void *workspace, size_t workspaceSize, cudaStream_t stream, const void *gelu_in, const void *bgrad) { cublasStatus_t status = CUBLAS_STATUS_SUCCESS; cublasLtMatmulDescOpaque_t operationDesc = {}; cublasLtMatrixLayoutOpaque_t Adesc = {}, Bdesc = {}, Cdesc = {}; cublasLtMatmulPreferenceOpaque_t preference = {}; int returnedResults = 0; cublasLtMatmulHeuristicResult_t heuristicResult = {}; cublasLtEpilogue_t epilogue = CUBLASLT_EPILOGUE_DGELU_BGRAD; // Create operation descriptor; see cublasLtMatmulDescAttributes_t // for details about defaults; here we just set the transforms for // A and B. status = cublasLtMatmulDescInit(&operationDesc, CUBLAS_COMPUTE_32F, CUDA_R_32F); if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP; status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_TRANSA, &transa, sizeof(transa)); if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP; status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_TRANSB, &transb, sizeof(transa)); if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP; status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_BIAS_POINTER, &bgrad, sizeof(bgrad)); if (status != CUBLAS_STATUS_SUCCESS) { goto CLEANUP; } status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_POINTER, &gelu_in, sizeof(gelu_in)); if (status != CUBLAS_STATUS_SUCCESS) { goto CLEANUP; } status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_LD, &ldc, sizeof(ldc)); status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_EPILOGUE, &epilogue, sizeof(epilogue)); if (status != CUBLAS_STATUS_SUCCESS) { goto CLEANUP; } // Create matrix descriptors. Not setting any extra attributes. status = cublasLtMatrixLayoutInit( &Adesc, CUDA_R_16BF, transa == CUBLAS_OP_N ? m : k, transa == CUBLAS_OP_N ? k : m, lda); if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP; status = cublasLtMatrixLayoutInit( &Bdesc, CUDA_R_16BF, transb == CUBLAS_OP_N ? k : n, transb == CUBLAS_OP_N ? n : k, ldb); if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP; status = cublasLtMatrixLayoutInit(&Cdesc, CUDA_R_16BF, m, n, ldc); if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP; // Create preference handle; In general, extra attributes can be // used here to disable tensor ops or to make sure algo selected // will work with badly aligned A, B, C. However, for simplicity // here we assume A,B,C are always well aligned (e.g., directly // come from cudaMalloc) status = cublasLtMatmulPreferenceInit(&preference); if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP; status = cublasLtMatmulPreferenceSetAttribute( &preference, CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, &workspaceSize, sizeof(workspaceSize)); if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP; // We just need the best available heuristic to try and run matmul. // There is no guarantee that this will work. For example, if A is // badly aligned, you can request more (e.g. 32) algos and try to // run them one by one until something works. status = cublasLtMatmulAlgoGetHeuristic( ltHandle, &operationDesc, &Adesc, &Bdesc, &Cdesc, &Cdesc, &preference, 1, &heuristicResult, &returnedResults); if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP; if (returnedResults == 0) { status = CUBLAS_STATUS_NOT_SUPPORTED; goto CLEANUP; } status = cublasLtMatmul(ltHandle, &operationDesc, alpha, A, &Adesc, B, &Bdesc, beta, C, &Cdesc, C, &Cdesc, //&heuristicResult.algo, NULL, workspace, workspaceSize, stream); CLEANUP: // Descriptors are no longer needed as all GPU work was already // enqueued. return status == CUBLAS_STATUS_SUCCESS ? 0 : 1; } int gemm_dgelu_bgradb_lt( cublasLtHandle_t ltHandle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const float *alpha, /* host pointer */ double *A, int lda, double *B, int ldb, const float *beta, /* host pointer */ double *C, int ldc, void *workspace, size_t workspaceSize, cudaStream_t stream, const void *gelu_in, const void *bgrad) { return 1; } int gemm_dgelu_bgradb_lt( cublasLtHandle_t ltHandle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const float *alpha, /* host pointer */ float *A, int lda, float *B, int ldb, const float *beta, /* host pointer */ float *C, int64_t ldc, void *workspace, size_t workspaceSize, cudaStream_t stream, const void *gelu_in, const void *bgrad) { cublasStatus_t status = CUBLAS_STATUS_SUCCESS; cublasLtMatmulDescOpaque_t operationDesc = {}; cublasLtMatrixLayoutOpaque_t Adesc = {}, Bdesc = {}, Cdesc = {}; cublasLtMatmulPreferenceOpaque_t preference = {}; int returnedResults = 0; cublasLtMatmulHeuristicResult_t heuristicResult = {}; cublasLtEpilogue_t epilogue = CUBLASLT_EPILOGUE_DGELU_BGRAD; // Create operation descriptor; see cublasLtMatmulDescAttributes_t // for details about defaults; here we just set the transforms for // A and B. status = cublasLtMatmulDescInit(&operationDesc, CUBLAS_COMPUTE_32F, CUDA_R_32F); if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP; status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_TRANSA, &transa, sizeof(transa)); if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP; status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_TRANSB, &transb, sizeof(transa)); if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP; status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_BIAS_POINTER, &bgrad, sizeof(bgrad)); if (status != CUBLAS_STATUS_SUCCESS) { goto CLEANUP; } status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_POINTER, &gelu_in, sizeof(gelu_in)); if (status != CUBLAS_STATUS_SUCCESS) { goto CLEANUP; } status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_LD, &ldc, sizeof(ldc)); status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_EPILOGUE, &epilogue, sizeof(epilogue)); if (status != CUBLAS_STATUS_SUCCESS) { goto CLEANUP; } // Create matrix descriptors. Not setting any extra attributes. status = cublasLtMatrixLayoutInit( &Adesc, CUDA_R_32F, transa == CUBLAS_OP_N ? m : k, transa == CUBLAS_OP_N ? k : m, lda); if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP; status = cublasLtMatrixLayoutInit( &Bdesc, CUDA_R_32F, transb == CUBLAS_OP_N ? k : n, transb == CUBLAS_OP_N ? n : k, ldb); if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP; status = cublasLtMatrixLayoutInit(&Cdesc, CUDA_R_32F, m, n, ldc); if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP; // Create preference handle; In general, extra attributes can be // used here to disable tensor ops or to make sure algo selected // will work with badly aligned A, B, C. However, for simplicity // here we assume A,B,C are always well aligned (e.g., directly // come from cudaMalloc) status = cublasLtMatmulPreferenceInit(&preference); if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP; status = cublasLtMatmulPreferenceSetAttribute( &preference, CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, &workspaceSize, sizeof(workspaceSize)); if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP; // We just need the best available heuristic to try and run matmul. // There is no guarantee that this will work. For example, if A is // badly aligned, you can request more (e.g. 32) algos and try to // run them one by one until something works. status = cublasLtMatmulAlgoGetHeuristic( ltHandle, &operationDesc, &Adesc, &Bdesc, &Cdesc, &Cdesc, &preference, 1, &heuristicResult, &returnedResults); if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP; if (returnedResults == 0) { status = CUBLAS_STATUS_NOT_SUPPORTED; goto CLEANUP; } status = cublasLtMatmul(ltHandle, &operationDesc, alpha, A, &Adesc, B, &Bdesc, beta, C, &Cdesc, C, &Cdesc, //&heuristicResult.algo, NULL, workspace, workspaceSize, stream); CLEANUP: // Descriptors are no longer needed as all GPU work was already // enqueued. return status == CUBLAS_STATUS_SUCCESS ? 0 : 1; } #endif template int linear_bias_forward_cuda(at::Tensor input, T *weight, at::Tensor bias, int in_features, int batch_size, int out_features, at::Tensor output, void *lt_workspace) { cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle(); // Get the stream from cublas handle to reuse for biasReLU kernel. cudaStream_t stream; cublasGetStream(handle, &stream); const float alpha = 1.0; const float beta_zero = 0.0; const float beta_one = 1.0; int status = 1; #if defined(CUBLAS_VERSION) && CUBLAS_VERSION >= 11600 status = gemm_bias_lt( (cublasLtHandle_t)handle, CUBLAS_OP_T, CUBLAS_OP_N, out_features, batch_size, in_features, &alpha, /* host pointer */ weight, in_features, input.data_ptr(), in_features, &beta_zero, /* host pointer */ output.data_ptr(), out_features, lt_workspace, 1 << 22, stream, true, static_cast(bias.data_ptr())); #endif if (status != 0){ output.copy_(bias); status = gemm_bias( handle, CUBLAS_OP_T, CUBLAS_OP_N, out_features, batch_size, in_features, &alpha, weight, in_features, input.data_ptr(), in_features, &beta_one, output.data_ptr(), out_features); } return status; } template int linear_bias_backward_cuda(T *input, T *weight, T *d_output, int in_features, int batch_size, int out_features, T *d_weight, T *d_bias, T *d_input, void *lt_workspace) { cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle(); // Get the stream from cublas handle to reuse for biasReLU kernel. cudaStream_t stream; cublasGetStream(handle, &stream); const float alpha = 1.0; const float beta_zero = 0.0; int status = 1; #if defined(CUBLAS_VERSION) && CUBLAS_VERSION >= 11600 status = gemm_bgradb_lt( (cublasLtHandle_t)handle, CUBLAS_OP_N, CUBLAS_OP_T, in_features, out_features, batch_size, &alpha, /* host pointer */ input, in_features, d_output, out_features, &beta_zero, /* host pointer */ d_weight, in_features, lt_workspace, 1 << 22, stream, true, static_cast(d_bias)); #endif if (status != 0){ status = gemm_bias( handle, CUBLAS_OP_N, CUBLAS_OP_T, in_features, out_features, batch_size, &alpha, input, in_features, d_output, out_features, &beta_zero, d_weight, in_features); } status = gemm_bias( handle, CUBLAS_OP_N, CUBLAS_OP_N, in_features, batch_size, out_features, &alpha, weight, in_features, d_output, out_features, &beta_zero, d_input, in_features); return status; } template int linear_gelu_linear_forward_cuda(T *input, T *weight1, T *bias1, T *weight2, T *bias2, int in_features, int hidden_features, int batch_size, int out_features, T *output1, T *output2, T *gelu_in, void *lt_workspace) { cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle(); // Get the stream from cublas handle to reuse for biasReLU kernel. cudaStream_t stream; cublasGetStream(handle, &stream); const float alpha = 1.0; const float beta_zero = 0.0; int status = 1; #if defined(CUBLAS_VERSION) && CUBLAS_VERSION >= 11600 status = gemm_bias_gelu_lt( (cublasLtHandle_t)handle, CUBLAS_OP_T, CUBLAS_OP_N, hidden_features, batch_size, in_features, &alpha, /* host pointer */ weight1, in_features, input, in_features, &beta_zero, /* host pointer */ output1, hidden_features, lt_workspace, 1 << 22, stream, true, static_cast(gelu_in), static_cast(bias1)); status = gemm_bias_lt( (cublasLtHandle_t)handle, CUBLAS_OP_T, CUBLAS_OP_N, out_features, batch_size, hidden_features, &alpha, /* host pointer */ weight2, hidden_features, output1, hidden_features, &beta_zero, /* host pointer */ output2, out_features, lt_workspace, 1 << 22, stream, true, static_cast(bias2)); return status; #else return 1; #endif } template int linear_gelu_linear_backward_cuda(T *input, T *gelu_in, T *output1, T *weight1, T *weight2, T *d_output1, T *d_output2, int in_features, int batch_size, int hidden_features, int out_features, T *d_weight1, T *d_weight2, T *d_bias1, T *d_bias2, T *d_input, void *lt_workspace) { cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle(); // Get the stream from cublas handle to reuse for biasReLU kernel. cudaStream_t stream; cublasGetStream(handle, &stream); const float alpha = 1.0; const float beta_zero = 0.0; int status = 1; #if defined(CUBLAS_VERSION) && CUBLAS_VERSION >= 11600 //wgrad for first gemm status = gemm_bgradb_lt( (cublasLtHandle_t)handle, CUBLAS_OP_N, CUBLAS_OP_T, hidden_features, out_features, batch_size, &alpha, /* host pointer */ output1, hidden_features, d_output2, out_features, &beta_zero, /* host pointer */ d_weight2, hidden_features, lt_workspace, 1 << 22, stream, true, static_cast(d_bias2)); //dgrad for second GEMM status = gemm_dgelu_bgradb_lt( (cublasLtHandle_t)handle, CUBLAS_OP_N, CUBLAS_OP_N, hidden_features, batch_size, out_features, &alpha, /* host pointer */ weight2, hidden_features, d_output2, out_features, &beta_zero, /* host pointer */ d_output1, hidden_features, lt_workspace, 1 << 22, stream, static_cast(gelu_in), static_cast(d_bias1)); //wgrad for the first GEMM status = gemm_bias( handle, CUBLAS_OP_N, CUBLAS_OP_T, in_features, hidden_features, batch_size, &alpha, input, in_features, d_output1, hidden_features, &beta_zero, d_weight1, in_features); //dgrad for the first GEMM status = gemm_bias( handle, CUBLAS_OP_N, CUBLAS_OP_N, in_features, batch_size, hidden_features, &alpha, weight1, in_features, d_output1, hidden_features, &beta_zero, d_input, in_features); #endif return status; } template int linear_bias_forward_cuda(at::Tensor input, at::Half *weight, at::Tensor bias, int in_features, int batch_size, int out_features, at::Tensor output, void *lt_workspace); template int linear_bias_forward_cuda(at::Tensor input, float *weight, at::Tensor bias, int in_features, int batch_size, int out_features, at::Tensor output, void *lt_workspace); template int linear_bias_forward_cuda(at::Tensor input, double *weight, at::Tensor bias, int in_features, int batch_size, int out_features, at::Tensor output, void *lt_workspace); template int linear_bias_backward_cuda(at::Half *input, at::Half *weight, at::Half *d_output, int in_features, int batch_size, int out_features, at::Half *d_weight, at::Half *d_bias, at::Half *d_input, void *lt_workspace) ; template int linear_bias_backward_cuda(float *input, float *weight, float *d_output, int in_features, int batch_size, int out_features, float *d_weight, float *d_bias, float *d_input, void *lt_workspace) ; template int linear_bias_backward_cuda(double *input, double *weight, double *d_output, int in_features, int batch_size, int out_features, double *d_weight, double *d_bias, double *d_input, void *lt_workspace) ; template int linear_gelu_linear_forward_cuda(at::Half *input, at::Half *weight1, at::Half *bias1, at::Half *weight2, at::Half *bias2, int in_features, int hidden_features, int batch_size, int out_features, at::Half *output1, at::Half *output2, at::Half *gelu_in, void *lt_workspace) ; template int linear_gelu_linear_forward_cuda(float *input, float *weight1, float *bias1, float *weight2, float *bias2, int in_features, int hidden_features, int batch_size, int out_features, float *output1, float *output2, float *gelu_in, void *lt_workspace); template int linear_gelu_linear_forward_cuda(double *input, double *weight1, double *bias1, double *weight2, double *bias2, int in_features, int hidden_features, int batch_size, int out_features, double *output1, double *output2, double *gelu_in, void *lt_workspace) ; template int linear_gelu_linear_backward_cuda(at::Half *input, at::Half *gelu_in, at::Half *output1, at::Half *weight1, at::Half *weight2, at::Half *d_output1, at::Half *d_output2, int in_features, int batch_size, int hidden_features, int out_features, at::Half *d_weight1, at::Half *d_weight2, at::Half *d_bias1, at::Half *d_bias2, at::Half *d_input, void *lt_workspace); template int linear_gelu_linear_backward_cuda(float *input, float *gelu_in, float *output1, float *weight1, float *weight2, float *d_output1, float *d_output2, int in_features, int batch_size, int hidden_features, int out_features, float *d_weight1, float *d_weight2, float *d_bias1, float *d_bias2, float *d_input, void *lt_workspace); template int linear_gelu_linear_backward_cuda(double *input, double *gelu_in, double *output1, double *weight1, double *weight2, double *d_output1, double *d_output2, int in_features, int batch_size, int hidden_features, int out_features, double *d_weight1, double *d_weight2, double *d_bias1, double *d_bias2, double *d_input, void *lt_workspace); template int linear_bias_forward_cuda(at::Tensor input, at::BFloat16 *weight, at::Tensor bias, int in_features, int batch_size, int out_features, at::Tensor output, void *lt_workspace); template int linear_bias_backward_cuda(at::BFloat16 *input, at::BFloat16 *weight, at::BFloat16 *d_output, int in_features, int batch_size, int out_features, at::BFloat16 *d_weight, at::BFloat16 *d_bias, at::BFloat16 *d_input, void *lt_workspace) ; template int linear_gelu_linear_forward_cuda(at::BFloat16 *input, at::BFloat16 *weight1, at::BFloat16 *bias1, at::BFloat16 *weight2, at::BFloat16 *bias2, int in_features, int hidden_features, int batch_size, int out_features, at::BFloat16 *output1, at::BFloat16 *output2, at::BFloat16 *gelu_in, void *lt_workspace) ; template int linear_gelu_linear_backward_cuda(at::BFloat16 *input, at::BFloat16 *gelu_in, at::BFloat16 *output1, at::BFloat16 *weight1, at::BFloat16 *weight2, at::BFloat16 *d_output1, at::BFloat16 *d_output2, int in_features, int batch_size, int hidden_features, int out_features, at::BFloat16 *d_weight1, at::BFloat16 *d_weight2, at::BFloat16 *d_bias1, at::BFloat16 *d_bias2, at::BFloat16 *d_input, void *lt_workspace);