Files
2025-04-02 21:44:17 -07:00

3479 lines
216 KiB
C

/*
* Copyright 1993-2021 NVIDIA Corporation. All rights reserved.
*
* NOTICE TO LICENSEE:
*
* This source code and/or documentation ("Licensed Deliverables") are
* subject to NVIDIA intellectual property rights under U.S. and
* international Copyright laws.
*
* These Licensed Deliverables contained herein is PROPRIETARY and
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
* conditions of a form of NVIDIA software license agreement by and
* between NVIDIA and Licensee ("License Agreement") or electronically
* accepted by Licensee. Notwithstanding any terms or conditions to
* the contrary in the License Agreement, reproduction or disclosure
* of the Licensed Deliverables to any third party without the express
* written consent of NVIDIA is prohibited.
*
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
* OF THESE LICENSED DELIVERABLES.
*
* U.S. Government End Users. These Licensed Deliverables are a
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
* 1995), consisting of "commercial computer software" and "commercial
* computer software documentation" as such terms are used in 48
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
* U.S. Government End Users acquire the Licensed Deliverables with
* only those rights set forth herein.
*
* Any use of the Licensed Deliverables in individual and commercial
* software must include, in the user documentation and internal
* comments to the code, the above Disclaimer and U.S. Government End
* Users Notice.
*/
/*
* This is the public header file for the CUBLAS library, defining the API
*
* CUBLAS is an implementation of BLAS (Basic Linear Algebra Subroutines)
* on top of the CUDA runtime.
*/
#if !defined(CUBLAS_API_H_)
#define CUBLAS_API_H_
#ifndef CUBLASWINAPI
#ifdef _WIN32
#define CUBLASWINAPI __stdcall
#else
#define CUBLASWINAPI
#endif
#endif
#ifndef CUBLASAPI
#error "This file should not be included without defining CUBLASAPI"
#endif
#include "driver_types.h"
#include "cuComplex.h" /* import complex data type */
#include <cuda_fp16.h>
#include <cuda_bf16.h>
#include "library_types.h"
#if defined(__cplusplus)
extern "C" {
#endif /* __cplusplus */
#define CUBLAS_VER_MAJOR 11
#define CUBLAS_VER_MINOR 10
#define CUBLAS_VER_PATCH 3
#define CUBLAS_VER_BUILD 66
#define CUBLAS_VERSION (CUBLAS_VER_MAJOR * 10000 + CUBLAS_VER_MINOR * 100 + CUBLAS_VER_PATCH)
/* CUBLAS status type returns */
typedef enum {
CUBLAS_STATUS_SUCCESS = 0,
CUBLAS_STATUS_NOT_INITIALIZED = 1,
CUBLAS_STATUS_ALLOC_FAILED = 3,
CUBLAS_STATUS_INVALID_VALUE = 7,
CUBLAS_STATUS_ARCH_MISMATCH = 8,
CUBLAS_STATUS_MAPPING_ERROR = 11,
CUBLAS_STATUS_EXECUTION_FAILED = 13,
CUBLAS_STATUS_INTERNAL_ERROR = 14,
CUBLAS_STATUS_NOT_SUPPORTED = 15,
CUBLAS_STATUS_LICENSE_ERROR = 16
} cublasStatus_t;
typedef enum { CUBLAS_FILL_MODE_LOWER = 0, CUBLAS_FILL_MODE_UPPER = 1, CUBLAS_FILL_MODE_FULL = 2 } cublasFillMode_t;
typedef enum { CUBLAS_DIAG_NON_UNIT = 0, CUBLAS_DIAG_UNIT = 1 } cublasDiagType_t;
typedef enum { CUBLAS_SIDE_LEFT = 0, CUBLAS_SIDE_RIGHT = 1 } cublasSideMode_t;
typedef enum {
CUBLAS_OP_N = 0,
CUBLAS_OP_T = 1,
CUBLAS_OP_C = 2,
CUBLAS_OP_HERMITAN = 2, /* synonym if CUBLAS_OP_C */
CUBLAS_OP_CONJG = 3 /* conjugate, placeholder - not supported in the current release */
} cublasOperation_t;
typedef enum { CUBLAS_POINTER_MODE_HOST = 0, CUBLAS_POINTER_MODE_DEVICE = 1 } cublasPointerMode_t;
typedef enum { CUBLAS_ATOMICS_NOT_ALLOWED = 0, CUBLAS_ATOMICS_ALLOWED = 1 } cublasAtomicsMode_t;
/*For different GEMM algorithm */
typedef enum {
CUBLAS_GEMM_DFALT = -1,
CUBLAS_GEMM_DEFAULT = -1,
CUBLAS_GEMM_ALGO0 = 0,
CUBLAS_GEMM_ALGO1 = 1,
CUBLAS_GEMM_ALGO2 = 2,
CUBLAS_GEMM_ALGO3 = 3,
CUBLAS_GEMM_ALGO4 = 4,
CUBLAS_GEMM_ALGO5 = 5,
CUBLAS_GEMM_ALGO6 = 6,
CUBLAS_GEMM_ALGO7 = 7,
CUBLAS_GEMM_ALGO8 = 8,
CUBLAS_GEMM_ALGO9 = 9,
CUBLAS_GEMM_ALGO10 = 10,
CUBLAS_GEMM_ALGO11 = 11,
CUBLAS_GEMM_ALGO12 = 12,
CUBLAS_GEMM_ALGO13 = 13,
CUBLAS_GEMM_ALGO14 = 14,
CUBLAS_GEMM_ALGO15 = 15,
CUBLAS_GEMM_ALGO16 = 16,
CUBLAS_GEMM_ALGO17 = 17,
CUBLAS_GEMM_ALGO18 = 18, // sliced 32x32
CUBLAS_GEMM_ALGO19 = 19, // sliced 64x32
CUBLAS_GEMM_ALGO20 = 20, // sliced 128x32
CUBLAS_GEMM_ALGO21 = 21, // sliced 32x32 -splitK
CUBLAS_GEMM_ALGO22 = 22, // sliced 64x32 -splitK
CUBLAS_GEMM_ALGO23 = 23, // sliced 128x32 -splitK
CUBLAS_GEMM_DEFAULT_TENSOR_OP = 99,
CUBLAS_GEMM_DFALT_TENSOR_OP = 99,
CUBLAS_GEMM_ALGO0_TENSOR_OP = 100,
CUBLAS_GEMM_ALGO1_TENSOR_OP = 101,
CUBLAS_GEMM_ALGO2_TENSOR_OP = 102,
CUBLAS_GEMM_ALGO3_TENSOR_OP = 103,
CUBLAS_GEMM_ALGO4_TENSOR_OP = 104,
CUBLAS_GEMM_ALGO5_TENSOR_OP = 105,
CUBLAS_GEMM_ALGO6_TENSOR_OP = 106,
CUBLAS_GEMM_ALGO7_TENSOR_OP = 107,
CUBLAS_GEMM_ALGO8_TENSOR_OP = 108,
CUBLAS_GEMM_ALGO9_TENSOR_OP = 109,
CUBLAS_GEMM_ALGO10_TENSOR_OP = 110,
CUBLAS_GEMM_ALGO11_TENSOR_OP = 111,
CUBLAS_GEMM_ALGO12_TENSOR_OP = 112,
CUBLAS_GEMM_ALGO13_TENSOR_OP = 113,
CUBLAS_GEMM_ALGO14_TENSOR_OP = 114,
CUBLAS_GEMM_ALGO15_TENSOR_OP = 115
} cublasGemmAlgo_t;
/*Enum for default math mode/tensor operation*/
typedef enum {
CUBLAS_DEFAULT_MATH = 0,
/* deprecated, same effect as using CUBLAS_COMPUTE_32F_FAST_16F, will be removed in a future release */
CUBLAS_TENSOR_OP_MATH = 1,
/* same as using matching _PEDANTIC compute type when using cublas<T>routine calls or cublasEx() calls with
cudaDataType as compute type */
CUBLAS_PEDANTIC_MATH = 2,
/* allow accelerating single precision routines using TF32 tensor cores */
CUBLAS_TF32_TENSOR_OP_MATH = 3,
/* flag to force any reductons to use the accumulator type and not output type in case of mixed precision routines
with lower size output type */
CUBLAS_MATH_DISALLOW_REDUCED_PRECISION_REDUCTION = 16,
} cublasMath_t;
/* For backward compatibility purposes */
typedef cudaDataType cublasDataType_t;
/* Enum for compute type
*
* - default types provide best available performance using all available hardware features
* and guarantee internal storage precision with at least the same precision and range;
* - _PEDANTIC types ensure standard arithmetic and exact specified internal storage format;
* - _FAST types allow for some loss of precision to enable higher throughput arithmetic.
*/
typedef enum {
CUBLAS_COMPUTE_16F = 64, /* half - default */
CUBLAS_COMPUTE_16F_PEDANTIC = 65, /* half - pedantic */
CUBLAS_COMPUTE_32F = 68, /* float - default */
CUBLAS_COMPUTE_32F_PEDANTIC = 69, /* float - pedantic */
CUBLAS_COMPUTE_32F_FAST_16F = 74, /* float - fast, allows down-converting inputs to half or TF32 */
CUBLAS_COMPUTE_32F_FAST_16BF = 75, /* float - fast, allows down-converting inputs to bfloat16 or TF32 */
CUBLAS_COMPUTE_32F_FAST_TF32 = 77, /* float - fast, allows down-converting inputs to TF32 */
CUBLAS_COMPUTE_64F = 70, /* double - default */
CUBLAS_COMPUTE_64F_PEDANTIC = 71, /* double - pedantic */
CUBLAS_COMPUTE_32I = 72, /* signed 32-bit int - default */
CUBLAS_COMPUTE_32I_PEDANTIC = 73, /* signed 32-bit int - pedantic */
} cublasComputeType_t;
/* Opaque structure holding CUBLAS library context */
struct cublasContext;
typedef struct cublasContext* cublasHandle_t;
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCreate_v2(cublasHandle_t* handle);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDestroy_v2(cublasHandle_t handle);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasGetVersion_v2(cublasHandle_t handle, int* version);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasGetProperty(libraryPropertyType type, int* value);
CUBLASAPI size_t CUBLASWINAPI cublasGetCudartVersion(void);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSetWorkspace_v2(cublasHandle_t handle,
void* workspace,
size_t workspaceSizeInBytes);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSetStream_v2(cublasHandle_t handle, cudaStream_t streamId);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasGetStream_v2(cublasHandle_t handle, cudaStream_t* streamId);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasGetPointerMode_v2(cublasHandle_t handle, cublasPointerMode_t* mode);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSetPointerMode_v2(cublasHandle_t handle, cublasPointerMode_t mode);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasGetAtomicsMode(cublasHandle_t handle, cublasAtomicsMode_t* mode);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSetAtomicsMode(cublasHandle_t handle, cublasAtomicsMode_t mode);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasGetMathMode(cublasHandle_t handle, cublasMath_t* mode);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSetMathMode(cublasHandle_t handle, cublasMath_t mode);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasGetSmCountTarget(cublasHandle_t handle, int* smCountTarget);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSetSmCountTarget(cublasHandle_t handle, int smCountTarget);
CUBLASAPI const char* CUBLASWINAPI cublasGetStatusName(cublasStatus_t status);
CUBLASAPI const char* CUBLASWINAPI cublasGetStatusString(cublasStatus_t status);
/* Cublas logging */
typedef void (*cublasLogCallback)(const char* msg);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasLoggerConfigure(int logIsOn,
int logToStdOut,
int logToStdErr,
const char* logFileName);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSetLoggerCallback(cublasLogCallback userCallback);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasGetLoggerCallback(cublasLogCallback* userCallback);
/*
* cublasStatus_t
* cublasSetVector (int n, int elemSize, const void *x, int incx,
* void *y, int incy)
*
* copies n elements from a vector x in CPU memory space to a vector y
* in GPU memory space. Elements in both vectors are assumed to have a
* size of elemSize bytes. Storage spacing between consecutive elements
* is incx for the source vector x and incy for the destination vector
* y. In general, y points to an object, or part of an object, allocated
* via cublasAlloc(). Column major format for two-dimensional matrices
* is assumed throughout CUBLAS. Therefore, if the increment for a vector
* is equal to 1, this access a column vector while using an increment
* equal to the leading dimension of the respective matrix accesses a
* row vector.
*
* Return Values
* -------------
* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library not been initialized
* CUBLAS_STATUS_INVALID_VALUE if incx, incy, or elemSize <= 0
* CUBLAS_STATUS_MAPPING_ERROR if an error occurred accessing GPU memory
* CUBLAS_STATUS_SUCCESS if the operation completed successfully
*/
cublasStatus_t CUBLASWINAPI cublasSetVector(int n, int elemSize, const void* x, int incx, void* devicePtr, int incy);
/*
* cublasStatus_t
* cublasGetVector (int n, int elemSize, const void *x, int incx,
* void *y, int incy)
*
* copies n elements from a vector x in GPU memory space to a vector y
* in CPU memory space. Elements in both vectors are assumed to have a
* size of elemSize bytes. Storage spacing between consecutive elements
* is incx for the source vector x and incy for the destination vector
* y. In general, x points to an object, or part of an object, allocated
* via cublasAlloc(). Column major format for two-dimensional matrices
* is assumed throughout CUBLAS. Therefore, if the increment for a vector
* is equal to 1, this access a column vector while using an increment
* equal to the leading dimension of the respective matrix accesses a
* row vector.
*
* Return Values
* -------------
* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library not been initialized
* CUBLAS_STATUS_INVALID_VALUE if incx, incy, or elemSize <= 0
* CUBLAS_STATUS_MAPPING_ERROR if an error occurred accessing GPU memory
* CUBLAS_STATUS_SUCCESS if the operation completed successfully
*/
cublasStatus_t CUBLASWINAPI cublasGetVector(int n, int elemSize, const void* x, int incx, void* y, int incy);
/*
* cublasStatus_t
* cublasSetMatrix (int rows, int cols, int elemSize, const void *A,
* int lda, void *B, int ldb)
*
* copies a tile of rows x cols elements from a matrix A in CPU memory
* space to a matrix B in GPU memory space. Each element requires storage
* of elemSize bytes. Both matrices are assumed to be stored in column
* major format, with the leading dimension (i.e. number of rows) of
* source matrix A provided in lda, and the leading dimension of matrix B
* provided in ldb. In general, B points to an object, or part of an
* object, that was allocated via cublasAlloc().
*
* Return Values
* -------------
* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialized
* CUBLAS_STATUS_INVALID_VALUE if rows or cols < 0, or elemSize, lda, or
* ldb <= 0
* CUBLAS_STATUS_MAPPING_ERROR if error occurred accessing GPU memory
* CUBLAS_STATUS_SUCCESS if the operation completed successfully
*/
cublasStatus_t CUBLASWINAPI cublasSetMatrix(int rows, int cols, int elemSize, const void* A, int lda, void* B, int ldb);
/*
* cublasStatus_t
* cublasGetMatrix (int rows, int cols, int elemSize, const void *A,
* int lda, void *B, int ldb)
*
* copies a tile of rows x cols elements from a matrix A in GPU memory
* space to a matrix B in CPU memory space. Each element requires storage
* of elemSize bytes. Both matrices are assumed to be stored in column
* major format, with the leading dimension (i.e. number of rows) of
* source matrix A provided in lda, and the leading dimension of matrix B
* provided in ldb. In general, A points to an object, or part of an
* object, that was allocated via cublasAlloc().
*
* Return Values
* -------------
* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialized
* CUBLAS_STATUS_INVALID_VALUE if rows, cols, eleSize, lda, or ldb <= 0
* CUBLAS_STATUS_MAPPING_ERROR if error occurred accessing GPU memory
* CUBLAS_STATUS_SUCCESS if the operation completed successfully
*/
cublasStatus_t CUBLASWINAPI cublasGetMatrix(int rows, int cols, int elemSize, const void* A, int lda, void* B, int ldb);
/*
* cublasStatus
* cublasSetVectorAsync ( int n, int elemSize, const void *x, int incx,
* void *y, int incy, cudaStream_t stream );
*
* cublasSetVectorAsync has the same functionnality as cublasSetVector
* but the transfer is done asynchronously within the CUDA stream passed
* in parameter.
*
* Return Values
* -------------
* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library not been initialized
* CUBLAS_STATUS_INVALID_VALUE if incx, incy, or elemSize <= 0
* CUBLAS_STATUS_MAPPING_ERROR if an error occurred accessing GPU memory
* CUBLAS_STATUS_SUCCESS if the operation completed successfully
*/
cublasStatus_t CUBLASWINAPI cublasSetVectorAsync(
int n, int elemSize, const void* hostPtr, int incx, void* devicePtr, int incy, cudaStream_t stream);
/*
* cublasStatus
* cublasGetVectorAsync( int n, int elemSize, const void *x, int incx,
* void *y, int incy, cudaStream_t stream)
*
* cublasGetVectorAsync has the same functionnality as cublasGetVector
* but the transfer is done asynchronously within the CUDA stream passed
* in parameter.
*
* Return Values
* -------------
* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library not been initialized
* CUBLAS_STATUS_INVALID_VALUE if incx, incy, or elemSize <= 0
* CUBLAS_STATUS_MAPPING_ERROR if an error occurred accessing GPU memory
* CUBLAS_STATUS_SUCCESS if the operation completed successfully
*/
cublasStatus_t CUBLASWINAPI cublasGetVectorAsync(
int n, int elemSize, const void* devicePtr, int incx, void* hostPtr, int incy, cudaStream_t stream);
/*
* cublasStatus_t
* cublasSetMatrixAsync (int rows, int cols, int elemSize, const void *A,
* int lda, void *B, int ldb, cudaStream_t stream)
*
* cublasSetMatrixAsync has the same functionnality as cublasSetMatrix
* but the transfer is done asynchronously within the CUDA stream passed
* in parameter.
*
* Return Values
* -------------
* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialized
* CUBLAS_STATUS_INVALID_VALUE if rows or cols < 0, or elemSize, lda, or
* ldb <= 0
* CUBLAS_STATUS_MAPPING_ERROR if error occurred accessing GPU memory
* CUBLAS_STATUS_SUCCESS if the operation completed successfully
*/
cublasStatus_t CUBLASWINAPI
cublasSetMatrixAsync(int rows, int cols, int elemSize, const void* A, int lda, void* B, int ldb, cudaStream_t stream);
/*
* cublasStatus_t
* cublasGetMatrixAsync (int rows, int cols, int elemSize, const void *A,
* int lda, void *B, int ldb, cudaStream_t stream)
*
* cublasGetMatrixAsync has the same functionnality as cublasGetMatrix
* but the transfer is done asynchronously within the CUDA stream passed
* in parameter.
*
* Return Values
* -------------
* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialized
* CUBLAS_STATUS_INVALID_VALUE if rows, cols, eleSize, lda, or ldb <= 0
* CUBLAS_STATUS_MAPPING_ERROR if error occurred accessing GPU memory
* CUBLAS_STATUS_SUCCESS if the operation completed successfully
*/
cublasStatus_t CUBLASWINAPI
cublasGetMatrixAsync(int rows, int cols, int elemSize, const void* A, int lda, void* B, int ldb, cudaStream_t stream);
CUBLASAPI void CUBLASWINAPI cublasXerbla(const char* srName, int info);
/* ---------------- CUBLAS BLAS1 functions ---------------- */
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasNrm2Ex(cublasHandle_t handle,
int n,
const void* x,
cudaDataType xType,
int incx,
void* result,
cudaDataType resultType,
cudaDataType executionType); /* host or device pointer */
CUBLASAPI cublasStatus_t CUBLASWINAPI
cublasSnrm2_v2(cublasHandle_t handle, int n, const float* x, int incx, float* result); /* host or device pointer */
CUBLASAPI cublasStatus_t CUBLASWINAPI
cublasDnrm2_v2(cublasHandle_t handle, int n, const double* x, int incx, double* result); /* host or device pointer */
CUBLASAPI cublasStatus_t CUBLASWINAPI
cublasScnrm2_v2(cublasHandle_t handle, int n, const cuComplex* x, int incx, float* result); /* host or device pointer */
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDznrm2_v2(
cublasHandle_t handle, int n, const cuDoubleComplex* x, int incx, double* result); /* host or device pointer */
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDotEx(cublasHandle_t handle,
int n,
const void* x,
cudaDataType xType,
int incx,
const void* y,
cudaDataType yType,
int incy,
void* result,
cudaDataType resultType,
cudaDataType executionType);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDotcEx(cublasHandle_t handle,
int n,
const void* x,
cudaDataType xType,
int incx,
const void* y,
cudaDataType yType,
int incy,
void* result,
cudaDataType resultType,
cudaDataType executionType);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSdot_v2(cublasHandle_t handle,
int n,
const float* x,
int incx,
const float* y,
int incy,
float* result); /* host or device pointer */
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDdot_v2(cublasHandle_t handle,
int n,
const double* x,
int incx,
const double* y,
int incy,
double* result); /* host or device pointer */
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCdotu_v2(cublasHandle_t handle,
int n,
const cuComplex* x,
int incx,
const cuComplex* y,
int incy,
cuComplex* result); /* host or device pointer */
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCdotc_v2(cublasHandle_t handle,
int n,
const cuComplex* x,
int incx,
const cuComplex* y,
int incy,
cuComplex* result); /* host or device pointer */
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZdotu_v2(cublasHandle_t handle,
int n,
const cuDoubleComplex* x,
int incx,
const cuDoubleComplex* y,
int incy,
cuDoubleComplex* result); /* host or device pointer */
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZdotc_v2(cublasHandle_t handle,
int n,
const cuDoubleComplex* x,
int incx,
const cuDoubleComplex* y,
int incy,
cuDoubleComplex* result); /* host or device pointer */
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasScalEx(cublasHandle_t handle,
int n,
const void* alpha, /* host or device pointer */
cudaDataType alphaType,
void* x,
cudaDataType xType,
int incx,
cudaDataType executionType);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSscal_v2(cublasHandle_t handle,
int n,
const float* alpha, /* host or device pointer */
float* x,
int incx);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDscal_v2(cublasHandle_t handle,
int n,
const double* alpha, /* host or device pointer */
double* x,
int incx);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCscal_v2(cublasHandle_t handle,
int n,
const cuComplex* alpha, /* host or device pointer */
cuComplex* x,
int incx);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCsscal_v2(cublasHandle_t handle,
int n,
const float* alpha, /* host or device pointer */
cuComplex* x,
int incx);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZscal_v2(cublasHandle_t handle,
int n,
const cuDoubleComplex* alpha, /* host or device pointer */
cuDoubleComplex* x,
int incx);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZdscal_v2(cublasHandle_t handle,
int n,
const double* alpha, /* host or device pointer */
cuDoubleComplex* x,
int incx);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasAxpyEx(cublasHandle_t handle,
int n,
const void* alpha, /* host or device pointer */
cudaDataType alphaType,
const void* x,
cudaDataType xType,
int incx,
void* y,
cudaDataType yType,
int incy,
cudaDataType executiontype);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSaxpy_v2(cublasHandle_t handle,
int n,
const float* alpha, /* host or device pointer */
const float* x,
int incx,
float* y,
int incy);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDaxpy_v2(cublasHandle_t handle,
int n,
const double* alpha, /* host or device pointer */
const double* x,
int incx,
double* y,
int incy);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCaxpy_v2(cublasHandle_t handle,
int n,
const cuComplex* alpha, /* host or device pointer */
const cuComplex* x,
int incx,
cuComplex* y,
int incy);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZaxpy_v2(cublasHandle_t handle,
int n,
const cuDoubleComplex* alpha, /* host or device pointer */
const cuDoubleComplex* x,
int incx,
cuDoubleComplex* y,
int incy);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCopyEx(
cublasHandle_t handle, int n, const void* x, cudaDataType xType, int incx, void* y, cudaDataType yType, int incy);
CUBLASAPI cublasStatus_t CUBLASWINAPI
cublasScopy_v2(cublasHandle_t handle, int n, const float* x, int incx, float* y, int incy);
CUBLASAPI cublasStatus_t CUBLASWINAPI
cublasDcopy_v2(cublasHandle_t handle, int n, const double* x, int incx, double* y, int incy);
CUBLASAPI cublasStatus_t CUBLASWINAPI
cublasCcopy_v2(cublasHandle_t handle, int n, const cuComplex* x, int incx, cuComplex* y, int incy);
CUBLASAPI cublasStatus_t CUBLASWINAPI
cublasZcopy_v2(cublasHandle_t handle, int n, const cuDoubleComplex* x, int incx, cuDoubleComplex* y, int incy);
CUBLASAPI cublasStatus_t CUBLASWINAPI
cublasSswap_v2(cublasHandle_t handle, int n, float* x, int incx, float* y, int incy);
CUBLASAPI cublasStatus_t CUBLASWINAPI
cublasDswap_v2(cublasHandle_t handle, int n, double* x, int incx, double* y, int incy);
CUBLASAPI cublasStatus_t CUBLASWINAPI
cublasCswap_v2(cublasHandle_t handle, int n, cuComplex* x, int incx, cuComplex* y, int incy);
CUBLASAPI cublasStatus_t CUBLASWINAPI
cublasZswap_v2(cublasHandle_t handle, int n, cuDoubleComplex* x, int incx, cuDoubleComplex* y, int incy);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSwapEx(
cublasHandle_t handle, int n, void* x, cudaDataType xType, int incx, void* y, cudaDataType yType, int incy);
CUBLASAPI cublasStatus_t CUBLASWINAPI
cublasIsamax_v2(cublasHandle_t handle, int n, const float* x, int incx, int* result); /* host or device pointer */
CUBLASAPI cublasStatus_t CUBLASWINAPI
cublasIdamax_v2(cublasHandle_t handle, int n, const double* x, int incx, int* result); /* host or device pointer */
CUBLASAPI cublasStatus_t CUBLASWINAPI
cublasIcamax_v2(cublasHandle_t handle, int n, const cuComplex* x, int incx, int* result); /* host or device pointer */
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasIzamax_v2(
cublasHandle_t handle, int n, const cuDoubleComplex* x, int incx, int* result); /* host or device pointer */
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasIamaxEx(
cublasHandle_t handle, int n, const void* x, cudaDataType xType, int incx, int* result /* host or device pointer */
);
CUBLASAPI cublasStatus_t CUBLASWINAPI
cublasIsamin_v2(cublasHandle_t handle, int n, const float* x, int incx, int* result); /* host or device pointer */
CUBLASAPI cublasStatus_t CUBLASWINAPI
cublasIdamin_v2(cublasHandle_t handle, int n, const double* x, int incx, int* result); /* host or device pointer */
CUBLASAPI cublasStatus_t CUBLASWINAPI
cublasIcamin_v2(cublasHandle_t handle, int n, const cuComplex* x, int incx, int* result); /* host or device pointer */
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasIzamin_v2(
cublasHandle_t handle, int n, const cuDoubleComplex* x, int incx, int* result); /* host or device pointer */
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasIaminEx(
cublasHandle_t handle, int n, const void* x, cudaDataType xType, int incx, int* result /* host or device pointer */
);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasAsumEx(cublasHandle_t handle,
int n,
const void* x,
cudaDataType xType,
int incx,
void* result,
cudaDataType resultType, /* host or device pointer */
cudaDataType executiontype);
CUBLASAPI cublasStatus_t CUBLASWINAPI
cublasSasum_v2(cublasHandle_t handle, int n, const float* x, int incx, float* result); /* host or device pointer */
CUBLASAPI cublasStatus_t CUBLASWINAPI
cublasDasum_v2(cublasHandle_t handle, int n, const double* x, int incx, double* result); /* host or device pointer */
CUBLASAPI cublasStatus_t CUBLASWINAPI
cublasScasum_v2(cublasHandle_t handle, int n, const cuComplex* x, int incx, float* result); /* host or device pointer */
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDzasum_v2(
cublasHandle_t handle, int n, const cuDoubleComplex* x, int incx, double* result); /* host or device pointer */
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSrot_v2(cublasHandle_t handle,
int n,
float* x,
int incx,
float* y,
int incy,
const float* c, /* host or device pointer */
const float* s); /* host or device pointer */
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDrot_v2(cublasHandle_t handle,
int n,
double* x,
int incx,
double* y,
int incy,
const double* c, /* host or device pointer */
const double* s); /* host or device pointer */
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCrot_v2(cublasHandle_t handle,
int n,
cuComplex* x,
int incx,
cuComplex* y,
int incy,
const float* c, /* host or device pointer */
const cuComplex* s); /* host or device pointer */
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCsrot_v2(cublasHandle_t handle,
int n,
cuComplex* x,
int incx,
cuComplex* y,
int incy,
const float* c, /* host or device pointer */
const float* s); /* host or device pointer */
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZrot_v2(cublasHandle_t handle,
int n,
cuDoubleComplex* x,
int incx,
cuDoubleComplex* y,
int incy,
const double* c, /* host or device pointer */
const cuDoubleComplex* s); /* host or device pointer */
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZdrot_v2(cublasHandle_t handle,
int n,
cuDoubleComplex* x,
int incx,
cuDoubleComplex* y,
int incy,
const double* c, /* host or device pointer */
const double* s); /* host or device pointer */
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasRotEx(cublasHandle_t handle,
int n,
void* x,
cudaDataType xType,
int incx,
void* y,
cudaDataType yType,
int incy,
const void* c, /* host or device pointer */
const void* s,
cudaDataType csType,
cudaDataType executiontype);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSrotg_v2(cublasHandle_t handle,
float* a, /* host or device pointer */
float* b, /* host or device pointer */
float* c, /* host or device pointer */
float* s); /* host or device pointer */
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDrotg_v2(cublasHandle_t handle,
double* a, /* host or device pointer */
double* b, /* host or device pointer */
double* c, /* host or device pointer */
double* s); /* host or device pointer */
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCrotg_v2(cublasHandle_t handle,
cuComplex* a, /* host or device pointer */
cuComplex* b, /* host or device pointer */
float* c, /* host or device pointer */
cuComplex* s); /* host or device pointer */
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZrotg_v2(cublasHandle_t handle,
cuDoubleComplex* a, /* host or device pointer */
cuDoubleComplex* b, /* host or device pointer */
double* c, /* host or device pointer */
cuDoubleComplex* s); /* host or device pointer */
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasRotgEx(cublasHandle_t handle,
void* a, /* host or device pointer */
void* b, /* host or device pointer */
cudaDataType abType,
void* c, /* host or device pointer */
void* s, /* host or device pointer */
cudaDataType csType,
cudaDataType executiontype);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSrotm_v2(cublasHandle_t handle,
int n,
float* x,
int incx,
float* y,
int incy,
const float* param); /* host or device pointer */
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDrotm_v2(cublasHandle_t handle,
int n,
double* x,
int incx,
double* y,
int incy,
const double* param); /* host or device pointer */
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasRotmEx(cublasHandle_t handle,
int n,
void* x,
cudaDataType xType,
int incx,
void* y,
cudaDataType yType,
int incy,
const void* param, /* host or device pointer */
cudaDataType paramType,
cudaDataType executiontype);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSrotmg_v2(cublasHandle_t handle,
float* d1, /* host or device pointer */
float* d2, /* host or device pointer */
float* x1, /* host or device pointer */
const float* y1, /* host or device pointer */
float* param); /* host or device pointer */
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDrotmg_v2(cublasHandle_t handle,
double* d1, /* host or device pointer */
double* d2, /* host or device pointer */
double* x1, /* host or device pointer */
const double* y1, /* host or device pointer */
double* param); /* host or device pointer */
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasRotmgEx(cublasHandle_t handle,
void* d1, /* host or device pointer */
cudaDataType d1Type,
void* d2, /* host or device pointer */
cudaDataType d2Type,
void* x1, /* host or device pointer */
cudaDataType x1Type,
const void* y1, /* host or device pointer */
cudaDataType y1Type,
void* param, /* host or device pointer */
cudaDataType paramType,
cudaDataType executiontype);
/* --------------- CUBLAS BLAS2 functions ---------------- */
/* GEMV */
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSgemv_v2(cublasHandle_t handle,
cublasOperation_t trans,
int m,
int n,
const float* alpha, /* host or device pointer */
const float* A,
int lda,
const float* x,
int incx,
const float* beta, /* host or device pointer */
float* y,
int incy);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDgemv_v2(cublasHandle_t handle,
cublasOperation_t trans,
int m,
int n,
const double* alpha, /* host or device pointer */
const double* A,
int lda,
const double* x,
int incx,
const double* beta, /* host or device pointer */
double* y,
int incy);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCgemv_v2(cublasHandle_t handle,
cublasOperation_t trans,
int m,
int n,
const cuComplex* alpha, /* host or device pointer */
const cuComplex* A,
int lda,
const cuComplex* x,
int incx,
const cuComplex* beta, /* host or device pointer */
cuComplex* y,
int incy);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZgemv_v2(cublasHandle_t handle,
cublasOperation_t trans,
int m,
int n,
const cuDoubleComplex* alpha, /* host or device pointer */
const cuDoubleComplex* A,
int lda,
const cuDoubleComplex* x,
int incx,
const cuDoubleComplex* beta, /* host or device pointer */
cuDoubleComplex* y,
int incy);
/* GBMV */
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSgbmv_v2(cublasHandle_t handle,
cublasOperation_t trans,
int m,
int n,
int kl,
int ku,
const float* alpha, /* host or device pointer */
const float* A,
int lda,
const float* x,
int incx,
const float* beta, /* host or device pointer */
float* y,
int incy);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDgbmv_v2(cublasHandle_t handle,
cublasOperation_t trans,
int m,
int n,
int kl,
int ku,
const double* alpha, /* host or device pointer */
const double* A,
int lda,
const double* x,
int incx,
const double* beta, /* host or device pointer */
double* y,
int incy);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCgbmv_v2(cublasHandle_t handle,
cublasOperation_t trans,
int m,
int n,
int kl,
int ku,
const cuComplex* alpha, /* host or device pointer */
const cuComplex* A,
int lda,
const cuComplex* x,
int incx,
const cuComplex* beta, /* host or device pointer */
cuComplex* y,
int incy);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZgbmv_v2(cublasHandle_t handle,
cublasOperation_t trans,
int m,
int n,
int kl,
int ku,
const cuDoubleComplex* alpha, /* host or device pointer */
const cuDoubleComplex* A,
int lda,
const cuDoubleComplex* x,
int incx,
const cuDoubleComplex* beta, /* host or device pointer */
cuDoubleComplex* y,
int incy);
/* TRMV */
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasStrmv_v2(cublasHandle_t handle,
cublasFillMode_t uplo,
cublasOperation_t trans,
cublasDiagType_t diag,
int n,
const float* A,
int lda,
float* x,
int incx);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDtrmv_v2(cublasHandle_t handle,
cublasFillMode_t uplo,
cublasOperation_t trans,
cublasDiagType_t diag,
int n,
const double* A,
int lda,
double* x,
int incx);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCtrmv_v2(cublasHandle_t handle,
cublasFillMode_t uplo,
cublasOperation_t trans,
cublasDiagType_t diag,
int n,
const cuComplex* A,
int lda,
cuComplex* x,
int incx);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZtrmv_v2(cublasHandle_t handle,
cublasFillMode_t uplo,
cublasOperation_t trans,
cublasDiagType_t diag,
int n,
const cuDoubleComplex* A,
int lda,
cuDoubleComplex* x,
int incx);
/* TBMV */
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasStbmv_v2(cublasHandle_t handle,
cublasFillMode_t uplo,
cublasOperation_t trans,
cublasDiagType_t diag,
int n,
int k,
const float* A,
int lda,
float* x,
int incx);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDtbmv_v2(cublasHandle_t handle,
cublasFillMode_t uplo,
cublasOperation_t trans,
cublasDiagType_t diag,
int n,
int k,
const double* A,
int lda,
double* x,
int incx);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCtbmv_v2(cublasHandle_t handle,
cublasFillMode_t uplo,
cublasOperation_t trans,
cublasDiagType_t diag,
int n,
int k,
const cuComplex* A,
int lda,
cuComplex* x,
int incx);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZtbmv_v2(cublasHandle_t handle,
cublasFillMode_t uplo,
cublasOperation_t trans,
cublasDiagType_t diag,
int n,
int k,
const cuDoubleComplex* A,
int lda,
cuDoubleComplex* x,
int incx);
/* TPMV */
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasStpmv_v2(cublasHandle_t handle,
cublasFillMode_t uplo,
cublasOperation_t trans,
cublasDiagType_t diag,
int n,
const float* AP,
float* x,
int incx);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDtpmv_v2(cublasHandle_t handle,
cublasFillMode_t uplo,
cublasOperation_t trans,
cublasDiagType_t diag,
int n,
const double* AP,
double* x,
int incx);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCtpmv_v2(cublasHandle_t handle,
cublasFillMode_t uplo,
cublasOperation_t trans,
cublasDiagType_t diag,
int n,
const cuComplex* AP,
cuComplex* x,
int incx);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZtpmv_v2(cublasHandle_t handle,
cublasFillMode_t uplo,
cublasOperation_t trans,
cublasDiagType_t diag,
int n,
const cuDoubleComplex* AP,
cuDoubleComplex* x,
int incx);
/* TRSV */
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasStrsv_v2(cublasHandle_t handle,
cublasFillMode_t uplo,
cublasOperation_t trans,
cublasDiagType_t diag,
int n,
const float* A,
int lda,
float* x,
int incx);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDtrsv_v2(cublasHandle_t handle,
cublasFillMode_t uplo,
cublasOperation_t trans,
cublasDiagType_t diag,
int n,
const double* A,
int lda,
double* x,
int incx);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCtrsv_v2(cublasHandle_t handle,
cublasFillMode_t uplo,
cublasOperation_t trans,
cublasDiagType_t diag,
int n,
const cuComplex* A,
int lda,
cuComplex* x,
int incx);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZtrsv_v2(cublasHandle_t handle,
cublasFillMode_t uplo,
cublasOperation_t trans,
cublasDiagType_t diag,
int n,
const cuDoubleComplex* A,
int lda,
cuDoubleComplex* x,
int incx);
/* TPSV */
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasStpsv_v2(cublasHandle_t handle,
cublasFillMode_t uplo,
cublasOperation_t trans,
cublasDiagType_t diag,
int n,
const float* AP,
float* x,
int incx);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDtpsv_v2(cublasHandle_t handle,
cublasFillMode_t uplo,
cublasOperation_t trans,
cublasDiagType_t diag,
int n,
const double* AP,
double* x,
int incx);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCtpsv_v2(cublasHandle_t handle,
cublasFillMode_t uplo,
cublasOperation_t trans,
cublasDiagType_t diag,
int n,
const cuComplex* AP,
cuComplex* x,
int incx);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZtpsv_v2(cublasHandle_t handle,
cublasFillMode_t uplo,
cublasOperation_t trans,
cublasDiagType_t diag,
int n,
const cuDoubleComplex* AP,
cuDoubleComplex* x,
int incx);
/* TBSV */
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasStbsv_v2(cublasHandle_t handle,
cublasFillMode_t uplo,
cublasOperation_t trans,
cublasDiagType_t diag,
int n,
int k,
const float* A,
int lda,
float* x,
int incx);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDtbsv_v2(cublasHandle_t handle,
cublasFillMode_t uplo,
cublasOperation_t trans,
cublasDiagType_t diag,
int n,
int k,
const double* A,
int lda,
double* x,
int incx);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCtbsv_v2(cublasHandle_t handle,
cublasFillMode_t uplo,
cublasOperation_t trans,
cublasDiagType_t diag,
int n,
int k,
const cuComplex* A,
int lda,
cuComplex* x,
int incx);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZtbsv_v2(cublasHandle_t handle,
cublasFillMode_t uplo,
cublasOperation_t trans,
cublasDiagType_t diag,
int n,
int k,
const cuDoubleComplex* A,
int lda,
cuDoubleComplex* x,
int incx);
/* SYMV/HEMV */
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSsymv_v2(cublasHandle_t handle,
cublasFillMode_t uplo,
int n,
const float* alpha, /* host or device pointer */
const float* A,
int lda,
const float* x,
int incx,
const float* beta, /* host or device pointer */
float* y,
int incy);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDsymv_v2(cublasHandle_t handle,
cublasFillMode_t uplo,
int n,
const double* alpha, /* host or device pointer */
const double* A,
int lda,
const double* x,
int incx,
const double* beta, /* host or device pointer */
double* y,
int incy);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCsymv_v2(cublasHandle_t handle,
cublasFillMode_t uplo,
int n,
const cuComplex* alpha, /* host or device pointer */
const cuComplex* A,
int lda,
const cuComplex* x,
int incx,
const cuComplex* beta, /* host or device pointer */
cuComplex* y,
int incy);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZsymv_v2(cublasHandle_t handle,
cublasFillMode_t uplo,
int n,
const cuDoubleComplex* alpha, /* host or device pointer */
const cuDoubleComplex* A,
int lda,
const cuDoubleComplex* x,
int incx,
const cuDoubleComplex* beta, /* host or device pointer */
cuDoubleComplex* y,
int incy);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasChemv_v2(cublasHandle_t handle,
cublasFillMode_t uplo,
int n,
const cuComplex* alpha, /* host or device pointer */
const cuComplex* A,
int lda,
const cuComplex* x,
int incx,
const cuComplex* beta, /* host or device pointer */
cuComplex* y,
int incy);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZhemv_v2(cublasHandle_t handle,
cublasFillMode_t uplo,
int n,
const cuDoubleComplex* alpha, /* host or device pointer */
const cuDoubleComplex* A,
int lda,
const cuDoubleComplex* x,
int incx,
const cuDoubleComplex* beta, /* host or device pointer */
cuDoubleComplex* y,
int incy);
/* SBMV/HBMV */
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSsbmv_v2(cublasHandle_t handle,
cublasFillMode_t uplo,
int n,
int k,
const float* alpha, /* host or device pointer */
const float* A,
int lda,
const float* x,
int incx,
const float* beta, /* host or device pointer */
float* y,
int incy);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDsbmv_v2(cublasHandle_t handle,
cublasFillMode_t uplo,
int n,
int k,
const double* alpha, /* host or device pointer */
const double* A,
int lda,
const double* x,
int incx,
const double* beta, /* host or device pointer */
double* y,
int incy);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasChbmv_v2(cublasHandle_t handle,
cublasFillMode_t uplo,
int n,
int k,
const cuComplex* alpha, /* host or device pointer */
const cuComplex* A,
int lda,
const cuComplex* x,
int incx,
const cuComplex* beta, /* host or device pointer */
cuComplex* y,
int incy);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZhbmv_v2(cublasHandle_t handle,
cublasFillMode_t uplo,
int n,
int k,
const cuDoubleComplex* alpha, /* host or device pointer */
const cuDoubleComplex* A,
int lda,
const cuDoubleComplex* x,
int incx,
const cuDoubleComplex* beta, /* host or device pointer */
cuDoubleComplex* y,
int incy);
/* SPMV/HPMV */
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSspmv_v2(cublasHandle_t handle,
cublasFillMode_t uplo,
int n,
const float* alpha, /* host or device pointer */
const float* AP,
const float* x,
int incx,
const float* beta, /* host or device pointer */
float* y,
int incy);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDspmv_v2(cublasHandle_t handle,
cublasFillMode_t uplo,
int n,
const double* alpha, /* host or device pointer */
const double* AP,
const double* x,
int incx,
const double* beta, /* host or device pointer */
double* y,
int incy);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasChpmv_v2(cublasHandle_t handle,
cublasFillMode_t uplo,
int n,
const cuComplex* alpha, /* host or device pointer */
const cuComplex* AP,
const cuComplex* x,
int incx,
const cuComplex* beta, /* host or device pointer */
cuComplex* y,
int incy);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZhpmv_v2(cublasHandle_t handle,
cublasFillMode_t uplo,
int n,
const cuDoubleComplex* alpha, /* host or device pointer */
const cuDoubleComplex* AP,
const cuDoubleComplex* x,
int incx,
const cuDoubleComplex* beta, /* host or device pointer */
cuDoubleComplex* y,
int incy);
/* GER */
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSger_v2(cublasHandle_t handle,
int m,
int n,
const float* alpha, /* host or device pointer */
const float* x,
int incx,
const float* y,
int incy,
float* A,
int lda);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDger_v2(cublasHandle_t handle,
int m,
int n,
const double* alpha, /* host or device pointer */
const double* x,
int incx,
const double* y,
int incy,
double* A,
int lda);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCgeru_v2(cublasHandle_t handle,
int m,
int n,
const cuComplex* alpha, /* host or device pointer */
const cuComplex* x,
int incx,
const cuComplex* y,
int incy,
cuComplex* A,
int lda);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCgerc_v2(cublasHandle_t handle,
int m,
int n,
const cuComplex* alpha, /* host or device pointer */
const cuComplex* x,
int incx,
const cuComplex* y,
int incy,
cuComplex* A,
int lda);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZgeru_v2(cublasHandle_t handle,
int m,
int n,
const cuDoubleComplex* alpha, /* host or device pointer */
const cuDoubleComplex* x,
int incx,
const cuDoubleComplex* y,
int incy,
cuDoubleComplex* A,
int lda);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZgerc_v2(cublasHandle_t handle,
int m,
int n,
const cuDoubleComplex* alpha, /* host or device pointer */
const cuDoubleComplex* x,
int incx,
const cuDoubleComplex* y,
int incy,
cuDoubleComplex* A,
int lda);
/* SYR/HER */
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSsyr_v2(cublasHandle_t handle,
cublasFillMode_t uplo,
int n,
const float* alpha, /* host or device pointer */
const float* x,
int incx,
float* A,
int lda);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDsyr_v2(cublasHandle_t handle,
cublasFillMode_t uplo,
int n,
const double* alpha, /* host or device pointer */
const double* x,
int incx,
double* A,
int lda);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCsyr_v2(cublasHandle_t handle,
cublasFillMode_t uplo,
int n,
const cuComplex* alpha, /* host or device pointer */
const cuComplex* x,
int incx,
cuComplex* A,
int lda);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZsyr_v2(cublasHandle_t handle,
cublasFillMode_t uplo,
int n,
const cuDoubleComplex* alpha, /* host or device pointer */
const cuDoubleComplex* x,
int incx,
cuDoubleComplex* A,
int lda);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCher_v2(cublasHandle_t handle,
cublasFillMode_t uplo,
int n,
const float* alpha, /* host or device pointer */
const cuComplex* x,
int incx,
cuComplex* A,
int lda);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZher_v2(cublasHandle_t handle,
cublasFillMode_t uplo,
int n,
const double* alpha, /* host or device pointer */
const cuDoubleComplex* x,
int incx,
cuDoubleComplex* A,
int lda);
/* SPR/HPR */
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSspr_v2(cublasHandle_t handle,
cublasFillMode_t uplo,
int n,
const float* alpha, /* host or device pointer */
const float* x,
int incx,
float* AP);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDspr_v2(cublasHandle_t handle,
cublasFillMode_t uplo,
int n,
const double* alpha, /* host or device pointer */
const double* x,
int incx,
double* AP);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasChpr_v2(cublasHandle_t handle,
cublasFillMode_t uplo,
int n,
const float* alpha, /* host or device pointer */
const cuComplex* x,
int incx,
cuComplex* AP);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZhpr_v2(cublasHandle_t handle,
cublasFillMode_t uplo,
int n,
const double* alpha, /* host or device pointer */
const cuDoubleComplex* x,
int incx,
cuDoubleComplex* AP);
/* SYR2/HER2 */
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSsyr2_v2(cublasHandle_t handle,
cublasFillMode_t uplo,
int n,
const float* alpha, /* host or device pointer */
const float* x,
int incx,
const float* y,
int incy,
float* A,
int lda);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDsyr2_v2(cublasHandle_t handle,
cublasFillMode_t uplo,
int n,
const double* alpha, /* host or device pointer */
const double* x,
int incx,
const double* y,
int incy,
double* A,
int lda);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCsyr2_v2(cublasHandle_t handle,
cublasFillMode_t uplo,
int n,
const cuComplex* alpha, /* host or device pointer */
const cuComplex* x,
int incx,
const cuComplex* y,
int incy,
cuComplex* A,
int lda);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZsyr2_v2(cublasHandle_t handle,
cublasFillMode_t uplo,
int n,
const cuDoubleComplex* alpha, /* host or device pointer */
const cuDoubleComplex* x,
int incx,
const cuDoubleComplex* y,
int incy,
cuDoubleComplex* A,
int lda);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCher2_v2(cublasHandle_t handle,
cublasFillMode_t uplo,
int n,
const cuComplex* alpha, /* host or device pointer */
const cuComplex* x,
int incx,
const cuComplex* y,
int incy,
cuComplex* A,
int lda);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZher2_v2(cublasHandle_t handle,
cublasFillMode_t uplo,
int n,
const cuDoubleComplex* alpha, /* host or device pointer */
const cuDoubleComplex* x,
int incx,
const cuDoubleComplex* y,
int incy,
cuDoubleComplex* A,
int lda);
/* SPR2/HPR2 */
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSspr2_v2(cublasHandle_t handle,
cublasFillMode_t uplo,
int n,
const float* alpha, /* host or device pointer */
const float* x,
int incx,
const float* y,
int incy,
float* AP);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDspr2_v2(cublasHandle_t handle,
cublasFillMode_t uplo,
int n,
const double* alpha, /* host or device pointer */
const double* x,
int incx,
const double* y,
int incy,
double* AP);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasChpr2_v2(cublasHandle_t handle,
cublasFillMode_t uplo,
int n,
const cuComplex* alpha, /* host or device pointer */
const cuComplex* x,
int incx,
const cuComplex* y,
int incy,
cuComplex* AP);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZhpr2_v2(cublasHandle_t handle,
cublasFillMode_t uplo,
int n,
const cuDoubleComplex* alpha, /* host or device pointer */
const cuDoubleComplex* x,
int incx,
const cuDoubleComplex* y,
int incy,
cuDoubleComplex* AP);
/* BATCH GEMV */
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSgemvBatched(cublasHandle_t handle,
cublasOperation_t trans,
int m,
int n,
const float* alpha, /* host or device pointer */
const float* const Aarray[],
int lda,
const float* const xarray[],
int incx,
const float* beta, /* host or device pointer */
float* const yarray[],
int incy,
int batchCount);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDgemvBatched(cublasHandle_t handle,
cublasOperation_t trans,
int m,
int n,
const double* alpha, /* host or device pointer */
const double* const Aarray[],
int lda,
const double* const xarray[],
int incx,
const double* beta, /* host or device pointer */
double* const yarray[],
int incy,
int batchCount);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCgemvBatched(cublasHandle_t handle,
cublasOperation_t trans,
int m,
int n,
const cuComplex* alpha, /* host or device pointer */
const cuComplex* const Aarray[],
int lda,
const cuComplex* const xarray[],
int incx,
const cuComplex* beta, /* host or device pointer */
cuComplex* const yarray[],
int incy,
int batchCount);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZgemvBatched(cublasHandle_t handle,
cublasOperation_t trans,
int m,
int n,
const cuDoubleComplex* alpha, /* host or device pointer */
const cuDoubleComplex* const Aarray[],
int lda,
const cuDoubleComplex* const xarray[],
int incx,
const cuDoubleComplex* beta, /* host or device pointer */
cuDoubleComplex* const yarray[],
int incy,
int batchCount);
#if defined(__cplusplus)
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasHSHgemvBatched(cublasHandle_t handle,
cublasOperation_t trans,
int m,
int n,
const float* alpha, /* host or device pointer */
const __half* const Aarray[],
int lda,
const __half* const xarray[],
int incx,
const float* beta, /* host or device pointer */
__half* const yarray[],
int incy,
int batchCount);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasHSSgemvBatched(cublasHandle_t handle,
cublasOperation_t trans,
int m,
int n,
const float* alpha, /* host or device pointer */
const __half* const Aarray[],
int lda,
const __half* const xarray[],
int incx,
const float* beta, /* host or device pointer */
float* const yarray[],
int incy,
int batchCount);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasTSTgemvBatched(cublasHandle_t handle,
cublasOperation_t trans,
int m,
int n,
const float* alpha, /* host or device pointer */
const __nv_bfloat16* const Aarray[],
int lda,
const __nv_bfloat16* const xarray[],
int incx,
const float* beta, /* host or device pointer */
__nv_bfloat16* const yarray[],
int incy,
int batchCount);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasTSSgemvBatched(cublasHandle_t handle,
cublasOperation_t trans,
int m,
int n,
const float* alpha, /* host or device pointer */
const __nv_bfloat16* const Aarray[],
int lda,
const __nv_bfloat16* const xarray[],
int incx,
const float* beta, /* host or device pointer */
float* const yarray[],
int incy,
int batchCount);
#endif
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSgemvStridedBatched(cublasHandle_t handle,
cublasOperation_t trans,
int m,
int n,
const float* alpha, /* host or device pointer */
const float* A,
int lda,
long long int strideA, /* purposely signed */
const float* x,
int incx,
long long int stridex,
const float* beta, /* host or device pointer */
float* y,
int incy,
long long int stridey,
int batchCount);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDgemvStridedBatched(cublasHandle_t handle,
cublasOperation_t trans,
int m,
int n,
const double* alpha, /* host or device pointer */
const double* A,
int lda,
long long int strideA, /* purposely signed */
const double* x,
int incx,
long long int stridex,
const double* beta, /* host or device pointer */
double* y,
int incy,
long long int stridey,
int batchCount);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCgemvStridedBatched(cublasHandle_t handle,
cublasOperation_t trans,
int m,
int n,
const cuComplex* alpha, /* host or device pointer */
const cuComplex* A,
int lda,
long long int strideA, /* purposely signed */
const cuComplex* x,
int incx,
long long int stridex,
const cuComplex* beta, /* host or device pointer */
cuComplex* y,
int incy,
long long int stridey,
int batchCount);
CUBLASAPI cublasStatus_t CUBLASWINAPI
cublasZgemvStridedBatched(cublasHandle_t handle,
cublasOperation_t trans,
int m,
int n,
const cuDoubleComplex* alpha, /* host or device pointer */
const cuDoubleComplex* A,
int lda,
long long int strideA, /* purposely signed */
const cuDoubleComplex* x,
int incx,
long long int stridex,
const cuDoubleComplex* beta, /* host or device pointer */
cuDoubleComplex* y,
int incy,
long long int stridey,
int batchCount);
#if defined(__cplusplus)
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasHSHgemvStridedBatched(cublasHandle_t handle,
cublasOperation_t trans,
int m,
int n,
const float* alpha, /* host or device pointer */
const __half* A,
int lda,
long long int strideA, /* purposely signed */
const __half* x,
int incx,
long long int stridex,
const float* beta, /* host or device pointer */
__half* y,
int incy,
long long int stridey,
int batchCount);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasHSSgemvStridedBatched(cublasHandle_t handle,
cublasOperation_t trans,
int m,
int n,
const float* alpha, /* host or device pointer */
const __half* A,
int lda,
long long int strideA, /* purposely signed */
const __half* x,
int incx,
long long int stridex,
const float* beta, /* host or device pointer */
float* y,
int incy,
long long int stridey,
int batchCount);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasTSTgemvStridedBatched(cublasHandle_t handle,
cublasOperation_t trans,
int m,
int n,
const float* alpha, /* host or device pointer */
const __nv_bfloat16* A,
int lda,
long long int strideA, /* purposely signed */
const __nv_bfloat16* x,
int incx,
long long int stridex,
const float* beta, /* host or device pointer */
__nv_bfloat16* y,
int incy,
long long int stridey,
int batchCount);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasTSSgemvStridedBatched(cublasHandle_t handle,
cublasOperation_t trans,
int m,
int n,
const float* alpha, /* host or device pointer */
const __nv_bfloat16* A,
int lda,
long long int strideA, /* purposely signed */
const __nv_bfloat16* x,
int incx,
long long int stridex,
const float* beta, /* host or device pointer */
float* y,
int incy,
long long int stridey,
int batchCount);
#endif
/* ---------------- CUBLAS BLAS3 functions ---------------- */
/* GEMM */
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSgemm_v2(cublasHandle_t handle,
cublasOperation_t transa,
cublasOperation_t transb,
int m,
int n,
int k,
const float* alpha, /* host or device pointer */
const float* A,
int lda,
const float* B,
int ldb,
const float* beta, /* host or device pointer */
float* C,
int ldc);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDgemm_v2(cublasHandle_t handle,
cublasOperation_t transa,
cublasOperation_t transb,
int m,
int n,
int k,
const double* alpha, /* host or device pointer */
const double* A,
int lda,
const double* B,
int ldb,
const double* beta, /* host or device pointer */
double* C,
int ldc);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCgemm_v2(cublasHandle_t handle,
cublasOperation_t transa,
cublasOperation_t transb,
int m,
int n,
int k,
const cuComplex* alpha, /* host or device pointer */
const cuComplex* A,
int lda,
const cuComplex* B,
int ldb,
const cuComplex* beta, /* host or device pointer */
cuComplex* C,
int ldc);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCgemm3m(cublasHandle_t handle,
cublasOperation_t transa,
cublasOperation_t transb,
int m,
int n,
int k,
const cuComplex* alpha, /* host or device pointer */
const cuComplex* A,
int lda,
const cuComplex* B,
int ldb,
const cuComplex* beta, /* host or device pointer */
cuComplex* C,
int ldc);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCgemm3mEx(cublasHandle_t handle,
cublasOperation_t transa,
cublasOperation_t transb,
int m,
int n,
int k,
const cuComplex* alpha,
const void* A,
cudaDataType Atype,
int lda,
const void* B,
cudaDataType Btype,
int ldb,
const cuComplex* beta,
void* C,
cudaDataType Ctype,
int ldc);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZgemm_v2(cublasHandle_t handle,
cublasOperation_t transa,
cublasOperation_t transb,
int m,
int n,
int k,
const cuDoubleComplex* alpha, /* host or device pointer */
const cuDoubleComplex* A,
int lda,
const cuDoubleComplex* B,
int ldb,
const cuDoubleComplex* beta, /* host or device pointer */
cuDoubleComplex* C,
int ldc);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZgemm3m(cublasHandle_t handle,
cublasOperation_t transa,
cublasOperation_t transb,
int m,
int n,
int k,
const cuDoubleComplex* alpha, /* host or device pointer */
const cuDoubleComplex* A,
int lda,
const cuDoubleComplex* B,
int ldb,
const cuDoubleComplex* beta, /* host or device pointer */
cuDoubleComplex* C,
int ldc);
#if defined(__cplusplus)
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasHgemm(cublasHandle_t handle,
cublasOperation_t transa,
cublasOperation_t transb,
int m,
int n,
int k,
const __half* alpha, /* host or device pointer */
const __half* A,
int lda,
const __half* B,
int ldb,
const __half* beta, /* host or device pointer */
__half* C,
int ldc);
#endif
/* IO in FP16/FP32, computation in float */
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSgemmEx(cublasHandle_t handle,
cublasOperation_t transa,
cublasOperation_t transb,
int m,
int n,
int k,
const float* alpha, /* host or device pointer */
const void* A,
cudaDataType Atype,
int lda,
const void* B,
cudaDataType Btype,
int ldb,
const float* beta, /* host or device pointer */
void* C,
cudaDataType Ctype,
int ldc);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasGemmEx(cublasHandle_t handle,
cublasOperation_t transa,
cublasOperation_t transb,
int m,
int n,
int k,
const void* alpha, /* host or device pointer */
const void* A,
cudaDataType Atype,
int lda,
const void* B,
cudaDataType Btype,
int ldb,
const void* beta, /* host or device pointer */
void* C,
cudaDataType Ctype,
int ldc,
cublasComputeType_t computeType,
cublasGemmAlgo_t algo);
/* IO in Int8 complex/cuComplex, computation in cuComplex */
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCgemmEx(cublasHandle_t handle,
cublasOperation_t transa,
cublasOperation_t transb,
int m,
int n,
int k,
const cuComplex* alpha,
const void* A,
cudaDataType Atype,
int lda,
const void* B,
cudaDataType Btype,
int ldb,
const cuComplex* beta,
void* C,
cudaDataType Ctype,
int ldc);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasUint8gemmBias(cublasHandle_t handle,
cublasOperation_t transa,
cublasOperation_t transb,
cublasOperation_t transc,
int m,
int n,
int k,
const unsigned char* A,
int A_bias,
int lda,
const unsigned char* B,
int B_bias,
int ldb,
unsigned char* C,
int C_bias,
int ldc,
int C_mult,
int C_shift);
/* SYRK */
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSsyrk_v2(cublasHandle_t handle,
cublasFillMode_t uplo,
cublasOperation_t trans,
int n,
int k,
const float* alpha, /* host or device pointer */
const float* A,
int lda,
const float* beta, /* host or device pointer */
float* C,
int ldc);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDsyrk_v2(cublasHandle_t handle,
cublasFillMode_t uplo,
cublasOperation_t trans,
int n,
int k,
const double* alpha, /* host or device pointer */
const double* A,
int lda,
const double* beta, /* host or device pointer */
double* C,
int ldc);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCsyrk_v2(cublasHandle_t handle,
cublasFillMode_t uplo,
cublasOperation_t trans,
int n,
int k,
const cuComplex* alpha, /* host or device pointer */
const cuComplex* A,
int lda,
const cuComplex* beta, /* host or device pointer */
cuComplex* C,
int ldc);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZsyrk_v2(cublasHandle_t handle,
cublasFillMode_t uplo,
cublasOperation_t trans,
int n,
int k,
const cuDoubleComplex* alpha, /* host or device pointer */
const cuDoubleComplex* A,
int lda,
const cuDoubleComplex* beta, /* host or device pointer */
cuDoubleComplex* C,
int ldc);
/* IO in Int8 complex/cuComplex, computation in cuComplex */
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCsyrkEx(cublasHandle_t handle,
cublasFillMode_t uplo,
cublasOperation_t trans,
int n,
int k,
const cuComplex* alpha, /* host or device pointer */
const void* A,
cudaDataType Atype,
int lda,
const cuComplex* beta, /* host or device pointer */
void* C,
cudaDataType Ctype,
int ldc);
/* IO in Int8 complex/cuComplex, computation in cuComplex, Gaussian math */
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCsyrk3mEx(cublasHandle_t handle,
cublasFillMode_t uplo,
cublasOperation_t trans,
int n,
int k,
const cuComplex* alpha,
const void* A,
cudaDataType Atype,
int lda,
const cuComplex* beta,
void* C,
cudaDataType Ctype,
int ldc);
/* HERK */
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCherk_v2(cublasHandle_t handle,
cublasFillMode_t uplo,
cublasOperation_t trans,
int n,
int k,
const float* alpha, /* host or device pointer */
const cuComplex* A,
int lda,
const float* beta, /* host or device pointer */
cuComplex* C,
int ldc);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZherk_v2(cublasHandle_t handle,
cublasFillMode_t uplo,
cublasOperation_t trans,
int n,
int k,
const double* alpha, /* host or device pointer */
const cuDoubleComplex* A,
int lda,
const double* beta, /* host or device pointer */
cuDoubleComplex* C,
int ldc);
/* IO in Int8 complex/cuComplex, computation in cuComplex */
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCherkEx(cublasHandle_t handle,
cublasFillMode_t uplo,
cublasOperation_t trans,
int n,
int k,
const float* alpha, /* host or device pointer */
const void* A,
cudaDataType Atype,
int lda,
const float* beta, /* host or device pointer */
void* C,
cudaDataType Ctype,
int ldc);
/* IO in Int8 complex/cuComplex, computation in cuComplex, Gaussian math */
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCherk3mEx(cublasHandle_t handle,
cublasFillMode_t uplo,
cublasOperation_t trans,
int n,
int k,
const float* alpha,
const void* A,
cudaDataType Atype,
int lda,
const float* beta,
void* C,
cudaDataType Ctype,
int ldc);
/* SYR2K */
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSsyr2k_v2(cublasHandle_t handle,
cublasFillMode_t uplo,
cublasOperation_t trans,
int n,
int k,
const float* alpha, /* host or device pointer */
const float* A,
int lda,
const float* B,
int ldb,
const float* beta, /* host or device pointer */
float* C,
int ldc);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDsyr2k_v2(cublasHandle_t handle,
cublasFillMode_t uplo,
cublasOperation_t trans,
int n,
int k,
const double* alpha, /* host or device pointer */
const double* A,
int lda,
const double* B,
int ldb,
const double* beta, /* host or device pointer */
double* C,
int ldc);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCsyr2k_v2(cublasHandle_t handle,
cublasFillMode_t uplo,
cublasOperation_t trans,
int n,
int k,
const cuComplex* alpha, /* host or device pointer */
const cuComplex* A,
int lda,
const cuComplex* B,
int ldb,
const cuComplex* beta, /* host or device pointer */
cuComplex* C,
int ldc);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZsyr2k_v2(cublasHandle_t handle,
cublasFillMode_t uplo,
cublasOperation_t trans,
int n,
int k,
const cuDoubleComplex* alpha, /* host or device pointer */
const cuDoubleComplex* A,
int lda,
const cuDoubleComplex* B,
int ldb,
const cuDoubleComplex* beta, /* host or device pointer */
cuDoubleComplex* C,
int ldc);
/* HER2K */
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCher2k_v2(cublasHandle_t handle,
cublasFillMode_t uplo,
cublasOperation_t trans,
int n,
int k,
const cuComplex* alpha, /* host or device pointer */
const cuComplex* A,
int lda,
const cuComplex* B,
int ldb,
const float* beta, /* host or device pointer */
cuComplex* C,
int ldc);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZher2k_v2(cublasHandle_t handle,
cublasFillMode_t uplo,
cublasOperation_t trans,
int n,
int k,
const cuDoubleComplex* alpha, /* host or device pointer */
const cuDoubleComplex* A,
int lda,
const cuDoubleComplex* B,
int ldb,
const double* beta, /* host or device pointer */
cuDoubleComplex* C,
int ldc);
/* SYRKX : eXtended SYRK*/
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSsyrkx(cublasHandle_t handle,
cublasFillMode_t uplo,
cublasOperation_t trans,
int n,
int k,
const float* alpha, /* host or device pointer */
const float* A,
int lda,
const float* B,
int ldb,
const float* beta, /* host or device pointer */
float* C,
int ldc);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDsyrkx(cublasHandle_t handle,
cublasFillMode_t uplo,
cublasOperation_t trans,
int n,
int k,
const double* alpha, /* host or device pointer */
const double* A,
int lda,
const double* B,
int ldb,
const double* beta, /* host or device pointer */
double* C,
int ldc);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCsyrkx(cublasHandle_t handle,
cublasFillMode_t uplo,
cublasOperation_t trans,
int n,
int k,
const cuComplex* alpha, /* host or device pointer */
const cuComplex* A,
int lda,
const cuComplex* B,
int ldb,
const cuComplex* beta, /* host or device pointer */
cuComplex* C,
int ldc);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZsyrkx(cublasHandle_t handle,
cublasFillMode_t uplo,
cublasOperation_t trans,
int n,
int k,
const cuDoubleComplex* alpha, /* host or device pointer */
const cuDoubleComplex* A,
int lda,
const cuDoubleComplex* B,
int ldb,
const cuDoubleComplex* beta, /* host or device pointer */
cuDoubleComplex* C,
int ldc);
/* HERKX : eXtended HERK */
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCherkx(cublasHandle_t handle,
cublasFillMode_t uplo,
cublasOperation_t trans,
int n,
int k,
const cuComplex* alpha, /* host or device pointer */
const cuComplex* A,
int lda,
const cuComplex* B,
int ldb,
const float* beta, /* host or device pointer */
cuComplex* C,
int ldc);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZherkx(cublasHandle_t handle,
cublasFillMode_t uplo,
cublasOperation_t trans,
int n,
int k,
const cuDoubleComplex* alpha, /* host or device pointer */
const cuDoubleComplex* A,
int lda,
const cuDoubleComplex* B,
int ldb,
const double* beta, /* host or device pointer */
cuDoubleComplex* C,
int ldc);
/* SYMM */
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSsymm_v2(cublasHandle_t handle,
cublasSideMode_t side,
cublasFillMode_t uplo,
int m,
int n,
const float* alpha, /* host or device pointer */
const float* A,
int lda,
const float* B,
int ldb,
const float* beta, /* host or device pointer */
float* C,
int ldc);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDsymm_v2(cublasHandle_t handle,
cublasSideMode_t side,
cublasFillMode_t uplo,
int m,
int n,
const double* alpha, /* host or device pointer */
const double* A,
int lda,
const double* B,
int ldb,
const double* beta, /* host or device pointer */
double* C,
int ldc);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCsymm_v2(cublasHandle_t handle,
cublasSideMode_t side,
cublasFillMode_t uplo,
int m,
int n,
const cuComplex* alpha, /* host or device pointer */
const cuComplex* A,
int lda,
const cuComplex* B,
int ldb,
const cuComplex* beta, /* host or device pointer */
cuComplex* C,
int ldc);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZsymm_v2(cublasHandle_t handle,
cublasSideMode_t side,
cublasFillMode_t uplo,
int m,
int n,
const cuDoubleComplex* alpha, /* host or device pointer */
const cuDoubleComplex* A,
int lda,
const cuDoubleComplex* B,
int ldb,
const cuDoubleComplex* beta, /* host or device pointer */
cuDoubleComplex* C,
int ldc);
/* HEMM */
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasChemm_v2(cublasHandle_t handle,
cublasSideMode_t side,
cublasFillMode_t uplo,
int m,
int n,
const cuComplex* alpha, /* host or device pointer */
const cuComplex* A,
int lda,
const cuComplex* B,
int ldb,
const cuComplex* beta, /* host or device pointer */
cuComplex* C,
int ldc);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZhemm_v2(cublasHandle_t handle,
cublasSideMode_t side,
cublasFillMode_t uplo,
int m,
int n,
const cuDoubleComplex* alpha, /* host or device pointer */
const cuDoubleComplex* A,
int lda,
const cuDoubleComplex* B,
int ldb,
const cuDoubleComplex* beta, /* host or device pointer */
cuDoubleComplex* C,
int ldc);
/* TRSM */
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasStrsm_v2(cublasHandle_t handle,
cublasSideMode_t side,
cublasFillMode_t uplo,
cublasOperation_t trans,
cublasDiagType_t diag,
int m,
int n,
const float* alpha, /* host or device pointer */
const float* A,
int lda,
float* B,
int ldb);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDtrsm_v2(cublasHandle_t handle,
cublasSideMode_t side,
cublasFillMode_t uplo,
cublasOperation_t trans,
cublasDiagType_t diag,
int m,
int n,
const double* alpha, /* host or device pointer */
const double* A,
int lda,
double* B,
int ldb);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCtrsm_v2(cublasHandle_t handle,
cublasSideMode_t side,
cublasFillMode_t uplo,
cublasOperation_t trans,
cublasDiagType_t diag,
int m,
int n,
const cuComplex* alpha, /* host or device pointer */
const cuComplex* A,
int lda,
cuComplex* B,
int ldb);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZtrsm_v2(cublasHandle_t handle,
cublasSideMode_t side,
cublasFillMode_t uplo,
cublasOperation_t trans,
cublasDiagType_t diag,
int m,
int n,
const cuDoubleComplex* alpha, /* host or device pointer */
const cuDoubleComplex* A,
int lda,
cuDoubleComplex* B,
int ldb);
/* TRMM */
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasStrmm_v2(cublasHandle_t handle,
cublasSideMode_t side,
cublasFillMode_t uplo,
cublasOperation_t trans,
cublasDiagType_t diag,
int m,
int n,
const float* alpha, /* host or device pointer */
const float* A,
int lda,
const float* B,
int ldb,
float* C,
int ldc);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDtrmm_v2(cublasHandle_t handle,
cublasSideMode_t side,
cublasFillMode_t uplo,
cublasOperation_t trans,
cublasDiagType_t diag,
int m,
int n,
const double* alpha, /* host or device pointer */
const double* A,
int lda,
const double* B,
int ldb,
double* C,
int ldc);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCtrmm_v2(cublasHandle_t handle,
cublasSideMode_t side,
cublasFillMode_t uplo,
cublasOperation_t trans,
cublasDiagType_t diag,
int m,
int n,
const cuComplex* alpha, /* host or device pointer */
const cuComplex* A,
int lda,
const cuComplex* B,
int ldb,
cuComplex* C,
int ldc);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZtrmm_v2(cublasHandle_t handle,
cublasSideMode_t side,
cublasFillMode_t uplo,
cublasOperation_t trans,
cublasDiagType_t diag,
int m,
int n,
const cuDoubleComplex* alpha, /* host or device pointer */
const cuDoubleComplex* A,
int lda,
const cuDoubleComplex* B,
int ldb,
cuDoubleComplex* C,
int ldc);
/* BATCH GEMM */
#if defined(__cplusplus)
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasHgemmBatched(cublasHandle_t handle,
cublasOperation_t transa,
cublasOperation_t transb,
int m,
int n,
int k,
const __half* alpha, /* host or device pointer */
const __half* const Aarray[],
int lda,
const __half* const Barray[],
int ldb,
const __half* beta, /* host or device pointer */
__half* const Carray[],
int ldc,
int batchCount);
#endif
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSgemmBatched(cublasHandle_t handle,
cublasOperation_t transa,
cublasOperation_t transb,
int m,
int n,
int k,
const float* alpha, /* host or device pointer */
const float* const Aarray[],
int lda,
const float* const Barray[],
int ldb,
const float* beta, /* host or device pointer */
float* const Carray[],
int ldc,
int batchCount);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDgemmBatched(cublasHandle_t handle,
cublasOperation_t transa,
cublasOperation_t transb,
int m,
int n,
int k,
const double* alpha, /* host or device pointer */
const double* const Aarray[],
int lda,
const double* const Barray[],
int ldb,
const double* beta, /* host or device pointer */
double* const Carray[],
int ldc,
int batchCount);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCgemmBatched(cublasHandle_t handle,
cublasOperation_t transa,
cublasOperation_t transb,
int m,
int n,
int k,
const cuComplex* alpha, /* host or device pointer */
const cuComplex* const Aarray[],
int lda,
const cuComplex* const Barray[],
int ldb,
const cuComplex* beta, /* host or device pointer */
cuComplex* const Carray[],
int ldc,
int batchCount);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCgemm3mBatched(cublasHandle_t handle,
cublasOperation_t transa,
cublasOperation_t transb,
int m,
int n,
int k,
const cuComplex* alpha, /* host or device pointer */
const cuComplex* const Aarray[],
int lda,
const cuComplex* const Barray[],
int ldb,
const cuComplex* beta, /* host or device pointer */
cuComplex* const Carray[],
int ldc,
int batchCount);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZgemmBatched(cublasHandle_t handle,
cublasOperation_t transa,
cublasOperation_t transb,
int m,
int n,
int k,
const cuDoubleComplex* alpha, /* host or device pointer */
const cuDoubleComplex* const Aarray[],
int lda,
const cuDoubleComplex* const Barray[],
int ldb,
const cuDoubleComplex* beta, /* host or device pointer */
cuDoubleComplex* const Carray[],
int ldc,
int batchCount);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasGemmBatchedEx(cublasHandle_t handle,
cublasOperation_t transa,
cublasOperation_t transb,
int m,
int n,
int k,
const void* alpha, /* host or device pointer */
const void* const Aarray[],
cudaDataType Atype,
int lda,
const void* const Barray[],
cudaDataType Btype,
int ldb,
const void* beta, /* host or device pointer */
void* const Carray[],
cudaDataType Ctype,
int ldc,
int batchCount,
cublasComputeType_t computeType,
cublasGemmAlgo_t algo);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasGemmStridedBatchedEx(cublasHandle_t handle,
cublasOperation_t transa,
cublasOperation_t transb,
int m,
int n,
int k,
const void* alpha, /* host or device pointer */
const void* A,
cudaDataType Atype,
int lda,
long long int strideA, /* purposely signed */
const void* B,
cudaDataType Btype,
int ldb,
long long int strideB,
const void* beta, /* host or device pointer */
void* C,
cudaDataType Ctype,
int ldc,
long long int strideC,
int batchCount,
cublasComputeType_t computeType,
cublasGemmAlgo_t algo);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSgemmStridedBatched(cublasHandle_t handle,
cublasOperation_t transa,
cublasOperation_t transb,
int m,
int n,
int k,
const float* alpha, /* host or device pointer */
const float* A,
int lda,
long long int strideA, /* purposely signed */
const float* B,
int ldb,
long long int strideB,
const float* beta, /* host or device pointer */
float* C,
int ldc,
long long int strideC,
int batchCount);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDgemmStridedBatched(cublasHandle_t handle,
cublasOperation_t transa,
cublasOperation_t transb,
int m,
int n,
int k,
const double* alpha, /* host or device pointer */
const double* A,
int lda,
long long int strideA, /* purposely signed */
const double* B,
int ldb,
long long int strideB,
const double* beta, /* host or device pointer */
double* C,
int ldc,
long long int strideC,
int batchCount);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCgemmStridedBatched(cublasHandle_t handle,
cublasOperation_t transa,
cublasOperation_t transb,
int m,
int n,
int k,
const cuComplex* alpha, /* host or device pointer */
const cuComplex* A,
int lda,
long long int strideA, /* purposely signed */
const cuComplex* B,
int ldb,
long long int strideB,
const cuComplex* beta, /* host or device pointer */
cuComplex* C,
int ldc,
long long int strideC,
int batchCount);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCgemm3mStridedBatched(cublasHandle_t handle,
cublasOperation_t transa,
cublasOperation_t transb,
int m,
int n,
int k,
const cuComplex* alpha, /* host or device pointer */
const cuComplex* A,
int lda,
long long int strideA, /* purposely signed */
const cuComplex* B,
int ldb,
long long int strideB,
const cuComplex* beta, /* host or device pointer */
cuComplex* C,
int ldc,
long long int strideC,
int batchCount);
CUBLASAPI cublasStatus_t CUBLASWINAPI
cublasZgemmStridedBatched(cublasHandle_t handle,
cublasOperation_t transa,
cublasOperation_t transb,
int m,
int n,
int k,
const cuDoubleComplex* alpha, /* host or device pointer */
const cuDoubleComplex* A,
int lda,
long long int strideA, /* purposely signed */
const cuDoubleComplex* B,
int ldb,
long long int strideB,
const cuDoubleComplex* beta, /* host or device poi */
cuDoubleComplex* C,
int ldc,
long long int strideC,
int batchCount);
#if defined(__cplusplus)
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasHgemmStridedBatched(cublasHandle_t handle,
cublasOperation_t transa,
cublasOperation_t transb,
int m,
int n,
int k,
const __half* alpha, /* host or device pointer */
const __half* A,
int lda,
long long int strideA, /* purposely signed */
const __half* B,
int ldb,
long long int strideB,
const __half* beta, /* host or device pointer */
__half* C,
int ldc,
long long int strideC,
int batchCount);
#endif
/* ---------------- CUBLAS BLAS-like extension ---------------- */
/* GEAM */
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSgeam(cublasHandle_t handle,
cublasOperation_t transa,
cublasOperation_t transb,
int m,
int n,
const float* alpha, /* host or device pointer */
const float* A,
int lda,
const float* beta, /* host or device pointer */
const float* B,
int ldb,
float* C,
int ldc);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDgeam(cublasHandle_t handle,
cublasOperation_t transa,
cublasOperation_t transb,
int m,
int n,
const double* alpha, /* host or device pointer */
const double* A,
int lda,
const double* beta, /* host or device pointer */
const double* B,
int ldb,
double* C,
int ldc);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCgeam(cublasHandle_t handle,
cublasOperation_t transa,
cublasOperation_t transb,
int m,
int n,
const cuComplex* alpha, /* host or device pointer */
const cuComplex* A,
int lda,
const cuComplex* beta, /* host or device pointer */
const cuComplex* B,
int ldb,
cuComplex* C,
int ldc);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZgeam(cublasHandle_t handle,
cublasOperation_t transa,
cublasOperation_t transb,
int m,
int n,
const cuDoubleComplex* alpha, /* host or device pointer */
const cuDoubleComplex* A,
int lda,
const cuDoubleComplex* beta, /* host or device pointer */
const cuDoubleComplex* B,
int ldb,
cuDoubleComplex* C,
int ldc);
/* Batched LU - GETRF*/
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSgetrfBatched(cublasHandle_t handle,
int n,
float* const A[], /*Device pointer*/
int lda,
int* P, /*Device Pointer*/
int* info, /*Device Pointer*/
int batchSize);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDgetrfBatched(cublasHandle_t handle,
int n,
double* const A[], /*Device pointer*/
int lda,
int* P, /*Device Pointer*/
int* info, /*Device Pointer*/
int batchSize);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCgetrfBatched(cublasHandle_t handle,
int n,
cuComplex* const A[], /*Device pointer*/
int lda,
int* P, /*Device Pointer*/
int* info, /*Device Pointer*/
int batchSize);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZgetrfBatched(cublasHandle_t handle,
int n,
cuDoubleComplex* const A[], /*Device pointer*/
int lda,
int* P, /*Device Pointer*/
int* info, /*Device Pointer*/
int batchSize);
/* Batched inversion based on LU factorization from getrf */
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSgetriBatched(cublasHandle_t handle,
int n,
const float* const A[], /*Device pointer*/
int lda,
const int* P, /*Device pointer*/
float* const C[], /*Device pointer*/
int ldc,
int* info,
int batchSize);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDgetriBatched(cublasHandle_t handle,
int n,
const double* const A[], /*Device pointer*/
int lda,
const int* P, /*Device pointer*/
double* const C[], /*Device pointer*/
int ldc,
int* info,
int batchSize);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCgetriBatched(cublasHandle_t handle,
int n,
const cuComplex* const A[], /*Device pointer*/
int lda,
const int* P, /*Device pointer*/
cuComplex* const C[], /*Device pointer*/
int ldc,
int* info,
int batchSize);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZgetriBatched(cublasHandle_t handle,
int n,
const cuDoubleComplex* const A[], /*Device pointer*/
int lda,
const int* P, /*Device pointer*/
cuDoubleComplex* const C[], /*Device pointer*/
int ldc,
int* info,
int batchSize);
/* Batched solver based on LU factorization from getrf */
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSgetrsBatched(cublasHandle_t handle,
cublasOperation_t trans,
int n,
int nrhs,
const float* const Aarray[],
int lda,
const int* devIpiv,
float* const Barray[],
int ldb,
int* info,
int batchSize);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDgetrsBatched(cublasHandle_t handle,
cublasOperation_t trans,
int n,
int nrhs,
const double* const Aarray[],
int lda,
const int* devIpiv,
double* const Barray[],
int ldb,
int* info,
int batchSize);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCgetrsBatched(cublasHandle_t handle,
cublasOperation_t trans,
int n,
int nrhs,
const cuComplex* const Aarray[],
int lda,
const int* devIpiv,
cuComplex* const Barray[],
int ldb,
int* info,
int batchSize);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZgetrsBatched(cublasHandle_t handle,
cublasOperation_t trans,
int n,
int nrhs,
const cuDoubleComplex* const Aarray[],
int lda,
const int* devIpiv,
cuDoubleComplex* const Barray[],
int ldb,
int* info,
int batchSize);
/* TRSM - Batched Triangular Solver */
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasStrsmBatched(cublasHandle_t handle,
cublasSideMode_t side,
cublasFillMode_t uplo,
cublasOperation_t trans,
cublasDiagType_t diag,
int m,
int n,
const float* alpha, /*Host or Device Pointer*/
const float* const A[],
int lda,
float* const B[],
int ldb,
int batchCount);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDtrsmBatched(cublasHandle_t handle,
cublasSideMode_t side,
cublasFillMode_t uplo,
cublasOperation_t trans,
cublasDiagType_t diag,
int m,
int n,
const double* alpha, /*Host or Device Pointer*/
const double* const A[],
int lda,
double* const B[],
int ldb,
int batchCount);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCtrsmBatched(cublasHandle_t handle,
cublasSideMode_t side,
cublasFillMode_t uplo,
cublasOperation_t trans,
cublasDiagType_t diag,
int m,
int n,
const cuComplex* alpha, /*Host or Device Pointer*/
const cuComplex* const A[],
int lda,
cuComplex* const B[],
int ldb,
int batchCount);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZtrsmBatched(cublasHandle_t handle,
cublasSideMode_t side,
cublasFillMode_t uplo,
cublasOperation_t trans,
cublasDiagType_t diag,
int m,
int n,
const cuDoubleComplex* alpha, /*Host or Device Pointer*/
const cuDoubleComplex* const A[],
int lda,
cuDoubleComplex* const B[],
int ldb,
int batchCount);
/* Batched - MATINV*/
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSmatinvBatched(cublasHandle_t handle,
int n,
const float* const A[], /*Device pointer*/
int lda,
float* const Ainv[], /*Device pointer*/
int lda_inv,
int* info, /*Device Pointer*/
int batchSize);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDmatinvBatched(cublasHandle_t handle,
int n,
const double* const A[], /*Device pointer*/
int lda,
double* const Ainv[], /*Device pointer*/
int lda_inv,
int* info, /*Device Pointer*/
int batchSize);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCmatinvBatched(cublasHandle_t handle,
int n,
const cuComplex* const A[], /*Device pointer*/
int lda,
cuComplex* const Ainv[], /*Device pointer*/
int lda_inv,
int* info, /*Device Pointer*/
int batchSize);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZmatinvBatched(cublasHandle_t handle,
int n,
const cuDoubleComplex* const A[], /*Device pointer*/
int lda,
cuDoubleComplex* const Ainv[], /*Device pointer*/
int lda_inv,
int* info, /*Device Pointer*/
int batchSize);
/* Batch QR Factorization */
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSgeqrfBatched(cublasHandle_t handle,
int m,
int n,
float* const Aarray[], /*Device pointer*/
int lda,
float* const TauArray[], /*Device pointer*/
int* info,
int batchSize);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDgeqrfBatched(cublasHandle_t handle,
int m,
int n,
double* const Aarray[], /*Device pointer*/
int lda,
double* const TauArray[], /*Device pointer*/
int* info,
int batchSize);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCgeqrfBatched(cublasHandle_t handle,
int m,
int n,
cuComplex* const Aarray[], /*Device pointer*/
int lda,
cuComplex* const TauArray[], /*Device pointer*/
int* info,
int batchSize);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZgeqrfBatched(cublasHandle_t handle,
int m,
int n,
cuDoubleComplex* const Aarray[], /*Device pointer*/
int lda,
cuDoubleComplex* const TauArray[], /*Device pointer*/
int* info,
int batchSize);
/* Least Square Min only m >= n and Non-transpose supported */
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSgelsBatched(cublasHandle_t handle,
cublasOperation_t trans,
int m,
int n,
int nrhs,
float* const Aarray[], /*Device pointer*/
int lda,
float* const Carray[], /*Device pointer*/
int ldc,
int* info,
int* devInfoArray, /*Device pointer*/
int batchSize);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDgelsBatched(cublasHandle_t handle,
cublasOperation_t trans,
int m,
int n,
int nrhs,
double* const Aarray[], /*Device pointer*/
int lda,
double* const Carray[], /*Device pointer*/
int ldc,
int* info,
int* devInfoArray, /*Device pointer*/
int batchSize);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCgelsBatched(cublasHandle_t handle,
cublasOperation_t trans,
int m,
int n,
int nrhs,
cuComplex* const Aarray[], /*Device pointer*/
int lda,
cuComplex* const Carray[], /*Device pointer*/
int ldc,
int* info,
int* devInfoArray,
int batchSize);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZgelsBatched(cublasHandle_t handle,
cublasOperation_t trans,
int m,
int n,
int nrhs,
cuDoubleComplex* const Aarray[], /*Device pointer*/
int lda,
cuDoubleComplex* const Carray[], /*Device pointer*/
int ldc,
int* info,
int* devInfoArray,
int batchSize);
/* DGMM */
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSdgmm(cublasHandle_t handle,
cublasSideMode_t mode,
int m,
int n,
const float* A,
int lda,
const float* x,
int incx,
float* C,
int ldc);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDdgmm(cublasHandle_t handle,
cublasSideMode_t mode,
int m,
int n,
const double* A,
int lda,
const double* x,
int incx,
double* C,
int ldc);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCdgmm(cublasHandle_t handle,
cublasSideMode_t mode,
int m,
int n,
const cuComplex* A,
int lda,
const cuComplex* x,
int incx,
cuComplex* C,
int ldc);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZdgmm(cublasHandle_t handle,
cublasSideMode_t mode,
int m,
int n,
const cuDoubleComplex* A,
int lda,
const cuDoubleComplex* x,
int incx,
cuDoubleComplex* C,
int ldc);
/* TPTTR : Triangular Pack format to Triangular format */
CUBLASAPI cublasStatus_t CUBLASWINAPI
cublasStpttr(cublasHandle_t handle, cublasFillMode_t uplo, int n, const float* AP, float* A, int lda);
CUBLASAPI cublasStatus_t CUBLASWINAPI
cublasDtpttr(cublasHandle_t handle, cublasFillMode_t uplo, int n, const double* AP, double* A, int lda);
CUBLASAPI cublasStatus_t CUBLASWINAPI
cublasCtpttr(cublasHandle_t handle, cublasFillMode_t uplo, int n, const cuComplex* AP, cuComplex* A, int lda);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZtpttr(
cublasHandle_t handle, cublasFillMode_t uplo, int n, const cuDoubleComplex* AP, cuDoubleComplex* A, int lda);
/* TRTTP : Triangular format to Triangular Pack format */
CUBLASAPI cublasStatus_t CUBLASWINAPI
cublasStrttp(cublasHandle_t handle, cublasFillMode_t uplo, int n, const float* A, int lda, float* AP);
CUBLASAPI cublasStatus_t CUBLASWINAPI
cublasDtrttp(cublasHandle_t handle, cublasFillMode_t uplo, int n, const double* A, int lda, double* AP);
CUBLASAPI cublasStatus_t CUBLASWINAPI
cublasCtrttp(cublasHandle_t handle, cublasFillMode_t uplo, int n, const cuComplex* A, int lda, cuComplex* AP);
CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZtrttp(
cublasHandle_t handle, cublasFillMode_t uplo, int n, const cuDoubleComplex* A, int lda, cuDoubleComplex* AP);
#if defined(__cplusplus)
}
static inline cublasStatus_t cublasMigrateComputeType(cublasHandle_t handle,
cudaDataType_t dataType,
cublasComputeType_t* computeType) {
cublasMath_t mathMode = CUBLAS_DEFAULT_MATH;
cublasStatus_t status = CUBLAS_STATUS_SUCCESS;
status = cublasGetMathMode(handle, &mathMode);
if (status != CUBLAS_STATUS_SUCCESS) {
return status;
}
bool isPedantic = ((mathMode & 0xf) == CUBLAS_PEDANTIC_MATH);
switch (dataType) {
case CUDA_R_32F:
case CUDA_C_32F:
*computeType = isPedantic ? CUBLAS_COMPUTE_32F_PEDANTIC : CUBLAS_COMPUTE_32F;
return CUBLAS_STATUS_SUCCESS;
case CUDA_R_64F:
case CUDA_C_64F:
*computeType = isPedantic ? CUBLAS_COMPUTE_64F_PEDANTIC : CUBLAS_COMPUTE_64F;
return CUBLAS_STATUS_SUCCESS;
case CUDA_R_16F:
*computeType = isPedantic ? CUBLAS_COMPUTE_16F_PEDANTIC : CUBLAS_COMPUTE_16F;
return CUBLAS_STATUS_SUCCESS;
case CUDA_R_32I:
*computeType = isPedantic ? CUBLAS_COMPUTE_32I_PEDANTIC : CUBLAS_COMPUTE_32I;
return CUBLAS_STATUS_SUCCESS;
default:
return CUBLAS_STATUS_NOT_SUPPORTED;
}
}
/* wrappers to accept old code with cudaDataType computeType when referenced from c++ code */
static inline cublasStatus_t cublasGemmEx(cublasHandle_t handle,
cublasOperation_t transa,
cublasOperation_t transb,
int m,
int n,
int k,
const void* alpha, /* host or device pointer */
const void* A,
cudaDataType Atype,
int lda,
const void* B,
cudaDataType Btype,
int ldb,
const void* beta, /* host or device pointer */
void* C,
cudaDataType Ctype,
int ldc,
cudaDataType computeType,
cublasGemmAlgo_t algo) {
cublasComputeType_t migratedComputeType = CUBLAS_COMPUTE_32F;
cublasStatus_t status = CUBLAS_STATUS_SUCCESS;
status = cublasMigrateComputeType(handle, computeType, &migratedComputeType);
if (status != CUBLAS_STATUS_SUCCESS) {
return status;
}
return cublasGemmEx(handle,
transa,
transb,
m,
n,
k,
alpha,
A,
Atype,
lda,
B,
Btype,
ldb,
beta,
C,
Ctype,
ldc,
migratedComputeType,
algo);
}
static inline cublasStatus_t cublasGemmBatchedEx(cublasHandle_t handle,
cublasOperation_t transa,
cublasOperation_t transb,
int m,
int n,
int k,
const void* alpha, /* host or device pointer */
const void* const Aarray[],
cudaDataType Atype,
int lda,
const void* const Barray[],
cudaDataType Btype,
int ldb,
const void* beta, /* host or device pointer */
void* const Carray[],
cudaDataType Ctype,
int ldc,
int batchCount,
cudaDataType computeType,
cublasGemmAlgo_t algo) {
cublasComputeType_t migratedComputeType;
cublasStatus_t status;
status = cublasMigrateComputeType(handle, computeType, &migratedComputeType);
if (status != CUBLAS_STATUS_SUCCESS) {
return status;
}
return cublasGemmBatchedEx(handle,
transa,
transb,
m,
n,
k,
alpha,
Aarray,
Atype,
lda,
Barray,
Btype,
ldb,
beta,
Carray,
Ctype,
ldc,
batchCount,
migratedComputeType,
algo);
}
static inline cublasStatus_t cublasGemmStridedBatchedEx(cublasHandle_t handle,
cublasOperation_t transa,
cublasOperation_t transb,
int m,
int n,
int k,
const void* alpha, /* host or device pointer */
const void* A,
cudaDataType Atype,
int lda,
long long int strideA, /* purposely signed */
const void* B,
cudaDataType Btype,
int ldb,
long long int strideB,
const void* beta, /* host or device pointer */
void* C,
cudaDataType Ctype,
int ldc,
long long int strideC,
int batchCount,
cudaDataType computeType,
cublasGemmAlgo_t algo) {
cublasComputeType_t migratedComputeType;
cublasStatus_t status;
status = cublasMigrateComputeType(handle, computeType, &migratedComputeType);
if (status != CUBLAS_STATUS_SUCCESS) {
return status;
}
return cublasGemmStridedBatchedEx(handle,
transa,
transb,
m,
n,
k,
alpha,
A,
Atype,
lda,
strideA,
B,
Btype,
ldb,
strideB,
beta,
C,
Ctype,
ldc,
strideC,
batchCount,
migratedComputeType,
algo);
}
#endif /* __cplusplus */
#endif /* !defined(CUBLAS_API_H_) */