ytts/venv/lib/python3.11/site-packages/nvidia/curand/include/curand_kernel.h


 /* Copyright 2010-2014 NVIDIA Corporation.  All rights reserved.
  *
  * NOTICE TO LICENSEE:
  *
  * The source code and/or documentation ("Licensed Deliverables") are
  * subject to NVIDIA intellectual property rights under U.S. and
  * international Copyright laws.
  *
  * The Licensed Deliverables contained herein are PROPRIETARY and
  * CONFIDENTIAL to NVIDIA and are being provided under the terms and
  * conditions of a form of NVIDIA software license agreement by and
  * between NVIDIA and Licensee ("License Agreement") or electronically
  * accepted by Licensee.  Notwithstanding any terms or conditions to
  * the contrary in the License Agreement, reproduction or disclosure
  * of the Licensed Deliverables to any third party without the express
  * written consent of NVIDIA is prohibited.
  *
  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
  * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
  * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  THEY ARE
  * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
  * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
  * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
  * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
  * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
  * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
  * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
  * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
  * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
  * OF THESE LICENSED DELIVERABLES.
  *
  * U.S. Government End Users.  These Licensed Deliverables are a
  * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
  * 1995), consisting of "commercial computer software" and "commercial
  * computer software documentation" as such terms are used in 48
  * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
  * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
  * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
  * U.S. Government End Users acquire the Licensed Deliverables with
  * only those rights set forth herein.
  *
  * Any use of the Licensed Deliverables in individual and commercial
  * software must include, in the user documentation and internal
  * comments to the code, the above Disclaimer and U.S. Government End
  * Users Notice.
  */


#if !defined(CURAND_KERNEL_H_)
#define CURAND_KERNEL_H_

/**
 * \defgroup DEVICE Device API
 *
 * @{
 */

#if !defined(QUALIFIERS)
#define QUALIFIERS static __forceinline__ __device__
#endif


#ifdef __CUDACC_RTC__
#define CURAND_DETAIL_USE_CUDA_STL
#endif

#if __cplusplus >= 201103L
#   ifdef CURAND_DETAIL_USE_CUDA_STL
#       define CURAND_STD cuda::std
#       include <cuda/std/type_traits>
#   else
#       define CURAND_STD std
#       include <type_traits>
#   endif // CURAND_DETAIL_USE_CUDA_STL
#else
// To support C++03 compilation
#   define CURAND_STD curand_detail
namespace curand_detail {
    template<bool B, class T = void>
    struct enable_if {};

    template<class T>
    struct enable_if<true, T> { typedef T type; };

    template<class T, class U>
    struct is_same { static const bool value = false; };

    template<class T>
    struct is_same<T, T> { static const bool value = true; };
} // namespace curand_detail
#endif // __cplusplus >= 201103L

#ifndef __CUDACC_RTC__
#include <math.h>
#endif // __CUDACC_RTC__

#include "curand.h"
#include "curand_discrete.h"
#include "curand_precalc.h"
#include "curand_mrg32k3a.h"
#include "curand_mtgp32_kernel.h"
#include "curand_philox4x32_x.h"
#include "curand_globals.h"

/* Test RNG */
/* This generator uses the formula:
   x_n = x_(n-1) + 1 mod 2^32
   x_0 = (unsigned int)seed * 3
   Subsequences are spaced 31337 steps apart.
*/
struct curandStateTest {
    unsigned int v;
};

/** \cond UNHIDE_TYPEDEFS */
typedef struct curandStateTest curandStateTest_t;
/** \endcond */

/* XORSHIFT FAMILY RNGs */
/* These generators are a family proposed by Marsaglia.  They keep state
   in 32 bit chunks, then use repeated shift and xor operations to scramble
   the bits.  The following generators are a combination of a simple Weyl
   generator with an N variable XORSHIFT generator.
*/

/* XORSHIFT RNG */
/* This generator uses the xorwow formula of
www.jstatsoft.org/v08/i14/paper page 5
Has period 2^192 - 2^32.
*/
/**
 * CURAND XORWOW state
 */
struct curandStateXORWOW;

/*
 * Implementation details not in reference documentation */
struct curandStateXORWOW {
    unsigned int d, v[5];
    int boxmuller_flag;
    int boxmuller_flag_double;
    float boxmuller_extra;
    double boxmuller_extra_double;
};

/*
 * CURAND XORWOW state
 */
/** \cond UNHIDE_TYPEDEFS */
typedef struct curandStateXORWOW curandStateXORWOW_t;

#define EXTRA_FLAG_NORMAL         0x00000001
#define EXTRA_FLAG_LOG_NORMAL     0x00000002
/** \endcond */

/* Combined Multiple Recursive Generators */
/* These generators are a family proposed by L'Ecuyer.  They keep state
   in sets of doubles, then use repeated modular arithmetic multiply operations
   to scramble the bits in each set, and combine the result.
*/

/* MRG32k3a RNG */
/* This generator uses the MRG32k3A formula of
http://www.iro.umontreal.ca/~lecuyer/myftp/streams00/c++/streams4.pdf
Has period 2^191.
*/

/* moduli for the recursions */
/** \cond UNHIDE_DEFINES */
#define MRG32K3A_MOD1 4294967087.
#define MRG32K3A_MOD2 4294944443.

/* Constants used in generation */

#define MRG32K3A_A12  1403580.
#define MRG32K3A_A13N 810728.
#define MRG32K3A_A21  527612.
#define MRG32K3A_A23N 1370589.
#define MRG32K3A_NORM (2.3283065498378288e-10)
//
// #define MRG32K3A_BITS_NORM ((double)((POW32_DOUBLE-1.0)/MOD1))
//  above constant, used verbatim, rounds differently on some host systems.
#define MRG32K3A_BITS_NORM 1.000000048662

/** \endcond */


/**
 * CURAND MRG32K3A state
 */
struct curandStateMRG32k3a;

/* Implementation details not in reference documentation */
struct curandStateMRG32k3a {
    unsigned int s1[3];
    unsigned int s2[3];
    int boxmuller_flag;
    int boxmuller_flag_double;
    float boxmuller_extra;
    double boxmuller_extra_double;
};

/*
 * CURAND MRG32K3A state
 */
/** \cond UNHIDE_TYPEDEFS */
typedef struct curandStateMRG32k3a curandStateMRG32k3a_t;
/** \endcond */

/* SOBOL QRNG */
/**
 * CURAND Sobol32 state
 */
struct curandStateSobol32;

/* Implementation details not in reference documentation */
struct curandStateSobol32 {
    unsigned int i, x, c;
    unsigned int direction_vectors[32];
};

/*
 * CURAND Sobol32 state
 */
/** \cond UNHIDE_TYPEDEFS */
typedef struct curandStateSobol32 curandStateSobol32_t;
/** \endcond */

/**
 * CURAND Scrambled Sobol32 state
 */
struct curandStateScrambledSobol32;

/* Implementation details not in reference documentation */
struct curandStateScrambledSobol32 {
    unsigned int i, x, c;
    unsigned int direction_vectors[32];
};

/*
 * CURAND Scrambled Sobol32 state
 */
/** \cond UNHIDE_TYPEDEFS */
typedef struct curandStateScrambledSobol32 curandStateScrambledSobol32_t;
/** \endcond */

/**
 * CURAND Sobol64 state
 */
struct curandStateSobol64;

/* Implementation details not in reference documentation */
struct curandStateSobol64 {
    unsigned long long i, x, c;
    unsigned long long direction_vectors[64];
};

/*
 * CURAND Sobol64 state
 */
/** \cond UNHIDE_TYPEDEFS */
typedef struct curandStateSobol64 curandStateSobol64_t;
/** \endcond */

/**
 * CURAND Scrambled Sobol64 state
 */
struct curandStateScrambledSobol64;

/* Implementation details not in reference documentation */
struct curandStateScrambledSobol64 {
    unsigned long long i, x, c;
    unsigned long long direction_vectors[64];
};

/*
 * CURAND Scrambled Sobol64 state
 */
/** \cond UNHIDE_TYPEDEFS */
typedef struct curandStateScrambledSobol64 curandStateScrambledSobol64_t;
/** \endcond */

/*
 * Default RNG
 */
/** \cond UNHIDE_TYPEDEFS */
typedef struct curandStateXORWOW curandState_t;
typedef struct curandStateXORWOW curandState;
/** \endcond */

/****************************************************************************/
/* Utility functions needed by RNGs */
/****************************************************************************/
/** \cond UNHIDE_UTILITIES */
/*
   multiply vector by matrix, store in result
   matrix is n x n, measured in 32 bit units
   matrix is stored in row major order
   vector and result cannot be same pointer
*/
template<int N>
QUALIFIERS void __curand_matvec_inplace(unsigned int *vector, unsigned int *matrix)
{
    unsigned int result[N] = { 0 };
    for(int i = 0; i < N; i++) {
        #ifdef __CUDA_ARCH__
        #pragma unroll 16
        #endif
        for(int j = 0; j < 32; j++) {
            if(vector[i] & (1 << j)) {
                for(int k = 0; k < N; k++) {
                    result[k] ^= matrix[N * (i * 32 + j) + k];
                }
            }
        }
    }
    for(int i = 0; i < N; i++) {
        vector[i] = result[i];
    }
}

QUALIFIERS void __curand_matvec(unsigned int *vector, unsigned int *matrix,
                                unsigned int *result, int n)
{
    for(int i = 0; i < n; i++) {
        result[i] = 0;
    }
    for(int i = 0; i < n; i++) {
        for(int j = 0; j < 32; j++) {
            if(vector[i] & (1 << j)) {
                for(int k = 0; k < n; k++) {
                    result[k] ^= matrix[n * (i * 32 + j) + k];
                }
            }
        }
    }
}

/* generate identity matrix */
QUALIFIERS void __curand_matidentity(unsigned int *matrix, int n)
{
    int r;
    for(int i = 0; i < n * 32; i++) {
        for(int j = 0; j < n; j++) {
            r = i & 31;
            if(i / 32 == j) {
                matrix[i * n + j] = (1 << r);
            } else {
                matrix[i * n + j] = 0;
            }
        }
    }
}

/* multiply matrixA by matrixB, store back in matrixA
   matrixA and matrixB must not be same matrix */
QUALIFIERS void __curand_matmat(unsigned int *matrixA, unsigned int *matrixB, int n)
{
    unsigned int result[MAX_XOR_N];
    for(int i = 0; i < n * 32; i++) {
        __curand_matvec(matrixA + i * n, matrixB, result, n);
        for(int j = 0; j < n; j++) {
            matrixA[i * n + j] = result[j];
        }
    }
}

/* copy vectorA to vector */
QUALIFIERS void __curand_veccopy(unsigned int *vector, unsigned int *vectorA, int n)
{
    for(int i = 0; i < n; i++) {
        vector[i] = vectorA[i];
    }
}

/* copy matrixA to matrix */
QUALIFIERS void __curand_matcopy(unsigned int *matrix, unsigned int *matrixA, int n)
{
    for(int i = 0; i < n * n * 32; i++) {
        matrix[i] = matrixA[i];
    }
}

/* compute matrixA to power p, store result in matrix */
QUALIFIERS void __curand_matpow(unsigned int *matrix, unsigned int *matrixA,
                                unsigned long long p, int n)
{
    unsigned int matrixR[MAX_XOR_N * MAX_XOR_N * 32];
    unsigned int matrixS[MAX_XOR_N * MAX_XOR_N * 32];
    __curand_matidentity(matrix, n);
    __curand_matcopy(matrixR, matrixA, n);
    while(p) {
        if(p & 1) {
            __curand_matmat(matrix, matrixR, n);
        }
        __curand_matcopy(matrixS, matrixR, n);
        __curand_matmat(matrixR, matrixS, n);
        p >>= 1;
    }
}

/****************************************************************************/
/* Utility functions needed by MRG32k3a RNG                                 */
/* Matrix operations modulo some integer less than 2**32, done in           */
/* double precision floating point, with care not to overflow 53 bits       */
/****************************************************************************/

/* return i mod m.                                                          */
/* assumes i and m are integers represented accurately in doubles           */

QUALIFIERS double curand_MRGmod(double i, double m)
{
    double quo;
    double rem;
    quo = floor(i/m);
    rem = i - (quo*m);
    if (rem < 0.0) rem += m;
    return rem;
}

/* Multiplication modulo m. Inputs i and j less than 2**32                  */
/* Ensure intermediate results do not exceed 2**53                          */

QUALIFIERS double curand_MRGmodMul(double i, double j, double m)
{
    double tempHi;
    double tempLo;

    tempHi = floor(i/131072.0);
    tempLo = i - (tempHi*131072.0);
    tempLo = curand_MRGmod( curand_MRGmod( (tempHi * j), m) * 131072.0 + curand_MRGmod(tempLo * j, m),m);

    if (tempLo < 0.0) tempLo += m;
    return tempLo;
}

/* multiply 3 by 3 matrices of doubles, modulo m                            */

QUALIFIERS void curand_MRGmatMul3x3(unsigned int i1[][3],unsigned int i2[][3],unsigned int o[][3],double m)
{
    int i,j;
    double temp[3][3];
    for (i=0; i<3; i++){
        for (j=0; j<3; j++){
            temp[i][j] = ( curand_MRGmodMul(i1[i][0], i2[0][j], m) +
                           curand_MRGmodMul(i1[i][1], i2[1][j], m) +
                           curand_MRGmodMul(i1[i][2], i2[2][j], m));
            temp[i][j] = curand_MRGmod( temp[i][j], m );
        }
    }
    for (i=0; i<3; i++){
        for (j=0; j<3; j++){
            o[i][j] = (unsigned int)temp[i][j];
        }
    }
}

/* multiply 3 by 3 matrix times 3 by 1 vector of doubles, modulo m          */

QUALIFIERS void curand_MRGmatVecMul3x3( unsigned int i[][3], unsigned int v[], double m)
{
    int k;
    double t[3];
    for (k = 0; k < 3; k++) {
        t[k] = ( curand_MRGmodMul(i[k][0], v[0], m) +
                 curand_MRGmodMul(i[k][1], v[1], m) +
                 curand_MRGmodMul(i[k][2], v[2], m) );
        t[k] = curand_MRGmod( t[k], m );
    }
    for (k = 0; k < 3; k++) {
        v[k] = (unsigned int)t[k];
    }

}

/* raise a 3 by 3 matrix of doubles to a 64 bit integer power pow, modulo m */
/* input is index zero of an array of 3 by 3 matrices m,                    */
/* each m = m[0]**(2**index)                                                */

QUALIFIERS void curand_MRGmatPow3x3( unsigned int in[][3][3], unsigned int o[][3], double m, unsigned long long pow )
{
    int i,j;
    for ( i = 0; i < 3; i++ ) {
        for ( j = 0; j < 3; j++ ) {
            o[i][j] = 0;
            if ( i == j ) o[i][j] = 1;
        }
    }
    i = 0;
    curand_MRGmatVecMul3x3(o,o[0],m);
    while (pow) {
        if ( pow & 1ll ) {
             curand_MRGmatMul3x3(in[i], o, o, m);
        }
        i++;
        pow >>= 1;
    }
}

/* raise a 3 by 3 matrix of doubles to the power                            */
/* 2 to the power (pow modulo 191), modulo m                                */

QUALIFIERS void curnand_MRGmatPow2Pow3x3( double in[][3], double o[][3], double m, unsigned long pow )
{
    unsigned int temp[3][3];
    int i,j;
    pow = pow % 191;
    for ( i = 0; i < 3; i++ ) {
        for ( j = 0; j < 3; j++ ) {
            temp[i][j] = (unsigned int)in[i][j];
        }
    }
    while (pow) {
        curand_MRGmatMul3x3(temp, temp, temp, m);
        pow--;
    }
    for ( i = 0; i < 3; i++ ) {
        for ( j = 0; j < 3; j++ ) {
            o[i][j] = temp[i][j];
        }
    }
}

/** \endcond */

/****************************************************************************/
/* Kernel implementations of RNGs                                           */
/****************************************************************************/

/* Test RNG */

QUALIFIERS void curand_init(unsigned long long seed,
                                            unsigned long long subsequence,
                                            unsigned long long offset,
                                            curandStateTest_t *state)
{
    state->v = (unsigned int)(seed * 3) + (unsigned int)(subsequence * 31337) + \
                     (unsigned int)offset;
}


QUALIFIERS unsigned int curand(curandStateTest_t *state)
{
    unsigned int r = state->v++;
    return r;
}

QUALIFIERS void skipahead(unsigned long long n, curandStateTest_t *state)
{
    state->v += (unsigned int)n;
}

/* XORWOW RNG */

template <typename T, int n>
QUALIFIERS void __curand_generate_skipahead_matrix_xor(unsigned int matrix[])
{
    T state;
    // Generate matrix that advances one step
    // matrix has n * n * 32 32-bit elements
    // solve for matrix by stepping single bit states
    for(int i = 0; i < 32 * n; i++) {
        state.d = 0;
        for(int j = 0; j < n; j++) {
            state.v[j] = 0;
        }
        state.v[i / 32] = (1 << (i & 31));
        curand(&state);
        for(int j = 0; j < n; j++) {
            matrix[i * n + j] = state.v[j];
        }
    }
}

template <typename T, int n>
QUALIFIERS void _skipahead_scratch(unsigned long long x, T *state, unsigned int *scratch)
{
    // unsigned int matrix[n * n * 32];
    unsigned int *matrix = scratch;
    // unsigned int matrixA[n * n * 32];
    unsigned int *matrixA = scratch + (n * n * 32);
    // unsigned int vector[n];
    unsigned int *vector = scratch + (n * n * 32) + (n * n * 32);
    // unsigned int result[n];
    unsigned int *result = scratch + (n * n * 32) + (n * n * 32) + n;
    unsigned long long p = x;
    for(int i = 0; i < n; i++) {
        vector[i] = state->v[i];
    }
    int matrix_num = 0;
    while(p && (matrix_num < PRECALC_NUM_MATRICES - 1)) {
        for(unsigned int t = 0; t < (p & PRECALC_BLOCK_MASK); t++) {
#ifdef __CUDA_ARCH__
            __curand_matvec(vector, precalc_xorwow_offset_matrix[matrix_num], result, n);
#else
            __curand_matvec(vector, precalc_xorwow_offset_matrix_host[matrix_num], result, n);
#endif
            __curand_veccopy(vector, result, n);
        }
        p >>= PRECALC_BLOCK_SIZE;
        matrix_num++;
    }
    if(p) {
#ifdef __CUDA_ARCH__
        __curand_matcopy(matrix, precalc_xorwow_offset_matrix[PRECALC_NUM_MATRICES - 1], n);
        __curand_matcopy(matrixA, precalc_xorwow_offset_matrix[PRECALC_NUM_MATRICES - 1], n);
#else
        __curand_matcopy(matrix, precalc_xorwow_offset_matrix_host[PRECALC_NUM_MATRICES - 1], n);
        __curand_matcopy(matrixA, precalc_xorwow_offset_matrix_host[PRECALC_NUM_MATRICES - 1], n);
#endif
    }
    while(p) {
        for(unsigned int t = 0; t < (p & SKIPAHEAD_MASK); t++) {
            __curand_matvec(vector, matrixA, result, n);
            __curand_veccopy(vector, result, n);
        }
        p >>= SKIPAHEAD_BLOCKSIZE;
        if(p) {
            for(int i = 0; i < SKIPAHEAD_BLOCKSIZE; i++) {
                __curand_matmat(matrix, matrixA, n);
                __curand_matcopy(matrixA, matrix, n);
            }
        }
    }
    for(int i = 0; i < n; i++) {
        state->v[i] = vector[i];
    }
    state->d += 362437 * (unsigned int)x;
}

template <typename T, int n>
QUALIFIERS void _skipahead_sequence_scratch(unsigned long long x, T *state, unsigned int *scratch)
{
    // unsigned int matrix[n * n * 32];
    unsigned int *matrix = scratch;
    // unsigned int matrixA[n * n * 32];
    unsigned int *matrixA = scratch + (n * n * 32);
    // unsigned int vector[n];
    unsigned int *vector = scratch + (n * n * 32) + (n * n * 32);
    // unsigned int result[n];
    unsigned int *result = scratch + (n * n * 32) + (n * n * 32) + n;
    unsigned long long p = x;
    for(int i = 0; i < n; i++) {
        vector[i] = state->v[i];
    }
    int matrix_num = 0;
    while(p && matrix_num < PRECALC_NUM_MATRICES - 1) {
        for(unsigned int t = 0; t < (p & PRECALC_BLOCK_MASK); t++) {
#ifdef __CUDA_ARCH__
            __curand_matvec(vector, precalc_xorwow_matrix[matrix_num], result, n);
#else
            __curand_matvec(vector, precalc_xorwow_matrix_host[matrix_num], result, n);
#endif
            __curand_veccopy(vector, result, n);
        }
        p >>= PRECALC_BLOCK_SIZE;
        matrix_num++;
    }
    if(p) {
#ifdef __CUDA_ARCH__
        __curand_matcopy(matrix, precalc_xorwow_matrix[PRECALC_NUM_MATRICES - 1], n);
        __curand_matcopy(matrixA, precalc_xorwow_matrix[PRECALC_NUM_MATRICES - 1], n);
#else
        __curand_matcopy(matrix, precalc_xorwow_matrix_host[PRECALC_NUM_MATRICES - 1], n);
        __curand_matcopy(matrixA, precalc_xorwow_matrix_host[PRECALC_NUM_MATRICES - 1], n);
#endif
    }
    while(p) {
        for(unsigned int t = 0; t < (p & SKIPAHEAD_MASK); t++) {
            __curand_matvec(vector, matrixA, result, n);
            __curand_veccopy(vector, result, n);
        }
        p >>= SKIPAHEAD_BLOCKSIZE;
        if(p) {
            for(int i = 0; i < SKIPAHEAD_BLOCKSIZE; i++) {
                __curand_matmat(matrix, matrixA, n);
                __curand_matcopy(matrixA, matrix, n);
            }
        }
    }
    for(int i = 0; i < n; i++) {
        state->v[i] = vector[i];
    }
    /* No update of state->d needed, guaranteed to be a multiple of 2^32 */
}

template <typename T, int N>
QUALIFIERS void _skipahead_inplace(const unsigned long long x, T *state)
{
    unsigned long long p = x;
    int matrix_num = 0;
    while(p) {
        for(unsigned int t = 0; t < (p & PRECALC_BLOCK_MASK); t++) {
#ifdef __CUDA_ARCH__
            __curand_matvec_inplace<N>(state->v, precalc_xorwow_offset_matrix[matrix_num]);
#else
            __curand_matvec_inplace<N>(state->v, precalc_xorwow_offset_matrix_host[matrix_num]);
#endif
        }
        p >>= PRECALC_BLOCK_SIZE;
        matrix_num++;
    }
    state->d += 362437 * (unsigned int)x;
}

template <typename T, int N>
QUALIFIERS void _skipahead_sequence_inplace(unsigned long long x, T *state)
{
    int matrix_num = 0;
    while(x) {
        for(unsigned int t = 0; t < (x & PRECALC_BLOCK_MASK); t++) {
#ifdef __CUDA_ARCH__
            __curand_matvec_inplace<N>(state->v, precalc_xorwow_matrix[matrix_num]);
#else
            __curand_matvec_inplace<N>(state->v, precalc_xorwow_matrix_host[matrix_num]);
#endif
        }
        x >>= PRECALC_BLOCK_SIZE;
        matrix_num++;
    }
    /* No update of state->d needed, guaranteed to be a multiple of 2^32 */
}

/**
 * \brief Update XORWOW state to skip \p n elements.
 *
 * Update the XORWOW state in \p state to skip ahead \p n elements.
 *
 * All values of \p n are valid.  Large values require more computation and so
 * will take more time to complete.
 *
 * \param n - Number of elements to skip
 * \param state - Pointer to state to update
 */
QUALIFIERS void skipahead(unsigned long long n, curandStateXORWOW_t *state)
{
    _skipahead_inplace<curandStateXORWOW_t, 5>(n, state);
}

/**
 * \brief Update XORWOW state to skip ahead \p n subsequences.
 *
 * Update the XORWOW state in \p state to skip ahead \p n subsequences.  Each
 * subsequence is \xmlonly<ph outputclass="xmlonly">2<sup>67</sup></ph>\endxmlonly elements long, so this means the function will skip ahead
 * \xmlonly<ph outputclass="xmlonly">2<sup>67</sup></ph>\endxmlonly  * n elements.
 *
 * All values of \p n are valid.  Large values require more computation and so
 * will take more time to complete.
 *
 * \param n - Number of subsequences to skip
 * \param state - Pointer to state to update
 */
QUALIFIERS void skipahead_sequence(unsigned long long n, curandStateXORWOW_t *state)
{
    _skipahead_sequence_inplace<curandStateXORWOW_t, 5>(n, state);
}

QUALIFIERS void _curand_init_scratch(unsigned long long seed,
                                     unsigned long long subsequence,
                                     unsigned long long offset,
                                     curandStateXORWOW_t *state,
                                     unsigned int *scratch)
{
    // Break up seed, apply salt
    // Constants are arbitrary nonzero values
    unsigned int s0 = ((unsigned int)seed) ^ 0xaad26b49UL;
    unsigned int s1 = (unsigned int)(seed >> 32) ^ 0xf7dcefddUL;
    // Simple multiplication to mix up bits
    // Constants are arbitrary odd values
    unsigned int t0 = 1099087573UL * s0;
    unsigned int t1 = 2591861531UL * s1;
    state->d = 6615241 + t1 + t0;
    state->v[0] = 123456789UL + t0;
    state->v[1] = 362436069UL ^ t0;
    state->v[2] = 521288629UL + t1;
    state->v[3] = 88675123UL ^ t1;
    state->v[4] = 5783321UL + t0;
    _skipahead_sequence_scratch<curandStateXORWOW_t, 5>(subsequence, state, scratch);
    _skipahead_scratch<curandStateXORWOW_t, 5>(offset, state, scratch);
    state->boxmuller_flag = 0;
    state->boxmuller_flag_double = 0;
    state->boxmuller_extra = 0.f;
    state->boxmuller_extra_double = 0.;
}

QUALIFIERS void _curand_init_inplace(unsigned long long seed,
                                     unsigned long long subsequence,
                                     unsigned long long offset,
                                     curandStateXORWOW_t *state)
{
    // Break up seed, apply salt
    // Constants are arbitrary nonzero values
    unsigned int s0 = ((unsigned int)seed) ^ 0xaad26b49UL;
    unsigned int s1 = (unsigned int)(seed >> 32) ^ 0xf7dcefddUL;
    // Simple multiplication to mix up bits
    // Constants are arbitrary odd values
    unsigned int t0 = 1099087573UL * s0;
    unsigned int t1 = 2591861531UL * s1;
    state->d = 6615241 + t1 + t0;
    state->v[0] = 123456789UL + t0;
    state->v[1] = 362436069UL ^ t0;
    state->v[2] = 521288629UL + t1;
    state->v[3] = 88675123UL ^ t1;
    state->v[4] = 5783321UL + t0;
    _skipahead_sequence_inplace<curandStateXORWOW_t, 5>(subsequence, state);
    _skipahead_inplace<curandStateXORWOW_t, 5>(offset, state);
    state->boxmuller_flag = 0;
    state->boxmuller_flag_double = 0;
    state->boxmuller_extra = 0.f;
    state->boxmuller_extra_double = 0.;
}

/**
 * \brief Initialize XORWOW state.
 *
 * Initialize XORWOW state in \p state with the given \p seed, \p subsequence,
 * and \p offset.
 *
 * All input values of \p seed, \p subsequence, and \p offset are legal.  Large
 * values for \p subsequence and \p offset require more computation and so will
 * take more time to complete.
 *
 * A value of 0 for \p seed sets the state to the values of the original
 * published version of the \p xorwow algorithm.
 *
 * \param seed - Arbitrary bits to use as a seed
 * \param subsequence - Subsequence to start at
 * \param offset - Absolute offset into sequence
 * \param state - Pointer to state to initialize
 */
QUALIFIERS void curand_init(unsigned long long seed,
                            unsigned long long subsequence,
                            unsigned long long offset,
                            curandStateXORWOW_t *state)
{
    _curand_init_inplace(seed, subsequence, offset, state);
}

/**
 * \brief Return 32-bits of pseudorandomness from an XORWOW generator.
 *
 * Return 32-bits of pseudorandomness from the XORWOW generator in \p state,
 * increment position of generator by one.
 *
 * \param state - Pointer to state to update
 *
 * \return 32-bits of pseudorandomness as an unsigned int, all bits valid to use.
 */
QUALIFIERS unsigned int curand(curandStateXORWOW_t *state)
{
    unsigned int t;
    t = (state->v[0] ^ (state->v[0] >> 2));
    state->v[0] = state->v[1];
    state->v[1] = state->v[2];
    state->v[2] = state->v[3];
    state->v[3] = state->v[4];
    state->v[4] = (state->v[4] ^ (state->v[4] <<4)) ^ (t ^ (t << 1));
    state->d += 362437;
    return state->v[4] + state->d;
}


/**
 * \brief Return 32-bits of pseudorandomness from an Philox4_32_10 generator.
 *
 * Return 32-bits of pseudorandomness from the Philox4_32_10 generator in \p state,
 * increment position of generator by one.
 *
 * \param state - Pointer to state to update
 *
 * \return 32-bits of pseudorandomness as an unsigned int, all bits valid to use.
 */

QUALIFIERS unsigned int curand(curandStatePhilox4_32_10_t *state)
{
    // Maintain the invariant: output[STATE] is always "good" and
    //  is the next value to be returned by curand.
    unsigned int ret;
    switch(state->STATE++){
    default:
        ret = state->output.x;
        break;
    case 1:
        ret = state->output.y;
        break;
    case 2:
        ret = state->output.z;
        break;
    case 3:
        ret = state->output.w;
        break;
    }
    if(state->STATE == 4){
        Philox_State_Incr(state);
        state->output = curand_Philox4x32_10(state->ctr,state->key);
        state->STATE = 0;
    }
    return ret;
}

/**
 * \brief Return tuple of 4 32-bit pseudorandoms from a Philox4_32_10 generator.
 *
 * Return 128 bits of pseudorandomness from the Philox4_32_10 generator in \p state,
 * increment position of generator by four.
 *
 * \param state - Pointer to state to update
 *
 * \return 128-bits of pseudorandomness as a uint4, all bits valid to use.
 */

QUALIFIERS uint4 curand4(curandStatePhilox4_32_10_t *state)
{
    uint4 r;

    uint4 tmp = state->output;
    Philox_State_Incr(state);
    state->output= curand_Philox4x32_10(state->ctr,state->key);
    switch(state->STATE){
    case 0:
        return tmp;
    case 1:
        r.x = tmp.y;
        r.y = tmp.z;
        r.z = tmp.w;
        r.w = state->output.x;
        break;
    case 2:
        r.x = tmp.z;
        r.y = tmp.w;
        r.z = state->output.x;
        r.w = state->output.y;
        break;
    case 3:
        r.x = tmp.w;
        r.y = state->output.x;
        r.z = state->output.y;
        r.w = state->output.z;
        break;
    default:
        // NOT possible but needed to avoid compiler warnings
        return tmp;
    }
    return r;
}

/**
 * \brief Update Philox4_32_10 state to skip \p n elements.
 *
 * Update the Philox4_32_10 state in \p state to skip ahead \p n elements.
 *
 * All values of \p n are valid.
 *
 * \param n - Number of elements to skip
 * \param state - Pointer to state to update
 */
QUALIFIERS void skipahead(unsigned long long n, curandStatePhilox4_32_10_t *state)
{
    state->STATE += (n & 3);
    n /= 4;
    if( state->STATE > 3 ){
        n += 1;
        state->STATE -= 4;
    }
    Philox_State_Incr(state, n);
    state->output = curand_Philox4x32_10(state->ctr,state->key);
}

/**
 * \brief Update Philox4_32_10 state to skip ahead \p n subsequences.
 *
 * Update the Philox4_32_10 state in \p state to skip ahead \p n subsequences.  Each
 * subsequence is \xmlonly<ph outputclass="xmlonly">2<sup>66</sup></ph>\endxmlonly elements long, so this means the function will skip ahead
 * \xmlonly<ph outputclass="xmlonly">2<sup>66</sup></ph>\endxmlonly * n elements.
 *
 * All values of \p n are valid.
 *
 * \param n - Number of subsequences to skip
 * \param state - Pointer to state to update
 */
QUALIFIERS void skipahead_sequence(unsigned long long n, curandStatePhilox4_32_10_t *state)
{
    Philox_State_Incr_hi(state, n);
    state->output = curand_Philox4x32_10(state->ctr,state->key);
}

/**
 * \brief Initialize Philox4_32_10 state.
 *
 * Initialize Philox4_32_10 state in \p state with the given \p seed, p\ subsequence,
 * and \p offset.
 *
 * All input values for \p seed, \p subseqence and \p offset are legal.  Each of the
 * \xmlonly<ph outputclass="xmlonly">2<sup>64</sup></ph>\endxmlonly possible
 * values of seed selects an independent sequence of length
 * \xmlonly<ph outputclass="xmlonly">2<sup>130</sup></ph>\endxmlonly.
 * The first
 * \xmlonly<ph outputclass="xmlonly">2<sup>66</sup> * subsequence + offset</ph>\endxmlonly.
 * values of the sequence are skipped.
 * I.e., subsequences are of length
 * \xmlonly<ph outputclass="xmlonly">2<sup>66</sup></ph>\endxmlonly.
 *
 * \param seed - Arbitrary bits to use as a seed
 * \param subsequence - Subsequence to start at
 * \param offset - Absolute offset into subsequence
 * \param state - Pointer to state to initialize
 */
QUALIFIERS void curand_init(unsigned long long seed,
                                 unsigned long long subsequence,
                                 unsigned long long offset,
                                 curandStatePhilox4_32_10_t *state)
{
    state->ctr = make_uint4(0, 0, 0, 0);
    state->key.x = (unsigned int)seed;
    state->key.y = (unsigned int)(seed>>32);
    state->STATE = 0;
    state->boxmuller_flag = 0;
    state->boxmuller_flag_double = 0;
    state->boxmuller_extra = 0.f;
    state->boxmuller_extra_double = 0.;
    skipahead_sequence(subsequence, state);
    skipahead(offset, state);
}


/* MRG32k3a RNG */

/* Base generator for MRG32k3a                                              */
#if __CUDA_ARCH__ > 600
QUALIFIERS unsigned long long __curand_umad(unsigned int a, unsigned int b, unsigned long long c)
{
    unsigned long long r;
    asm("mad.wide.u32 %0, %1, %2, %3;"
        : "=l"(r) : "r"(a), "r"(b), "l"(c));
    return r;
}
QUALIFIERS unsigned long long __curand_umul(unsigned int a, unsigned int b)
{
    unsigned long long r;
    asm("mul.wide.u32 %0, %1, %2;"
        : "=l"(r) : "r"(a), "r"(b));
    return r;
}

QUALIFIERS double curand_MRG32k3a (curandStateMRG32k3a_t *state)
{
    const unsigned int m1  = 4294967087u;
    const unsigned int m2  = 4294944443u;
    const unsigned int m1c = 209u;
    const unsigned int m2c = 22853u;
    const unsigned int a12  = 1403580u;
    const unsigned int a13n = 810728u;
    const unsigned int a21  = 527612u;
    const unsigned int a23n = 1370589u;

    unsigned long long p1, p2;
    const unsigned long long p3 = __curand_umul(a13n, m1 - state->s1[0]);
    p1 = __curand_umad(a12, state->s1[1], p3);

    // Putting addition inside and changing umul to umad
    // slowed this function down on GV100
    p1 =  __curand_umul(p1 >> 32, m1c) + (p1 & 0xffffffff);
    if (p1 >= m1) p1 -= m1;

    state->s1[0] = state->s1[1]; state->s1[1] = state->s1[2]; state->s1[2] = p1;
    const unsigned long long p4 = __curand_umul(a23n, m2 - state->s2[0]);
    p2 = __curand_umad(a21, state->s2[2], p4);

    // Putting addition inside and changing umul to umad
    // slowed this function down on GV100
    p2 =  __curand_umul(p2 >> 32, m2c) + (p2 & 0xffffffff);
    p2 =  __curand_umul(p2 >> 32, m2c) + (p2 & 0xffffffff);
    if (p2 >= m2) p2 -= m2;

    state->s2[0] = state->s2[1]; state->s2[1] = state->s2[2]; state->s2[2] = p2;

    const unsigned int p5 = (unsigned int)p1 - (unsigned int)p2;
    if(p1 <= p2) return p5 + m1;
    return p5;
}
#elif __CUDA_ARCH__ > 0
/*  nj's implementation */
QUALIFIERS double curand_MRG32k3a (curandStateMRG32k3a_t *state)
{
    const double m1 = 4294967087.;
    const double m2 = 4294944443.;
    const double a12  = 1403580.;
    const double a13n = 810728.;
    const double a21  = 527612.;
    const double a23n = 1370589.;

    const double rh1 =  2.3283065498378290e-010;  /* (1.0 / m1)__hi */
    const double rl1 = -1.7354913086174288e-026;  /* (1.0 / m1)__lo */
    const double rh2 =  2.3283188252407387e-010;  /* (1.0 / m2)__hi */
    const double rl2 =  2.4081018096503646e-026;  /* (1.0 / m2)__lo */

    double q, p1, p2;
    p1 = a12 * state->s1[1] - a13n * state->s1[0];
    q = trunc (fma (p1, rh1, p1 * rl1));
    p1 -= q * m1;
    if (p1 < 0.0) p1 += m1;
    state->s1[0] = state->s1[1];   state->s1[1] = state->s1[2];   state->s1[2] = (unsigned int)p1;
    p2 = a21 * state->s2[2] - a23n * state->s2[0];
    q = trunc (fma (p2, rh2, p2 * rl2));
    p2 -= q * m2;
    if (p2 < 0.0) p2 += m2;
    state->s2[0] = state->s2[1];   state->s2[1] = state->s2[2];   state->s2[2] = (unsigned int)p2;
    if (p1 <= p2) return (p1 - p2 + m1);
    else return (p1 - p2);
}
/* end nj's implementation */
#else
QUALIFIERS double curand_MRG32k3a(curandStateMRG32k3a_t *state)
{
    double p1,p2,r;
    p1 = (MRG32K3A_A12 * state->s1[1]) - (MRG32K3A_A13N * state->s1[0]);
    p1 = curand_MRGmod(p1, MRG32K3A_MOD1);
    if (p1 < 0.0) p1 += MRG32K3A_MOD1;
    state->s1[0] = state->s1[1];
    state->s1[1] = state->s1[2];
    state->s1[2] = (unsigned int)p1;
    p2 = (MRG32K3A_A21 * state->s2[2]) - (MRG32K3A_A23N * state->s2[0]);
    p2 = curand_MRGmod(p2, MRG32K3A_MOD2);
    if (p2 < 0) p2 += MRG32K3A_MOD2;
    state->s2[0] = state->s2[1];
    state->s2[1] = state->s2[2];
    state->s2[2] = (unsigned int)p2;
    r = p1 - p2;
    if (r <= 0) r += MRG32K3A_MOD1;
    return r;
}
#endif


/**
 * \brief Return 32-bits of pseudorandomness from an MRG32k3a generator.
 *
 * Return 32-bits of pseudorandomness from the MRG32k3a generator in \p state,
 * increment position of generator by one.
 *
 * \param state - Pointer to state to update
 *
 * \return 32-bits of pseudorandomness as an unsigned int, all bits valid to use.
 */
QUALIFIERS unsigned int curand(curandStateMRG32k3a_t *state)
{
    double dRet;
    dRet = (double)curand_MRG32k3a(state)*(double)MRG32K3A_BITS_NORM;
    return (unsigned int)dRet;
}


/**
 * \brief Update MRG32k3a state to skip \p n elements.
 *
 * Update the MRG32k3a state in \p state to skip ahead \p n elements.
 *
 * All values of \p n are valid.  Large values require more computation and so
 * will take more time to complete.
 *
 * \param n - Number of elements to skip
 * \param state - Pointer to state to update
 */
QUALIFIERS void skipahead(unsigned long long n, curandStateMRG32k3a_t *state)
{
    unsigned int t[3][3];
#ifdef __CUDA_ARCH__
    curand_MRGmatPow3x3( mrg32k3aM1, t, MRG32K3A_MOD1, n);
    curand_MRGmatVecMul3x3( t, state->s1, MRG32K3A_MOD1);
    curand_MRGmatPow3x3(mrg32k3aM2, t, MRG32K3A_MOD2, n);
    curand_MRGmatVecMul3x3( t, state->s2, MRG32K3A_MOD2);
#else
    curand_MRGmatPow3x3( mrg32k3aM1Host, t, MRG32K3A_MOD1, n);
    curand_MRGmatVecMul3x3( t, state->s1, MRG32K3A_MOD1);
    curand_MRGmatPow3x3(mrg32k3aM2Host, t, MRG32K3A_MOD2, n);
    curand_MRGmatVecMul3x3( t, state->s2, MRG32K3A_MOD2);
#endif
}

/**
 * \brief Update MRG32k3a state to skip ahead \p n subsequences.
 *
 * Update the MRG32k3a state in \p state to skip ahead \p n subsequences.  Each
 * subsequence is \xmlonly<ph outputclass="xmlonly">2<sup>127</sup></ph>\endxmlonly
 *
 * \xmlonly<ph outputclass="xmlonly">2<sup>76</sup></ph>\endxmlonly elements long, so this means the function will skip ahead
 * \xmlonly<ph outputclass="xmlonly">2<sup>67</sup></ph>\endxmlonly * n elements.
 *
 * Valid values of \p n are 0 to \xmlonly<ph outputclass="xmlonly">2<sup>51</sup></ph>\endxmlonly.  Note \p n will be masked to 51 bits
 *
 * \param n - Number of subsequences to skip
 * \param state - Pointer to state to update
 */
QUALIFIERS void skipahead_subsequence(unsigned long long n, curandStateMRG32k3a_t *state)
{
    unsigned int t[3][3];
#ifdef __CUDA_ARCH__
    curand_MRGmatPow3x3( mrg32k3aM1SubSeq, t, MRG32K3A_MOD1, n);
    curand_MRGmatVecMul3x3( t, state->s1, MRG32K3A_MOD1);
    curand_MRGmatPow3x3( mrg32k3aM2SubSeq, t, MRG32K3A_MOD2, n);
    curand_MRGmatVecMul3x3( t, state->s2, MRG32K3A_MOD2);
#else
    curand_MRGmatPow3x3( mrg32k3aM1SubSeqHost, t, MRG32K3A_MOD1, n);
    curand_MRGmatVecMul3x3( t, state->s1, MRG32K3A_MOD1);
    curand_MRGmatPow3x3( mrg32k3aM2SubSeqHost, t, MRG32K3A_MOD2, n);
    curand_MRGmatVecMul3x3( t, state->s2, MRG32K3A_MOD2);
#endif
}

/**
 * \brief Update MRG32k3a state to skip ahead \p n sequences.
 *
 * Update the MRG32k3a state in \p state to skip ahead \p n sequences.  Each
 * sequence is \xmlonly<ph outputclass="xmlonly">2<sup>127</sup></ph>\endxmlonly elements long, so this means the function will skip ahead
 * \xmlonly<ph outputclass="xmlonly">2<sup>127</sup></ph>\endxmlonly * n elements.
 *
 * All values of \p n are valid.  Large values require more computation and so
 * will take more time to complete.
 *
 * \param n - Number of sequences to skip
 * \param state - Pointer to state to update
 */
QUALIFIERS void skipahead_sequence(unsigned long long n, curandStateMRG32k3a_t *state)
{
    unsigned int t[3][3];
#ifdef __CUDA_ARCH__
    curand_MRGmatPow3x3( mrg32k3aM1Seq, t, MRG32K3A_MOD1, n);
    curand_MRGmatVecMul3x3( t, state->s1, MRG32K3A_MOD1);
    curand_MRGmatPow3x3(  mrg32k3aM2Seq, t, MRG32K3A_MOD2, n);
    curand_MRGmatVecMul3x3( t, state->s2, MRG32K3A_MOD2);
#else
    curand_MRGmatPow3x3( mrg32k3aM1SeqHost, t, MRG32K3A_MOD1, n);
    curand_MRGmatVecMul3x3( t, state->s1, MRG32K3A_MOD1);
    curand_MRGmatPow3x3(  mrg32k3aM2SeqHost, t, MRG32K3A_MOD2, n);
    curand_MRGmatVecMul3x3( t, state->s2, MRG32K3A_MOD2);
#endif
}


/**
 * \brief Initialize MRG32k3a state.
 *
 * Initialize MRG32k3a state in \p state with the given \p seed, \p subsequence,
 * and \p offset.
 *
 * All input values of \p seed, \p subsequence, and \p offset are legal.
 * \p subsequence will be truncated to 51 bits to avoid running into the next sequence
 *
 * A value of 0 for \p seed sets the state to the values of the original
 * published version of the \p MRG32k3a algorithm.
 *
 * \param seed - Arbitrary bits to use as a seed
 * \param subsequence - Subsequence to start at
 * \param offset - Absolute offset into sequence
 * \param state - Pointer to state to initialize
 */
QUALIFIERS void curand_init(unsigned long long seed,
                            unsigned long long subsequence,
                            unsigned long long offset,
                            curandStateMRG32k3a_t *state)
{
    int i;
    for ( i=0; i<3; i++ ) {
        state->s1[i] = 12345u;
        state->s2[i] = 12345u;
    }
    if (seed != 0ull) {
        unsigned int x1 = ((unsigned int)seed) ^ 0x55555555UL;
        unsigned int x2 = (unsigned int)((seed >> 32) ^ 0xAAAAAAAAUL);
        state->s1[0] = (unsigned int)curand_MRGmodMul(x1, state->s1[0], MRG32K3A_MOD1);
        state->s1[1] = (unsigned int)curand_MRGmodMul(x2, state->s1[1], MRG32K3A_MOD1);
        state->s1[2] = (unsigned int)curand_MRGmodMul(x1, state->s1[2], MRG32K3A_MOD1);
        state->s2[0] = (unsigned int)curand_MRGmodMul(x2, state->s2[0], MRG32K3A_MOD2);
        state->s2[1] = (unsigned int)curand_MRGmodMul(x1, state->s2[1], MRG32K3A_MOD2);
        state->s2[2] = (unsigned int)curand_MRGmodMul(x2, state->s2[2], MRG32K3A_MOD2);
    }
    skipahead_subsequence( subsequence, state );
    skipahead( offset, state );
    state->boxmuller_flag = 0;
    state->boxmuller_flag_double = 0;
    state->boxmuller_extra = 0.f;
    state->boxmuller_extra_double = 0.;
}

/**
 * \brief Update Sobol32 state to skip \p n elements.
 *
 * Update the Sobol32 state in \p state to skip ahead \p n elements.
 *
 * All values of \p n are valid.
 *
 * \param n - Number of elements to skip
 * \param state - Pointer to state to update
 */
template <typename T>
QUALIFIERS
typename CURAND_STD::enable_if<CURAND_STD::is_same<curandStateSobol32_t*, T>::value || CURAND_STD::is_same<curandStateScrambledSobol32_t*, T>::value>::type
skipahead(unsigned int n, T state)
{
    unsigned int i_gray;
    state->x = state->c;
    state->i += n;
    /* Convert state->i to gray code */
    i_gray = state->i ^ (state->i >> 1);
    for(unsigned int k = 0; k < 32; k++) {
        if(i_gray & (1 << k)) {
            state->x ^= state->direction_vectors[k];
        }
    }
    return;
}

/**
 * \brief Update Sobol64 state to skip \p n elements.
 *
 * Update the Sobol64 state in \p state to skip ahead \p n elements.
 *
 * All values of \p n are valid.
 *
 * \param n - Number of elements to skip
 * \param state - Pointer to state to update
 */
template <typename T>
QUALIFIERS
typename CURAND_STD::enable_if<CURAND_STD::is_same<curandStateSobol64_t*, T>::value || CURAND_STD::is_same<curandStateScrambledSobol64_t*, T>::value>::type
skipahead(unsigned long long n, T state)
{
    unsigned long long i_gray;
    state->x = state->c;
    state->i += n;
    /* Convert state->i to gray code */
    i_gray = state->i ^ (state->i >> 1);
    for(unsigned k = 0; k < 64; k++) {
        if(i_gray & (1ULL << k)) {
            state->x ^= state->direction_vectors[k];
        }
    }
    return;
}

/**
 * \brief Initialize Sobol32 state.
 *
 * Initialize Sobol32 state in \p state with the given \p direction \p vectors and
 * \p offset.
 *
 * The direction vector is a device pointer to an array of 32 unsigned ints.
 * All input values of \p offset are legal.
 *
 * \param direction_vectors - Pointer to array of 32 unsigned ints representing the
 * direction vectors for the desired dimension
 * \param offset - Absolute offset into sequence
 * \param state - Pointer to state to initialize
 */
QUALIFIERS void curand_init(curandDirectionVectors32_t direction_vectors,
                                            unsigned int offset,
                                            curandStateSobol32_t *state)
{
    state->i = 0;
    state->c = 0;
    for(int i = 0; i < 32; i++) {
        state->direction_vectors[i] = direction_vectors[i];
    }
    state->x = 0;
    skipahead<curandStateSobol32_t *>(offset, state);
}
/**
 * \brief Initialize Scrambled Sobol32 state.
 *
 * Initialize Sobol32 state in \p state with the given \p direction \p vectors and
 * \p offset.
 *
 * The direction vector is a device pointer to an array of 32 unsigned ints.
 * All input values of \p offset are legal.
 *
 * \param direction_vectors - Pointer to array of 32 unsigned ints representing the
 direction vectors for the desired dimension
 * \param scramble_c Scramble constant
 * \param offset - Absolute offset into sequence
 * \param state - Pointer to state to initialize
 */
QUALIFIERS void curand_init(curandDirectionVectors32_t direction_vectors,
                                            unsigned int scramble_c,
                                            unsigned int offset,
                                            curandStateScrambledSobol32_t *state)
{
    state->i = 0;
    state->c = scramble_c;
    for(int i = 0; i < 32; i++) {
        state->direction_vectors[i] = direction_vectors[i];
    }
    state->x = state->c;
    skipahead<curandStateScrambledSobol32_t *>(offset, state);
}

QUALIFIERS int __curand_find_trailing_zero(unsigned int x)
{
#if __CUDA_ARCH__ > 0
    int y = __ffs(~x);
    if(y)
        return y - 1;
    return 31;
#else
    int i = 1;
    while(x & 1) {
        i++;
        x >>= 1;
    }
    i = i - 1;
    return i == 32 ? 31 : i;
#endif
}

QUALIFIERS int __curand_find_trailing_zero(unsigned long long x)
{
#if __CUDA_ARCH__ > 0
    int y = __ffsll(~x);
    if(y)
        return y - 1;
    return 63;
#else
    int i = 1;
    while(x & 1) {
        i++;
        x >>= 1;
    }
    i = i - 1;
    return i == 64 ? 63 : i;
#endif
}

/**
 * \brief Initialize Sobol64 state.
 *
 * Initialize Sobol64 state in \p state with the given \p direction \p vectors and
 * \p offset.
 *
 * The direction vector is a device pointer to an array of 64 unsigned long longs.
 * All input values of \p offset are legal.
 *
 * \param direction_vectors - Pointer to array of 64 unsigned long longs representing the
 direction vectors for the desired dimension
 * \param offset - Absolute offset into sequence
 * \param state - Pointer to state to initialize
 */
QUALIFIERS void curand_init(curandDirectionVectors64_t direction_vectors,
                                            unsigned long long offset,
                                            curandStateSobol64_t *state)
{
    state->i = 0;
    state->c = 0;
    for(int i = 0; i < 64; i++) {
        state->direction_vectors[i] = direction_vectors[i];
    }
    state->x = 0;
    skipahead<curandStateSobol64_t *>(offset, state);
}

/**
 * \brief Initialize Scrambled Sobol64 state.
 *
 * Initialize Sobol64 state in \p state with the given \p direction \p vectors and
 * \p offset.
 *
 * The direction vector is a device pointer to an array of 64 unsigned long longs.
 * All input values of \p offset are legal.
 *
 * \param direction_vectors - Pointer to array of 64 unsigned long longs representing the
 direction vectors for the desired dimension
 * \param scramble_c Scramble constant
 * \param offset - Absolute offset into sequence
 * \param state - Pointer to state to initialize
 */
QUALIFIERS void curand_init(curandDirectionVectors64_t direction_vectors,
                                            unsigned long long scramble_c,
                                            unsigned long long offset,
                                            curandStateScrambledSobol64_t *state)
{
    state->i = 0;
    state->c = scramble_c;
    for(int i = 0; i < 64; i++) {
        state->direction_vectors[i] = direction_vectors[i];
    }
    state->x = state->c;
    skipahead<curandStateScrambledSobol64_t *>(offset, state);
}

/**
 * \brief Return 32-bits of quasirandomness from a Sobol32 generator.
 *
 * Return 32-bits of quasirandomness from the Sobol32 generator in \p state,
 * increment position of generator by one.
 *
 * \param state - Pointer to state to update
 *
 * \return 32-bits of quasirandomness as an unsigned int, all bits valid to use.
 */

QUALIFIERS unsigned int curand(curandStateSobol32_t * state)
{
    /* Moving from i to i+1 element in gray code is flipping one bit,
       the trailing zero bit of i
    */
    unsigned int res = state->x;
    state->x ^= state->direction_vectors[__curand_find_trailing_zero(state->i)];
    state->i ++;
    return res;
}

/**
 * \brief Return 32-bits of quasirandomness from a scrambled Sobol32 generator.
 *
 * Return 32-bits of quasirandomness from the scrambled Sobol32 generator in \p state,
 * increment position of generator by one.
 *
 * \param state - Pointer to state to update
 *
 * \return 32-bits of quasirandomness as an unsigned int, all bits valid to use.
 */

QUALIFIERS unsigned int curand(curandStateScrambledSobol32_t * state)
{
    /* Moving from i to i+1 element in gray code is flipping one bit,
       the trailing zero bit of i
    */
    unsigned int res = state->x;
    state->x ^= state->direction_vectors[__curand_find_trailing_zero(state->i)];
    state->i ++;
    return res;
}

/**
 * \brief Return 64-bits of quasirandomness from a Sobol64 generator.
 *
 * Return 64-bits of quasirandomness from the Sobol64 generator in \p state,
 * increment position of generator by one.
 *
 * \param state - Pointer to state to update
 *
 * \return 64-bits of quasirandomness as an unsigned long long, all bits valid to use.
 */

QUALIFIERS unsigned long long curand(curandStateSobol64_t * state)
{
    /* Moving from i to i+1 element in gray code is flipping one bit,
       the trailing zero bit of i
    */
    unsigned long long res = state->x;
    state->x ^= state->direction_vectors[__curand_find_trailing_zero(state->i)];
    state->i ++;
    return res;
}

/**
 * \brief Return 64-bits of quasirandomness from a scrambled Sobol64 generator.
 *
 * Return 64-bits of quasirandomness from the scrambled Sobol32 generator in \p state,
 * increment position of generator by one.
 *
 * \param state - Pointer to state to update
 *
 * \return 64-bits of quasirandomness as an unsigned long long, all bits valid to use.
 */

QUALIFIERS unsigned long long curand(curandStateScrambledSobol64_t * state)
{
    /* Moving from i to i+1 element in gray code is flipping one bit,
       the trailing zero bit of i
    */
    unsigned long long res = state->x;
    state->x ^= state->direction_vectors[__curand_find_trailing_zero(state->i)];
    state->i ++;
    return res;
}

#include "curand_uniform.h"
#include "curand_normal.h"
#include "curand_lognormal.h"
#include "curand_poisson.h"
#include "curand_discrete2.h"

__device__ static inline unsigned int *__get_precalculated_matrix(int n)
{
    if(n == 0) {
        return precalc_xorwow_matrix[n];
    }
    if(n == 2) {
        return precalc_xorwow_offset_matrix[n];
    }
    return precalc_xorwow_matrix[n];
}

#ifndef __CUDACC_RTC__
__host__ static inline unsigned int *__get_precalculated_matrix_host(int n)
{
    if(n == 1) {
        return precalc_xorwow_matrix_host[n];
    }
    if(n == 3) {
        return precalc_xorwow_offset_matrix_host[n];
    }
    return precalc_xorwow_matrix_host[n];
}
#endif // #ifndef __CUDACC_RTC__

__device__ static inline unsigned int *__get_mrg32k3a_matrix(int n)
{
    if(n == 0) {
        return mrg32k3aM1[n][0];
    }
    if(n == 2) {
        return mrg32k3aM2[n][0];
    }
    if(n == 4) {
        return mrg32k3aM1SubSeq[n][0];
    }
    if(n == 6) {
        return mrg32k3aM2SubSeq[n][0];
    }
    if(n == 8) {
        return mrg32k3aM1Seq[n][0];
    }
    if(n == 10) {
        return mrg32k3aM2Seq[n][0];
    }
    return mrg32k3aM1[n][0];
}

#ifndef __CUDACC_RTC__
__host__ static inline unsigned int *__get_mrg32k3a_matrix_host(int n)
{
    if(n == 1) {
        return mrg32k3aM1Host[n][0];
    }
    if(n == 3) {
        return mrg32k3aM2Host[n][0];
    }
    if(n == 5) {
        return mrg32k3aM1SubSeqHost[n][0];
    }
    if(n == 7) {
        return mrg32k3aM2SubSeqHost[n][0];
    }
    if(n == 9) {
        return mrg32k3aM1SeqHost[n][0];
    }
    if(n == 11) {
        return mrg32k3aM2SeqHost[n][0];
    }
    return mrg32k3aM1Host[n][0];
}

__host__ static inline double *__get__cr_lgamma_table_host(void) {
    return __cr_lgamma_table;
}
#endif // #ifndef __CUDACC_RTC__

/** @} */

#endif // !defined(CURAND_KERNEL_H_)