/* * Copyright 2010-2014 NVIDIA Corporation. All rights reserved. * * NOTICE TO LICENSEE: * * This source code and/or documentation ("Licensed Deliverables") are * subject to NVIDIA intellectual property rights under U.S. and * international Copyright laws. * * These Licensed Deliverables contained herein is PROPRIETARY and * CONFIDENTIAL to NVIDIA and is being provided under the terms and * conditions of a form of NVIDIA software license agreement by and * between NVIDIA and Licensee ("License Agreement") or electronically * accepted by Licensee. Notwithstanding any terms or conditions to * the contrary in the License Agreement, reproduction or disclosure * of the Licensed Deliverables to any third party without the express * written consent of NVIDIA is prohibited. * * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE * OF THESE LICENSED DELIVERABLES. * * U.S. Government End Users. These Licensed Deliverables are a * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT * 1995), consisting of "commercial computer software" and "commercial * computer software documentation" as such terms are used in 48 * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government * only as a commercial end item. Consistent with 48 C.F.R.12.212 and * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all * U.S. Government End Users acquire the Licensed Deliverables with * only those rights set forth herein. * * Any use of the Licensed Deliverables in individual and commercial * software must include, in the user documentation and internal * comments to the code, the above Disclaimer and U.S. Government End * Users Notice. */ /* * curand_mtgp32_kernel.h * * * MTGP32-11213 * * Mersenne Twister RNG for the GPU * * The period of generated integers is 2¹¹²¹³-1. * * This code generates 32-bit unsigned integers, and * single precision floating point numbers uniformly distributed * in the range [1, 2). (float r; 1.0 <= r < 2.0) */ /* * Copyright (c) 2009, 2010 Mutsuo Saito, Makoto Matsumoto and Hiroshima * University. All rights reserved. * Copyright (c) 2011 Mutsuo Saito, Makoto Matsumoto, Hiroshima * University and University of Tokyo. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are * met: * * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials provided * with the distribution. * * Neither the name of the Hiroshima University nor the names of * its contributors may be used to endorse or promote products * derived from this software without specific prior written * permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #if !defined CURAND_MTGP32_KERNEL_H #define CURAND_MTGP32_KERNEL_H #if !defined(QUALIFIERS) #define QUALIFIERS static __forceinline__ __device__ #endif #ifndef __CUDACC_RTC__ #include #include #include #include #endif // ifndef __CUDACC_RTC__ #include #include "curand.h" #include "curand_mtgp32.h" /** * \addtogroup DEVICE Device API * * @{ */ #ifndef __CUDA_ARCH__ // define blockDim and threadIdx for host compatibility call extern const dim3 blockDim; extern const uint3 threadIdx; #endif /* * The function of the recursion formula calculation. * * @param[in] X1 the farthest part of state array. * @param[in] X2 the second farthest part of state array. * @param[in] Y a part of state array. * @param[in] bid block id. * @return output */ QUALIFIERS unsigned int para_rec(mtgp32_kernel_params_t * k,unsigned int X1, unsigned int X2, unsigned int Y, int bid) { unsigned int X = (X1 & k->mask[0]) ^ X2; unsigned int MAT; X ^= X << k->sh1_tbl[bid]; Y = X ^ (Y >> k->sh2_tbl[bid]); MAT = k->param_tbl[bid][Y & 0x0f]; return Y ^ MAT; } /* * The tempering function. * * @param[in] V the output value should be tempered. * @param[in] T the tempering helper value. * @param[in] bid block id. * @return the tempered value. */ QUALIFIERS unsigned int temper(mtgp32_kernel_params_t * k,unsigned int V, unsigned int T, int bid) { unsigned int MAT; T ^= T >> 16; T ^= T >> 8; MAT = k->temper_tbl[bid][T & 0x0f]; return V ^ MAT; } /* * The tempering and converting function. * By using the preset table, converting to IEEE format * and tempering are done simultaneously. * * @param[in] V the output value should be tempered. * @param[in] T the tempering helper value. * @param[in] bid block id. * @return the tempered and converted value. */ QUALIFIERS unsigned int temper_single(mtgp32_kernel_params_t * k,unsigned int V, unsigned int T, int bid) { unsigned int MAT; unsigned int r; T ^= T >> 16; T ^= T >> 8; MAT = k->single_temper_tbl[bid][T & 0x0f]; r = (V >> 9) ^ MAT; return r; } /** * \brief Return 32-bits of pseudorandomness from a mtgp32 generator. * * Return 32-bits of pseudorandomness from the mtgp32 generator in \p state, * increment position of generator by the number of threads in the block. * Note the number of threads in the block can not exceed 256. * * \param state - Pointer to state to update * * \return 32-bits of pseudorandomness as an unsigned int, all bits valid to use. */ QUALIFIERS unsigned int curand(curandStateMtgp32_t *state) { unsigned int t; unsigned int d; int pos = state->k->pos_tbl[state->pIdx]; unsigned int r; unsigned int o; d = blockDim.z * blockDim.y * blockDim.x; //assert( d <= 256 ); t = (blockDim.z * blockDim.y * threadIdx.z) + (blockDim.x * threadIdx.y) + threadIdx.x; r = para_rec(state->k, state->s[(t + state->offset) & MTGP32_STATE_MASK], state->s[(t + state->offset + 1) & MTGP32_STATE_MASK], state->s[(t + state->offset + pos) & MTGP32_STATE_MASK], state->pIdx); state->s[(t + state->offset + MTGPDC_N) & MTGP32_STATE_MASK] = r; o = temper(state->k, r, state->s[(t + state->offset + pos -1) & MTGP32_STATE_MASK], state->pIdx); NV_IF_TARGET(NV_IS_DEVICE, __syncthreads(); ) if (t == 0) { state->offset = (state->offset + d) & MTGP32_STATE_MASK; } NV_IF_TARGET(NV_IS_DEVICE, __syncthreads(); ) return o; } /** * \brief Return 32-bits of pseudorandomness from a specific position in a mtgp32 generator. * * Return 32-bits of pseudorandomness from position \p index of the mtgp32 generator in \p state, * increment position of generator by \p n positions, which must be the total number of positions * upddated in the state by the thread block, for this invocation. * * Note : * Thread indices must range from 0...\ n - 1. * The number of positions updated may not exceed 256. * A thread block may update more than one state, but a given state may not be updated by more than one thread block. * * \param state - Pointer to state to update * \param index - Index (0..255) of the position within the state to draw from and update * \param n - The total number of postions in this state that are being updated by this invocation * * \return 32-bits of pseudorandomness as an unsigned int, all bits valid to use. */ QUALIFIERS unsigned int curand_mtgp32_specific(curandStateMtgp32_t *state, unsigned char index, unsigned char n) { unsigned int t; int pos = state->k->pos_tbl[state->pIdx]; unsigned int r; unsigned int o; t = index; r = para_rec(state->k, state->s[(t + state->offset) & MTGP32_STATE_MASK], state->s[(t + state->offset + 1) & MTGP32_STATE_MASK], state->s[(t + state->offset + pos) & MTGP32_STATE_MASK], state->pIdx); state->s[(t + state->offset + MTGPDC_N) & MTGP32_STATE_MASK] = r; o = temper(state->k, r, state->s[(t + state->offset + pos -1) & MTGP32_STATE_MASK], state->pIdx); NV_IF_TARGET(NV_IS_DEVICE, __syncthreads(); ) if (index == 0) { state->offset = (state->offset + n) & MTGP32_STATE_MASK; } NV_IF_TARGET(NV_IS_DEVICE, __syncthreads(); ) return o; } /** * \brief Return a uniformly distributed float from a mtgp32 generator. * * Return a uniformly distributed float between \p 0.0f and \p 1.0f * from the mtgp32 generator in \p state, increment position of generator. * Output range excludes \p 0.0f but includes \p 1.0f. Denormalized floating * point outputs are never returned. * * Note: This alternate derivation of a uniform float is provided for completeness * with the original source * * \param state - Pointer to state to update * * \return uniformly distributed float between \p 0.0f and \p 1.0f */ QUALIFIERS float curand_mtgp32_single(curandStateMtgp32_t *state) { unsigned int t; unsigned int d; int pos = state->k->pos_tbl[state->pIdx]; unsigned int r; unsigned int o_u; float o_f; t = blockDim.z * blockDim.y; d = t * blockDim.x; //assert( d <= 256 ); t += threadIdx.x; r = para_rec(state->k, state->s[(t + state->offset) & MTGP32_STATE_MASK], state->s[(t + state->offset + 1) & MTGP32_STATE_MASK], state->s[(t + state->offset + pos) & MTGP32_STATE_MASK], state->pIdx); state->s[t] = r; o_u = temper_single(state->k, r, state->s[(t + state->offset + pos -1) & MTGP32_STATE_MASK], state->pIdx); NV_IF_TARGET(NV_IS_DEVICE, __syncthreads(); ) if (threadIdx.x == 0) { state->offset = (state->offset + d) & MTGP32_STATE_MASK; } NV_IF_TARGET(NV_IS_DEVICE, __syncthreads(); ) memcpy(&o_f, &o_u, sizeof(o_u)); return o_f; } /** * \brief Return a uniformly distributed float from a specific position in a mtgp32 generator. * * Return a uniformly distributed float between \p 0.0f and \p 1.0f * from position \p index of the mtgp32 generator in \p state, and * increment position of generator by \p n positions, which must be the total number of positions * upddated in the state by the thread block, for this invocation. * Output range excludes \p 0.0f but includes \p 1.0f. Denormalized floating * point outputs are never returned. * * Note 1: * Thread indices must range from 0...\p n - 1. * The number of positions updated may not exceed 256. * A thread block may update more than one state, but a given state may not be updated by more than one thread block. * * Note 2: This alternate derivation of a uniform float is provided for completeness * with the original source * * \param state - Pointer to state to update * \param index - Index (0..255) of the position within the state to draw from and update * \param n - The total number of postions in this state that are being updated by this invocation * * \return uniformly distributed float between \p 0.0f and \p 1.0f */ QUALIFIERS float curand_mtgp32_single_specific(curandStateMtgp32_t *state, unsigned char index, unsigned char n) { unsigned int t; int pos = state->k->pos_tbl[state->pIdx]; unsigned int r; unsigned int o_u; float o_f; t = index; r = para_rec(state->k, state->s[(t + state->offset) & MTGP32_STATE_MASK], state->s[(t + state->offset + 1) & MTGP32_STATE_MASK], state->s[(t + state->offset + pos) & MTGP32_STATE_MASK], state->pIdx); state->s[t] = r; o_u = temper_single(state->k, r, state->s[(t + state->offset + pos -1) & MTGP32_STATE_MASK], state->pIdx); NV_IF_TARGET(NV_IS_DEVICE, __syncthreads(); ) if (threadIdx.x == 0) { state->offset = (state->offset + n) & MTGP32_STATE_MASK; } NV_IF_TARGET(NV_IS_DEVICE, __syncthreads(); ) memcpy(&o_f, &o_u, sizeof(o_u)); return o_f; } /** @} */ #endif