/build/reproducible-path/rocrand-6.4.1/library/include/rocrand/rocrand_philox4x32_10.h Source File

/build/reproducible-path/rocrand-6.4.1/library/include/rocrand/rocrand_philox4x32_10.h Source File#

API library: /build/reproducible-path/rocrand-6.4.1/library/include/rocrand/rocrand_philox4x32_10.h Source File
rocrand_philox4x32_10.h
1// Copyright (c) 2017-2024 Advanced Micro Devices, Inc. All rights reserved.
2//
3// Permission is hereby granted, free of charge, to any person obtaining a copy
4// of this software and associated documentation files (the "Software"), to deal
5// in the Software without restriction, including without limitation the rights
6// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7// copies of the Software, and to permit persons to whom the Software is
8// furnished to do so, subject to the following conditions:
9//
10// The above copyright notice and this permission notice shall be included in
11// all copies or substantial portions of the Software.
12//
13// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19// THE SOFTWARE.
20
21/*
22Copyright 2010-2011, D. E. Shaw Research.
23All rights reserved.
24
25Redistribution and use in source and binary forms, with or without
26modification, are permitted provided that the following conditions are
27met:
28
29* Redistributions of source code must retain the above copyright
30 notice, this list of conditions, and the following disclaimer.
31
32* Redistributions in binary form must reproduce the above copyright
33 notice, this list of conditions, and the following disclaimer in the
34 documentation and/or other materials provided with the distribution.
35
36* Neither the name of D. E. Shaw Research nor the names of its
37 contributors may be used to endorse or promote products derived from
38 this software without specific prior written permission.
39
40THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
41"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
42LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
43A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
44OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
45SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
46LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
47DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
48THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
49(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
50OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
51*/
52
53#ifndef ROCRAND_PHILOX4X32_10_H_
54#define ROCRAND_PHILOX4X32_10_H_
55
56#include "rocrand/rocrand_common.h"
57
58// Constants from Random123
59// See https://www.deshawresearch.com/resources_random123.html
60#define ROCRAND_PHILOX_M4x32_0 0xD2511F53U
61#define ROCRAND_PHILOX_M4x32_1 0xCD9E8D57U
62#define ROCRAND_PHILOX_W32_0 0x9E3779B9U
63#define ROCRAND_PHILOX_W32_1 0xBB67AE85U
64
73#define ROCRAND_PHILOX4x32_DEFAULT_SEED 0xdeadbeefdeadbeefULL
// end of group rocranddevice
75
76namespace rocrand_device {
77namespace detail {
78
79__forceinline__ __device__ __host__ unsigned int
80 mulhilo32(unsigned int x, unsigned int y, unsigned int& z)
81{
82 unsigned long long xy = mad_u64_u32(x, y, 0);
83 z = static_cast<unsigned int>(xy >> 32);
84 return static_cast<unsigned int>(xy);
85}
86
87} // end detail namespace
88
89class philox4x32_10_engine
90{
91public:
92 struct philox4x32_10_state
93 {
94 uint4 counter;
95 uint4 result;
96 uint2 key;
97 unsigned int substate;
98
99 #ifndef ROCRAND_DETAIL_BM_NOT_IN_STATE
100 // The Box–Muller transform requires two inputs to convert uniformly
101 // distributed real values [0; 1] to normally distributed real values
102 // (with mean = 0, and stddev = 1). Often user wants only one
103 // normally distributed number, to save performance and random
104 // numbers the 2nd value is saved for future requests.
105 unsigned int boxmuller_float_state; // is there a float in boxmuller_float
106 unsigned int boxmuller_double_state; // is there a double in boxmuller_double
107 float boxmuller_float; // normally distributed float
108 double boxmuller_double; // normally distributed double
109 #endif
110 };
111
112 __forceinline__ __device__ __host__ philox4x32_10_engine()
113 {
114 this->seed(ROCRAND_PHILOX4x32_DEFAULT_SEED, 0, 0);
115 }
116
122 __forceinline__ __device__ __host__ philox4x32_10_engine(const unsigned long long seed,
123 const unsigned long long subsequence,
124 const unsigned long long offset)
125 {
126 this->seed(seed, subsequence, offset);
127 }
128
134 __forceinline__ __device__ __host__ void seed(unsigned long long seed_value,
135 const unsigned long long subsequence,
136 const unsigned long long offset)
137 {
138 m_state.key.x = static_cast<unsigned int>(seed_value);
139 m_state.key.y = static_cast<unsigned int>(seed_value >> 32);
140 this->restart(subsequence, offset);
141 }
142
144 __forceinline__ __device__ __host__ void discard(unsigned long long offset)
145 {
146 this->discard_impl(offset);
147 this->m_state.result = this->ten_rounds(m_state.counter, m_state.key);
148 }
149
154 __forceinline__ __device__ __host__ void discard_subsequence(unsigned long long subsequence)
155 {
156 this->discard_subsequence_impl(subsequence);
157 m_state.result = this->ten_rounds(m_state.counter, m_state.key);
158 }
159
160 __forceinline__ __device__ __host__ void restart(const unsigned long long subsequence,
161 const unsigned long long offset)
162 {
163 m_state.counter = {0, 0, 0, 0};
164 m_state.result = {0, 0, 0, 0};
165 m_state.substate = 0;
166 #ifndef ROCRAND_DETAIL_BM_NOT_IN_STATE
167 m_state.boxmuller_float_state = 0;
168 m_state.boxmuller_double_state = 0;
169 #endif
170 this->discard_subsequence_impl(subsequence);
171 this->discard_impl(offset);
172 m_state.result = this->ten_rounds(m_state.counter, m_state.key);
173 }
174
175 __forceinline__ __device__ __host__ unsigned int operator()()
176 {
177 return this->next();
178 }
179
180 __forceinline__ __device__ __host__ unsigned int next()
181 {
182 #if defined(__HIP_PLATFORM_AMD__)
183 unsigned int ret = m_state.result.data[m_state.substate];
184 #else
185 unsigned int ret = (&m_state.result.x)[m_state.substate];
186 #endif
187 m_state.substate++;
188 if(m_state.substate == 4)
189 {
190 m_state.substate = 0;
191 this->discard_state();
192 m_state.result = this->ten_rounds(m_state.counter, m_state.key);
193 }
194 return ret;
195 }
196
197 __forceinline__ __device__ __host__ uint4 next4()
198 {
199 uint4 ret = m_state.result;
200 this->discard_state();
201 m_state.result = this->ten_rounds(m_state.counter, m_state.key);
202 return this->interleave(ret, m_state.result);
203 }
204
205protected:
206 // Advances the internal state to skip \p offset numbers.
207 // DOES NOT CALCULATE NEW 4 UINTs (m_state.result)
208 __forceinline__ __device__ __host__ void discard_impl(unsigned long long offset)
209 {
210 // Adjust offset for subset
211 m_state.substate += offset & 3;
212 unsigned long long counter_offset = offset / 4;
213 counter_offset += m_state.substate < 4 ? 0 : 1;
214 m_state.substate += m_state.substate < 4 ? 0 : -4;
215 // Discard states
216 this->discard_state(counter_offset);
217 }
218
219 // DOES NOT CALCULATE NEW 4 UINTs (m_state.result)
220 __forceinline__ __device__ __host__ void
221 discard_subsequence_impl(unsigned long long subsequence)
222 {
223 unsigned int lo = static_cast<unsigned int>(subsequence);
224 unsigned int hi = static_cast<unsigned int>(subsequence >> 32);
225
226 unsigned int temp = m_state.counter.z;
227 m_state.counter.z += lo;
228 m_state.counter.w += hi + (m_state.counter.z < temp ? 1 : 0);
229 }
230
231 // Advances the internal state by offset times.
232 // DOES NOT CALCULATE NEW 4 UINTs (m_state.result)
233 __forceinline__ __device__ __host__ void discard_state(unsigned long long offset)
234 {
235 unsigned int lo = static_cast<unsigned int>(offset);
236 unsigned int hi = static_cast<unsigned int>(offset >> 32);
237
238 uint4 temp = m_state.counter;
239 m_state.counter.x += lo;
240 m_state.counter.y += hi + (m_state.counter.x < temp.x ? 1 : 0);
241 m_state.counter.z += (m_state.counter.y < temp.y ? 1 : 0);
242 m_state.counter.w += (m_state.counter.z < temp.z ? 1 : 0);
243 }
244
245 // Advances the internal state to the next state
246 // DOES NOT CALCULATE NEW 4 UINTs (m_state.result)
247 __forceinline__ __device__ __host__ void discard_state()
248 {
249 m_state.counter = this->bump_counter(m_state.counter);
250 }
251
252 __forceinline__ __device__ __host__ static uint4 bump_counter(uint4 counter)
253 {
254 counter.x++;
255 unsigned int add = counter.x == 0 ? 1 : 0;
256 counter.y += add; add = counter.y == 0 ? add : 0;
257 counter.z += add; add = counter.z == 0 ? add : 0;
258 counter.w += add;
259 return counter;
260 }
261
262 __forceinline__ __device__ __host__ uint4 interleave(const uint4 prev, const uint4 next) const
263 {
264 switch(m_state.substate)
265 {
266 case 0:
267 return prev;
268 case 1:
269 return uint4{ prev.y, prev.z, prev.w, next.x };
270 case 2:
271 return uint4{ prev.z, prev.w, next.x, next.y };
272 case 3:
273 return uint4{ prev.w, next.x, next.y, next.z };
274 }
275 __builtin_unreachable();
276 }
277
278 // 10 Philox4x32 rounds
279 __forceinline__ __device__ __host__ uint4 ten_rounds(uint4 counter, uint2 key)
280 {
281 counter = this->single_round(counter, key); key = this->bumpkey(key); // 1
282 counter = this->single_round(counter, key); key = this->bumpkey(key); // 2
283 counter = this->single_round(counter, key); key = this->bumpkey(key); // 3
284 counter = this->single_round(counter, key); key = this->bumpkey(key); // 4
285 counter = this->single_round(counter, key); key = this->bumpkey(key); // 5
286 counter = this->single_round(counter, key); key = this->bumpkey(key); // 6
287 counter = this->single_round(counter, key); key = this->bumpkey(key); // 7
288 counter = this->single_round(counter, key); key = this->bumpkey(key); // 8
289 counter = this->single_round(counter, key); key = this->bumpkey(key); // 9
290 return this->single_round(counter, key); // 10
291 }
292
293private:
294 // Single Philox4x32 round
295 __forceinline__ __device__ __host__ static uint4 single_round(uint4 counter, uint2 key)
296 {
297 // Source: Random123
298 unsigned int hi0;
299 unsigned int hi1;
300 unsigned int lo0 = detail::mulhilo32(ROCRAND_PHILOX_M4x32_0, counter.x, hi0);
301 unsigned int lo1 = detail::mulhilo32(ROCRAND_PHILOX_M4x32_1, counter.z, hi1);
302 return uint4 {
303 hi1 ^ counter.y ^ key.x,
304 lo1,
305 hi0 ^ counter.w ^ key.y,
306 lo0
307 };
308 }
309
310 __forceinline__ __device__ __host__ static uint2 bumpkey(uint2 key)
311 {
312 key.x += ROCRAND_PHILOX_W32_0;
313 key.y += ROCRAND_PHILOX_W32_1;
314 return key;
315 }
316
317protected:
318 // State
319 philox4x32_10_state m_state;
320
321 #ifndef ROCRAND_DETAIL_BM_NOT_IN_STATE
322 friend struct detail::engine_boxmuller_helper<philox4x32_10_engine>;
323 #endif
324
325}; // philox4x32_10_engine class
326
327} // end namespace rocrand_device
328
335typedef rocrand_device::philox4x32_10_engine rocrand_state_philox4x32_10;
337
349__forceinline__ __device__ __host__ void rocrand_init(const unsigned long long seed,
350 const unsigned long long subsequence,
351 const unsigned long long offset,
352 rocrand_state_philox4x32_10* state)
353{
354 *state = rocrand_state_philox4x32_10(seed, subsequence, offset);
355}
356
369__forceinline__ __device__ __host__ unsigned int rocrand(rocrand_state_philox4x32_10* state)
370{
371 return state->next();
372}
373
386__forceinline__ __device__ __host__ uint4 rocrand4(rocrand_state_philox4x32_10* state)
387{
388 return state->next4();
389}
390
399__forceinline__ __device__ __host__ void skipahead(unsigned long long offset,
400 rocrand_state_philox4x32_10* state)
401{
402 return state->discard(offset);
403}
404
414__forceinline__ __device__ __host__ void skipahead_subsequence(unsigned long long subsequence,
415 rocrand_state_philox4x32_10* state)
416{
417 return state->discard_subsequence(subsequence);
418}
419
429__forceinline__ __device__ __host__ void skipahead_sequence(unsigned long long sequence,
430 rocrand_state_philox4x32_10* state)
431{
432 return state->discard_subsequence(sequence);
433}
434
435#endif // ROCRAND_PHILOX4X32_10_H_
436
// end of group rocranddevice
__forceinline__ __device__ __host__ void skipahead(unsigned long long offset, rocrand_state_philox4x32_10 *state)
Updates Philox state to skip ahead by offset elements.
Definition rocrand_philox4x32_10.h:399
__forceinline__ __device__ __host__ void rocrand_init(const unsigned long long seed, const unsigned long long subsequence, const unsigned long long offset, rocrand_state_philox4x32_10 *state)
Initializes Philox state.
Definition rocrand_philox4x32_10.h:349
#define ROCRAND_PHILOX4x32_DEFAULT_SEED
Default seed for PHILOX4x32 PRNG.
Definition rocrand_philox4x32_10.h:73
__forceinline__ __device__ __host__ uint4 rocrand4(rocrand_state_philox4x32_10 *state)
Returns four uniformly distributed random unsigned int values from [0; 2^32 - 1] range.
Definition rocrand_philox4x32_10.h:386
__forceinline__ __device__ __host__ void skipahead_sequence(unsigned long long sequence, rocrand_state_philox4x32_10 *state)
Updates Philox state to skip ahead by sequence sequences.
Definition rocrand_philox4x32_10.h:429
__forceinline__ __device__ __host__ unsigned int rocrand(rocrand_state_philox4x32_10 *state)
Returns uniformly distributed random unsigned int value from [0; 2^32 - 1] range.
Definition rocrand_philox4x32_10.h:369
__forceinline__ __device__ __host__ void skipahead_subsequence(unsigned long long subsequence, rocrand_state_philox4x32_10 *state)
Updates Philox state to skip ahead by subsequence subsequences.
Definition rocrand_philox4x32_10.h:414