/build/reproducible-path/rocrand-6.4.1/library/include/rocrand/rocrand_mrg32k3a.h Source File

/build/reproducible-path/rocrand-6.4.1/library/include/rocrand/rocrand_mrg32k3a.h Source File#

API library: /build/reproducible-path/rocrand-6.4.1/library/include/rocrand/rocrand_mrg32k3a.h Source File
rocrand_mrg32k3a.h
1// Copyright (c) 2017-2024 Advanced Micro Devices, Inc. All rights reserved.
2//
3// Permission is hereby granted, free of charge, to any person obtaining a copy
4// of this software and associated documentation files (the "Software"), to deal
5// in the Software without restriction, including without limitation the rights
6// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7// copies of the Software, and to permit persons to whom the Software is
8// furnished to do so, subject to the following conditions:
9//
10// The above copyright notice and this permission notice shall be included in
11// all copies or substantial portions of the Software.
12//
13// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19// THE SOFTWARE.
20
21#ifndef ROCRAND_MRG32K3A_H_
22#define ROCRAND_MRG32K3A_H_
23
24#include "rocrand/rocrand_common.h"
25#include "rocrand/rocrand_mrg32k3a_precomputed.h"
26
27#define ROCRAND_MRG32K3A_POW32 4294967296
28#define ROCRAND_MRG32K3A_M1 4294967087
29#define ROCRAND_MRG32K3A_M1C 209
30#define ROCRAND_MRG32K3A_M2 4294944443
31#define ROCRAND_MRG32K3A_M2C 22853
32#define ROCRAND_MRG32K3A_A12 1403580
33#define ROCRAND_MRG32K3A_A13 (4294967087 - 810728)
34#define ROCRAND_MRG32K3A_A13N 810728
35#define ROCRAND_MRG32K3A_A21 527612
36#define ROCRAND_MRG32K3A_A23 (4294944443 - 1370589)
37#define ROCRAND_MRG32K3A_A23N 1370589
38#define ROCRAND_MRG32K3A_NORM_DOUBLE (2.3283065498378288e-10) // 1/ROCRAND_MRG32K3A_M1
39#define ROCRAND_MRG32K3A_UINT_NORM (1.000000048661607) // (ROCRAND_MRG32K3A_POW32 - 1)/(ROCRAND_MRG32K3A_M1 - 1)
40
49 #define ROCRAND_MRG32K3A_DEFAULT_SEED 12345ULL
// end of group rocranddevice
51
52namespace rocrand_device {
53
54class mrg32k3a_engine
55{
56public:
57 struct mrg32k3a_state
58 {
59 unsigned int g1[3];
60 unsigned int g2[3];
61
62 #ifndef ROCRAND_DETAIL_BM_NOT_IN_STATE
63 // The Box–Muller transform requires two inputs to convert uniformly
64 // distributed real values [0; 1] to normally distributed real values
65 // (with mean = 0, and stddev = 1). Often user wants only one
66 // normally distributed number, to save performance and random
67 // numbers the 2nd value is saved for future requests.
68 unsigned int boxmuller_float_state; // is there a float in boxmuller_float
69 unsigned int boxmuller_double_state; // is there a double in boxmuller_double
70 float boxmuller_float; // normally distributed float
71 double boxmuller_double; // normally distributed double
72 #endif
73 };
74
75 __forceinline__ __device__ __host__ mrg32k3a_engine()
76 {
77 this->seed(ROCRAND_MRG32K3A_DEFAULT_SEED, 0, 0);
78 }
79
88 __forceinline__ __device__ __host__ mrg32k3a_engine(const unsigned long long seed,
89 const unsigned long long subsequence,
90 const unsigned long long offset)
91 {
92 this->seed(seed, subsequence, offset);
93 }
94
103 __forceinline__ __device__ __host__ void seed(unsigned long long seed_value,
104 const unsigned long long subsequence,
105 const unsigned long long offset)
106 {
107 if(seed_value == 0)
108 {
110 }
111 unsigned int x = (unsigned int) seed_value ^ 0x55555555U;
112 unsigned int y = (unsigned int) ((seed_value >> 32) ^ 0xAAAAAAAAU);
113 m_state.g1[0] = mod_mul_m1(x, seed_value);
114 m_state.g1[1] = mod_mul_m1(y, seed_value);
115 m_state.g1[2] = mod_mul_m1(x, seed_value);
116 m_state.g2[0] = mod_mul_m2(y, seed_value);
117 m_state.g2[1] = mod_mul_m2(x, seed_value);
118 m_state.g2[2] = mod_mul_m2(y, seed_value);
119 this->restart(subsequence, offset);
120 }
121
123 __forceinline__ __device__ __host__ void discard(unsigned long long offset)
124 {
125 this->discard_impl(offset);
126 }
127
130 __forceinline__ __device__ __host__ void discard_subsequence(unsigned long long subsequence)
131 {
132 this->discard_subsequence_impl(subsequence);
133 }
134
137 __forceinline__ __device__ __host__ void discard_sequence(unsigned long long sequence)
138 {
139 this->discard_sequence_impl(sequence);
140 }
141
142 __forceinline__ __device__ __host__ void restart(const unsigned long long subsequence,
143 const unsigned long long offset)
144 {
145 #ifndef ROCRAND_DETAIL_BM_NOT_IN_STATE
146 m_state.boxmuller_float_state = 0;
147 m_state.boxmuller_double_state = 0;
148 #endif
149 this->discard_subsequence_impl(subsequence);
150 this->discard_impl(offset);
151 }
152
153 __forceinline__ __device__ __host__ unsigned int operator()()
154 {
155 return this->next();
156 }
157
158 // Returned value is in range [1, ROCRAND_MRG32K3A_M1],
159 // where ROCRAND_MRG32K3A_M1 < UINT_MAX
160 __forceinline__ __device__ __host__ unsigned int next()
161 {
162 const unsigned int p1 = mod_m1(
163 detail::mad_u64_u32(
164 ROCRAND_MRG32K3A_A12,
165 m_state.g1[1],
166 detail::mad_u64_u32(
167 ROCRAND_MRG32K3A_A13N,
168 (ROCRAND_MRG32K3A_M1 - m_state.g1[0]),
169 0
170 )
171 )
172 );
173
174 m_state.g1[0] = m_state.g1[1]; m_state.g1[1] = m_state.g1[2];
175 m_state.g1[2] = p1;
176
177 const unsigned int p2 = mod_m2(
178 detail::mad_u64_u32(
179 ROCRAND_MRG32K3A_A21,
180 m_state.g2[2],
181 detail::mad_u64_u32(
182 ROCRAND_MRG32K3A_A23N,
183 (ROCRAND_MRG32K3A_M2 - m_state.g2[0]),
184 0
185 )
186 )
187 );
188
189 m_state.g2[0] = m_state.g2[1]; m_state.g2[1] = m_state.g2[2];
190 m_state.g2[2] = p2;
191
192 return (p1 - p2) + (p1 <= p2 ? ROCRAND_MRG32K3A_M1 : 0);
193 }
194
195protected:
196 // Advances the internal state to skip \p offset numbers.
197 // DOES NOT CALCULATE NEW ULONGLONG
198 __forceinline__ __device__ __host__ void discard_impl(unsigned long long offset)
199 {
200 discard_state(offset);
201 }
202
203 // DOES NOT CALCULATE NEW ULONGLONG
204 __forceinline__ __device__ __host__ void
205 discard_subsequence_impl(unsigned long long subsequence)
206 {
207 int i = 0;
208
209 while(subsequence > 0) {
210 if (subsequence & 1) {
211 #if defined(__HIP_DEVICE_COMPILE__)
212 mod_mat_vec_m1(d_A1P76 + i, m_state.g1);
213 mod_mat_vec_m2(d_A2P76 + i, m_state.g2);
214 #else
215 mod_mat_vec_m1(h_A1P76 + i, m_state.g1);
216 mod_mat_vec_m2(h_A2P76 + i, m_state.g2);
217 #endif
218 }
219 subsequence >>= 1;
220 i += 9;
221 }
222 }
223
224 // DOES NOT CALCULATE NEW ULONGLONG
225 __forceinline__ __device__ __host__ void discard_sequence_impl(unsigned long long sequence)
226 {
227 int i = 0;
228
229 while(sequence > 0) {
230 if (sequence & 1) {
231 #if defined(__HIP_DEVICE_COMPILE__)
232 mod_mat_vec_m1(d_A1P127 + i, m_state.g1);
233 mod_mat_vec_m2(d_A2P127 + i, m_state.g2);
234 #else
235 mod_mat_vec_m1(h_A1P127 + i, m_state.g1);
236 mod_mat_vec_m2(h_A2P127 + i, m_state.g2);
237 #endif
238 }
239 sequence >>= 1;
240 i += 9;
241 }
242 }
243
244 // Advances the internal state by offset times.
245 // DOES NOT CALCULATE NEW ULONGLONG
246 __forceinline__ __device__ __host__ void discard_state(unsigned long long offset)
247 {
248 int i = 0;
249
250 while(offset > 0) {
251 if (offset & 1) {
252 #if defined(__HIP_DEVICE_COMPILE__)
253 mod_mat_vec_m1(d_A1 + i, m_state.g1);
254 mod_mat_vec_m2(d_A2 + i, m_state.g2);
255 #else
256 mod_mat_vec_m1(h_A1 + i, m_state.g1);
257 mod_mat_vec_m2(h_A2 + i, m_state.g2);
258 #endif
259 }
260 offset >>= 1;
261 i += 9;
262 }
263 }
264
265 // Advances the internal state to the next state
266 // DOES NOT CALCULATE NEW ULONGLONG
267 __forceinline__ __device__ __host__ void discard_state()
268 {
269 discard_state(1);
270 }
271
272private:
273 __forceinline__ __device__ __host__ static void mod_mat_vec_m1(const unsigned long long* A,
274 unsigned int* s)
275 {
276 unsigned long long x[3];
277
278 x[0] = mod_m1(mod_m1(A[0] * s[0])
279 + mod_m1(A[1] * s[1])
280 + mod_m1(A[2] * s[2]));
281
282 x[1] = mod_m1(mod_m1(A[3] * s[0])
283 + mod_m1(A[4] * s[1])
284 + mod_m1(A[5] * s[2]));
285
286 x[2] = mod_m1(mod_m1(A[6] * s[0])
287 + mod_m1(A[7] * s[1])
288 + mod_m1(A[8] * s[2]));
289
290 s[0] = x[0];
291 s[1] = x[1];
292 s[2] = x[2];
293 }
294
295 __forceinline__ __device__ __host__ static void mod_mat_vec_m2(const unsigned long long* A,
296 unsigned int* s)
297 {
298 unsigned long long x[3];
299
300 x[0] = mod_m2(mod_m2(A[0] * s[0])
301 + mod_m2(A[1] * s[1])
302 + mod_m2(A[2] * s[2]));
303
304 x[1] = mod_m2(mod_m2(A[3] * s[0])
305 + mod_m2(A[4] * s[1])
306 + mod_m2(A[5] * s[2]));
307
308 x[2] = mod_m2(mod_m2(A[6] * s[0])
309 + mod_m2(A[7] * s[1])
310 + mod_m2(A[8] * s[2]));
311
312 s[0] = x[0];
313 s[1] = x[1];
314 s[2] = x[2];
315 }
316
317 __forceinline__ __device__ __host__ static unsigned long long mod_mul_m1(unsigned int i,
318 unsigned long long j)
319 {
320 long long hi, lo, temp1, temp2;
321
322 hi = i / 131072;
323 lo = i - (hi * 131072);
324 temp1 = mod_m1(hi * j) * 131072;
325 temp2 = mod_m1(lo * j);
326 lo = mod_m1(temp1 + temp2);
327
328 if (lo < 0)
329 lo += ROCRAND_MRG32K3A_M1;
330 return lo;
331 }
332
333 __forceinline__ __device__ __host__ static unsigned long long mod_m1(unsigned long long p)
334 {
335 p = detail::mad_u64_u32(ROCRAND_MRG32K3A_M1C, (p >> 32), p & (ROCRAND_MRG32K3A_POW32 - 1));
336 if (p >= ROCRAND_MRG32K3A_M1)
337 p -= ROCRAND_MRG32K3A_M1;
338
339 return p;
340 }
341
342 __forceinline__ __device__ __host__ static unsigned long long mod_mul_m2(unsigned int i,
343 unsigned long long j)
344 {
345 long long hi, lo, temp1, temp2;
346
347 hi = i / 131072;
348 lo = i - (hi * 131072);
349 temp1 = mod_m2(hi * j) * 131072;
350 temp2 = mod_m2(lo * j);
351 lo = mod_m2(temp1 + temp2);
352
353 if (lo < 0)
354 lo += ROCRAND_MRG32K3A_M2;
355 return lo;
356 }
357
358 __forceinline__ __device__ __host__ static unsigned long long mod_m2(unsigned long long p)
359 {
360 p = detail::mad_u64_u32(ROCRAND_MRG32K3A_M2C, (p >> 32), p & (ROCRAND_MRG32K3A_POW32 - 1));
361 p = detail::mad_u64_u32(ROCRAND_MRG32K3A_M2C, (p >> 32), p & (ROCRAND_MRG32K3A_POW32 - 1));
362 if (p >= ROCRAND_MRG32K3A_M2)
363 p -= ROCRAND_MRG32K3A_M2;
364
365 return p;
366 }
367
368protected:
369 // State
370 mrg32k3a_state m_state;
371
372 #ifndef ROCRAND_DETAIL_BM_NOT_IN_STATE
373 friend struct detail::engine_boxmuller_helper<mrg32k3a_engine>;
374 #endif
375
376}; // mrg32k3a_engine class
377
378} // end namespace rocrand_device
379
386typedef rocrand_device::mrg32k3a_engine rocrand_state_mrg32k3a;
388
400__forceinline__ __device__ __host__ void rocrand_init(const unsigned long long seed,
401 const unsigned long long subsequence,
402 const unsigned long long offset,
403 rocrand_state_mrg32k3a* state)
404{
405 *state = rocrand_state_mrg32k3a(seed, subsequence, offset);
406}
407
420__forceinline__ __device__ __host__ unsigned int rocrand(rocrand_state_mrg32k3a* state)
421{
422 // next() in [1, ROCRAND_MRG32K3A_M1]
423 return static_cast<unsigned int>((state->next() - 1) * ROCRAND_MRG32K3A_UINT_NORM);
424}
425
434__forceinline__ __device__ __host__ void skipahead(unsigned long long offset,
435 rocrand_state_mrg32k3a* state)
436{
437 return state->discard(offset);
438}
439
449__forceinline__ __device__ __host__ void skipahead_subsequence(unsigned long long subsequence,
450 rocrand_state_mrg32k3a* state)
451{
452 return state->discard_subsequence(subsequence);
453}
454
464__forceinline__ __device__ __host__ void skipahead_sequence(unsigned long long sequence,
465 rocrand_state_mrg32k3a* state)
466{
467 return state->discard_sequence(sequence);
468}
469
470#endif // ROCRAND_MRG32K3A_H_
471
// end of group rocranddevice
#define ROCRAND_MRG32K3A_DEFAULT_SEED
Default seed for MRG32K3A PRNG.
Definition rocrand_mrg32k3a.h:49
__forceinline__ __device__ __host__ void rocrand_init(const unsigned long long seed, const unsigned long long subsequence, const unsigned long long offset, rocrand_state_mrg32k3a *state)
Initializes MRG32K3A state.
Definition rocrand_mrg32k3a.h:400
__forceinline__ __device__ __host__ void skipahead_subsequence(unsigned long long subsequence, rocrand_state_mrg32k3a *state)
Updates MRG32K3A state to skip ahead by subsequence subsequences.
Definition rocrand_mrg32k3a.h:449
__forceinline__ __device__ __host__ void skipahead(unsigned long long offset, rocrand_state_mrg32k3a *state)
Updates MRG32K3A state to skip ahead by offset elements.
Definition rocrand_mrg32k3a.h:434
__forceinline__ __device__ __host__ unsigned int rocrand(rocrand_state_mrg32k3a *state)
Returns uniformly distributed random unsigned int value from [0; 2^32 - 1] range.
Definition rocrand_mrg32k3a.h:420
__forceinline__ __device__ __host__ void skipahead_sequence(unsigned long long sequence, rocrand_state_mrg32k3a *state)
Updates MRG32K3A state to skip ahead by sequence sequences.
Definition rocrand_mrg32k3a.h:464