HIP: Heterogenous-computing Interface for Portability
Loading...
Searching...
No Matches
amd_device_functions.h
1/*
2Copyright (c) 2015 - 2023 Advanced Micro Devices, Inc. All rights reserved.
3
4Permission is hereby granted, free of charge, to any person obtaining a copy
5of this software and associated documentation files (the "Software"), to deal
6in the Software without restriction, including without limitation the rights
7to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8copies of the Software, and to permit persons to whom the Software is
9furnished to do so, subject to the following conditions:
10
11The above copyright notice and this permission notice shall be included in
12all copies or substantial portions of the Software.
13
14THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
20THE SOFTWARE.
21*/
22
23#ifndef HIP_INCLUDE_HIP_AMD_DETAIL_DEVICE_FUNCTIONS_H
24#define HIP_INCLUDE_HIP_AMD_DETAIL_DEVICE_FUNCTIONS_H
25
26#if !defined(__HIPCC_RTC__)
27#include <hip/amd_detail/amd_hip_common.h>
29#include <hip/amd_detail/hip_assert.h>
30#include "host_defines.h"
31#include "math_fwd.h"
32#include <hip/hip_runtime_api.h>
33#include <stddef.h>
34#include <hip/hip_vector_types.h>
35#endif // !defined(__HIPCC_RTC__)
36
37#if defined(__clang__) && defined(__HIP__)
38extern "C" __device__ int printf(const char *fmt, ...);
39#else
40template <typename... All>
41static inline __device__ void printf(const char* format, All... all) {}
42#endif
43
44extern "C" __device__ unsigned long long __ockl_steadyctr_u64();
45
46/*
47Integer Intrinsics
48*/
49
50// integer intrinsic function __poc __clz __ffs __brev
51__device__ static inline unsigned int __popc(unsigned int input) {
52 return __builtin_popcount(input);
53}
54__device__ static inline unsigned int __popcll(unsigned long long int input) {
55 return __builtin_popcountll(input);
56}
57
58__device__ static inline int __clz(int input) {
59 return __ockl_clz_u32((uint)input);
60}
61
62__device__ static inline int __clzll(long long int input) {
63 return __ockl_clz_u64((uint64_t)input);
64}
65
66__device__ static inline unsigned int __ffs(unsigned int input) {
67 return ( input == 0 ? -1 : __builtin_ctz(input) ) + 1;
68}
69
70__device__ static inline unsigned int __ffsll(unsigned long long int input) {
71 return ( input == 0 ? -1 : __builtin_ctzll(input) ) + 1;
72}
73
74__device__ static inline unsigned int __ffsll(unsigned long int input) {
75 return ( input == 0 ? -1 : __builtin_ctzll(input) ) + 1;
76}
77
78__device__ static inline unsigned int __ffs(int input) {
79 return ( input == 0 ? -1 : __builtin_ctz(input) ) + 1;
80}
81
82__device__ static inline unsigned int __ffsll(long long int input) {
83 return ( input == 0 ? -1 : __builtin_ctzll(input) ) + 1;
84}
85
86__device__ static inline unsigned int __ffsll(long int input) {
87 return ( input == 0 ? -1 : __builtin_ctzll(input) ) + 1;
88}
89
90// Given a 32/64-bit value exec mask and an integer value base (between 0 and WAVEFRONT_SIZE),
91// find the n-th (given by offset) set bit in the exec mask from the base bit, and return the bit position.
92// If not found, return -1.
93__device__ static int32_t __fns64(uint64_t mask, uint32_t base, int32_t offset) {
94 uint64_t temp_mask = mask;
95 int32_t temp_offset = offset;
96
97 if (offset == 0) {
98 temp_mask &= (1 << base);
99 temp_offset = 1;
100 }
101 else if (offset < 0) {
102 temp_mask = __builtin_bitreverse64(mask);
103 base = 63 - base;
104 temp_offset = -offset;
105 }
106
107 temp_mask = temp_mask & ((~0ULL) << base);
108 if (__builtin_popcountll(temp_mask) < temp_offset)
109 return -1;
110 int32_t total = 0;
111 for (int i = 0x20; i > 0; i >>= 1) {
112 uint64_t temp_mask_lo = temp_mask & ((1ULL << i) - 1);
113 int32_t pcnt = __builtin_popcountll(temp_mask_lo);
114 if (pcnt < temp_offset) {
115 temp_mask = temp_mask >> i;
116 temp_offset -= pcnt;
117 total += i;
118 }
119 else {
120 temp_mask = temp_mask_lo;
121 }
122 }
123 if (offset < 0)
124 return 63 - total;
125 else
126 return total;
127}
128
129__device__ static int32_t __fns32(uint64_t mask, uint32_t base, int32_t offset) {
130 uint64_t temp_mask = mask;
131 int32_t temp_offset = offset;
132 if (offset == 0) {
133 temp_mask &= (1 << base);
134 temp_offset = 1;
135 }
136 else if (offset < 0) {
137 temp_mask = __builtin_bitreverse64(mask);
138 base = 63 - base;
139 temp_offset = -offset;
140 }
141 temp_mask = temp_mask & ((~0ULL) << base);
142 if (__builtin_popcountll(temp_mask) < temp_offset)
143 return -1;
144 int32_t total = 0;
145 for (int i = 0x20; i > 0; i >>= 1) {
146 uint64_t temp_mask_lo = temp_mask & ((1ULL << i) - 1);
147 int32_t pcnt = __builtin_popcountll(temp_mask_lo);
148 if (pcnt < temp_offset) {
149 temp_mask = temp_mask >> i;
150 temp_offset -= pcnt;
151 total += i;
152 }
153 else {
154 temp_mask = temp_mask_lo;
155 }
156 }
157 if (offset < 0)
158 return 63 - total;
159 else
160 return total;
161}
162__device__ static inline unsigned int __brev(unsigned int input) {
163 return __builtin_bitreverse32(input);
164}
165
166__device__ static inline unsigned long long int __brevll(unsigned long long int input) {
167 return __builtin_bitreverse64(input);
168}
169
170__device__ static inline unsigned int __lastbit_u32_u64(uint64_t input) {
171 return input == 0 ? -1 : __builtin_ctzl(input);
172}
173
174__device__ static inline unsigned int __bitextract_u32(unsigned int src0, unsigned int src1, unsigned int src2) {
175 uint32_t offset = src1 & 31;
176 uint32_t width = src2 & 31;
177 return width == 0 ? 0 : (src0 << (32 - offset - width)) >> (32 - width);
178}
179
180__device__ static inline uint64_t __bitextract_u64(uint64_t src0, unsigned int src1, unsigned int src2) {
181 uint64_t offset = src1 & 63;
182 uint64_t width = src2 & 63;
183 return width == 0 ? 0 : (src0 << (64 - offset - width)) >> (64 - width);
184}
185
186__device__ static inline unsigned int __bitinsert_u32(unsigned int src0, unsigned int src1, unsigned int src2, unsigned int src3) {
187 uint32_t offset = src2 & 31;
188 uint32_t width = src3 & 31;
189 uint32_t mask = (1 << width) - 1;
190 return ((src0 & ~(mask << offset)) | ((src1 & mask) << offset));
191}
192
193__device__ static inline uint64_t __bitinsert_u64(uint64_t src0, uint64_t src1, unsigned int src2, unsigned int src3) {
194 uint64_t offset = src2 & 63;
195 uint64_t width = src3 & 63;
196 uint64_t mask = (1ULL << width) - 1;
197 return ((src0 & ~(mask << offset)) | ((src1 & mask) << offset));
198}
199
200__device__ inline unsigned int __funnelshift_l(unsigned int lo, unsigned int hi, unsigned int shift)
201{
202 uint32_t mask_shift = shift & 31;
203 return mask_shift == 0 ? hi : __builtin_amdgcn_alignbit(hi, lo, 32 - mask_shift);
204}
205
206__device__ inline unsigned int __funnelshift_lc(unsigned int lo, unsigned int hi, unsigned int shift)
207{
208 uint32_t min_shift = shift >= 32 ? 32 : shift;
209 return min_shift == 0 ? hi : __builtin_amdgcn_alignbit(hi, lo, 32 - min_shift);
210}
211
212__device__ inline unsigned int __funnelshift_r(unsigned int lo, unsigned int hi, unsigned int shift)
213{
214 return __builtin_amdgcn_alignbit(hi, lo, shift);
215}
216
217__device__ inline unsigned int __funnelshift_rc(unsigned int lo, unsigned int hi, unsigned int shift)
218{
219 return shift >= 32 ? hi : __builtin_amdgcn_alignbit(hi, lo, shift);
220}
221
222__device__ static unsigned int __byte_perm(unsigned int x, unsigned int y, unsigned int s);
223__device__ static unsigned int __hadd(int x, int y);
224__device__ static int __mul24(int x, int y);
225__device__ static long long int __mul64hi(long long int x, long long int y);
226__device__ static int __mulhi(int x, int y);
227__device__ static int __rhadd(int x, int y);
228__device__ static unsigned int __sad(int x, int y,unsigned int z);
229__device__ static unsigned int __uhadd(unsigned int x, unsigned int y);
230__device__ static int __umul24(unsigned int x, unsigned int y);
231__device__ static unsigned long long int __umul64hi(unsigned long long int x, unsigned long long int y);
232__device__ static unsigned int __umulhi(unsigned int x, unsigned int y);
233__device__ static unsigned int __urhadd(unsigned int x, unsigned int y);
234__device__ static unsigned int __usad(unsigned int x, unsigned int y, unsigned int z);
235
237 union {
238 unsigned char c[4];
239 unsigned int ui;
240 };
241} __attribute__((aligned(4)));
242
244 union {
245 unsigned int ui[2];
246 unsigned char c[8];
247 };
248} __attribute__((aligned(8)));
249
250__device__
251static inline unsigned int __byte_perm(unsigned int x, unsigned int y, unsigned int s) {
252 struct uchar2Holder cHoldVal;
253 struct ucharHolder cHoldKey;
254 cHoldKey.ui = s;
255 cHoldVal.ui[0] = x;
256 cHoldVal.ui[1] = y;
257 unsigned int result;
258 result = cHoldVal.c[cHoldKey.c[0] & 0x07];
259 result += (cHoldVal.c[(cHoldKey.c[0] & 0x70) >> 4] << 8);
260 result += (cHoldVal.c[cHoldKey.c[1] & 0x07] << 16);
261 result += (cHoldVal.c[(cHoldKey.c[1] & 0x70) >> 4] << 24);
262 return result;
263}
264
265__device__ static inline unsigned int __hadd(int x, int y) {
266 int z = x + y;
267 int sign = z & 0x8000000;
268 int value = z & 0x7FFFFFFF;
269 return ((value) >> 1 || sign);
270}
271
272__device__ static inline int __mul24(int x, int y) {
273 return __ockl_mul24_i32(x, y);
274}
275
276__device__ static inline long long __mul64hi(long long int x, long long int y) {
277 ulong x0 = (ulong)x & 0xffffffffUL;
278 long x1 = x >> 32;
279 ulong y0 = (ulong)y & 0xffffffffUL;
280 long y1 = y >> 32;
281 ulong z0 = x0*y0;
282 long t = x1*y0 + (z0 >> 32);
283 long z1 = t & 0xffffffffL;
284 long z2 = t >> 32;
285 z1 = x0*y1 + z1;
286 return x1*y1 + z2 + (z1 >> 32);
287}
288
289__device__ static inline int __mulhi(int x, int y) {
290 return __ockl_mul_hi_i32(x, y);
291}
292
293__device__ static inline int __rhadd(int x, int y) {
294 int z = x + y + 1;
295 int sign = z & 0x8000000;
296 int value = z & 0x7FFFFFFF;
297 return ((value) >> 1 || sign);
298}
299__device__ static inline unsigned int __sad(int x, int y, unsigned int z) {
300 return x > y ? x - y + z : y - x + z;
301}
302__device__ static inline unsigned int __uhadd(unsigned int x, unsigned int y) {
303 return (x + y) >> 1;
304}
305__device__ static inline int __umul24(unsigned int x, unsigned int y) {
306 return __ockl_mul24_u32(x, y);
307}
308
309__device__
310static inline unsigned long long __umul64hi(unsigned long long int x, unsigned long long int y) {
311 ulong x0 = x & 0xffffffffUL;
312 ulong x1 = x >> 32;
313 ulong y0 = y & 0xffffffffUL;
314 ulong y1 = y >> 32;
315 ulong z0 = x0*y0;
316 ulong t = x1*y0 + (z0 >> 32);
317 ulong z1 = t & 0xffffffffUL;
318 ulong z2 = t >> 32;
319 z1 = x0*y1 + z1;
320 return x1*y1 + z2 + (z1 >> 32);
321}
322
323__device__ static inline unsigned int __umulhi(unsigned int x, unsigned int y) {
324 return __ockl_mul_hi_u32(x, y);
325}
326__device__ static inline unsigned int __urhadd(unsigned int x, unsigned int y) {
327 return (x + y + 1) >> 1;
328}
329__device__ static inline unsigned int __usad(unsigned int x, unsigned int y, unsigned int z) {
330 return __ockl_sadd_u32(x, y, z);
331}
332
333__device__ static inline unsigned int __lane_id() {
334 return __builtin_amdgcn_mbcnt_hi(
335 -1, __builtin_amdgcn_mbcnt_lo(-1, 0));
336}
337
338__device__
339static inline unsigned int __mbcnt_lo(unsigned int x, unsigned int y) {return __builtin_amdgcn_mbcnt_lo(x,y);};
340
341__device__
342static inline unsigned int __mbcnt_hi(unsigned int x, unsigned int y) {return __builtin_amdgcn_mbcnt_hi(x,y);};
343
344/*
345HIP specific device functions
346*/
347
348#if !defined(__HIPCC_RTC__)
349#include "amd_warp_functions.h"
350#endif
351
352#define MASK1 0x00ff00ff
353#define MASK2 0xff00ff00
354
355__device__ static inline char4 __hip_hc_add8pk(char4 in1, char4 in2) {
356 char4 out;
357 unsigned one1 = in1.w & MASK1;
358 unsigned one2 = in2.w & MASK1;
359 out.w = (one1 + one2) & MASK1;
360 one1 = in1.w & MASK2;
361 one2 = in2.w & MASK2;
362 out.w = out.w | ((one1 + one2) & MASK2);
363 return out;
364}
365
366__device__ static inline char4 __hip_hc_sub8pk(char4 in1, char4 in2) {
367 char4 out;
368 unsigned one1 = in1.w & MASK1;
369 unsigned one2 = in2.w & MASK1;
370 out.w = (one1 - one2) & MASK1;
371 one1 = in1.w & MASK2;
372 one2 = in2.w & MASK2;
373 out.w = out.w | ((one1 - one2) & MASK2);
374 return out;
375}
376
377__device__ static inline char4 __hip_hc_mul8pk(char4 in1, char4 in2) {
378 char4 out;
379 unsigned one1 = in1.w & MASK1;
380 unsigned one2 = in2.w & MASK1;
381 out.w = (one1 * one2) & MASK1;
382 one1 = in1.w & MASK2;
383 one2 = in2.w & MASK2;
384 out.w = out.w | ((one1 * one2) & MASK2);
385 return out;
386}
387
388__device__ static inline float __double2float_rd(double x) {
389 return __ocml_cvtrtn_f32_f64(x);
390}
391__device__ static inline float __double2float_rn(double x) { return x; }
392__device__ static inline float __double2float_ru(double x) {
393 return __ocml_cvtrtp_f32_f64(x);
394}
395__device__ static inline float __double2float_rz(double x) {
396 return __ocml_cvtrtz_f32_f64(x);
397}
398
399__device__ static inline int __double2hiint(double x) {
400 static_assert(sizeof(double) == 2 * sizeof(int), "");
401
402 int tmp[2];
403 __builtin_memcpy(tmp, &x, sizeof(tmp));
404
405 return tmp[1];
406}
407__device__ static inline int __double2loint(double x) {
408 static_assert(sizeof(double) == 2 * sizeof(int), "");
409
410 int tmp[2];
411 __builtin_memcpy(tmp, &x, sizeof(tmp));
412
413 return tmp[0];
414}
415
416__device__ static inline int __double2int_rd(double x) { return (int)__ocml_floor_f64(x); }
417__device__ static inline int __double2int_rn(double x) { return (int)__ocml_rint_f64(x); }
418__device__ static inline int __double2int_ru(double x) { return (int)__ocml_ceil_f64(x); }
419__device__ static inline int __double2int_rz(double x) { return (int)x; }
420
421__device__ static inline long long int __double2ll_rd(double x) {
422 return (long long)__ocml_floor_f64(x);
423}
424__device__ static inline long long int __double2ll_rn(double x) {
425 return (long long)__ocml_rint_f64(x);
426}
427__device__ static inline long long int __double2ll_ru(double x) {
428 return (long long)__ocml_ceil_f64(x);
429}
430__device__ static inline long long int __double2ll_rz(double x) { return (long long)x; }
431
432__device__ static inline unsigned int __double2uint_rd(double x) {
433 return (unsigned int)__ocml_floor_f64(x);
434}
435__device__ static inline unsigned int __double2uint_rn(double x) {
436 return (unsigned int)__ocml_rint_f64(x);
437}
438__device__ static inline unsigned int __double2uint_ru(double x) {
439 return (unsigned int)__ocml_ceil_f64(x);
440}
441__device__ static inline unsigned int __double2uint_rz(double x) { return (unsigned int)x; }
442
443__device__ static inline unsigned long long int __double2ull_rd(double x) {
444 return (unsigned long long int)__ocml_floor_f64(x);
445}
446__device__ static inline unsigned long long int __double2ull_rn(double x) {
447 return (unsigned long long int)__ocml_rint_f64(x);
448}
449__device__ static inline unsigned long long int __double2ull_ru(double x) {
450 return (unsigned long long int)__ocml_ceil_f64(x);
451}
452__device__ static inline unsigned long long int __double2ull_rz(double x) {
453 return (unsigned long long int)x;
454}
455__device__ static inline long long int __double_as_longlong(double x) {
456 static_assert(sizeof(long long) == sizeof(double), "");
457
458 long long tmp;
459 __builtin_memcpy(&tmp, &x, sizeof(tmp));
460
461 return tmp;
462}
463
464/*
465__device__ unsigned short __float2half_rn(float x);
466__device__ float __half2float(unsigned short);
467
468The above device function are not a valid .
469Use
470__device__ __half __float2half_rn(float x);
471__device__ float __half2float(__half);
472from hip_fp16.h
473
474CUDA implements half as unsigned short whereas, HIP doesn't.
475
476*/
477
478__device__ static inline int __float2int_rd(float x) { return (int)__ocml_floor_f32(x); }
479__device__ static inline int __float2int_rn(float x) { return (int)__ocml_rint_f32(x); }
480__device__ static inline int __float2int_ru(float x) { return (int)__ocml_ceil_f32(x); }
481__device__ static inline int __float2int_rz(float x) { return (int)__ocml_trunc_f32(x); }
482
483__device__ static inline long long int __float2ll_rd(float x) {
484 return (long long int)__ocml_floor_f32(x);
485}
486__device__ static inline long long int __float2ll_rn(float x) {
487 return (long long int)__ocml_rint_f32(x);
488}
489__device__ static inline long long int __float2ll_ru(float x) {
490 return (long long int)__ocml_ceil_f32(x);
491}
492__device__ static inline long long int __float2ll_rz(float x) { return (long long int)x; }
493
494__device__ static inline unsigned int __float2uint_rd(float x) {
495 return (unsigned int)__ocml_floor_f32(x);
496}
497__device__ static inline unsigned int __float2uint_rn(float x) {
498 return (unsigned int)__ocml_rint_f32(x);
499}
500__device__ static inline unsigned int __float2uint_ru(float x) {
501 return (unsigned int)__ocml_ceil_f32(x);
502}
503__device__ static inline unsigned int __float2uint_rz(float x) { return (unsigned int)x; }
504
505__device__ static inline unsigned long long int __float2ull_rd(float x) {
506 return (unsigned long long int)__ocml_floor_f32(x);
507}
508__device__ static inline unsigned long long int __float2ull_rn(float x) {
509 return (unsigned long long int)__ocml_rint_f32(x);
510}
511__device__ static inline unsigned long long int __float2ull_ru(float x) {
512 return (unsigned long long int)__ocml_ceil_f32(x);
513}
514__device__ static inline unsigned long long int __float2ull_rz(float x) {
515 return (unsigned long long int)x;
516}
517
518__device__ static inline int __float_as_int(float x) {
519 static_assert(sizeof(int) == sizeof(float), "");
520
521 int tmp;
522 __builtin_memcpy(&tmp, &x, sizeof(tmp));
523
524 return tmp;
525}
526
527__device__ static inline unsigned int __float_as_uint(float x) {
528 static_assert(sizeof(unsigned int) == sizeof(float), "");
529
530 unsigned int tmp;
531 __builtin_memcpy(&tmp, &x, sizeof(tmp));
532
533 return tmp;
534}
535
536__device__ static inline double __hiloint2double(int hi, int lo) {
537 static_assert(sizeof(double) == sizeof(uint64_t), "");
538
539 uint64_t tmp0 = (static_cast<uint64_t>(hi) << 32ull) | static_cast<uint32_t>(lo);
540 double tmp1;
541 __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
542
543 return tmp1;
544}
545
546__device__ static inline double __int2double_rn(int x) { return (double)x; }
547
548__device__ static inline float __int2float_rd(int x) {
549 return __ocml_cvtrtn_f32_s32(x);
550}
551__device__ static inline float __int2float_rn(int x) { return (float)x; }
552__device__ static inline float __int2float_ru(int x) {
553 return __ocml_cvtrtp_f32_s32(x);
554}
555__device__ static inline float __int2float_rz(int x) {
556 return __ocml_cvtrtz_f32_s32(x);
557}
558
559__device__ static inline float __int_as_float(int x) {
560 static_assert(sizeof(float) == sizeof(int), "");
561
562 float tmp;
563 __builtin_memcpy(&tmp, &x, sizeof(tmp));
564
565 return tmp;
566}
567
568__device__ static inline double __ll2double_rd(long long int x) {
569 return __ocml_cvtrtn_f64_s64(x);
570}
571__device__ static inline double __ll2double_rn(long long int x) { return (double)x; }
572__device__ static inline double __ll2double_ru(long long int x) {
573 return __ocml_cvtrtp_f64_s64(x);
574}
575__device__ static inline double __ll2double_rz(long long int x) {
576 return __ocml_cvtrtz_f64_s64(x);
577}
578
579__device__ static inline float __ll2float_rd(long long int x) {
580 return __ocml_cvtrtn_f32_s64(x);
581}
582__device__ static inline float __ll2float_rn(long long int x) { return (float)x; }
583__device__ static inline float __ll2float_ru(long long int x) {
584 return __ocml_cvtrtp_f32_s64(x);
585}
586__device__ static inline float __ll2float_rz(long long int x) {
587 return __ocml_cvtrtz_f32_s64(x);
588}
589
590__device__ static inline double __longlong_as_double(long long int x) {
591 static_assert(sizeof(double) == sizeof(long long), "");
592
593 double tmp;
594 __builtin_memcpy(&tmp, &x, sizeof(tmp));
595
596 return tmp;
597}
598
599__device__ static inline double __uint2double_rn(unsigned int x) { return (double)x; }
600
601__device__ static inline float __uint2float_rd(unsigned int x) {
602 return __ocml_cvtrtn_f32_u32(x);
603}
604__device__ static inline float __uint2float_rn(unsigned int x) { return (float)x; }
605__device__ static inline float __uint2float_ru(unsigned int x) {
606 return __ocml_cvtrtp_f32_u32(x);
607}
608__device__ static inline float __uint2float_rz(unsigned int x) {
609 return __ocml_cvtrtz_f32_u32(x);
610}
611
612__device__ static inline float __uint_as_float(unsigned int x) {
613 static_assert(sizeof(float) == sizeof(unsigned int), "");
614
615 float tmp;
616 __builtin_memcpy(&tmp, &x, sizeof(tmp));
617
618 return tmp;
619}
620
621__device__ static inline double __ull2double_rd(unsigned long long int x) {
622 return __ocml_cvtrtn_f64_u64(x);
623}
624__device__ static inline double __ull2double_rn(unsigned long long int x) { return (double)x; }
625__device__ static inline double __ull2double_ru(unsigned long long int x) {
626 return __ocml_cvtrtp_f64_u64(x);
627}
628__device__ static inline double __ull2double_rz(unsigned long long int x) {
629 return __ocml_cvtrtz_f64_u64(x);
630}
631
632__device__ static inline float __ull2float_rd(unsigned long long int x) {
633 return __ocml_cvtrtn_f32_u64(x);
634}
635__device__ static inline float __ull2float_rn(unsigned long long int x) { return (float)x; }
636__device__ static inline float __ull2float_ru(unsigned long long int x) {
637 return __ocml_cvtrtp_f32_u64(x);
638}
639__device__ static inline float __ull2float_rz(unsigned long long int x) {
640 return __ocml_cvtrtz_f32_u64(x);
641}
642
643#if defined(__clang__) && defined(__HIP__)
644
645// Clock functions
646__device__ long long int __clock64();
647__device__ long long int __clock();
648__device__ long long int clock64();
649__device__ long long int clock();
650__device__ long long int wall_clock64();
651// hip.amdgcn.bc - named sync
652__device__ void __named_sync();
653
654#ifdef __HIP_DEVICE_COMPILE__
655
656// Clock function to return GPU core cycle count.
657// GPU can change its core clock frequency at runtime. The maximum frequency can be queried
658// through hipDeviceAttributeClockRate attribute.
659__device__
660inline __attribute((always_inline))
661long long int __clock64() {
662#if __has_builtin(__builtin_amdgcn_s_memtime)
663 // Exists on gfx8, gfx9, gfx10.1, gfx10.2, gfx10.3
664 return (long long int) __builtin_amdgcn_s_memtime();
665#else
666 // Subject to change when better solution available
667 return (long long int) __builtin_readcyclecounter();
668#endif
669}
670
671__device__
672inline __attribute((always_inline))
673long long int __clock() { return __clock64(); }
674
675// Clock function to return wall clock count at a constant frequency that can be queried
676// through hipDeviceAttributeWallClockRate attribute.
677__device__
678inline __attribute__((always_inline))
679long long int wall_clock64() {
680 return (long long int) __ockl_steadyctr_u64();
681}
682
683__device__
684inline __attribute__((always_inline))
685long long int clock64() { return __clock64(); }
686
687__device__
688inline __attribute__((always_inline))
689long long int clock() { return __clock(); }
690
691// hip.amdgcn.bc - named sync
692__device__
693inline
694void __named_sync() { __builtin_amdgcn_s_barrier(); }
695
696#endif // __HIP_DEVICE_COMPILE__
697
698// warp vote function __all __any __ballot
699__device__
700inline
701int __all(int predicate) {
702 return __ockl_wfall_i32(predicate);
703}
704
705__device__
706inline
707int __any(int predicate) {
708 return __ockl_wfany_i32(predicate);
709}
710
711// XXX from llvm/include/llvm/IR/InstrTypes.h
712#define ICMP_NE 33
713
714__device__
715inline
716unsigned long long int __ballot(int predicate) {
717 return __builtin_amdgcn_uicmp(predicate, 0, ICMP_NE);
718}
719
720__device__
721inline
722unsigned long long int __ballot64(int predicate) {
723 return __builtin_amdgcn_uicmp(predicate, 0, ICMP_NE);
724}
725
726// hip.amdgcn.bc - lanemask
727__device__
728inline
729uint64_t __lanemask_gt()
730{
731 uint32_t lane = __ockl_lane_u32();
732 if (lane == 63)
733 return 0;
734 uint64_t ballot = __ballot64(1);
735 uint64_t mask = (~((uint64_t)0)) << (lane + 1);
736 return mask & ballot;
737}
738
739__device__
740inline
741uint64_t __lanemask_lt()
742{
743 uint32_t lane = __ockl_lane_u32();
744 int64_t ballot = __ballot64(1);
745 uint64_t mask = ((uint64_t)1 << lane) - (uint64_t)1;
746 return mask & ballot;
747}
748
749__device__
750inline
751uint64_t __lanemask_eq()
752{
753 uint32_t lane = __ockl_lane_u32();
754 int64_t mask = ((uint64_t)1 << lane);
755 return mask;
756}
757
758
759__device__ inline void* __local_to_generic(void* p) { return p; }
760
761#ifdef __HIP_DEVICE_COMPILE__
762__device__
763inline
764void* __get_dynamicgroupbaseptr()
765{
766 // Get group segment base pointer.
767 return (char*)__local_to_generic((void*)__to_local(__builtin_amdgcn_groupstaticsize()));
768}
769#else
770__device__
771void* __get_dynamicgroupbaseptr();
772#endif // __HIP_DEVICE_COMPILE__
773
774__device__
775inline
776void *__amdgcn_get_dynamicgroupbaseptr() {
777 return __get_dynamicgroupbaseptr();
778}
779
780// Memory Fence Functions
781__device__
782inline
783static void __threadfence()
784{
785 __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "agent");
786}
787
788__device__
789inline
790static void __threadfence_block()
791{
792 __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "workgroup");
793}
794
795__device__
796inline
797static void __threadfence_system()
798{
799 __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "");
800}
801__device__ inline static void __work_group_barrier(__cl_mem_fence_flags flags) {
802 if (flags) {
803 __builtin_amdgcn_fence(__ATOMIC_RELEASE, "workgroup");
804 __builtin_amdgcn_s_barrier();
805 __builtin_amdgcn_fence(__ATOMIC_ACQUIRE, "workgroup");
806 } else {
807 __builtin_amdgcn_s_barrier();
808 }
809}
810
811__device__
812inline
813static void __barrier(int n)
814{
815 __work_group_barrier((__cl_mem_fence_flags)n);
816}
817
818__device__
819inline
820__attribute__((convergent))
821void __syncthreads()
822{
823 __barrier(__CLK_LOCAL_MEM_FENCE);
824}
825
826__device__
827inline
828__attribute__((convergent))
829int __syncthreads_count(int predicate)
830{
831 return __ockl_wgred_add_i32(!!predicate);
832}
833
834__device__
835inline
836__attribute__((convergent))
837int __syncthreads_and(int predicate)
838{
839 return __ockl_wgred_and_i32(!!predicate);
840}
841
842__device__
843inline
844__attribute__((convergent))
845int __syncthreads_or(int predicate)
846{
847 return __ockl_wgred_or_i32(!!predicate);
848}
849
850// hip.amdgcn.bc - device routine
851/*
852 HW_ID Register bit structure for RDNA2 & RDNA3
853 WAVE_ID 4:0 Wave id within the SIMD.
854 SIMD_ID 9:8 SIMD_ID within the WGP: [0] = row, [1] = column.
855 WGP_ID 13:10 Physical WGP ID.
856 SA_ID 16 Shader Array ID
857 SE_ID 20:18 Shader Engine the wave is assigned to for gfx11
858 SE_ID 19:18 Shader Engine the wave is assigned to for gfx10
859 DP_RATE 31:29 Number of double-precision float units per SIMD
860
861 HW_ID Register bit structure for GCN and CDNA
862 WAVE_ID 3:0 Wave buffer slot number. 0-9.
863 SIMD_ID 5:4 SIMD which the wave is assigned to within the CU.
864 PIPE_ID 7:6 Pipeline from which the wave was dispatched.
865 CU_ID 11:8 Compute Unit the wave is assigned to.
866 SH_ID 12 Shader Array (within an SE) the wave is assigned to.
867 SE_ID 15:13 Shader Engine the wave is assigned to for gfx908, gfx90a, gfx940-942
868 14:13 Shader Engine the wave is assigned to for Vega.
869 TG_ID 19:16 Thread-group ID
870 VM_ID 23:20 Virtual Memory ID
871 QUEUE_ID 26:24 Queue from which this wave was dispatched.
872 STATE_ID 29:27 State ID (graphics only, not compute).
873 ME_ID 31:30 Micro-engine ID.
874
875 XCC_ID Register bit structure for gfx940
876 XCC_ID 3:0 XCC the wave is assigned to.
877 */
878
879#if (defined (__GFX10__) || defined (__GFX11__))
880 #define HW_ID 23
881#else
882 #define HW_ID 4
883#endif
884
885#if (defined(__GFX10__) || defined(__GFX11__))
886 #define HW_ID_WGP_ID_SIZE 4
887 #define HW_ID_WGP_ID_OFFSET 10
888#else
889 #define HW_ID_CU_ID_SIZE 4
890 #define HW_ID_CU_ID_OFFSET 8
891#endif
892
893#if (defined(__gfx908__) || defined(__gfx90a__) || \
894 defined(__GFX11__))
895 #define HW_ID_SE_ID_SIZE 3
896#else //4 SEs/XCC for gfx940-942
897 #define HW_ID_SE_ID_SIZE 2
898#endif
899#if (defined(__GFX10__) || defined(__GFX11__))
900 #define HW_ID_SE_ID_OFFSET 18
901 #define HW_ID_SA_ID_OFFSET 16
902 #define HW_ID_SA_ID_SIZE 1
903#else
904 #define HW_ID_SE_ID_OFFSET 13
905#endif
906
907#if (defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__))
908 #define XCC_ID 20
909 #define XCC_ID_XCC_ID_SIZE 4
910 #define XCC_ID_XCC_ID_OFFSET 0
911#endif
912
913#if (!defined(__HIP_NO_IMAGE_SUPPORT) && \
914 (defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)))
915 #define __HIP_NO_IMAGE_SUPPORT 1
916#endif
917
918/*
919 Encoding of parameter bitmask
920 HW_ID 5:0 HW_ID
921 OFFSET 10:6 Range: 0..31
922 SIZE 15:11 Range: 1..32
923 */
924
925#define GETREG_IMMED(SZ,OFF,REG) (((SZ) << 11) | ((OFF) << 6) | (REG))
926
927/*
928 __smid returns the wave's assigned Compute Unit and Shader Engine.
929 The Compute Unit, CU_ID returned in bits 3:0, and Shader Engine, SE_ID in bits 5:4.
930 Note: the results vary over time.
931 SZ minus 1 since SIZE is 1-based.
932*/
933__device__
934inline
935unsigned __smid(void)
936{
937 unsigned se_id = __builtin_amdgcn_s_getreg(
938 GETREG_IMMED(HW_ID_SE_ID_SIZE-1, HW_ID_SE_ID_OFFSET, HW_ID));
939 #if (defined(__GFX10__) || defined(__GFX11__))
940 unsigned wgp_id = __builtin_amdgcn_s_getreg(
941 GETREG_IMMED(HW_ID_WGP_ID_SIZE - 1, HW_ID_WGP_ID_OFFSET, HW_ID));
942 unsigned sa_id = __builtin_amdgcn_s_getreg(
943 GETREG_IMMED(HW_ID_SA_ID_SIZE - 1, HW_ID_SA_ID_OFFSET, HW_ID));
944 #else
945 #if (defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__))
946 unsigned xcc_id = __builtin_amdgcn_s_getreg(
947 GETREG_IMMED(XCC_ID_XCC_ID_SIZE - 1, XCC_ID_XCC_ID_OFFSET, XCC_ID));
948 #endif
949 unsigned cu_id = __builtin_amdgcn_s_getreg(
950 GETREG_IMMED(HW_ID_CU_ID_SIZE - 1, HW_ID_CU_ID_OFFSET, HW_ID));
951 #endif
952 #if (defined(__GFX10__) || defined(__GFX11__))
953 unsigned temp = se_id;
954 temp = (temp << HW_ID_SA_ID_SIZE) | sa_id;
955 temp = (temp << HW_ID_WGP_ID_SIZE) | wgp_id;
956 return temp;
957 //TODO : CU Mode impl
958 #elif (defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__))
959 unsigned temp = xcc_id;
960 temp = (temp << HW_ID_SE_ID_SIZE) | se_id;
961 temp = (temp << HW_ID_CU_ID_SIZE) | cu_id;
962 return temp;
963 #else
964 return (se_id << HW_ID_CU_ID_SIZE) + cu_id;
965 #endif
966}
967
972#define HIP_DYNAMIC_SHARED(type, var) extern __shared__ type var[];
973#define HIP_DYNAMIC_SHARED_ATTRIBUTE
974
975#endif //defined(__clang__) && defined(__HIP__)
976
977
978// loop unrolling
979static inline __device__ void* __hip_hc_memcpy(void* dst, const void* src, size_t size) {
980 auto dstPtr = static_cast<unsigned char*>(dst);
981 auto srcPtr = static_cast<const unsigned char*>(src);
982
983 while (size >= 4u) {
984 dstPtr[0] = srcPtr[0];
985 dstPtr[1] = srcPtr[1];
986 dstPtr[2] = srcPtr[2];
987 dstPtr[3] = srcPtr[3];
988
989 size -= 4u;
990 srcPtr += 4u;
991 dstPtr += 4u;
992 }
993 switch (size) {
994 case 3:
995 dstPtr[2] = srcPtr[2];
996 case 2:
997 dstPtr[1] = srcPtr[1];
998 case 1:
999 dstPtr[0] = srcPtr[0];
1000 }
1001
1002 return dst;
1003}
1004
1005static inline __device__ void* __hip_hc_memset(void* dst, unsigned char val, size_t size) {
1006 auto dstPtr = static_cast<unsigned char*>(dst);
1007
1008 while (size >= 4u) {
1009 dstPtr[0] = val;
1010 dstPtr[1] = val;
1011 dstPtr[2] = val;
1012 dstPtr[3] = val;
1013
1014 size -= 4u;
1015 dstPtr += 4u;
1016 }
1017 switch (size) {
1018 case 3:
1019 dstPtr[2] = val;
1020 case 2:
1021 dstPtr[1] = val;
1022 case 1:
1023 dstPtr[0] = val;
1024 }
1025
1026 return dst;
1027}
1028#ifndef __OPENMP_AMDGCN__
1029static inline __device__ void* memcpy(void* dst, const void* src, size_t size) {
1030 return __hip_hc_memcpy(dst, src, size);
1031}
1032
1033static inline __device__ void* memset(void* ptr, int val, size_t size) {
1034 unsigned char val8 = static_cast<unsigned char>(val);
1035 return __hip_hc_memset(ptr, val8, size);
1036}
1037#endif // !__OPENMP_AMDGCN__
1038
1039#endif
Contains declarations for types and functions in device library. Uses int64_t and uint64_t instead of...
Definition amd_device_functions.h:236
Definition amd_device_functions.h:243
Definition amd_hip_vector_types.h:1672