shared-inl.h 27 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681
  1. // Copyright 2020 Google LLC
  2. // SPDX-License-Identifier: Apache-2.0
  3. //
  4. // Licensed under the Apache License, Version 2.0 (the "License");
  5. // you may not use this file except in compliance with the License.
  6. // You may obtain a copy of the License at
  7. //
  8. // http://www.apache.org/licenses/LICENSE-2.0
  9. //
  10. // Unless required by applicable law or agreed to in writing, software
  11. // distributed under the License is distributed on an "AS IS" BASIS,
  12. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. // See the License for the specific language governing permissions and
  14. // limitations under the License.
  15. // Per-target definitions shared by ops/*.h and user code.
  16. // IWYU pragma: begin_exports
  17. // Export does not seem to be recursive, so re-export these (also in base.h)
  18. #include <stddef.h>
  19. #include "hwy/base.h"
  20. // "IWYU pragma: keep" does not work for this include, so hide it from the IDE.
  21. #if !HWY_IDE
  22. #include <stdint.h>
  23. #endif
  24. #include "hwy/detect_compiler_arch.h"
  25. #include "hwy/detect_targets.h"
  26. // Separate header because foreach_target.h re-enables its include guard.
  27. #include "hwy/ops/set_macros-inl.h"
  28. // IWYU pragma: end_exports
  29. #if HWY_IS_MSAN
  30. #include <sanitizer/msan_interface.h>
  31. #endif
  32. // We are covered by the highway.h include guard, but generic_ops-inl.h
  33. // includes this again #if HWY_IDE.
  34. // clang-format off
  35. #if defined(HIGHWAY_HWY_OPS_SHARED_TOGGLE) == defined(HWY_TARGET_TOGGLE) // NOLINT
  36. // clang-format on
  37. #ifdef HIGHWAY_HWY_OPS_SHARED_TOGGLE
  38. #undef HIGHWAY_HWY_OPS_SHARED_TOGGLE
  39. #else
  40. #define HIGHWAY_HWY_OPS_SHARED_TOGGLE
  41. #endif
  42. HWY_BEFORE_NAMESPACE();
  43. namespace hwy {
  44. namespace HWY_NAMESPACE {
  45. // NOTE: GCC generates incorrect code for vector arguments to non-inlined
  46. // functions in two situations:
  47. // - on Windows and GCC 10.3, passing by value crashes due to unaligned loads:
  48. // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=54412.
  49. // - on aarch64 and GCC 9.3.0 or 11.2.1, passing by value causes many (but not
  50. // all) tests to fail.
  51. //
  52. // We therefore pass by const& only on GCC and (Windows or aarch64). This alias
  53. // must be used for all vector/mask parameters of functions marked HWY_NOINLINE,
  54. // and possibly also other functions that are not inlined.
  55. //
  56. // Even better is to avoid passing vector arguments to non-inlined functions,
  57. // because the SVE and RISC-V ABIs are still works in progress and may lead to
  58. // incorrect codegen.
  59. #if HWY_COMPILER_GCC_ACTUAL && (HWY_OS_WIN || HWY_ARCH_ARM_A64)
  60. template <class V>
  61. using VecArg = const V&;
  62. #else
  63. template <class V>
  64. using VecArg = V;
  65. #endif
  66. namespace detail {
  67. template <typename T>
  68. struct NativeLaneTypeT {
  69. using type = T;
  70. };
  71. template <>
  72. struct NativeLaneTypeT<hwy::float16_t> {
  73. #if HWY_HAVE_SCALAR_F16_TYPE
  74. using type = hwy::float16_t::Native;
  75. #else
  76. using type = uint16_t;
  77. #endif
  78. };
  79. template <>
  80. struct NativeLaneTypeT<hwy::bfloat16_t> {
  81. #if HWY_HAVE_SCALAR_BF16_TYPE
  82. using type = hwy::bfloat16_t::Native;
  83. #else
  84. using type = uint16_t;
  85. #endif
  86. };
  87. // The type expected by intrinsics for the given Highway lane type T. This
  88. // usually matches T, but differs for our wrapper types [b]float16_t. Use this
  89. // only when defining intrinsic wrappers, and NOT for casting, which is UB.
  90. template <typename T>
  91. using NativeLaneType = typename NativeLaneTypeT<T>::type;
  92. // Returns the same pointer after changing type to NativeLaneType. Use this only
  93. // for wrapper functions that call intrinsics (e.g. load/store) where some of
  94. // the overloads expect _Float16* or __bf16* arguments. For non-special floats,
  95. // this returns the same pointer and type.
  96. //
  97. // This makes use of the fact that a wrapper struct is pointer-interconvertible
  98. // with its first member (a union), thus also with the union members. Do NOT
  99. // call both this and U16LanePointer on the same object - they access different
  100. // union members, and this is not guaranteed to be safe.
  101. template <typename T, HWY_IF_NOT_SPECIAL_FLOAT(T)>
  102. HWY_INLINE T* NativeLanePointer(T* p) {
  103. return p;
  104. }
  105. template <typename T, typename NT = NativeLaneType<RemoveConst<T>>,
  106. HWY_IF_F16(T)>
  107. HWY_INLINE constexpr If<IsConst<T>(), const NT*, NT*> NativeLanePointer(T* p) {
  108. #if HWY_HAVE_SCALAR_F16_TYPE
  109. return &p->native;
  110. #else
  111. return &p->bits;
  112. #endif
  113. }
  114. template <typename T, typename NT = NativeLaneType<RemoveConst<T>>,
  115. HWY_IF_BF16(T)>
  116. HWY_INLINE constexpr If<IsConst<T>(), const NT*, NT*> NativeLanePointer(T* p) {
  117. #if HWY_HAVE_SCALAR_BF16_TYPE
  118. return &p->native;
  119. #else
  120. return &p->bits;
  121. #endif
  122. }
  123. // Returns a pointer to the u16 member of our [b]float16_t wrapper structs.
  124. // Use this in Highway targets that lack __bf16 intrinsics; for storing to
  125. // memory, we BitCast vectors to u16 and write to the pointer returned here.
  126. // Do NOT call both this and U16LanePointer on the same object - they access
  127. // different union members, and this is not guaranteed to be safe.
  128. template <typename T, HWY_IF_SPECIAL_FLOAT(T)>
  129. HWY_INLINE If<IsConst<T>(), const uint16_t*, uint16_t*> U16LanePointer(T* p) {
  130. return &p->bits;
  131. }
  132. // Returns N * 2^pow2. N is the number of lanes in a full vector and pow2 the
  133. // desired fraction or multiple of it, see Simd<>. `pow2` is most often in
  134. // [-3, 3] but can also be lower for user-specified fractions.
  135. constexpr size_t ScaleByPower(size_t N, int pow2) {
  136. return pow2 >= 0 ? (N << pow2) : (N >> (-pow2));
  137. }
  138. template <typename T>
  139. HWY_INLINE void MaybeUnpoison(T* HWY_RESTRICT unaligned, size_t count) {
  140. // Workaround for MSAN not marking compressstore as initialized (b/233326619)
  141. #if HWY_IS_MSAN
  142. __msan_unpoison(unaligned, count * sizeof(T));
  143. #else
  144. (void)unaligned;
  145. (void)count;
  146. #endif
  147. }
  148. } // namespace detail
  149. // Highway operations are implemented as overloaded functions selected using a
  150. // zero-sized tag type D := Simd<T, N, kPow2>. T denotes the lane type.
  151. //
  152. // N defines how many lanes are in a 'full' vector, typically equal to
  153. // HWY_LANES(T) (which is the actual count on targets with vectors of known
  154. // size, and an upper bound in case of scalable vectors), otherwise a
  155. // user-specified limit at most that large.
  156. //
  157. // 2^kPow2 is a _subsequently_ applied scaling factor that indicates the
  158. // desired fraction of a 'full' vector: 0 means full, -1 means half; 1,2,3
  159. // means two/four/eight full vectors ganged together. The largest supported
  160. // kPow2 is `HWY_MAX_POW2` and the aliases below take care of clamping
  161. // user-specified values to that. Note that `Simd<T, 1, 0>` and `Simd<T, 2, -1>`
  162. // have the same `MaxLanes` and `Lanes`.
  163. //
  164. // We can theoretically keep halving Lanes(), but recursive instantiations of
  165. // kPow2 - 1 will eventually fail e.g. because -64 is not a valid shift count.
  166. // Users must terminate such compile-time recursions at or above HWY_MIN_POW2.
  167. //
  168. // WARNING: do not use N directly because it may be a special representation of
  169. // a fractional MaxLanes. This arises when we Rebind Simd<uint8_t, 1, 0> to
  170. // Simd<uint32_t, ??, 2>. RVV requires that the last argument (kPow2) be two,
  171. // but we want MaxLanes to be the same in both cases. Hence ?? is a
  172. // fixed-point encoding of 1/4.
  173. //
  174. // Instead of referring to Simd<> directly, users create D via aliases:
  175. // - ScalableTag<T> for a full vector;
  176. // - ScalableTag<T, kPow2>() for a fraction/group, where `kPow2` is
  177. // interpreted as `HWY_MIN(kPow2, HWY_MAX_POW2)`;
  178. // - CappedTag<T, kLimit> for a vector with up to kLimit lanes; or
  179. // - FixedTag<T, kNumLanes> for a vector with exactly kNumLanes lanes.
  180. //
  181. // Instead of N, use Lanes(D()) for the actual number of lanes at runtime and
  182. // D().MaxLanes() for a constexpr upper bound. Both are powers of two.
  183. template <typename Lane, size_t N, int kPow2>
  184. struct Simd {
  185. constexpr Simd() = default;
  186. using T = Lane;
  187. private:
  188. static_assert(sizeof(Lane) <= 8, "Lanes are up to 64-bit");
  189. static_assert(IsSame<Lane, RemoveCvRef<Lane>>(),
  190. "Lane must not be a reference type, const-qualified type, or "
  191. "volatile-qualified type");
  192. static_assert(IsIntegerLaneType<Lane>() || IsFloat<Lane>() ||
  193. IsSpecialFloat<Lane>(),
  194. "IsIntegerLaneType<T>(), IsFloat<T>(), or IsSpecialFloat<T>() "
  195. "must be true");
  196. // 20 bits are sufficient for any HWY_MAX_BYTES. This is the 'normal' value of
  197. // N when kFrac == 0, otherwise it is one (see FracN).
  198. static constexpr size_t kWhole = N & 0xFFFFF;
  199. // Fractional part is in the bits above kWhole.
  200. static constexpr int kFrac = static_cast<int>(N >> 20);
  201. // Can be 8x larger because kPow2 may be as low as -3 (Rebind of a larger
  202. // type to u8 results in fractions).
  203. static_assert(kWhole <= 8 * HWY_MAX_N && kFrac <= 3, "Out of range");
  204. static_assert(kFrac == 0 || kWhole == 1, "If frac, whole must be 1");
  205. static_assert((kWhole & (kWhole - 1)) == 0 && kWhole != 0, "Not 2^x");
  206. // Important to check this here because kPow2 <= -64 causes confusing
  207. // compile errors (invalid shift count).
  208. static_assert(kPow2 >= HWY_MIN_POW2, "Forgot kPow2 recursion terminator?");
  209. // However, do NOT verify kPow2 <= HWY_MAX_POW2 - users should be able to
  210. // Rebind<uint64_t, ScalableTag<uint8_t, 3>> in order to discover that its
  211. // kPow2 is out of bounds.
  212. public:
  213. // Upper bound on the number of lanes (tight if !HWY_HAVE_SCALABLE). In the
  214. // common case, N == kWhole, but if kFrac is nonzero, we deduct it from kPow2.
  215. // E.g. Rebind<uint32_t, Simd<uint8_t, 1, 0>> is Simd<uint32_t, 0x200001, 2>.
  216. // The resulting number of lanes is still 1 because this N represents 1/4
  217. // (the ratio of the sizes). Note that RVV requires kPow2 to be the ratio of
  218. // the sizes so that the correct LMUL overloads are chosen, even if N is
  219. // small enough that it would fit in an LMUL=1 vector.
  220. //
  221. // Cannot be an enum because GCC warns when using enums and non-enums in the
  222. // same expression. Cannot be a static constexpr function (MSVC limitation).
  223. // Rounded up to one so this is a valid array length.
  224. //
  225. // Do not use this directly - only 'public' so it is visible from the accessor
  226. // macro required by MSVC.
  227. static constexpr size_t kPrivateLanes =
  228. HWY_MAX(size_t{1}, detail::ScaleByPower(kWhole, kPow2 - kFrac));
  229. // Do not use this directly - only 'public' so it is visible from the accessor
  230. // macro required by MSVC.
  231. static constexpr int kPrivatePow2 = kPow2;
  232. constexpr size_t MaxLanes() const { return kPrivateLanes; }
  233. constexpr size_t MaxBytes() const { return kPrivateLanes * sizeof(Lane); }
  234. constexpr size_t MaxBlocks() const { return (MaxBytes() + 15) / 16; }
  235. // For SFINAE (HWY_IF_POW2_GT_D).
  236. constexpr int Pow2() const { return kPow2; }
  237. // ------------------------------ Changing lane type or count
  238. // Do not use any of these directly. Anything used from member typedefs cannot
  239. // be made private, but functions only used within other functions can.
  240. // Returns number of NewT lanes that fit within MaxBytes().
  241. template <typename NewT>
  242. static constexpr size_t RepartitionLanes() {
  243. // Round up to correctly handle larger NewT.
  244. return (kPrivateLanes * sizeof(T) + sizeof(NewT) - 1) / sizeof(NewT);
  245. }
  246. // Returns the new kPow2 required for lanes of type NewT.
  247. template <typename NewT>
  248. static constexpr int RebindPow2() {
  249. return kPow2 +
  250. ((sizeof(NewT) >= sizeof(T))
  251. ? static_cast<int>(CeilLog2(sizeof(NewT) / sizeof(T)))
  252. : -static_cast<int>(CeilLog2(sizeof(T) / sizeof(NewT))));
  253. }
  254. private:
  255. // Returns 0 or whole NewN such that kNewMaxLanes = NewN * 2^kNewPow2.
  256. template <int kNewPow2, size_t kNewMaxLanes>
  257. static constexpr size_t WholeN() {
  258. return detail::ScaleByPower(kNewMaxLanes, -kNewPow2);
  259. }
  260. // Returns fractional NewN such that kNewMaxLanes = NewN * 2^kNewPow2.
  261. template <int kNewPow2, size_t kNewMaxLanes>
  262. static constexpr size_t FracN() {
  263. // Only reached if kNewPow2 > CeilLog2(kNewMaxLanes) >= 0 (else WholeN
  264. // would not have been zero), but clamp to zero to avoid warnings. kFrac is
  265. // the difference, stored in the upper bits of N, and we also set kWhole =
  266. // 1 so that the new kPrivateLanes = kNewMaxLanes.
  267. static_assert(HWY_MAX_N <= (size_t{1} << 20), "Change bit shift");
  268. return static_cast<size_t>(
  269. 1 + (HWY_MAX(0, kNewPow2 - static_cast<int>(CeilLog2(kNewMaxLanes)))
  270. << 20));
  271. }
  272. public:
  273. // Returns (whole or fractional) NewN, see above.
  274. template <int kNewPow2, size_t kNewMaxLanes>
  275. static constexpr size_t NewN() {
  276. // We require a fraction if inverting kNewPow2 results in 0.
  277. return WholeN<kNewPow2, kNewMaxLanes>() == 0
  278. ? FracN<kNewPow2, kNewMaxLanes>()
  279. : WholeN<kNewPow2, kNewMaxLanes>();
  280. }
  281. // PromoteTo/DemoteTo() with another lane type, but same number of lanes.
  282. template <typename NewT>
  283. using Rebind =
  284. Simd<NewT, NewN<RebindPow2<NewT>(), kPrivateLanes>(), RebindPow2<NewT>()>;
  285. // Change lane type while keeping the same vector size, e.g. for MulEven.
  286. template <typename NewT>
  287. using Repartition =
  288. Simd<NewT, NewN<kPow2, RepartitionLanes<NewT>()>(), kPow2>;
  289. // Half the lanes while keeping the same lane type, e.g. for LowerHalf.
  290. using Half = Simd<T, N, kPow2 - 1>;
  291. // Twice the lanes while keeping the same lane type, e.g. for Combine.
  292. using Twice = Simd<T, N, kPow2 + 1>;
  293. };
  294. namespace detail {
  295. template <typename T, size_t N, int kPow2>
  296. constexpr bool IsFull(Simd<T, N, kPow2> /* d */) {
  297. return N == HWY_LANES(T) && kPow2 == 0;
  298. }
  299. // Struct wrappers enable validation of arguments via static_assert.
  300. template <typename T, size_t N, int kPow2>
  301. struct ClampNAndPow2 {
  302. using type = Simd<T, HWY_MIN(N, HWY_MAX_N), HWY_MIN(kPow2, HWY_MAX_POW2)>;
  303. };
  304. template <typename T, int kPow2>
  305. struct ScalableTagChecker {
  306. using type = typename ClampNAndPow2<T, HWY_LANES(T), kPow2>::type;
  307. };
  308. template <typename T, size_t kLimit, int kPow2>
  309. struct CappedTagChecker {
  310. static_assert(kLimit != 0, "Does not make sense to have zero lanes");
  311. // Safely handle non-power-of-two inputs by rounding down, which is allowed by
  312. // CappedTag. Otherwise, Simd<T, 3, 0> would static_assert.
  313. static constexpr size_t kLimitPow2 = size_t{1} << hwy::FloorLog2(kLimit);
  314. static constexpr size_t N = HWY_MIN(kLimitPow2, HWY_LANES(T));
  315. using type = typename ClampNAndPow2<T, N, kPow2>::type;
  316. };
  317. template <typename T, size_t kNumLanes>
  318. struct FixedTagChecker {
  319. static_assert(kNumLanes != 0, "Does not make sense to have zero lanes");
  320. static_assert(kNumLanes <= HWY_LANES(T), "Too many lanes");
  321. using type = Simd<T, kNumLanes, 0>;
  322. };
  323. } // namespace detail
  324. // ------------------------------ Aliases for Simd<>
  325. // Tag describing a full vector (kPow2 == 0: the most common usage, e.g. 1D
  326. // loops where the application does not care about the vector size) or a
  327. // fraction/multiple of one. Fractions (kPow2 < 0) are useful for arguments or
  328. // return values of type promotion and demotion. User-specified kPow2 is
  329. // interpreted as `HWY_MIN(kPow2, HWY_MAX_POW2)`.
  330. template <typename T, int kPow2 = 0>
  331. using ScalableTag = typename detail::ScalableTagChecker<T, kPow2>::type;
  332. // Tag describing a vector with *up to* kLimit active lanes, even on targets
  333. // with scalable vectors and HWY_SCALAR. The runtime lane count `Lanes(tag)` may
  334. // be less than kLimit, and is 1 on HWY_SCALAR. This alias is typically used for
  335. // 1D loops with a relatively low application-defined upper bound, e.g. for 8x8
  336. // DCTs. However, it is better if data structures are designed to be
  337. // vector-length-agnostic (e.g. a hybrid SoA where there are chunks of `M >=
  338. // MaxLanes(d)` DC components followed by M AC1, .., and M AC63; this would
  339. // enable vector-length-agnostic loops using ScalableTag). User-specified kPow2
  340. // is interpreted as `HWY_MIN(kPow2, HWY_MAX_POW2)`.
  341. template <typename T, size_t kLimit, int kPow2 = 0>
  342. using CappedTag = typename detail::CappedTagChecker<T, kLimit, kPow2>::type;
  343. #if !HWY_HAVE_SCALABLE
  344. // If the vector size is known, and the app knows it does not want more than
  345. // kLimit lanes, then capping can be beneficial. For example, AVX-512 has lower
  346. // IPC and potentially higher costs for unaligned load/store vs. 256-bit AVX2.
  347. template <typename T, size_t kLimit, int kPow2 = 0>
  348. using CappedTagIfFixed = CappedTag<T, kLimit, kPow2>;
  349. #else // HWY_HAVE_SCALABLE
  350. // .. whereas on RVV/SVE, the cost of clamping Lanes() may exceed the benefit.
  351. template <typename T, size_t kLimit, int kPow2 = 0>
  352. using CappedTagIfFixed = ScalableTag<T, kPow2>;
  353. #endif
  354. // Alias for a tag describing a vector with *exactly* kNumLanes active lanes,
  355. // even on targets with scalable vectors. Requires `kNumLanes` to be a power of
  356. // two not exceeding `HWY_LANES(T)`.
  357. //
  358. // NOTE: if the application does not need to support HWY_SCALAR (+), use this
  359. // instead of CappedTag to emphasize that there will be exactly kNumLanes lanes.
  360. // This is useful for data structures that rely on exactly 128-bit SIMD, but
  361. // these are discouraged because they cannot benefit from wider vectors.
  362. // Instead, applications would ideally define a larger problem size and loop
  363. // over it with the (unknown size) vectors from ScalableTag.
  364. //
  365. // + e.g. if the baseline is known to support SIMD, or the application requires
  366. // ops such as TableLookupBytes not supported by HWY_SCALAR.
  367. template <typename T, size_t kNumLanes>
  368. using FixedTag = typename detail::FixedTagChecker<T, kNumLanes>::type;
  369. // Convenience form for fixed sizes.
  370. template <typename T>
  371. using Full16 = Simd<T, 2 / sizeof(T), 0>;
  372. template <typename T>
  373. using Full32 = Simd<T, 4 / sizeof(T), 0>;
  374. template <typename T>
  375. using Full64 = Simd<T, 8 / sizeof(T), 0>;
  376. template <typename T>
  377. using Full128 = Simd<T, 16 / sizeof(T), 0>;
  378. // ------------------------------ Accessors for Simd<>
  379. // Lane type.
  380. template <class D>
  381. using TFromD = typename D::T;
  382. // Upper bound on the number of lanes, typically used for SFINAE conditions and
  383. // to allocate storage for targets with known vector sizes. Note: this may be a
  384. // loose bound, instead use Lanes() as the actual size for AllocateAligned.
  385. // MSVC workaround: use static constant directly instead of a function.
  386. #define HWY_MAX_LANES_D(D) D::kPrivateLanes
  387. // Same as D().Pow2(), but this is too complex for SFINAE with MSVC, so we use a
  388. // static constant directly.
  389. #define HWY_POW2_D(D) D::kPrivatePow2
  390. // Non-macro form of HWY_MAX_LANES_D in case that is preferable. WARNING: the
  391. // macro form may be required for MSVC, which has limitations on deducing
  392. // arguments.
  393. template <class D>
  394. HWY_INLINE HWY_MAYBE_UNUSED constexpr size_t MaxLanes(D) {
  395. return HWY_MAX_LANES_D(D);
  396. }
  397. #if !HWY_HAVE_SCALABLE
  398. // If non-scalable, this is constexpr; otherwise the target's header defines a
  399. // non-constexpr version of this function. This is the actual vector length,
  400. // used when advancing loop counters.
  401. template <class D>
  402. HWY_INLINE HWY_MAYBE_UNUSED constexpr size_t Lanes(D) {
  403. return HWY_MAX_LANES_D(D);
  404. }
  405. #endif // !HWY_HAVE_SCALABLE
  406. // Tag for the same number of lanes as D, but with the LaneType T.
  407. template <class T, class D>
  408. using Rebind = typename D::template Rebind<T>;
  409. template <class D>
  410. using RebindToSigned = Rebind<MakeSigned<TFromD<D>>, D>;
  411. template <class D>
  412. using RebindToUnsigned = Rebind<MakeUnsigned<TFromD<D>>, D>;
  413. template <class D>
  414. using RebindToFloat = Rebind<MakeFloat<TFromD<D>>, D>;
  415. // Tag for the same total size as D, but with the LaneType T.
  416. template <class T, class D>
  417. using Repartition = typename D::template Repartition<T>;
  418. template <class D>
  419. using RepartitionToWide = Repartition<MakeWide<TFromD<D>>, D>;
  420. template <class D>
  421. using RepartitionToNarrow = Repartition<MakeNarrow<TFromD<D>>, D>;
  422. // Shorthand for applying RepartitionToWide twice (for 8/16-bit types).
  423. template <class D>
  424. using RepartitionToWideX2 = RepartitionToWide<RepartitionToWide<D>>;
  425. // Shorthand for applying RepartitionToWide three times (for 8-bit types).
  426. template <class D>
  427. using RepartitionToWideX3 = RepartitionToWide<RepartitionToWideX2<D>>;
  428. // Tag for the same lane type as D, but half the lanes.
  429. template <class D>
  430. using Half = typename D::Half;
  431. // Tag for the same lane type as D, but twice the lanes.
  432. template <class D>
  433. using Twice = typename D::Twice;
  434. // Tag for a 16-byte block with the same lane type as D
  435. #if HWY_HAVE_SCALABLE
  436. namespace detail {
  437. template <class D>
  438. class BlockDFromD_t {};
  439. template <typename T, size_t N, int kPow2>
  440. class BlockDFromD_t<Simd<T, N, kPow2>> {
  441. using D = Simd<T, N, kPow2>;
  442. static constexpr int kNewPow2 = HWY_MIN(kPow2, 0);
  443. static constexpr size_t kMaxLpb = HWY_MIN(16 / sizeof(T), HWY_MAX_LANES_D(D));
  444. static constexpr size_t kNewN = D::template NewN<kNewPow2, kMaxLpb>();
  445. public:
  446. using type = Simd<T, kNewN, kNewPow2>;
  447. };
  448. } // namespace detail
  449. template <class D>
  450. using BlockDFromD = typename detail::BlockDFromD_t<RemoveConst<D>>::type;
  451. #else
  452. template <class D>
  453. using BlockDFromD =
  454. Simd<TFromD<D>, HWY_MIN(16 / sizeof(TFromD<D>), HWY_MAX_LANES_D(D)), 0>;
  455. #endif
  456. // Returns whether `ptr` is a multiple of `Lanes(d)` elements.
  457. template <class D, typename T>
  458. HWY_API bool IsAligned(D d, T* ptr) {
  459. const size_t N = Lanes(d);
  460. return reinterpret_cast<uintptr_t>(ptr) % (N * sizeof(T)) == 0;
  461. }
  462. // ------------------------------ Choosing overloads (SFINAE)
  463. // Same as base.h macros but with a Simd<T, N, kPow2> argument instead of T.
  464. #define HWY_IF_UNSIGNED_D(D) HWY_IF_UNSIGNED(hwy::HWY_NAMESPACE::TFromD<D>)
  465. #define HWY_IF_NOT_UNSIGNED_D(D) \
  466. HWY_IF_NOT_UNSIGNED(hwy::HWY_NAMESPACE::TFromD<D>)
  467. #define HWY_IF_SIGNED_D(D) HWY_IF_SIGNED(hwy::HWY_NAMESPACE::TFromD<D>)
  468. #define HWY_IF_FLOAT_D(D) HWY_IF_FLOAT(hwy::HWY_NAMESPACE::TFromD<D>)
  469. #define HWY_IF_NOT_FLOAT_D(D) HWY_IF_NOT_FLOAT(hwy::HWY_NAMESPACE::TFromD<D>)
  470. #define HWY_IF_FLOAT3264_D(D) HWY_IF_FLOAT3264(hwy::HWY_NAMESPACE::TFromD<D>)
  471. #define HWY_IF_NOT_FLOAT3264_D(D) \
  472. HWY_IF_NOT_FLOAT3264(hwy::HWY_NAMESPACE::TFromD<D>)
  473. #define HWY_IF_SPECIAL_FLOAT_D(D) \
  474. HWY_IF_SPECIAL_FLOAT(hwy::HWY_NAMESPACE::TFromD<D>)
  475. #define HWY_IF_NOT_SPECIAL_FLOAT_D(D) \
  476. HWY_IF_NOT_SPECIAL_FLOAT(hwy::HWY_NAMESPACE::TFromD<D>)
  477. #define HWY_IF_FLOAT_OR_SPECIAL_D(D) \
  478. HWY_IF_FLOAT_OR_SPECIAL(hwy::HWY_NAMESPACE::TFromD<D>)
  479. #define HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(D) \
  480. HWY_IF_NOT_FLOAT_NOR_SPECIAL(hwy::HWY_NAMESPACE::TFromD<D>)
  481. #define HWY_IF_T_SIZE_D(D, bytes) \
  482. HWY_IF_T_SIZE(hwy::HWY_NAMESPACE::TFromD<D>, bytes)
  483. #define HWY_IF_NOT_T_SIZE_D(D, bytes) \
  484. HWY_IF_NOT_T_SIZE(hwy::HWY_NAMESPACE::TFromD<D>, bytes)
  485. #define HWY_IF_T_SIZE_ONE_OF_D(D, bit_array) \
  486. HWY_IF_T_SIZE_ONE_OF(hwy::HWY_NAMESPACE::TFromD<D>, bit_array)
  487. #define HWY_IF_T_SIZE_LE_D(D, bytes) \
  488. HWY_IF_T_SIZE_LE(hwy::HWY_NAMESPACE::TFromD<D>, bytes)
  489. #define HWY_IF_T_SIZE_GT_D(D, bytes) \
  490. HWY_IF_T_SIZE_GT(hwy::HWY_NAMESPACE::TFromD<D>, bytes)
  491. #define HWY_IF_LANES_D(D, lanes) HWY_IF_LANES(HWY_MAX_LANES_D(D), lanes)
  492. #define HWY_IF_LANES_LE_D(D, lanes) HWY_IF_LANES_LE(HWY_MAX_LANES_D(D), lanes)
  493. #define HWY_IF_LANES_GT_D(D, lanes) HWY_IF_LANES_GT(HWY_MAX_LANES_D(D), lanes)
  494. #define HWY_IF_LANES_PER_BLOCK_D(D, lanes) \
  495. HWY_IF_LANES_PER_BLOCK(hwy::HWY_NAMESPACE::TFromD<D>, HWY_MAX_LANES_D(D), \
  496. lanes)
  497. #if HWY_COMPILER_MSVC
  498. #define HWY_IF_POW2_LE_D(D, pow2) \
  499. hwy::EnableIf<HWY_POW2_D(D) <= pow2>* = nullptr
  500. #define HWY_IF_POW2_GT_D(D, pow2) \
  501. hwy::EnableIf<(HWY_POW2_D(D) > pow2)>* = nullptr
  502. #else
  503. #define HWY_IF_POW2_LE_D(D, pow2) hwy::EnableIf<D().Pow2() <= pow2>* = nullptr
  504. #define HWY_IF_POW2_GT_D(D, pow2) hwy::EnableIf<(D().Pow2() > pow2)>* = nullptr
  505. #endif // HWY_COMPILER_MSVC
  506. #define HWY_IF_U8_D(D) HWY_IF_U8(hwy::HWY_NAMESPACE::TFromD<D>)
  507. #define HWY_IF_U16_D(D) HWY_IF_U16(hwy::HWY_NAMESPACE::TFromD<D>)
  508. #define HWY_IF_U32_D(D) HWY_IF_U32(hwy::HWY_NAMESPACE::TFromD<D>)
  509. #define HWY_IF_U64_D(D) HWY_IF_U64(hwy::HWY_NAMESPACE::TFromD<D>)
  510. #define HWY_IF_I8_D(D) HWY_IF_I8(hwy::HWY_NAMESPACE::TFromD<D>)
  511. #define HWY_IF_I16_D(D) HWY_IF_I16(hwy::HWY_NAMESPACE::TFromD<D>)
  512. #define HWY_IF_I32_D(D) HWY_IF_I32(hwy::HWY_NAMESPACE::TFromD<D>)
  513. #define HWY_IF_I64_D(D) HWY_IF_I64(hwy::HWY_NAMESPACE::TFromD<D>)
  514. // Use instead of HWY_IF_T_SIZE_D to avoid ambiguity with float16_t/float/double
  515. // overloads.
  516. #define HWY_IF_UI8_D(D) HWY_IF_UI8(hwy::HWY_NAMESPACE::TFromD<D>)
  517. #define HWY_IF_UI16_D(D) HWY_IF_UI16(hwy::HWY_NAMESPACE::TFromD<D>)
  518. #define HWY_IF_UI32_D(D) HWY_IF_UI32(hwy::HWY_NAMESPACE::TFromD<D>)
  519. #define HWY_IF_UI64_D(D) HWY_IF_UI64(hwy::HWY_NAMESPACE::TFromD<D>)
  520. #define HWY_IF_BF16_D(D) HWY_IF_BF16(hwy::HWY_NAMESPACE::TFromD<D>)
  521. #define HWY_IF_NOT_BF16_D(D) HWY_IF_NOT_BF16(hwy::HWY_NAMESPACE::TFromD<D>)
  522. #define HWY_IF_F16_D(D) HWY_IF_F16(hwy::HWY_NAMESPACE::TFromD<D>)
  523. #define HWY_IF_NOT_F16_D(D) HWY_IF_NOT_F16(hwy::HWY_NAMESPACE::TFromD<D>)
  524. #define HWY_IF_F32_D(D) HWY_IF_F32(hwy::HWY_NAMESPACE::TFromD<D>)
  525. #define HWY_IF_F64_D(D) HWY_IF_F64(hwy::HWY_NAMESPACE::TFromD<D>)
  526. #define HWY_V_SIZE_D(D) \
  527. (HWY_MAX_LANES_D(D) * sizeof(hwy::HWY_NAMESPACE::TFromD<D>))
  528. #define HWY_IF_V_SIZE_D(D, bytes) \
  529. HWY_IF_V_SIZE(hwy::HWY_NAMESPACE::TFromD<D>, HWY_MAX_LANES_D(D), bytes)
  530. #define HWY_IF_V_SIZE_LE_D(D, bytes) \
  531. HWY_IF_V_SIZE_LE(hwy::HWY_NAMESPACE::TFromD<D>, HWY_MAX_LANES_D(D), bytes)
  532. #define HWY_IF_V_SIZE_GT_D(D, bytes) \
  533. HWY_IF_V_SIZE_GT(hwy::HWY_NAMESPACE::TFromD<D>, HWY_MAX_LANES_D(D), bytes)
  534. // Same, but with a vector argument. ops/*-inl.h define their own TFromV.
  535. #define HWY_IF_UNSIGNED_V(V) HWY_IF_UNSIGNED(hwy::HWY_NAMESPACE::TFromV<V>)
  536. #define HWY_IF_NOT_UNSIGNED_V(V) \
  537. HWY_IF_NOT_UNSIGNED(hwy::HWY_NAMESPACE::TFromV<V>)
  538. #define HWY_IF_SIGNED_V(V) HWY_IF_SIGNED(hwy::HWY_NAMESPACE::TFromV<V>)
  539. #define HWY_IF_FLOAT_V(V) HWY_IF_FLOAT(hwy::HWY_NAMESPACE::TFromV<V>)
  540. #define HWY_IF_NOT_FLOAT_V(V) HWY_IF_NOT_FLOAT(hwy::HWY_NAMESPACE::TFromV<V>)
  541. #define HWY_IF_SPECIAL_FLOAT_V(V) \
  542. HWY_IF_SPECIAL_FLOAT(hwy::HWY_NAMESPACE::TFromV<V>)
  543. #define HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V) \
  544. HWY_IF_NOT_FLOAT_NOR_SPECIAL(hwy::HWY_NAMESPACE::TFromV<V>)
  545. #define HWY_IF_T_SIZE_V(V, bytes) \
  546. HWY_IF_T_SIZE(hwy::HWY_NAMESPACE::TFromV<V>, bytes)
  547. #define HWY_IF_NOT_T_SIZE_V(V, bytes) \
  548. HWY_IF_NOT_T_SIZE(hwy::HWY_NAMESPACE::TFromV<V>, bytes)
  549. #define HWY_IF_T_SIZE_ONE_OF_V(V, bit_array) \
  550. HWY_IF_T_SIZE_ONE_OF(hwy::HWY_NAMESPACE::TFromV<V>, bit_array)
  551. #define HWY_MAX_LANES_V(V) HWY_MAX_LANES_D(DFromV<V>)
  552. #define HWY_IF_V_SIZE_V(V, bytes) \
  553. HWY_IF_V_SIZE(hwy::HWY_NAMESPACE::TFromV<V>, HWY_MAX_LANES_V(V), bytes)
  554. #define HWY_IF_V_SIZE_LE_V(V, bytes) \
  555. HWY_IF_V_SIZE_LE(hwy::HWY_NAMESPACE::TFromV<V>, HWY_MAX_LANES_V(V), bytes)
  556. #define HWY_IF_V_SIZE_GT_V(V, bytes) \
  557. HWY_IF_V_SIZE_GT(hwy::HWY_NAMESPACE::TFromV<V>, HWY_MAX_LANES_V(V), bytes)
  558. // Use in implementations of ReduceSum etc. to avoid conflicts with the N=1 and
  559. // N=4 8-bit specializations in generic_ops-inl.
  560. #undef HWY_IF_REDUCE_D
  561. #define HWY_IF_REDUCE_D(D) \
  562. hwy::EnableIf<HWY_MAX_LANES_D(D) != 1 && \
  563. (HWY_MAX_LANES_D(D) != 4 || \
  564. sizeof(hwy::HWY_NAMESPACE::TFromD<D>) != 1)>* = nullptr
  565. #undef HWY_IF_SUM_OF_LANES_D
  566. #define HWY_IF_SUM_OF_LANES_D(D) HWY_IF_LANES_GT_D(D, 1)
  567. #undef HWY_IF_MINMAX_OF_LANES_D
  568. #define HWY_IF_MINMAX_OF_LANES_D(D) HWY_IF_LANES_GT_D(D, 1)
  569. #undef HWY_IF_ADDSUB_V
  570. #define HWY_IF_ADDSUB_V(V) HWY_IF_LANES_GT_D(DFromV<V>, 1)
  571. #undef HWY_IF_MULADDSUB_V
  572. #define HWY_IF_MULADDSUB_V(V) HWY_IF_LANES_GT_D(DFromV<V>, 1)
  573. // HWY_IF_U2I_DEMOTE_FROM_LANE_SIZE_V is used to disable the default
  574. // implementation of unsigned to signed DemoteTo/ReorderDemote2To in
  575. // generic_ops-inl.h for at least some of the unsigned to signed demotions on
  576. // SCALAR/EMU128/SSE2/SSSE3/SSE4/AVX2/SVE/SVE2
  577. #undef HWY_IF_U2I_DEMOTE_FROM_LANE_SIZE_V
  578. #define HWY_IF_U2I_DEMOTE_FROM_LANE_SIZE_V(V) void* = nullptr
  579. // Old names (deprecated)
  580. #define HWY_IF_LANE_SIZE_D(D, bytes) HWY_IF_T_SIZE_D(D, bytes)
  581. #define HWY_IF_NOT_LANE_SIZE_D(D, bytes) HWY_IF_NOT_T_SIZE_D(D, bytes)
  582. // NOLINTNEXTLINE(google-readability-namespace-comments)
  583. } // namespace HWY_NAMESPACE
  584. } // namespace hwy
  585. HWY_AFTER_NAMESPACE();
  586. #endif // HIGHWAY_HWY_OPS_SHARED_TOGGLE