inside-inl.h 23 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691
  1. // Copyright 2023 Google LLC
  2. // SPDX-License-Identifier: Apache-2.0
  3. //
  4. // Licensed under the Apache License, Version 2.0 (the "License");
  5. // you may not use this file except in compliance with the License.
  6. // You may obtain a copy of the License at
  7. //
  8. // http://www.apache.org/licenses/LICENSE-2.0
  9. //
  10. // Unless required by applicable law or agreed to in writing, software
  11. // distributed under the License is distributed on an "AS IS" BASIS,
  12. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. // See the License for the specific language governing permissions and
  14. // limitations under the License.
  15. // Must be included inside an existing include guard, with the following ops
  16. // already defined: BitCast, And, Set, ShiftLeft, ShiftRight, PromoteLowerTo,
  17. // ConcatEven, ConcatOdd, plus the optional detail::PromoteEvenTo and
  18. // detail::PromoteOddTo (if implemented in the target-specific header).
  19. // This is normally set by set_macros-inl.h before this header is included;
  20. // if not, we are viewing this header standalone. Reduce IDE errors by:
  21. #if !defined(HWY_NAMESPACE)
  22. // 1) Defining HWY_IDE so we get syntax highlighting rather than all-gray text.
  23. #include "hwy/ops/shared-inl.h"
  24. // 2) Entering the HWY_NAMESPACE to make definitions from shared-inl.h visible.
  25. HWY_BEFORE_NAMESPACE();
  26. namespace hwy {
  27. namespace HWY_NAMESPACE {
  28. #define HWY_INSIDE_END_NAMESPACE
  29. // 3) Providing a dummy VFromD (usually done by the target-specific header).
  30. template <class D>
  31. using VFromD = int;
  32. template <class D>
  33. using TFromV = int;
  34. template <class D>
  35. struct DFromV {};
  36. #endif
  37. // ------------------------------ Vec/Create/Get/Set2..4
  38. // On SVE and RVV, Vec2..4 are aliases to built-in types. Also exclude the
  39. // fixed-size SVE targets.
  40. #if HWY_IDE || (!HWY_HAVE_SCALABLE && !HWY_TARGET_IS_SVE)
  41. // NOTE: these are used inside arm_neon-inl.h, hence they cannot be defined in
  42. // generic_ops-inl.h, which is included after that.
  43. template <class D>
  44. struct Vec2 {
  45. VFromD<D> v0;
  46. VFromD<D> v1;
  47. };
  48. template <class D>
  49. struct Vec3 {
  50. VFromD<D> v0;
  51. VFromD<D> v1;
  52. VFromD<D> v2;
  53. };
  54. template <class D>
  55. struct Vec4 {
  56. VFromD<D> v0;
  57. VFromD<D> v1;
  58. VFromD<D> v2;
  59. VFromD<D> v3;
  60. };
  61. // D arg is unused but allows deducing D.
  62. template <class D>
  63. HWY_API Vec2<D> Create2(D /* tag */, VFromD<D> v0, VFromD<D> v1) {
  64. return Vec2<D>{v0, v1};
  65. }
  66. template <class D>
  67. HWY_API Vec3<D> Create3(D /* tag */, VFromD<D> v0, VFromD<D> v1, VFromD<D> v2) {
  68. return Vec3<D>{v0, v1, v2};
  69. }
  70. template <class D>
  71. HWY_API Vec4<D> Create4(D /* tag */, VFromD<D> v0, VFromD<D> v1, VFromD<D> v2,
  72. VFromD<D> v3) {
  73. return Vec4<D>{v0, v1, v2, v3};
  74. }
  75. template <size_t kIndex, class D>
  76. HWY_API VFromD<D> Get2(Vec2<D> tuple) {
  77. static_assert(kIndex < 2, "Tuple index out of bounds");
  78. return kIndex == 0 ? tuple.v0 : tuple.v1;
  79. }
  80. template <size_t kIndex, class D>
  81. HWY_API VFromD<D> Get3(Vec3<D> tuple) {
  82. static_assert(kIndex < 3, "Tuple index out of bounds");
  83. return kIndex == 0 ? tuple.v0 : kIndex == 1 ? tuple.v1 : tuple.v2;
  84. }
  85. template <size_t kIndex, class D>
  86. HWY_API VFromD<D> Get4(Vec4<D> tuple) {
  87. static_assert(kIndex < 4, "Tuple index out of bounds");
  88. return kIndex == 0 ? tuple.v0
  89. : kIndex == 1 ? tuple.v1
  90. : kIndex == 2 ? tuple.v2
  91. : tuple.v3;
  92. }
  93. template <size_t kIndex, class D>
  94. HWY_API Vec2<D> Set2(Vec2<D> tuple, VFromD<D> val) {
  95. static_assert(kIndex < 2, "Tuple index out of bounds");
  96. if (kIndex == 0) {
  97. tuple.v0 = val;
  98. } else {
  99. tuple.v1 = val;
  100. }
  101. return tuple;
  102. }
  103. template <size_t kIndex, class D>
  104. HWY_API Vec3<D> Set3(Vec3<D> tuple, VFromD<D> val) {
  105. static_assert(kIndex < 3, "Tuple index out of bounds");
  106. if (kIndex == 0) {
  107. tuple.v0 = val;
  108. } else if (kIndex == 1) {
  109. tuple.v1 = val;
  110. } else {
  111. tuple.v2 = val;
  112. }
  113. return tuple;
  114. }
  115. template <size_t kIndex, class D>
  116. HWY_API Vec4<D> Set4(Vec4<D> tuple, VFromD<D> val) {
  117. static_assert(kIndex < 4, "Tuple index out of bounds");
  118. if (kIndex == 0) {
  119. tuple.v0 = val;
  120. } else if (kIndex == 1) {
  121. tuple.v1 = val;
  122. } else if (kIndex == 2) {
  123. tuple.v2 = val;
  124. } else {
  125. tuple.v3 = val;
  126. }
  127. return tuple;
  128. }
  129. #endif // !HWY_HAVE_SCALABLE || HWY_IDE
  130. // ------------------------------ Rol/Ror (And, Or, Neg, Shl, Shr)
  131. #if (defined(HWY_NATIVE_ROL_ROR_8) == defined(HWY_TARGET_TOGGLE))
  132. #ifdef HWY_NATIVE_ROL_ROR_8
  133. #undef HWY_NATIVE_ROL_ROR_8
  134. #else
  135. #define HWY_NATIVE_ROL_ROR_8
  136. #endif
  137. template <class V, HWY_IF_UI8(TFromV<V>)>
  138. HWY_API V Rol(V a, V b) {
  139. const DFromV<decltype(a)> d;
  140. const RebindToSigned<decltype(d)> di;
  141. const RebindToUnsigned<decltype(d)> du;
  142. const auto shift_amt_mask = Set(du, uint8_t{7});
  143. const auto shl_amt = And(BitCast(du, b), shift_amt_mask);
  144. const auto shr_amt = And(BitCast(du, Neg(BitCast(di, b))), shift_amt_mask);
  145. const auto vu = BitCast(du, a);
  146. return BitCast(d, Or(Shl(vu, shl_amt), Shr(vu, shr_amt)));
  147. }
  148. template <class V, HWY_IF_UI8(TFromV<V>)>
  149. HWY_API V Ror(V a, V b) {
  150. const DFromV<decltype(a)> d;
  151. const RebindToSigned<decltype(d)> di;
  152. const RebindToUnsigned<decltype(d)> du;
  153. const auto shift_amt_mask = Set(du, uint8_t{7});
  154. const auto shr_amt = And(BitCast(du, b), shift_amt_mask);
  155. const auto shl_amt = And(BitCast(du, Neg(BitCast(di, b))), shift_amt_mask);
  156. const auto vu = BitCast(du, a);
  157. return BitCast(d, Or(Shl(vu, shl_amt), Shr(vu, shr_amt)));
  158. }
  159. #endif // HWY_NATIVE_ROL_ROR_8
  160. #if (defined(HWY_NATIVE_ROL_ROR_16) == defined(HWY_TARGET_TOGGLE))
  161. #ifdef HWY_NATIVE_ROL_ROR_16
  162. #undef HWY_NATIVE_ROL_ROR_16
  163. #else
  164. #define HWY_NATIVE_ROL_ROR_16
  165. #endif
  166. template <class V, HWY_IF_UI16(TFromV<V>)>
  167. HWY_API V Rol(V a, V b) {
  168. const DFromV<decltype(a)> d;
  169. const RebindToSigned<decltype(d)> di;
  170. const RebindToUnsigned<decltype(d)> du;
  171. const auto shift_amt_mask = Set(du, uint16_t{15});
  172. const auto shl_amt = And(BitCast(du, b), shift_amt_mask);
  173. const auto shr_amt = And(BitCast(du, Neg(BitCast(di, b))), shift_amt_mask);
  174. const auto vu = BitCast(du, a);
  175. return BitCast(d, Or(Shl(vu, shl_amt), Shr(vu, shr_amt)));
  176. }
  177. template <class V, HWY_IF_UI16(TFromV<V>)>
  178. HWY_API V Ror(V a, V b) {
  179. const DFromV<decltype(a)> d;
  180. const RebindToSigned<decltype(d)> di;
  181. const RebindToUnsigned<decltype(d)> du;
  182. const auto shift_amt_mask = Set(du, uint16_t{15});
  183. const auto shr_amt = And(BitCast(du, b), shift_amt_mask);
  184. const auto shl_amt = And(BitCast(du, Neg(BitCast(di, b))), shift_amt_mask);
  185. const auto vu = BitCast(du, a);
  186. return BitCast(d, Or(Shl(vu, shl_amt), Shr(vu, shr_amt)));
  187. }
  188. #endif // HWY_NATIVE_ROL_ROR_16
  189. #if (defined(HWY_NATIVE_ROL_ROR_32_64) == defined(HWY_TARGET_TOGGLE))
  190. #ifdef HWY_NATIVE_ROL_ROR_32_64
  191. #undef HWY_NATIVE_ROL_ROR_32_64
  192. #else
  193. #define HWY_NATIVE_ROL_ROR_32_64
  194. #endif
  195. template <class V, HWY_IF_UI32(TFromV<V>)>
  196. HWY_API V Rol(V a, V b) {
  197. const DFromV<decltype(a)> d;
  198. const RebindToSigned<decltype(d)> di;
  199. const RebindToUnsigned<decltype(d)> du;
  200. const auto shift_amt_mask = Set(du, uint32_t{31});
  201. const auto shl_amt = And(BitCast(du, b), shift_amt_mask);
  202. const auto shr_amt = And(BitCast(du, Neg(BitCast(di, b))), shift_amt_mask);
  203. const auto vu = BitCast(du, a);
  204. return BitCast(d, Or(Shl(vu, shl_amt), Shr(vu, shr_amt)));
  205. }
  206. template <class V, HWY_IF_UI32(TFromV<V>)>
  207. HWY_API V Ror(V a, V b) {
  208. const DFromV<decltype(a)> d;
  209. const RebindToSigned<decltype(d)> di;
  210. const RebindToUnsigned<decltype(d)> du;
  211. const auto shift_amt_mask = Set(du, uint32_t{31});
  212. const auto shr_amt = And(BitCast(du, b), shift_amt_mask);
  213. const auto shl_amt = And(BitCast(du, Neg(BitCast(di, b))), shift_amt_mask);
  214. const auto vu = BitCast(du, a);
  215. return BitCast(d, Or(Shl(vu, shl_amt), Shr(vu, shr_amt)));
  216. }
  217. #if HWY_HAVE_INTEGER64
  218. template <class V, HWY_IF_UI64(TFromV<V>)>
  219. HWY_API V Rol(V a, V b) {
  220. const DFromV<decltype(a)> d;
  221. const RebindToSigned<decltype(d)> di;
  222. const RebindToUnsigned<decltype(d)> du;
  223. const auto shift_amt_mask = Set(du, uint64_t{63});
  224. const auto shl_amt = And(BitCast(du, b), shift_amt_mask);
  225. const auto shr_amt = And(BitCast(du, Neg(BitCast(di, b))), shift_amt_mask);
  226. const auto vu = BitCast(du, a);
  227. return BitCast(d, Or(Shl(vu, shl_amt), Shr(vu, shr_amt)));
  228. }
  229. template <class V, HWY_IF_UI64(TFromV<V>)>
  230. HWY_API V Ror(V a, V b) {
  231. const DFromV<decltype(a)> d;
  232. const RebindToSigned<decltype(d)> di;
  233. const RebindToUnsigned<decltype(d)> du;
  234. const auto shift_amt_mask = Set(du, uint64_t{63});
  235. const auto shr_amt = And(BitCast(du, b), shift_amt_mask);
  236. const auto shl_amt = And(BitCast(du, Neg(BitCast(di, b))), shift_amt_mask);
  237. const auto vu = BitCast(du, a);
  238. return BitCast(d, Or(Shl(vu, shl_amt), Shr(vu, shr_amt)));
  239. }
  240. #endif // HWY_HAVE_INTEGER64
  241. #endif // HWY_NATIVE_ROL_ROR_32_64
  242. // ------------------------------ RotateLeftSame/RotateRightSame
  243. #if (defined(HWY_NATIVE_ROL_ROR_SAME_8) == defined(HWY_TARGET_TOGGLE))
  244. #ifdef HWY_NATIVE_ROL_ROR_SAME_8
  245. #undef HWY_NATIVE_ROL_ROR_SAME_8
  246. #else
  247. #define HWY_NATIVE_ROL_ROR_SAME_8
  248. #endif
  249. template <class V, HWY_IF_UI8(TFromV<V>)>
  250. HWY_API V RotateLeftSame(V v, int bits) {
  251. const DFromV<decltype(v)> d;
  252. const RebindToUnsigned<decltype(d)> du;
  253. const int shl_amt = bits & 7;
  254. const int shr_amt = static_cast<int>((0u - static_cast<unsigned>(bits)) & 7u);
  255. const auto vu = BitCast(du, v);
  256. return BitCast(d,
  257. Or(ShiftLeftSame(vu, shl_amt), ShiftRightSame(vu, shr_amt)));
  258. }
  259. template <class V, HWY_IF_UI8(TFromV<V>)>
  260. HWY_API V RotateRightSame(V v, int bits) {
  261. const DFromV<decltype(v)> d;
  262. const RebindToUnsigned<decltype(d)> du;
  263. const int shr_amt = bits & 7;
  264. const int shl_amt = static_cast<int>((0u - static_cast<unsigned>(bits)) & 7u);
  265. const auto vu = BitCast(du, v);
  266. return BitCast(d,
  267. Or(ShiftLeftSame(vu, shl_amt), ShiftRightSame(vu, shr_amt)));
  268. }
  269. #endif // HWY_NATIVE_ROL_ROR_SAME_8
  270. #if (defined(HWY_NATIVE_ROL_ROR_SAME_16) == defined(HWY_TARGET_TOGGLE))
  271. #ifdef HWY_NATIVE_ROL_ROR_SAME_16
  272. #undef HWY_NATIVE_ROL_ROR_SAME_16
  273. #else
  274. #define HWY_NATIVE_ROL_ROR_SAME_16
  275. #endif
  276. template <class V, HWY_IF_UI16(TFromV<V>)>
  277. HWY_API V RotateLeftSame(V v, int bits) {
  278. const DFromV<decltype(v)> d;
  279. const RebindToUnsigned<decltype(d)> du;
  280. const int shl_amt = bits & 15;
  281. const int shr_amt =
  282. static_cast<int>((0u - static_cast<unsigned>(bits)) & 15u);
  283. const auto vu = BitCast(du, v);
  284. return BitCast(d,
  285. Or(ShiftLeftSame(vu, shl_amt), ShiftRightSame(vu, shr_amt)));
  286. }
  287. template <class V, HWY_IF_UI16(TFromV<V>)>
  288. HWY_API V RotateRightSame(V v, int bits) {
  289. const DFromV<decltype(v)> d;
  290. const RebindToUnsigned<decltype(d)> du;
  291. const int shr_amt = bits & 15;
  292. const int shl_amt =
  293. static_cast<int>((0u - static_cast<unsigned>(bits)) & 15u);
  294. const auto vu = BitCast(du, v);
  295. return BitCast(d,
  296. Or(ShiftLeftSame(vu, shl_amt), ShiftRightSame(vu, shr_amt)));
  297. }
  298. #endif // HWY_NATIVE_ROL_ROR_SAME_16
  299. #if (defined(HWY_NATIVE_ROL_ROR_SAME_32_64) == defined(HWY_TARGET_TOGGLE))
  300. #ifdef HWY_NATIVE_ROL_ROR_SAME_32_64
  301. #undef HWY_NATIVE_ROL_ROR_SAME_32_64
  302. #else
  303. #define HWY_NATIVE_ROL_ROR_SAME_32_64
  304. #endif
  305. template <class V, HWY_IF_UI32(TFromV<V>)>
  306. HWY_API V RotateLeftSame(V v, int bits) {
  307. const DFromV<decltype(v)> d;
  308. const RebindToUnsigned<decltype(d)> du;
  309. const int shl_amt = bits & 31;
  310. const int shr_amt =
  311. static_cast<int>((0u - static_cast<unsigned>(bits)) & 31u);
  312. const auto vu = BitCast(du, v);
  313. return BitCast(d,
  314. Or(ShiftLeftSame(vu, shl_amt), ShiftRightSame(vu, shr_amt)));
  315. }
  316. template <class V, HWY_IF_UI32(TFromV<V>)>
  317. HWY_API V RotateRightSame(V v, int bits) {
  318. const DFromV<decltype(v)> d;
  319. const RebindToUnsigned<decltype(d)> du;
  320. const int shr_amt = bits & 31;
  321. const int shl_amt =
  322. static_cast<int>((0u - static_cast<unsigned>(bits)) & 31u);
  323. const auto vu = BitCast(du, v);
  324. return BitCast(d,
  325. Or(ShiftLeftSame(vu, shl_amt), ShiftRightSame(vu, shr_amt)));
  326. }
  327. #if HWY_HAVE_INTEGER64
  328. template <class V, HWY_IF_UI64(TFromV<V>)>
  329. HWY_API V RotateLeftSame(V v, int bits) {
  330. const DFromV<decltype(v)> d;
  331. const RebindToUnsigned<decltype(d)> du;
  332. const int shl_amt = bits & 63;
  333. const int shr_amt =
  334. static_cast<int>((0u - static_cast<unsigned>(bits)) & 63u);
  335. const auto vu = BitCast(du, v);
  336. return BitCast(d,
  337. Or(ShiftLeftSame(vu, shl_amt), ShiftRightSame(vu, shr_amt)));
  338. }
  339. template <class V, HWY_IF_UI64(TFromV<V>)>
  340. HWY_API V RotateRightSame(V v, int bits) {
  341. const DFromV<decltype(v)> d;
  342. const RebindToUnsigned<decltype(d)> du;
  343. const int shr_amt = bits & 63;
  344. const int shl_amt =
  345. static_cast<int>((0u - static_cast<unsigned>(bits)) & 63u);
  346. const auto vu = BitCast(du, v);
  347. return BitCast(d,
  348. Or(ShiftLeftSame(vu, shl_amt), ShiftRightSame(vu, shr_amt)));
  349. }
  350. #endif // HWY_HAVE_INTEGER64
  351. #endif // HWY_NATIVE_ROL_ROR_SAME_32_64
  352. // ------------------------------ PromoteEvenTo/PromoteOddTo
  353. // These are used by target-specific headers for ReorderWidenMulAccumulate etc.
  354. #if HWY_TARGET != HWY_SCALAR || HWY_IDE
  355. namespace detail {
  356. // Tag dispatch is used in detail::PromoteEvenTo and detail::PromoteOddTo as
  357. // there are target-specific specializations for some of the
  358. // detail::PromoteEvenTo and detail::PromoteOddTo cases on
  359. // SVE/PPC/SSE2/SSSE3/SSE4/AVX2.
  360. // All targets except HWY_SCALAR use the implementations of
  361. // detail::PromoteEvenTo and detail::PromoteOddTo in generic_ops-inl.h for at
  362. // least some of the PromoteEvenTo and PromoteOddTo cases.
  363. // Signed to signed PromoteEvenTo/PromoteOddTo
  364. template <size_t kToLaneSize, class D, class V>
  365. HWY_INLINE VFromD<D> PromoteEvenTo(
  366. hwy::SignedTag /*to_type_tag*/,
  367. hwy::SizeTag<kToLaneSize> /*to_lane_size_tag*/,
  368. hwy::SignedTag /*from_type_tag*/, D d_to, V v) {
  369. #if HWY_TARGET_IS_SVE
  370. // The intrinsic expects the wide lane type.
  371. return NativePromoteEvenTo(BitCast(d_to, v));
  372. #else
  373. #if HWY_IS_LITTLE_ENDIAN
  374. // On little-endian targets, need to shift each lane of the bitcasted
  375. // vector left by kToLaneSize * 4 bits to get the bits of the even
  376. // source lanes into the upper kToLaneSize * 4 bits of even_in_hi.
  377. const auto even_in_hi = ShiftLeft<kToLaneSize * 4>(BitCast(d_to, v));
  378. #else
  379. // On big-endian targets, the bits of the even source lanes are already
  380. // in the upper kToLaneSize * 4 bits of the lanes of the bitcasted
  381. // vector.
  382. const auto even_in_hi = BitCast(d_to, v);
  383. #endif
  384. // Right-shift even_in_hi by kToLaneSize * 4 bits
  385. return ShiftRight<kToLaneSize * 4>(even_in_hi);
  386. #endif // HWY_TARGET_IS_SVE
  387. }
  388. // Unsigned to unsigned PromoteEvenTo/PromoteOddTo
  389. template <size_t kToLaneSize, class D, class V>
  390. HWY_INLINE VFromD<D> PromoteEvenTo(
  391. hwy::UnsignedTag /*to_type_tag*/,
  392. hwy::SizeTag<kToLaneSize> /*to_lane_size_tag*/,
  393. hwy::UnsignedTag /*from_type_tag*/, D d_to, V v) {
  394. #if HWY_TARGET_IS_SVE
  395. // The intrinsic expects the wide lane type.
  396. return NativePromoteEvenTo(BitCast(d_to, v));
  397. #else
  398. #if HWY_IS_LITTLE_ENDIAN
  399. // On little-endian targets, the bits of the even source lanes are already
  400. // in the lower kToLaneSize * 4 bits of the lanes of the bitcasted vector.
  401. // Simply need to zero out the upper bits of each lane of the bitcasted
  402. // vector.
  403. return And(BitCast(d_to, v),
  404. Set(d_to, static_cast<TFromD<D>>(LimitsMax<TFromV<V>>())));
  405. #else
  406. // On big-endian targets, need to shift each lane of the bitcasted vector
  407. // right by kToLaneSize * 4 bits to get the bits of the even source lanes into
  408. // the lower kToLaneSize * 4 bits of the result.
  409. // The right shift below will zero out the upper kToLaneSize * 4 bits of the
  410. // result.
  411. return ShiftRight<kToLaneSize * 4>(BitCast(d_to, v));
  412. #endif
  413. #endif // HWY_TARGET_IS_SVE
  414. }
  415. template <size_t kToLaneSize, class D, class V>
  416. HWY_INLINE VFromD<D> PromoteOddTo(
  417. hwy::SignedTag /*to_type_tag*/,
  418. hwy::SizeTag<kToLaneSize> /*to_lane_size_tag*/,
  419. hwy::SignedTag /*from_type_tag*/, D d_to, V v) {
  420. #if HWY_IS_LITTLE_ENDIAN
  421. // On little-endian targets, the bits of the odd source lanes are already in
  422. // the upper kToLaneSize * 4 bits of the lanes of the bitcasted vector.
  423. const auto odd_in_hi = BitCast(d_to, v);
  424. #else
  425. // On big-endian targets, need to shift each lane of the bitcasted vector
  426. // left by kToLaneSize * 4 bits to get the bits of the odd source lanes into
  427. // the upper kToLaneSize * 4 bits of odd_in_hi.
  428. const auto odd_in_hi = ShiftLeft<kToLaneSize * 4>(BitCast(d_to, v));
  429. #endif
  430. // Right-shift odd_in_hi by kToLaneSize * 4 bits
  431. return ShiftRight<kToLaneSize * 4>(odd_in_hi);
  432. }
  433. template <size_t kToLaneSize, class D, class V>
  434. HWY_INLINE VFromD<D> PromoteOddTo(
  435. hwy::UnsignedTag /*to_type_tag*/,
  436. hwy::SizeTag<kToLaneSize> /*to_lane_size_tag*/,
  437. hwy::UnsignedTag /*from_type_tag*/, D d_to, V v) {
  438. #if HWY_IS_LITTLE_ENDIAN
  439. // On little-endian targets, need to shift each lane of the bitcasted vector
  440. // right by kToLaneSize * 4 bits to get the bits of the odd source lanes into
  441. // the lower kToLaneSize * 4 bits of the result.
  442. // The right shift below will zero out the upper kToLaneSize * 4 bits of the
  443. // result.
  444. return ShiftRight<kToLaneSize * 4>(BitCast(d_to, v));
  445. #else
  446. // On big-endian targets, the bits of the even source lanes are already
  447. // in the lower kToLaneSize * 4 bits of the lanes of the bitcasted vector.
  448. // Simply need to zero out the upper bits of each lane of the bitcasted
  449. // vector.
  450. return And(BitCast(d_to, v),
  451. Set(d_to, static_cast<TFromD<D>>(LimitsMax<TFromV<V>>())));
  452. #endif
  453. }
  454. // Unsigned to signed: Same as unsigned->unsigned PromoteEvenTo/PromoteOddTo
  455. // followed by BitCast to signed
  456. template <size_t kToLaneSize, class D, class V>
  457. HWY_INLINE VFromD<D> PromoteEvenTo(
  458. hwy::SignedTag /*to_type_tag*/,
  459. hwy::SizeTag<kToLaneSize> /*to_lane_size_tag*/,
  460. hwy::UnsignedTag /*from_type_tag*/, D d_to, V v) {
  461. const RebindToUnsigned<decltype(d_to)> du_to;
  462. return BitCast(d_to,
  463. PromoteEvenTo(hwy::UnsignedTag(), hwy::SizeTag<kToLaneSize>(),
  464. hwy::UnsignedTag(), du_to, v));
  465. }
  466. template <size_t kToLaneSize, class D, class V>
  467. HWY_INLINE VFromD<D> PromoteOddTo(
  468. hwy::SignedTag /*to_type_tag*/,
  469. hwy::SizeTag<kToLaneSize> /*to_lane_size_tag*/,
  470. hwy::UnsignedTag /*from_type_tag*/, D d_to, V v) {
  471. const RebindToUnsigned<decltype(d_to)> du_to;
  472. return BitCast(d_to,
  473. PromoteOddTo(hwy::UnsignedTag(), hwy::SizeTag<kToLaneSize>(),
  474. hwy::UnsignedTag(), du_to, v));
  475. }
  476. // BF16->F32 PromoteEvenTo
  477. // NOTE: It is possible for FromTypeTag to be hwy::SignedTag or hwy::UnsignedTag
  478. // instead of hwy::FloatTag on targets that use scalable vectors.
  479. // VBF16 is considered to be a bfloat16_t vector if TFromV<VBF16> is the same
  480. // type as TFromV<VFromD<Repartition<bfloat16_t, DF32>>>
  481. // The BF16->F32 PromoteEvenTo overload is only enabled if VBF16 is considered
  482. // to be a bfloat16_t vector.
  483. template <class FromTypeTag, class DF32, class VBF16,
  484. class VBF16_2 = VFromD<Repartition<bfloat16_t, DF32>>,
  485. hwy::EnableIf<IsSame<TFromV<VBF16>, TFromV<VBF16_2>>()>* = nullptr>
  486. HWY_INLINE VFromD<DF32> PromoteEvenTo(hwy::FloatTag /*to_type_tag*/,
  487. hwy::SizeTag<4> /*to_lane_size_tag*/,
  488. FromTypeTag /*from_type_tag*/, DF32 d_to,
  489. VBF16 v) {
  490. const RebindToUnsigned<decltype(d_to)> du_to;
  491. #if HWY_IS_LITTLE_ENDIAN
  492. // On little-endian platforms, need to shift left each lane of the bitcasted
  493. // vector by 16 bits.
  494. return BitCast(d_to, ShiftLeft<16>(BitCast(du_to, v)));
  495. #else
  496. // On big-endian platforms, the even lanes of the source vector are already
  497. // in the upper 16 bits of the lanes of the bitcasted vector.
  498. // Need to simply zero out the lower 16 bits of each lane of the bitcasted
  499. // vector.
  500. return BitCast(d_to,
  501. And(BitCast(du_to, v), Set(du_to, uint32_t{0xFFFF0000u})));
  502. #endif
  503. }
  504. // BF16->F32 PromoteOddTo
  505. // NOTE: It is possible for FromTypeTag to be hwy::SignedTag or hwy::UnsignedTag
  506. // instead of hwy::FloatTag on targets that use scalable vectors.
  507. // VBF16 is considered to be a bfloat16_t vector if TFromV<VBF16> is the same
  508. // type as TFromV<VFromD<Repartition<bfloat16_t, DF32>>>
  509. // The BF16->F32 PromoteEvenTo overload is only enabled if VBF16 is considered
  510. // to be a bfloat16_t vector.
  511. template <class FromTypeTag, class DF32, class VBF16,
  512. class VBF16_2 = VFromD<Repartition<bfloat16_t, DF32>>,
  513. hwy::EnableIf<IsSame<TFromV<VBF16>, TFromV<VBF16_2>>()>* = nullptr>
  514. HWY_INLINE VFromD<DF32> PromoteOddTo(hwy::FloatTag /*to_type_tag*/,
  515. hwy::SizeTag<4> /*to_lane_size_tag*/,
  516. FromTypeTag /*from_type_tag*/, DF32 d_to,
  517. VBF16 v) {
  518. const RebindToUnsigned<decltype(d_to)> du_to;
  519. #if HWY_IS_LITTLE_ENDIAN
  520. // On little-endian platforms, the odd lanes of the source vector are already
  521. // in the upper 16 bits of the lanes of the bitcasted vector.
  522. // Need to simply zero out the lower 16 bits of each lane of the bitcasted
  523. // vector.
  524. return BitCast(d_to,
  525. And(BitCast(du_to, v), Set(du_to, uint32_t{0xFFFF0000u})));
  526. #else
  527. // On big-endian platforms, need to shift left each lane of the bitcasted
  528. // vector by 16 bits.
  529. return BitCast(d_to, ShiftLeft<16>(BitCast(du_to, v)));
  530. #endif
  531. }
  532. // Default PromoteEvenTo/PromoteOddTo implementations
  533. template <class ToTypeTag, size_t kToLaneSize, class FromTypeTag, class D,
  534. class V, HWY_IF_LANES_D(D, 1)>
  535. HWY_INLINE VFromD<D> PromoteEvenTo(
  536. ToTypeTag /*to_type_tag*/, hwy::SizeTag<kToLaneSize> /*to_lane_size_tag*/,
  537. FromTypeTag /*from_type_tag*/, D d_to, V v) {
  538. return PromoteLowerTo(d_to, v);
  539. }
  540. template <class ToTypeTag, size_t kToLaneSize, class FromTypeTag, class D,
  541. class V, HWY_IF_LANES_GT_D(D, 1)>
  542. HWY_INLINE VFromD<D> PromoteEvenTo(
  543. ToTypeTag /*to_type_tag*/, hwy::SizeTag<kToLaneSize> /*to_lane_size_tag*/,
  544. FromTypeTag /*from_type_tag*/, D d_to, V v) {
  545. const DFromV<decltype(v)> d;
  546. return PromoteLowerTo(d_to, ConcatEven(d, v, v));
  547. }
  548. template <class ToTypeTag, size_t kToLaneSize, class FromTypeTag, class D,
  549. class V>
  550. HWY_INLINE VFromD<D> PromoteOddTo(
  551. ToTypeTag /*to_type_tag*/, hwy::SizeTag<kToLaneSize> /*to_lane_size_tag*/,
  552. FromTypeTag /*from_type_tag*/, D d_to, V v) {
  553. const DFromV<decltype(v)> d;
  554. return PromoteLowerTo(d_to, ConcatOdd(d, v, v));
  555. }
  556. } // namespace detail
  557. template <class D, class V, HWY_IF_T_SIZE_D(D, 2 * sizeof(TFromV<V>)),
  558. class V2 = VFromD<Repartition<TFromV<V>, D>>,
  559. HWY_IF_LANES_D(DFromV<V>, HWY_MAX_LANES_V(V2))>
  560. HWY_API VFromD<D> PromoteEvenTo(D d, V v) {
  561. return detail::PromoteEvenTo(hwy::TypeTag<TFromD<D>>(),
  562. hwy::SizeTag<sizeof(TFromD<D>)>(),
  563. hwy::TypeTag<TFromV<V>>(), d, v);
  564. }
  565. template <class D, class V, HWY_IF_T_SIZE_D(D, 2 * sizeof(TFromV<V>)),
  566. class V2 = VFromD<Repartition<TFromV<V>, D>>,
  567. HWY_IF_LANES_D(DFromV<V>, HWY_MAX_LANES_V(V2))>
  568. HWY_API VFromD<D> PromoteOddTo(D d, V v) {
  569. return detail::PromoteOddTo(hwy::TypeTag<TFromD<D>>(),
  570. hwy::SizeTag<sizeof(TFromD<D>)>(),
  571. hwy::TypeTag<TFromV<V>>(), d, v);
  572. }
  573. #endif // HWY_TARGET != HWY_SCALAR
  574. #ifdef HWY_INSIDE_END_NAMESPACE
  575. #undef HWY_INSIDE_END_NAMESPACE
  576. // NOLINTNEXTLINE(google-readability-namespace-comments)
  577. } // namespace HWY_NAMESPACE
  578. } // namespace hwy
  579. HWY_AFTER_NAMESPACE();
  580. #endif