detect_targets.h 27 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753
  1. // Copyright 2021 Google LLC
  2. // SPDX-License-Identifier: Apache-2.0
  3. //
  4. // Licensed under the Apache License, Version 2.0 (the "License");
  5. // you may not use this file except in compliance with the License.
  6. // You may obtain a copy of the License at
  7. //
  8. // http://www.apache.org/licenses/LICENSE-2.0
  9. //
  10. // Unless required by applicable law or agreed to in writing, software
  11. // distributed under the License is distributed on an "AS IS" BASIS,
  12. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. // See the License for the specific language governing permissions and
  14. // limitations under the License.
  15. #ifndef HIGHWAY_HWY_DETECT_TARGETS_H_
  16. #define HIGHWAY_HWY_DETECT_TARGETS_H_
  17. // Defines targets and chooses which to enable.
  18. #include "hwy/detect_compiler_arch.h"
  19. //------------------------------------------------------------------------------
  20. // Optional configuration
  21. // See g3doc/quick_reference.md for documentation of these macros.
  22. // Uncomment to override the default baseline determined from predefined macros:
  23. // #define HWY_BASELINE_TARGETS (HWY_SSE4 | HWY_SCALAR)
  24. // Uncomment to override the default blocklist:
  25. // #define HWY_BROKEN_TARGETS HWY_AVX3
  26. // Uncomment to definitely avoid generating those target(s):
  27. // #define HWY_DISABLED_TARGETS HWY_SSE4
  28. // Uncomment to avoid emitting BMI/BMI2/FMA instructions (allows generating
  29. // AVX2 target for VMs which support AVX2 but not the other instruction sets)
  30. // #define HWY_DISABLE_BMI2_FMA
  31. // Uncomment to enable these on MSVC even if the predefined macros are not set.
  32. // #define HWY_WANT_SSE2 1
  33. // #define HWY_WANT_SSSE3 1
  34. // #define HWY_WANT_SSE4 1
  35. //------------------------------------------------------------------------------
  36. // Targets
  37. // Unique bit value for each target. A lower value is "better" (e.g. more lanes)
  38. // than a higher value within the same group/platform - see HWY_STATIC_TARGET.
  39. //
  40. // All values are unconditionally defined so we can test HWY_TARGETS without
  41. // first checking the HWY_ARCH_*.
  42. //
  43. // The C99 preprocessor evaluates #if expressions using intmax_t types. This
  44. // holds at least 64 bits in practice (verified 2022-07-18 via Godbolt on
  45. // 32-bit clang/GCC/MSVC compilers for x86/Arm7/AArch32/RISC-V/WASM). We now
  46. // avoid overflow when computing HWY_TARGETS (subtracting one instead of
  47. // left-shifting 2^62), but still do not use bit 63 because it is the sign bit.
  48. // --------------------------- x86: 15 targets (+ one fallback)
  49. // Bits 0..3 reserved (4 targets)
  50. #define HWY_AVX3_SPR (1LL << 4)
  51. // Bit 5 reserved (likely AVX10.2 with 256-bit vectors)
  52. // Currently HWY_AVX3_DL plus AVX512BF16 and a special case for CompressStore
  53. // (10x as fast).
  54. // We may later also use VPCONFLICT.
  55. #define HWY_AVX3_ZEN4 (1LL << 6) // see HWY_WANT_AVX3_ZEN4 below
  56. // Currently satisfiable by Ice Lake (VNNI, VPCLMULQDQ, VPOPCNTDQ, VBMI, VBMI2,
  57. // VAES, BITALG, GFNI). Later to be added: BF16 (Cooper Lake). VP2INTERSECT is
  58. // only in Tiger Lake?
  59. #define HWY_AVX3_DL (1LL << 7) // see HWY_WANT_AVX3_DL below
  60. #define HWY_AVX3 (1LL << 8) // HWY_AVX2 plus AVX-512F/BW/CD/DQ/VL
  61. #define HWY_AVX2 (1LL << 9) // HWY_SSE4 plus BMI2 + F16 + FMA
  62. // Bit 10: reserved
  63. #define HWY_SSE4 (1LL << 11) // SSE4.2 plus AES + CLMUL
  64. #define HWY_SSSE3 (1LL << 12) // S-SSE3
  65. // Bit 13: reserved for SSE3
  66. #define HWY_SSE2 (1LL << 14)
  67. // The highest bit in the HWY_TARGETS mask that a x86 target can have. Used for
  68. // dynamic dispatch. All x86 target bits must be lower or equal to
  69. // (1 << HWY_HIGHEST_TARGET_BIT_X86) and they can only use
  70. // HWY_MAX_DYNAMIC_TARGETS in total.
  71. #define HWY_HIGHEST_TARGET_BIT_X86 14
  72. // --------------------------- Arm: 15 targets (+ one fallback)
  73. // Bits 15..17 reserved (3 targets)
  74. #define HWY_SVE2_128 (1LL << 18) // specialized (e.g. Neoverse V2/N2/N3)
  75. #define HWY_SVE_256 (1LL << 19) // specialized (Neoverse V1)
  76. // Bits 20-22 reserved for later SVE (3 targets)
  77. #define HWY_SVE2 (1LL << 23)
  78. #define HWY_SVE (1LL << 24)
  79. // Bit 25 reserved for NEON
  80. #define HWY_NEON_BF16 (1LL << 26) // fp16/dot/bf16 (e.g. Neoverse V2/N2/N3)
  81. // Bit 27 reserved for NEON
  82. #define HWY_NEON (1LL << 28) // Implies support for AES
  83. #define HWY_NEON_WITHOUT_AES (1LL << 29)
  84. #define HWY_HIGHEST_TARGET_BIT_ARM 29
  85. #define HWY_ALL_NEON (HWY_NEON_WITHOUT_AES | HWY_NEON | HWY_NEON_BF16)
  86. #define HWY_ALL_SVE (HWY_SVE | HWY_SVE2 | HWY_SVE_256 | HWY_SVE2_128)
  87. // --------------------------- RISC-V: 9 targets (+ one fallback)
  88. // Bits 30..36 reserved (7 targets)
  89. #define HWY_RVV (1LL << 37)
  90. // Bit 38 reserved
  91. #define HWY_HIGHEST_TARGET_BIT_RVV 38
  92. // --------------------------- Future expansion: 4 targets
  93. // Bits 39..42 reserved
  94. // --------------------------- IBM Power/ZSeries: 9 targets (+ one fallback)
  95. // Bits 43..46 reserved (4 targets)
  96. #define HWY_PPC10 (1LL << 47) // v3.1
  97. #define HWY_PPC9 (1LL << 48) // v3.0
  98. #define HWY_PPC8 (1LL << 49) // v2.07
  99. #define HWY_Z15 (1LL << 50) // Z15
  100. #define HWY_Z14 (1LL << 51) // Z14
  101. #define HWY_HIGHEST_TARGET_BIT_PPC 51
  102. #define HWY_ALL_PPC (HWY_PPC8 | HWY_PPC9 | HWY_PPC10)
  103. // --------------------------- WebAssembly: 9 targets (+ one fallback)
  104. // Bits 52..57 reserved (6 targets)
  105. #define HWY_WASM_EMU256 (1LL << 58) // Experimental
  106. #define HWY_WASM (1LL << 59)
  107. // Bits 60 reserved
  108. #define HWY_HIGHEST_TARGET_BIT_WASM 60
  109. // --------------------------- Emulation: 2 targets
  110. #define HWY_EMU128 (1LL << 61)
  111. // We do not add/left-shift, so this will not overflow to a negative number.
  112. #define HWY_SCALAR (1LL << 62)
  113. #define HWY_HIGHEST_TARGET_BIT_SCALAR 62
  114. // Do not use bit 63 - would be confusing to have negative numbers.
  115. //------------------------------------------------------------------------------
  116. // Set default blocklists
  117. // Disabled means excluded from enabled at user's request. A separate config
  118. // macro allows disabling without deactivating the blocklist below.
  119. #ifndef HWY_DISABLED_TARGETS
  120. #define HWY_DISABLED_TARGETS 0
  121. #endif
  122. // Broken means excluded from enabled due to known compiler issues. We define
  123. // separate HWY_BROKEN_* and then OR them together (more than one might apply).
  124. // x86 clang-6: we saw multiple AVX2/3 compile errors and in one case invalid
  125. // SSE4 codegen (possibly only for msan), so disable all those targets.
  126. #if HWY_ARCH_X86 && (HWY_COMPILER_CLANG != 0 && HWY_COMPILER_CLANG < 700)
  127. #define HWY_BROKEN_CLANG6 (HWY_SSE4 | (HWY_SSE4 - 1))
  128. // This entails a major speed reduction, so warn unless the user explicitly
  129. // opts in to scalar-only.
  130. #if !defined(HWY_COMPILE_ONLY_SCALAR)
  131. #pragma message("x86 Clang <= 6: define HWY_COMPILE_ONLY_SCALAR or upgrade.")
  132. #endif
  133. #else
  134. #define HWY_BROKEN_CLANG6 0
  135. #endif
  136. // 32-bit may fail to compile AVX2/3.
  137. #if HWY_ARCH_X86_32
  138. #define HWY_BROKEN_32BIT (HWY_AVX2 | (HWY_AVX2 - 1))
  139. #else
  140. #define HWY_BROKEN_32BIT 0
  141. #endif
  142. // MSVC AVX3 support is buggy: https://github.com/Mysticial/Flops/issues/16
  143. #if HWY_COMPILER_MSVC != 0
  144. #define HWY_BROKEN_MSVC (HWY_AVX3 | (HWY_AVX3 - 1))
  145. #else
  146. #define HWY_BROKEN_MSVC 0
  147. #endif
  148. // AVX3_DL and AVX3_ZEN4 require clang >= 7 (ensured above), gcc >= 8.1 or ICC
  149. // 2021.
  150. #if (HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 801) || \
  151. (HWY_COMPILER_ICC && HWY_COMPILER_ICC < 2021)
  152. #define HWY_BROKEN_AVX3_DL_ZEN4 (HWY_AVX3_DL | HWY_AVX3_ZEN4)
  153. #else
  154. #define HWY_BROKEN_AVX3_DL_ZEN4 0
  155. #endif
  156. // AVX3_SPR requires clang >= 14, gcc >= 12, or ICC 2021.
  157. #if (HWY_COMPILER_CLANG != 0 && HWY_COMPILER_CLANG < 1400) || \
  158. (HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1200) || \
  159. (HWY_COMPILER_ICC && HWY_COMPILER_ICC < 2021)
  160. #define HWY_BROKEN_AVX3_SPR (HWY_AVX3_SPR)
  161. #else
  162. #define HWY_BROKEN_AVX3_SPR 0
  163. #endif
  164. // armv7be has not been tested and is not yet supported.
  165. #if HWY_ARCH_ARM_V7 && HWY_IS_BIG_ENDIAN
  166. #define HWY_BROKEN_ARM7_BIG_ENDIAN HWY_ALL_NEON
  167. #else
  168. #define HWY_BROKEN_ARM7_BIG_ENDIAN 0
  169. #endif
  170. // armv7-a without a detected vfpv4 is not supported
  171. // (for example Cortex-A8, Cortex-A9)
  172. // vfpv4 always have neon half-float _and_ FMA.
  173. #if HWY_ARCH_ARM_V7 && (__ARM_ARCH_PROFILE == 'A') && \
  174. !defined(__ARM_VFPV4__) && \
  175. !((__ARM_NEON_FP & 0x2 /* half-float */) && (__ARM_FEATURE_FMA == 1))
  176. #define HWY_BROKEN_ARM7_WITHOUT_VFP4 HWY_ALL_NEON
  177. #else
  178. #define HWY_BROKEN_ARM7_WITHOUT_VFP4 0
  179. #endif
  180. // HWY_NEON_BF16 requires recent compilers.
  181. #if (HWY_COMPILER_CLANG != 0 && HWY_COMPILER_CLANG < 1700) || \
  182. (HWY_COMPILER_GCC_ACTUAL != 0 && HWY_COMPILER_GCC_ACTUAL < 1302)
  183. #define HWY_BROKEN_NEON_BF16 (HWY_NEON_BF16)
  184. #else
  185. #define HWY_BROKEN_NEON_BF16 0
  186. #endif
  187. // SVE[2] require recent clang or gcc versions.
  188. #if (HWY_COMPILER_CLANG && HWY_COMPILER_CLANG < 1100) || \
  189. (HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1000)
  190. #define HWY_BROKEN_SVE (HWY_SVE | HWY_SVE2 | HWY_SVE_256 | HWY_SVE2_128)
  191. #else
  192. #define HWY_BROKEN_SVE 0
  193. #endif
  194. #if (HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1100)
  195. // GCC 10 supports the -mcpu=power10 option but does not support the PPC10
  196. // vector intrinsics
  197. #define HWY_BROKEN_PPC10 (HWY_PPC10)
  198. #elif HWY_ARCH_PPC && HWY_IS_BIG_ENDIAN && \
  199. ((HWY_COMPILER3_CLANG && HWY_COMPILER3_CLANG < 160001) || \
  200. (HWY_COMPILER_GCC_ACTUAL >= 1200 && HWY_COMPILER_GCC_ACTUAL <= 1203) || \
  201. (HWY_COMPILER_GCC_ACTUAL >= 1300 && HWY_COMPILER_GCC_ACTUAL <= 1301))
  202. // GCC 12.0 through 12.3 and GCC 13.0 through 13.1 have a compiler bug where the
  203. // vsldoi instruction is sometimes incorrectly optimized out (and this causes
  204. // some of the Highway unit tests to fail on big-endian PPC10). Details about
  205. // this compiler bug can be found at
  206. // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=109069, and this bug will be
  207. // fixed in the upcoming GCC 12.4 and 13.2 releases.
  208. // Clang 16.0.0 and earlier (but not Clang 16.0.1 and later) have a compiler
  209. // bug in the LLVM DAGCombiner that causes a zero-extend followed by an
  210. // element insert into a vector, followed by a vector shuffle to be incorrectly
  211. // optimized on big-endian PPC (and which caused some of the Highway unit tests
  212. // to fail on big-endian PPC10).
  213. // Details about this bug, which has already been fixed in Clang 16.0.1 and
  214. // later, can be found at https://github.com/llvm/llvm-project/issues/61315.
  215. #define HWY_BROKEN_PPC10 (HWY_PPC10)
  216. #else
  217. #define HWY_BROKEN_PPC10 0
  218. #endif
  219. // Allow the user to override this without any guarantee of success.
  220. #ifndef HWY_BROKEN_TARGETS
  221. #define HWY_BROKEN_TARGETS \
  222. (HWY_BROKEN_CLANG6 | HWY_BROKEN_32BIT | HWY_BROKEN_MSVC | \
  223. HWY_BROKEN_AVX3_DL_ZEN4 | HWY_BROKEN_AVX3_SPR | \
  224. HWY_BROKEN_ARM7_BIG_ENDIAN | HWY_BROKEN_ARM7_WITHOUT_VFP4 | \
  225. HWY_BROKEN_NEON_BF16 | HWY_BROKEN_SVE | HWY_BROKEN_PPC10)
  226. #endif // HWY_BROKEN_TARGETS
  227. // Enabled means not disabled nor blocklisted.
  228. #define HWY_ENABLED(targets) \
  229. ((targets) & ~((HWY_DISABLED_TARGETS) | (HWY_BROKEN_TARGETS)))
  230. // Opt-out for EMU128 (affected by a GCC bug on multiple arches, fixed in 12.3:
  231. // see https://gcc.gnu.org/bugzilla/show_bug.cgi?id=106322). An issue still
  232. // remains with 13.2, see #1683. This is separate from HWY_BROKEN_TARGETS
  233. // because it affects the fallback target, which must always be enabled. If 1,
  234. // we instead choose HWY_SCALAR even without HWY_COMPILE_ONLY_SCALAR being set.
  235. #if !defined(HWY_BROKEN_EMU128) // allow overriding
  236. #if (HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1400) || \
  237. defined(HWY_NO_LIBCXX)
  238. #define HWY_BROKEN_EMU128 1
  239. #else
  240. #define HWY_BROKEN_EMU128 0
  241. #endif
  242. #endif // HWY_BROKEN_EMU128
  243. //------------------------------------------------------------------------------
  244. // Detect baseline targets using predefined macros
  245. // Baseline means the targets for which the compiler is allowed to generate
  246. // instructions, implying the target CPU would have to support them. This does
  247. // not take the blocklist into account.
  248. #if defined(HWY_COMPILE_ONLY_SCALAR) || HWY_BROKEN_EMU128
  249. #define HWY_BASELINE_SCALAR HWY_SCALAR
  250. #else
  251. #define HWY_BASELINE_SCALAR HWY_EMU128
  252. #endif
  253. // Also check HWY_ARCH to ensure that simulating unknown platforms ends up with
  254. // HWY_TARGET == HWY_BASELINE_SCALAR.
  255. #if HWY_ARCH_WASM && defined(__wasm_simd128__)
  256. #if defined(HWY_WANT_WASM2)
  257. #define HWY_BASELINE_WASM HWY_WASM_EMU256
  258. #else
  259. #define HWY_BASELINE_WASM HWY_WASM
  260. #endif // HWY_WANT_WASM2
  261. #else
  262. #define HWY_BASELINE_WASM 0
  263. #endif
  264. // GCC or Clang.
  265. #if HWY_ARCH_PPC && HWY_COMPILER_GCC && defined(__ALTIVEC__) && \
  266. defined(__VSX__) && defined(__POWER8_VECTOR__) && \
  267. (defined(__CRYPTO__) || defined(HWY_DISABLE_PPC8_CRYPTO))
  268. #define HWY_BASELINE_PPC8 HWY_PPC8
  269. #else
  270. #define HWY_BASELINE_PPC8 0
  271. #endif
  272. #if HWY_BASELINE_PPC8 != 0 && defined(__POWER9_VECTOR__)
  273. #define HWY_BASELINE_PPC9 HWY_PPC9
  274. #else
  275. #define HWY_BASELINE_PPC9 0
  276. #endif
  277. #if HWY_BASELINE_PPC9 != 0 && \
  278. (defined(_ARCH_PWR10) || defined(__POWER10_VECTOR__))
  279. #define HWY_BASELINE_PPC10 HWY_PPC10
  280. #else
  281. #define HWY_BASELINE_PPC10 0
  282. #endif
  283. #if HWY_ARCH_S390X && defined(__VEC__) && defined(__ARCH__) && __ARCH__ >= 12
  284. #define HWY_BASELINE_Z14 HWY_Z14
  285. #else
  286. #define HWY_BASELINE_Z14 0
  287. #endif
  288. #if HWY_BASELINE_Z14 && __ARCH__ >= 13
  289. #define HWY_BASELINE_Z15 HWY_Z15
  290. #else
  291. #define HWY_BASELINE_Z15 0
  292. #endif
  293. #define HWY_BASELINE_SVE2 0
  294. #define HWY_BASELINE_SVE 0
  295. #define HWY_BASELINE_NEON 0
  296. #if HWY_ARCH_ARM
  297. // Also check compiler version as done for HWY_ATTAINABLE_SVE2 because the
  298. // static target (influenced here) must be one of the attainable targets.
  299. #if defined(__ARM_FEATURE_SVE2) && \
  300. (HWY_COMPILER_CLANG >= 1400 || HWY_COMPILER_GCC_ACTUAL >= 1200)
  301. #undef HWY_BASELINE_SVE2 // was 0, will be re-defined
  302. // If user specified -msve-vector-bits=128, they assert the vector length is
  303. // 128 bits and we should use the HWY_SVE2_128 (more efficient for some ops).
  304. #if defined(__ARM_FEATURE_SVE_BITS) && __ARM_FEATURE_SVE_BITS == 128
  305. #define HWY_BASELINE_SVE2 HWY_SVE2_128
  306. // Otherwise we're not sure what the vector length will be. The baseline must be
  307. // unconditionally valid, so we can only assume HWY_SVE2. However, when running
  308. // on a CPU with 128-bit vectors, user code that supports dynamic dispatch will
  309. // still benefit from HWY_SVE2_128 because we add it to HWY_ATTAINABLE_TARGETS.
  310. #else
  311. #define HWY_BASELINE_SVE2 HWY_SVE2
  312. #endif // __ARM_FEATURE_SVE_BITS
  313. #endif // __ARM_FEATURE_SVE2
  314. #if defined(__ARM_FEATURE_SVE) && \
  315. (HWY_COMPILER_CLANG >= 900 || HWY_COMPILER_GCC_ACTUAL >= 800)
  316. #undef HWY_BASELINE_SVE // was 0, will be re-defined
  317. // See above. If user-specified vector length matches our optimization, use it.
  318. #if defined(__ARM_FEATURE_SVE_BITS) && __ARM_FEATURE_SVE_BITS == 256
  319. #define HWY_BASELINE_SVE HWY_SVE_256
  320. #else
  321. #define HWY_BASELINE_SVE HWY_SVE
  322. #endif // __ARM_FEATURE_SVE_BITS
  323. #endif // __ARM_FEATURE_SVE
  324. // GCC 4.5.4 only defines __ARM_NEON__; 5.4 defines both.
  325. #if defined(__ARM_NEON__) || defined(__ARM_NEON)
  326. #undef HWY_BASELINE_NEON
  327. #if defined(__ARM_FEATURE_AES) && \
  328. defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && \
  329. defined(__ARM_FEATURE_DOTPROD) && \
  330. defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC)
  331. #define HWY_BASELINE_NEON HWY_ALL_NEON
  332. #elif defined(__ARM_FEATURE_AES)
  333. #define HWY_BASELINE_NEON (HWY_NEON_WITHOUT_AES | HWY_NEON)
  334. #else
  335. #define HWY_BASELINE_NEON (HWY_NEON_WITHOUT_AES)
  336. #endif // __ARM_FEATURE*
  337. #endif // __ARM_NEON
  338. #endif // HWY_ARCH_ARM
  339. // Special handling for MSVC because it has fewer predefined macros:
  340. #if HWY_COMPILER_MSVC
  341. #if HWY_ARCH_X86_32
  342. #if _M_IX86_FP >= 2
  343. #define HWY_CHECK_SSE2 1
  344. #else
  345. #define HWY_CHECK_SSE2 0
  346. #endif
  347. #elif HWY_ARCH_X86_64
  348. #define HWY_CHECK_SSE2 1
  349. #else
  350. #define HWY_CHECK_SSE2 0
  351. #endif
  352. // 1) We can only be sure SSSE3/SSE4 are enabled if AVX is:
  353. // https://stackoverflow.com/questions/18563978/.
  354. #if defined(__AVX__)
  355. #define HWY_CHECK_SSSE3 1
  356. #define HWY_CHECK_SSE4 1
  357. #else
  358. #define HWY_CHECK_SSSE3 0
  359. #define HWY_CHECK_SSE4 0
  360. #endif
  361. // 2) Cannot check for PCLMUL/AES and BMI2/FMA/F16C individually; we assume
  362. // PCLMUL/AES are available if SSE4 is, and BMI2/FMA/F16C if AVX2 is.
  363. #define HWY_CHECK_PCLMUL_AES 1
  364. #define HWY_CHECK_BMI2_FMA 1
  365. #define HWY_CHECK_F16C 1
  366. #else // non-MSVC
  367. #if defined(__SSE2__)
  368. #define HWY_CHECK_SSE2 1
  369. #else
  370. #define HWY_CHECK_SSE2 0
  371. #endif
  372. #if defined(__SSSE3__)
  373. #define HWY_CHECK_SSSE3 1
  374. #else
  375. #define HWY_CHECK_SSSE3 0
  376. #endif
  377. #if defined(__SSE4_1__) && defined(__SSE4_2__)
  378. #define HWY_CHECK_SSE4 1
  379. #else
  380. #define HWY_CHECK_SSE4 0
  381. #endif
  382. // If these are disabled, they should not gate the availability of SSE4/AVX2.
  383. #if defined(HWY_DISABLE_PCLMUL_AES) || (defined(__PCLMUL__) && defined(__AES__))
  384. #define HWY_CHECK_PCLMUL_AES 1
  385. #else
  386. #define HWY_CHECK_PCLMUL_AES 0
  387. #endif
  388. #if defined(HWY_DISABLE_BMI2_FMA) || (defined(__BMI2__) && defined(__FMA__))
  389. #define HWY_CHECK_BMI2_FMA 1
  390. #else
  391. #define HWY_CHECK_BMI2_FMA 0
  392. #endif
  393. #if defined(HWY_DISABLE_F16C) || defined(__F16C__)
  394. #define HWY_CHECK_F16C 1
  395. #else
  396. #define HWY_CHECK_F16C 0
  397. #endif
  398. #endif // non-MSVC
  399. #if HWY_ARCH_X86 && (HWY_WANT_SSE2 || HWY_CHECK_SSE2)
  400. #define HWY_BASELINE_SSE2 HWY_SSE2
  401. #else
  402. #define HWY_BASELINE_SSE2 0
  403. #endif
  404. #if HWY_ARCH_X86 && (HWY_WANT_SSSE3 || HWY_CHECK_SSSE3)
  405. #define HWY_BASELINE_SSSE3 HWY_SSSE3
  406. #else
  407. #define HWY_BASELINE_SSSE3 0
  408. #endif
  409. #if HWY_ARCH_X86 && (HWY_WANT_SSE4 || (HWY_CHECK_SSE4 && HWY_CHECK_PCLMUL_AES))
  410. #define HWY_BASELINE_SSE4 HWY_SSE4
  411. #else
  412. #define HWY_BASELINE_SSE4 0
  413. #endif
  414. #if HWY_BASELINE_SSE4 != 0 && HWY_CHECK_BMI2_FMA && HWY_CHECK_F16C && \
  415. defined(__AVX2__)
  416. #define HWY_BASELINE_AVX2 HWY_AVX2
  417. #else
  418. #define HWY_BASELINE_AVX2 0
  419. #endif
  420. // Require everything in AVX2 plus AVX-512 flags (also set by MSVC)
  421. #if HWY_BASELINE_AVX2 != 0 && defined(__AVX512F__) && defined(__AVX512BW__) && \
  422. defined(__AVX512DQ__) && defined(__AVX512VL__)
  423. #define HWY_BASELINE_AVX3 HWY_AVX3
  424. #else
  425. #define HWY_BASELINE_AVX3 0
  426. #endif
  427. // TODO(janwas): not yet known whether these will be set by MSVC
  428. #if HWY_BASELINE_AVX3 != 0 && defined(__AVX512VNNI__) && defined(__VAES__) && \
  429. defined(__VPCLMULQDQ__) && defined(__AVX512VBMI__) && \
  430. defined(__AVX512VBMI2__) && defined(__AVX512VPOPCNTDQ__) && \
  431. defined(__AVX512BITALG__)
  432. #define HWY_BASELINE_AVX3_DL HWY_AVX3_DL
  433. #else
  434. #define HWY_BASELINE_AVX3_DL 0
  435. #endif
  436. // The ZEN4-optimized AVX3 target is numerically lower than AVX3_DL and is thus
  437. // considered better. Do not enable it unless the user explicitly requests it -
  438. // we do not want to choose the ZEN4 path on Intel because it could be slower.
  439. #if defined(HWY_WANT_AVX3_ZEN4) && HWY_BASELINE_AVX3_DL != 0
  440. #define HWY_BASELINE_AVX3_ZEN4 HWY_AVX3_ZEN4
  441. #else
  442. #define HWY_BASELINE_AVX3_ZEN4 0
  443. #endif
  444. #if HWY_BASELINE_AVX3_DL != 0 && defined(__AVX512BF16__) && \
  445. defined(__AVX512FP16__)
  446. #define HWY_BASELINE_AVX3_SPR HWY_AVX3_SPR
  447. #else
  448. #define HWY_BASELINE_AVX3_SPR 0
  449. #endif
  450. // RVV requires intrinsics 0.11 or later, see #1156.
  451. #if HWY_ARCH_RISCV && defined(__riscv_v_intrinsic) && \
  452. __riscv_v_intrinsic >= 11000
  453. #define HWY_BASELINE_RVV HWY_RVV
  454. #else
  455. #define HWY_BASELINE_RVV 0
  456. #endif
  457. // Allow the user to override this without any guarantee of success.
  458. #ifndef HWY_BASELINE_TARGETS
  459. #define HWY_BASELINE_TARGETS \
  460. (HWY_BASELINE_SCALAR | HWY_BASELINE_WASM | HWY_BASELINE_PPC8 | \
  461. HWY_BASELINE_PPC9 | HWY_BASELINE_PPC10 | HWY_BASELINE_Z14 | \
  462. HWY_BASELINE_Z15 | HWY_BASELINE_SVE2 | HWY_BASELINE_SVE | \
  463. HWY_BASELINE_NEON | HWY_BASELINE_SSE2 | HWY_BASELINE_SSSE3 | \
  464. HWY_BASELINE_SSE4 | HWY_BASELINE_AVX2 | HWY_BASELINE_AVX3 | \
  465. HWY_BASELINE_AVX3_DL | HWY_BASELINE_AVX3_ZEN4 | HWY_BASELINE_AVX3_SPR | \
  466. HWY_BASELINE_RVV)
  467. #endif // HWY_BASELINE_TARGETS
  468. //------------------------------------------------------------------------------
  469. // Choose target for static dispatch
  470. #define HWY_ENABLED_BASELINE HWY_ENABLED(HWY_BASELINE_TARGETS)
  471. #if HWY_ENABLED_BASELINE == 0
  472. #error "At least one baseline target must be defined and enabled"
  473. #endif
  474. // Best baseline, used for static dispatch. This is the least-significant 1-bit
  475. // within HWY_ENABLED_BASELINE and lower bit values imply "better".
  476. #define HWY_STATIC_TARGET (HWY_ENABLED_BASELINE & -HWY_ENABLED_BASELINE)
  477. // Start by assuming static dispatch. If we later use dynamic dispatch, this
  478. // will be defined to other targets during the multiple-inclusion, and finally
  479. // return to the initial value. Defining this outside begin/end_target ensures
  480. // inl headers successfully compile by themselves (required by Bazel).
  481. #define HWY_TARGET HWY_STATIC_TARGET
  482. //------------------------------------------------------------------------------
  483. // Choose targets for dynamic dispatch according to one of four policies
  484. #if 1 < (defined(HWY_COMPILE_ONLY_SCALAR) + defined(HWY_COMPILE_ONLY_EMU128) + \
  485. defined(HWY_COMPILE_ONLY_STATIC))
  486. #error "Can only define one of HWY_COMPILE_ONLY_{SCALAR|EMU128|STATIC} - bug?"
  487. #endif
  488. // Defining one of HWY_COMPILE_ONLY_* will trump HWY_COMPILE_ALL_ATTAINABLE.
  489. #ifndef HWY_HAVE_AUXV // allow override
  490. #ifdef TOOLCHAIN_MISS_SYS_AUXV_H
  491. #define HWY_HAVE_AUXV 0 // CMake failed to find the header
  492. // glibc 2.16 added auxv, but checking for that requires features.h, and we do
  493. // not want to include system headers here. Instead check for the header
  494. // directly, which has been supported at least since GCC 5.4 and Clang 3.
  495. #elif defined(__has_include) // note: wrapper macro fails on Clang ~17
  496. // clang-format off
  497. #if __has_include(<sys/auxv.h>)
  498. // clang-format on
  499. #define HWY_HAVE_AUXV 1 // header present
  500. #else
  501. #define HWY_HAVE_AUXV 0 // header not present
  502. #endif // __has_include
  503. #else // compiler lacks __has_include
  504. #define HWY_HAVE_AUXV 0
  505. #endif
  506. #endif // HWY_HAVE_AUXV
  507. // Allow opting out, and without a guarantee of success, opting-in.
  508. #ifndef HWY_HAVE_RUNTIME_DISPATCH
  509. // Clang, GCC and MSVC allow runtime dispatch on x86.
  510. #if HWY_ARCH_X86
  511. #define HWY_HAVE_RUNTIME_DISPATCH 1
  512. // On Arm, PPC, S390X, and RISC-V: GCC and Clang 17+ do, and we require Linux
  513. // to detect CPU capabilities.
  514. #elif (HWY_ARCH_ARM || HWY_ARCH_PPC || HWY_ARCH_S390X || HWY_ARCH_RISCV) && \
  515. (HWY_COMPILER_GCC_ACTUAL || HWY_COMPILER_CLANG >= 1700) && HWY_OS_LINUX && \
  516. HWY_HAVE_AUXV
  517. #define HWY_HAVE_RUNTIME_DISPATCH 1
  518. #elif HWY_ARCH_ARM_A64 && HWY_OS_APPLE && \
  519. (HWY_COMPILER_GCC_ACTUAL || HWY_COMPILER_CLANG >= 1700)
  520. #define HWY_HAVE_RUNTIME_DISPATCH 1
  521. #else
  522. #define HWY_HAVE_RUNTIME_DISPATCH 0
  523. #endif // HWY_ARCH_*
  524. #endif // HWY_HAVE_RUNTIME_DISPATCH
  525. // AVX3_DL is not widely available yet. To reduce code size and compile time,
  526. // only include it in the set of attainable targets (for dynamic dispatch) if
  527. // the user opts in, OR it is in the baseline (we check whether enabled below).
  528. #if defined(HWY_WANT_AVX3_DL) || (HWY_BASELINE_TARGETS & HWY_AVX3_DL)
  529. #define HWY_ATTAINABLE_AVX3_DL (HWY_AVX3_DL)
  530. #else
  531. #define HWY_ATTAINABLE_AVX3_DL 0
  532. #endif
  533. #if HWY_ARCH_ARM_A64 && HWY_HAVE_RUNTIME_DISPATCH
  534. #define HWY_ATTAINABLE_NEON HWY_ALL_NEON
  535. #elif HWY_ARCH_ARM // static dispatch, or HWY_ARCH_ARM_V7
  536. #define HWY_ATTAINABLE_NEON (HWY_BASELINE_NEON)
  537. #else
  538. #define HWY_ATTAINABLE_NEON 0
  539. #endif
  540. #if HWY_ARCH_ARM_A64 && \
  541. (HWY_COMPILER_CLANG >= 900 || HWY_COMPILER_GCC_ACTUAL >= 800) && \
  542. (HWY_HAVE_RUNTIME_DISPATCH || \
  543. (HWY_ENABLED_BASELINE & (HWY_SVE | HWY_SVE_256)))
  544. #define HWY_ATTAINABLE_SVE (HWY_SVE | HWY_SVE_256)
  545. #else
  546. #define HWY_ATTAINABLE_SVE 0
  547. #endif
  548. #if HWY_ARCH_ARM_A64 && \
  549. (HWY_COMPILER_CLANG >= 1400 || HWY_COMPILER_GCC_ACTUAL >= 1200) && \
  550. (HWY_HAVE_RUNTIME_DISPATCH || \
  551. (HWY_ENABLED_BASELINE & (HWY_SVE2 | HWY_SVE2_128)))
  552. #define HWY_ATTAINABLE_SVE2 (HWY_SVE2 | HWY_SVE2_128)
  553. #else
  554. #define HWY_ATTAINABLE_SVE2 0
  555. #endif
  556. #if HWY_ARCH_PPC && defined(__ALTIVEC__) && \
  557. (!HWY_COMPILER_CLANG || HWY_BASELINE_PPC8 != 0)
  558. #if (HWY_BASELINE_PPC9 | HWY_BASELINE_PPC10) && \
  559. !defined(HWY_SKIP_NON_BEST_BASELINE)
  560. // On POWER with -m flags, we get compile errors (#1707) for targets older than
  561. // the baseline specified via -m, so only generate the static target and better.
  562. // Note that some Linux distros actually do set POWER9 as the baseline.
  563. // This works by skipping case 3 below, so case 4 is reached.
  564. #define HWY_SKIP_NON_BEST_BASELINE
  565. #endif
  566. #define HWY_ATTAINABLE_PPC (HWY_PPC8 | HWY_PPC9 | HWY_PPC10)
  567. #else
  568. #define HWY_ATTAINABLE_PPC 0
  569. #endif
  570. #if HWY_ARCH_S390X && HWY_BASELINE_Z14 != 0
  571. #define HWY_ATTAINABLE_S390X (HWY_Z14 | HWY_Z15)
  572. #else
  573. #define HWY_ATTAINABLE_S390X 0
  574. #endif
  575. #if HWY_ARCH_RISCV && HWY_HAVE_RUNTIME_DISPATCH
  576. #define HWY_ATTAINABLE_RISCV (HWY_RVV)
  577. #else
  578. #define HWY_ATTAINABLE_RISCV 0
  579. #endif
  580. // Attainable means enabled and the compiler allows intrinsics (even when not
  581. // allowed to autovectorize). Used in 3 and 4.
  582. #if HWY_ARCH_X86
  583. #if HWY_COMPILER_MSVC
  584. // Fewer targets for faster builds.
  585. #define HWY_ATTAINABLE_TARGETS \
  586. HWY_ENABLED(HWY_BASELINE_SCALAR | HWY_STATIC_TARGET | HWY_AVX2)
  587. #else // !HWY_COMPILER_MSVC
  588. #define HWY_ATTAINABLE_TARGETS \
  589. HWY_ENABLED(HWY_BASELINE_SCALAR | HWY_SSE2 | HWY_SSSE3 | HWY_SSE4 | \
  590. HWY_AVX2 | HWY_AVX3 | HWY_ATTAINABLE_AVX3_DL | HWY_AVX3_ZEN4 | \
  591. HWY_AVX3_SPR)
  592. #endif // !HWY_COMPILER_MSVC
  593. #elif HWY_ARCH_ARM
  594. #define HWY_ATTAINABLE_TARGETS \
  595. HWY_ENABLED(HWY_BASELINE_SCALAR | HWY_ATTAINABLE_NEON | HWY_ATTAINABLE_SVE | \
  596. HWY_ATTAINABLE_SVE2)
  597. #elif HWY_ARCH_PPC
  598. #define HWY_ATTAINABLE_TARGETS \
  599. HWY_ENABLED(HWY_BASELINE_SCALAR | HWY_ATTAINABLE_PPC)
  600. #elif HWY_ARCH_S390X
  601. #define HWY_ATTAINABLE_TARGETS \
  602. HWY_ENABLED(HWY_BASELINE_SCALAR | HWY_ATTAINABLE_S390X)
  603. #elif HWY_ARCH_RVV
  604. #define HWY_ATTAINABLE_TARGETS \
  605. HWY_ENABLED(HWY_BASELINE_SCALAR | HWY_ATTAINABLE_RISCV)
  606. #else
  607. #define HWY_ATTAINABLE_TARGETS (HWY_ENABLED_BASELINE)
  608. #endif // HWY_ARCH_*
  609. // 1) For older compilers: avoid SIMD intrinsics, but still support all ops.
  610. #if defined(HWY_COMPILE_ONLY_EMU128) && !HWY_BROKEN_EMU128
  611. #undef HWY_STATIC_TARGET
  612. #define HWY_STATIC_TARGET HWY_EMU128 // override baseline
  613. #define HWY_TARGETS HWY_EMU128
  614. // 1b) HWY_SCALAR is less capable than HWY_EMU128 (which supports all ops), but
  615. // we currently still support it for backwards compatibility.
  616. #elif defined(HWY_COMPILE_ONLY_SCALAR) || \
  617. (defined(HWY_COMPILE_ONLY_EMU128) && HWY_BROKEN_EMU128)
  618. #undef HWY_STATIC_TARGET
  619. #define HWY_STATIC_TARGET HWY_SCALAR // override baseline
  620. #define HWY_TARGETS HWY_SCALAR
  621. // 2) For forcing static dispatch without code changes (removing HWY_EXPORT)
  622. #elif defined(HWY_COMPILE_ONLY_STATIC)
  623. #define HWY_TARGETS HWY_STATIC_TARGET
  624. // 3) For tests: include all attainable targets (in particular: scalar)
  625. #elif (defined(HWY_COMPILE_ALL_ATTAINABLE) || defined(HWY_IS_TEST)) && \
  626. !defined(HWY_SKIP_NON_BEST_BASELINE)
  627. #define HWY_TARGETS HWY_ATTAINABLE_TARGETS
  628. // 4) Default: attainable WITHOUT non-best baseline. This reduces code size by
  629. // excluding superseded targets, in particular scalar. Note: HWY_STATIC_TARGET
  630. // may be 2^62 (HWY_SCALAR), so we must not left-shift/add it. Subtracting one
  631. // sets all lower bits (better targets), then we also include the static target.
  632. #else
  633. #define HWY_TARGETS \
  634. (HWY_ATTAINABLE_TARGETS & ((HWY_STATIC_TARGET - 1LL) | HWY_STATIC_TARGET))
  635. #endif // target policy
  636. // HWY_ONCE and the multiple-inclusion mechanism rely on HWY_STATIC_TARGET being
  637. // one of the dynamic targets. This also implies HWY_TARGETS != 0 and
  638. // (HWY_TARGETS & HWY_ENABLED_BASELINE) != 0.
  639. #if (HWY_TARGETS & HWY_STATIC_TARGET) == 0
  640. #error "Logic error: best baseline should be included in dynamic targets"
  641. #endif
  642. #endif // HIGHWAY_HWY_DETECT_TARGETS_H_