targets.h 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346
  1. // Copyright 2020 Google LLC
  2. // SPDX-License-Identifier: Apache-2.0
  3. //
  4. // Licensed under the Apache License, Version 2.0 (the "License");
  5. // you may not use this file except in compliance with the License.
  6. // You may obtain a copy of the License at
  7. //
  8. // http://www.apache.org/licenses/LICENSE-2.0
  9. //
  10. // Unless required by applicable law or agreed to in writing, software
  11. // distributed under the License is distributed on an "AS IS" BASIS,
  12. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. // See the License for the specific language governing permissions and
  14. // limitations under the License.
  15. #ifndef HIGHWAY_HWY_TARGETS_H_
  16. #define HIGHWAY_HWY_TARGETS_H_
  17. // Allows opting out of C++ standard library usage, which is not available in
  18. // some Compiler Explorer environments.
  19. #ifndef HWY_NO_LIBCXX
  20. #include <vector>
  21. #endif
  22. // For SIMD module implementations and their callers. Defines which targets to
  23. // generate and call.
  24. #include "hwy/base.h"
  25. #include "hwy/detect_targets.h"
  26. #include "hwy/highway_export.h"
  27. #if !defined(HWY_NO_LIBCXX)
  28. #include <atomic>
  29. #endif
  30. namespace hwy {
  31. // Returns bitfield of enabled targets that are supported on this CPU; there is
  32. // always at least one such target, hence the return value is never 0. The
  33. // targets returned may change after calling DisableTargets. This function is
  34. // always defined, but the HWY_SUPPORTED_TARGETS wrapper may allow eliding
  35. // calls to it if there is only a single target enabled.
  36. HWY_DLLEXPORT int64_t SupportedTargets();
  37. // Evaluates to a function call, or literal if there is a single target.
  38. #if (HWY_TARGETS & (HWY_TARGETS - 1)) == 0
  39. #define HWY_SUPPORTED_TARGETS HWY_TARGETS
  40. #else
  41. #define HWY_SUPPORTED_TARGETS hwy::SupportedTargets()
  42. #endif
  43. // Subsequent SupportedTargets will not return targets whose bit(s) are set in
  44. // `disabled_targets`. Exception: if SupportedTargets would return 0, it will
  45. // instead return HWY_STATIC_TARGET (there must always be one target to call).
  46. //
  47. // This function is useful for disabling targets known to be buggy, or if the
  48. // best available target is undesirable (perhaps due to throttling or memory
  49. // bandwidth limitations). Use SetSupportedTargetsForTest instead of this
  50. // function for iteratively enabling specific targets for testing.
  51. HWY_DLLEXPORT void DisableTargets(int64_t disabled_targets);
  52. // Subsequent SupportedTargets will return the given set of targets, except
  53. // those disabled via DisableTargets. Call with a mask of 0 to disable the mock
  54. // and return to the normal SupportedTargets behavior. Used to run tests for
  55. // all targets.
  56. HWY_DLLEXPORT void SetSupportedTargetsForTest(int64_t targets);
  57. #ifndef HWY_NO_LIBCXX
  58. // Return the list of targets in HWY_TARGETS supported by the CPU as a list of
  59. // individual HWY_* target macros such as HWY_SCALAR or HWY_NEON. This list
  60. // is affected by the current SetSupportedTargetsForTest() mock if any.
  61. HWY_INLINE std::vector<int64_t> SupportedAndGeneratedTargets() {
  62. std::vector<int64_t> ret;
  63. for (int64_t targets = SupportedTargets() & HWY_TARGETS; targets != 0;
  64. targets = targets & (targets - 1)) {
  65. int64_t current_target = targets & ~(targets - 1);
  66. ret.push_back(current_target);
  67. }
  68. return ret;
  69. }
  70. #endif // HWY_NO_LIBCXX
  71. static inline HWY_MAYBE_UNUSED const char* TargetName(int64_t target) {
  72. switch (target) {
  73. #if HWY_ARCH_X86
  74. case HWY_SSE2:
  75. return "SSE2";
  76. case HWY_SSSE3:
  77. return "SSSE3";
  78. case HWY_SSE4:
  79. return "SSE4";
  80. case HWY_AVX2:
  81. return "AVX2";
  82. case HWY_AVX3:
  83. return "AVX3";
  84. case HWY_AVX3_DL:
  85. return "AVX3_DL";
  86. case HWY_AVX3_ZEN4:
  87. return "AVX3_ZEN4";
  88. case HWY_AVX3_SPR:
  89. return "AVX3_SPR";
  90. #endif
  91. #if HWY_ARCH_ARM
  92. case HWY_SVE2_128:
  93. return "SVE2_128";
  94. case HWY_SVE_256:
  95. return "SVE_256";
  96. case HWY_SVE2:
  97. return "SVE2";
  98. case HWY_SVE:
  99. return "SVE";
  100. case HWY_NEON_BF16:
  101. return "NEON_BF16";
  102. case HWY_NEON:
  103. return "NEON";
  104. case HWY_NEON_WITHOUT_AES:
  105. return "NEON_WITHOUT_AES";
  106. #endif
  107. #if HWY_ARCH_PPC
  108. case HWY_PPC8:
  109. return "PPC8";
  110. case HWY_PPC9:
  111. return "PPC9";
  112. case HWY_PPC10:
  113. return "PPC10";
  114. #endif
  115. #if HWY_ARCH_S390X
  116. case HWY_Z14:
  117. return "Z14";
  118. case HWY_Z15:
  119. return "Z15";
  120. #endif
  121. #if HWY_ARCH_WASM
  122. case HWY_WASM:
  123. return "WASM";
  124. case HWY_WASM_EMU256:
  125. return "WASM_EMU256";
  126. #endif
  127. #if HWY_ARCH_RISCV
  128. case HWY_RVV:
  129. return "RVV";
  130. #endif
  131. case HWY_EMU128:
  132. return "EMU128";
  133. case HWY_SCALAR:
  134. return "SCALAR";
  135. default:
  136. return "Unknown"; // must satisfy gtest IsValidParamName()
  137. }
  138. }
  139. // The maximum number of dynamic targets on any architecture is defined by
  140. // HWY_MAX_DYNAMIC_TARGETS and depends on the arch.
  141. // For the ChosenTarget mask and index we use a different bit arrangement than
  142. // in the HWY_TARGETS mask. Only the targets involved in the current
  143. // architecture are used in this mask, and therefore only the least significant
  144. // (HWY_MAX_DYNAMIC_TARGETS + 2) bits of the int64_t mask are used. The least
  145. // significant bit is set when the mask is not initialized, the next
  146. // HWY_MAX_DYNAMIC_TARGETS more significant bits are a range of bits from the
  147. // HWY_TARGETS or SupportedTargets() mask for the given architecture shifted to
  148. // that position and the next more significant bit is used for HWY_SCALAR (if
  149. // HWY_COMPILE_ONLY_SCALAR is defined) or HWY_EMU128. Because of this we need to
  150. // define equivalent values for HWY_TARGETS in this representation.
  151. // This mask representation allows to use ctz() on this mask and obtain a small
  152. // number that's used as an index of the table for dynamic dispatch. In this
  153. // way the first entry is used when the mask is uninitialized, the following
  154. // HWY_MAX_DYNAMIC_TARGETS are for dynamic dispatch and the last one is for
  155. // scalar.
  156. // The HWY_SCALAR/HWY_EMU128 bit in the ChosenTarget mask format.
  157. #define HWY_CHOSEN_TARGET_MASK_SCALAR (1LL << (HWY_MAX_DYNAMIC_TARGETS + 1))
  158. // Converts from a HWY_TARGETS mask to a ChosenTarget mask format for the
  159. // current architecture.
  160. #define HWY_CHOSEN_TARGET_SHIFT(X) \
  161. ((((X) >> (HWY_HIGHEST_TARGET_BIT + 1 - HWY_MAX_DYNAMIC_TARGETS)) & \
  162. ((1LL << HWY_MAX_DYNAMIC_TARGETS) - 1)) \
  163. << 1)
  164. // The HWY_TARGETS mask in the ChosenTarget mask format.
  165. #define HWY_CHOSEN_TARGET_MASK_TARGETS \
  166. (HWY_CHOSEN_TARGET_SHIFT(HWY_TARGETS) | HWY_CHOSEN_TARGET_MASK_SCALAR | 1LL)
  167. #if HWY_ARCH_X86
  168. // Maximum number of dynamic targets, changing this value is an ABI incompatible
  169. // change
  170. #define HWY_MAX_DYNAMIC_TARGETS 15
  171. #define HWY_HIGHEST_TARGET_BIT HWY_HIGHEST_TARGET_BIT_X86
  172. // These must match the order in which the HWY_TARGETS are defined
  173. // starting by the least significant (HWY_HIGHEST_TARGET_BIT + 1 -
  174. // HWY_MAX_DYNAMIC_TARGETS) bit. This list must contain exactly
  175. // HWY_MAX_DYNAMIC_TARGETS elements and does not include SCALAR. The first entry
  176. // corresponds to the best target. Don't include a "," at the end of the list.
  177. #define HWY_CHOOSE_TARGET_LIST(func_name) \
  178. nullptr, /* reserved */ \
  179. nullptr, /* reserved */ \
  180. nullptr, /* reserved */ \
  181. nullptr, /* reserved */ \
  182. HWY_CHOOSE_AVX3_SPR(func_name), /* AVX3_SPR */ \
  183. nullptr, /* reserved */ \
  184. HWY_CHOOSE_AVX3_ZEN4(func_name), /* AVX3_ZEN4 */ \
  185. HWY_CHOOSE_AVX3_DL(func_name), /* AVX3_DL */ \
  186. HWY_CHOOSE_AVX3(func_name), /* AVX3 */ \
  187. HWY_CHOOSE_AVX2(func_name), /* AVX2 */ \
  188. nullptr, /* AVX */ \
  189. HWY_CHOOSE_SSE4(func_name), /* SSE4 */ \
  190. HWY_CHOOSE_SSSE3(func_name), /* SSSE3 */ \
  191. nullptr, /* reserved - SSE3? */ \
  192. HWY_CHOOSE_SSE2(func_name) /* SSE2 */
  193. #elif HWY_ARCH_ARM
  194. // See HWY_ARCH_X86 above for details.
  195. #define HWY_MAX_DYNAMIC_TARGETS 15
  196. #define HWY_HIGHEST_TARGET_BIT HWY_HIGHEST_TARGET_BIT_ARM
  197. #define HWY_CHOOSE_TARGET_LIST(func_name) \
  198. nullptr, /* reserved */ \
  199. nullptr, /* reserved */ \
  200. nullptr, /* reserved */ \
  201. HWY_CHOOSE_SVE2_128(func_name), /* SVE2 128-bit */ \
  202. HWY_CHOOSE_SVE_256(func_name), /* SVE 256-bit */ \
  203. nullptr, /* reserved */ \
  204. nullptr, /* reserved */ \
  205. nullptr, /* reserved */ \
  206. HWY_CHOOSE_SVE2(func_name), /* SVE2 */ \
  207. HWY_CHOOSE_SVE(func_name), /* SVE */ \
  208. nullptr, /* reserved */ \
  209. HWY_CHOOSE_NEON_BF16(func_name), /* NEON + f16/dot/bf16 */ \
  210. nullptr, /* reserved */ \
  211. HWY_CHOOSE_NEON(func_name), /* NEON */ \
  212. HWY_CHOOSE_NEON_WITHOUT_AES(func_name) /* NEON without AES */
  213. #elif HWY_ARCH_RISCV
  214. // See HWY_ARCH_X86 above for details.
  215. #define HWY_MAX_DYNAMIC_TARGETS 9
  216. #define HWY_HIGHEST_TARGET_BIT HWY_HIGHEST_TARGET_BIT_RVV
  217. #define HWY_CHOOSE_TARGET_LIST(func_name) \
  218. nullptr, /* reserved */ \
  219. nullptr, /* reserved */ \
  220. nullptr, /* reserved */ \
  221. nullptr, /* reserved */ \
  222. nullptr, /* reserved */ \
  223. nullptr, /* reserved */ \
  224. nullptr, /* reserved */ \
  225. HWY_CHOOSE_RVV(func_name), /* RVV */ \
  226. nullptr /* reserved */
  227. #elif HWY_ARCH_PPC || HWY_ARCH_S390X
  228. // See HWY_ARCH_X86 above for details.
  229. #define HWY_MAX_DYNAMIC_TARGETS 9
  230. #define HWY_HIGHEST_TARGET_BIT HWY_HIGHEST_TARGET_BIT_PPC
  231. #define HWY_CHOOSE_TARGET_LIST(func_name) \
  232. nullptr, /* reserved */ \
  233. nullptr, /* reserved */ \
  234. nullptr, /* reserved */ \
  235. nullptr, /* reserved */ \
  236. HWY_CHOOSE_PPC10(func_name), /* PPC10 */ \
  237. HWY_CHOOSE_PPC9(func_name), /* PPC9 */ \
  238. HWY_CHOOSE_PPC8(func_name), /* PPC8 */ \
  239. HWY_CHOOSE_Z15(func_name), /* Z15 */ \
  240. HWY_CHOOSE_Z14(func_name) /* Z14 */
  241. #elif HWY_ARCH_WASM
  242. // See HWY_ARCH_X86 above for details.
  243. #define HWY_MAX_DYNAMIC_TARGETS 9
  244. #define HWY_HIGHEST_TARGET_BIT HWY_HIGHEST_TARGET_BIT_WASM
  245. #define HWY_CHOOSE_TARGET_LIST(func_name) \
  246. nullptr, /* reserved */ \
  247. nullptr, /* reserved */ \
  248. nullptr, /* reserved */ \
  249. nullptr, /* reserved */ \
  250. nullptr, /* reserved */ \
  251. nullptr, /* reserved */ \
  252. HWY_CHOOSE_WASM_EMU256(func_name), /* WASM_EMU256 */ \
  253. HWY_CHOOSE_WASM(func_name), /* WASM */ \
  254. nullptr /* reserved */
  255. #else
  256. // Unknown architecture, will use HWY_SCALAR without dynamic dispatch, though
  257. // still creating single-entry tables in HWY_EXPORT to ensure portability.
  258. #define HWY_MAX_DYNAMIC_TARGETS 1
  259. #define HWY_HIGHEST_TARGET_BIT HWY_HIGHEST_TARGET_BIT_SCALAR
  260. #endif
  261. // Bitfield of supported and enabled targets. The format differs from that of
  262. // HWY_TARGETS; the lowest bit governs the first function pointer (which is
  263. // special in that it calls FunctionCache, then Update, then dispatches to the
  264. // actual implementation) in the tables created by HWY_EXPORT. Monostate (see
  265. // GetChosenTarget), thread-safe except on RVV.
  266. struct ChosenTarget {
  267. public:
  268. // Reset bits according to `targets` (typically the return value of
  269. // SupportedTargets()). Postcondition: IsInitialized() == true.
  270. void Update(int64_t targets) {
  271. // These are `targets` shifted downwards, see above. Also include SCALAR
  272. // (corresponds to the last entry in the function table) as fallback.
  273. StoreMask(HWY_CHOSEN_TARGET_SHIFT(targets) | HWY_CHOSEN_TARGET_MASK_SCALAR);
  274. }
  275. // Reset to the uninitialized state, so that FunctionCache will call Update
  276. // during the next HWY_DYNAMIC_DISPATCH, and IsInitialized returns false.
  277. void DeInit() { StoreMask(1); }
  278. // Whether Update was called. This indicates whether any HWY_DYNAMIC_DISPATCH
  279. // function was called, which we check in tests.
  280. bool IsInitialized() const { return LoadMask() != 1; }
  281. // Return the index in the dynamic dispatch table to be used by the current
  282. // CPU. Note that this method must be in the header file so it uses the value
  283. // of HWY_CHOSEN_TARGET_MASK_TARGETS defined in the translation unit that
  284. // calls it, which may be different from others. This means we only enable
  285. // those targets that were actually compiled in this module.
  286. size_t HWY_INLINE GetIndex() const {
  287. return hwy::Num0BitsBelowLS1Bit_Nonzero64(
  288. static_cast<uint64_t>(LoadMask() & HWY_CHOSEN_TARGET_MASK_TARGETS));
  289. }
  290. private:
  291. #if defined(HWY_NO_LIBCXX)
  292. int64_t LoadMask() const { return mask_; }
  293. void StoreMask(int64_t mask) { mask_ = mask; }
  294. int64_t mask_{1}; // Initialized to 1 so GetIndex() returns 0.
  295. #else
  296. int64_t LoadMask() const { return mask_.load(); }
  297. void StoreMask(int64_t mask) { mask_.store(mask); }
  298. std::atomic<int64_t> mask_{1}; // Initialized to 1 so GetIndex() returns 0.
  299. #endif // HWY_ARCH_RISCV
  300. };
  301. // For internal use (e.g. by FunctionCache and DisableTargets).
  302. HWY_DLLEXPORT ChosenTarget& GetChosenTarget();
  303. } // namespace hwy
  304. #endif // HIGHWAY_HWY_TARGETS_H_