highway.h 24 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607
  1. // Copyright 2020 Google LLC
  2. // SPDX-License-Identifier: Apache-2.0
  3. //
  4. // Licensed under the Apache License, Version 2.0 (the "License");
  5. // you may not use this file except in compliance with the License.
  6. // You may obtain a copy of the License at
  7. //
  8. // http://www.apache.org/licenses/LICENSE-2.0
  9. //
  10. // Unless required by applicable law or agreed to in writing, software
  11. // distributed under the License is distributed on an "AS IS" BASIS,
  12. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. // See the License for the specific language governing permissions and
  14. // limitations under the License.
  15. // Main header required before using vector types.
  16. // IWYU pragma: begin_exports
  17. #include "hwy/base.h"
  18. #include "hwy/detect_compiler_arch.h"
  19. #include "hwy/detect_targets.h"
  20. #include "hwy/highway_export.h"
  21. #include "hwy/targets.h"
  22. // IWYU pragma: end_exports
  23. #if HWY_CXX_LANG < 201703L
  24. #define HWY_DISPATCH_MAP 1
  25. #else
  26. #define HWY_DISPATCH_MAP 0
  27. #endif
  28. // This include guard is checked by foreach_target, so avoid the usual _H_
  29. // suffix to prevent copybara from renaming it. NOTE: ops/*-inl.h are included
  30. // after/outside this include guard.
  31. #ifndef HWY_HIGHWAY_INCLUDED
  32. #define HWY_HIGHWAY_INCLUDED
  33. namespace hwy {
  34. //------------------------------------------------------------------------------
  35. // Shorthand for tags (defined in shared-inl.h) used to select overloads.
  36. // Note that ScalableTag<T> is preferred over HWY_FULL, and CappedTag<T, N> over
  37. // HWY_CAPPED(T, N).
  38. // HWY_FULL(T[,LMUL=1]) is a native vector/group. LMUL is the number of
  39. // registers in the group, and is ignored on targets that do not support groups.
  40. #define HWY_FULL1(T) hwy::HWY_NAMESPACE::ScalableTag<T>
  41. #define HWY_FULL2(T, LMUL) \
  42. hwy::HWY_NAMESPACE::ScalableTag<T, hwy::CeilLog2(HWY_MAX(0, LMUL))>
  43. #define HWY_3TH_ARG(arg1, arg2, arg3, ...) arg3
  44. // Workaround for MSVC grouping __VA_ARGS__ into a single argument
  45. #define HWY_FULL_RECOMPOSER(args_with_paren) HWY_3TH_ARG args_with_paren
  46. // Trailing comma avoids -pedantic false alarm
  47. #define HWY_CHOOSE_FULL(...) \
  48. HWY_FULL_RECOMPOSER((__VA_ARGS__, HWY_FULL2, HWY_FULL1, ))
  49. #define HWY_FULL(...) HWY_CHOOSE_FULL(__VA_ARGS__())(__VA_ARGS__)
  50. // Vector of up to MAX_N lanes. It's better to use full vectors where possible.
  51. #define HWY_CAPPED(T, MAX_N) hwy::HWY_NAMESPACE::CappedTag<T, MAX_N>
  52. //------------------------------------------------------------------------------
  53. // Export user functions for static/dynamic dispatch
  54. // Evaluates to 0 inside a translation unit if it is generating anything but the
  55. // static target (the last one if multiple targets are enabled). Used to prevent
  56. // redefinitions of HWY_EXPORT. Unless foreach_target.h is included, we only
  57. // compile once anyway, so this is 1 unless it is or has been included.
  58. #ifndef HWY_ONCE
  59. #define HWY_ONCE 1
  60. #endif
  61. // HWY_STATIC_DISPATCH(FUNC_NAME) is the namespace-qualified FUNC_NAME for
  62. // HWY_STATIC_TARGET (the only defined namespace unless HWY_TARGET_INCLUDE is
  63. // defined), and can be used to deduce the return type of Choose*.
  64. #if HWY_STATIC_TARGET == HWY_SCALAR
  65. #define HWY_STATIC_DISPATCH(FUNC_NAME) N_SCALAR::FUNC_NAME
  66. #elif HWY_STATIC_TARGET == HWY_EMU128
  67. #define HWY_STATIC_DISPATCH(FUNC_NAME) N_EMU128::FUNC_NAME
  68. #elif HWY_STATIC_TARGET == HWY_RVV
  69. #define HWY_STATIC_DISPATCH(FUNC_NAME) N_RVV::FUNC_NAME
  70. #elif HWY_STATIC_TARGET == HWY_WASM_EMU256
  71. #define HWY_STATIC_DISPATCH(FUNC_NAME) N_WASM_EMU256::FUNC_NAME
  72. #elif HWY_STATIC_TARGET == HWY_WASM
  73. #define HWY_STATIC_DISPATCH(FUNC_NAME) N_WASM::FUNC_NAME
  74. #elif HWY_STATIC_TARGET == HWY_NEON_WITHOUT_AES
  75. #define HWY_STATIC_DISPATCH(FUNC_NAME) N_NEON_WITHOUT_AES::FUNC_NAME
  76. #elif HWY_STATIC_TARGET == HWY_NEON
  77. #define HWY_STATIC_DISPATCH(FUNC_NAME) N_NEON::FUNC_NAME
  78. #elif HWY_STATIC_TARGET == HWY_NEON_BF16
  79. #define HWY_STATIC_DISPATCH(FUNC_NAME) N_NEON_BF16::FUNC_NAME
  80. #elif HWY_STATIC_TARGET == HWY_SVE
  81. #define HWY_STATIC_DISPATCH(FUNC_NAME) N_SVE::FUNC_NAME
  82. #elif HWY_STATIC_TARGET == HWY_SVE2
  83. #define HWY_STATIC_DISPATCH(FUNC_NAME) N_SVE2::FUNC_NAME
  84. #elif HWY_STATIC_TARGET == HWY_SVE_256
  85. #define HWY_STATIC_DISPATCH(FUNC_NAME) N_SVE_256::FUNC_NAME
  86. #elif HWY_STATIC_TARGET == HWY_SVE2_128
  87. #define HWY_STATIC_DISPATCH(FUNC_NAME) N_SVE2_128::FUNC_NAME
  88. #elif HWY_STATIC_TARGET == HWY_PPC8
  89. #define HWY_STATIC_DISPATCH(FUNC_NAME) N_PPC8::FUNC_NAME
  90. #elif HWY_STATIC_TARGET == HWY_PPC9
  91. #define HWY_STATIC_DISPATCH(FUNC_NAME) N_PPC9::FUNC_NAME
  92. #elif HWY_STATIC_TARGET == HWY_PPC10
  93. #define HWY_STATIC_DISPATCH(FUNC_NAME) N_PPC10::FUNC_NAME
  94. #elif HWY_STATIC_TARGET == HWY_Z14
  95. #define HWY_STATIC_DISPATCH(FUNC_NAME) N_Z14::FUNC_NAME
  96. #elif HWY_STATIC_TARGET == HWY_Z15
  97. #define HWY_STATIC_DISPATCH(FUNC_NAME) N_Z15::FUNC_NAME
  98. #elif HWY_STATIC_TARGET == HWY_SSE2
  99. #define HWY_STATIC_DISPATCH(FUNC_NAME) N_SSE2::FUNC_NAME
  100. #elif HWY_STATIC_TARGET == HWY_SSSE3
  101. #define HWY_STATIC_DISPATCH(FUNC_NAME) N_SSSE3::FUNC_NAME
  102. #elif HWY_STATIC_TARGET == HWY_SSE4
  103. #define HWY_STATIC_DISPATCH(FUNC_NAME) N_SSE4::FUNC_NAME
  104. #elif HWY_STATIC_TARGET == HWY_AVX2
  105. #define HWY_STATIC_DISPATCH(FUNC_NAME) N_AVX2::FUNC_NAME
  106. #elif HWY_STATIC_TARGET == HWY_AVX3
  107. #define HWY_STATIC_DISPATCH(FUNC_NAME) N_AVX3::FUNC_NAME
  108. #elif HWY_STATIC_TARGET == HWY_AVX3_DL
  109. #define HWY_STATIC_DISPATCH(FUNC_NAME) N_AVX3_DL::FUNC_NAME
  110. #elif HWY_STATIC_TARGET == HWY_AVX3_ZEN4
  111. #define HWY_STATIC_DISPATCH(FUNC_NAME) N_AVX3_ZEN4::FUNC_NAME
  112. #elif HWY_STATIC_TARGET == HWY_AVX3_SPR
  113. #define HWY_STATIC_DISPATCH(FUNC_NAME) N_AVX3_SPR::FUNC_NAME
  114. #endif
  115. // HWY_CHOOSE_*(FUNC_NAME) expands to the function pointer for that target or
  116. // nullptr is that target was not compiled.
  117. #if HWY_TARGETS & HWY_EMU128
  118. #define HWY_CHOOSE_FALLBACK(FUNC_NAME) &N_EMU128::FUNC_NAME
  119. #elif HWY_TARGETS & HWY_SCALAR
  120. #define HWY_CHOOSE_FALLBACK(FUNC_NAME) &N_SCALAR::FUNC_NAME
  121. #else
  122. // When HWY_SCALAR/HWY_EMU128 are not present and other targets were disabled at
  123. // runtime, fall back to the baseline with HWY_STATIC_DISPATCH().
  124. #define HWY_CHOOSE_FALLBACK(FUNC_NAME) &HWY_STATIC_DISPATCH(FUNC_NAME)
  125. #endif
  126. #if HWY_TARGETS & HWY_WASM_EMU256
  127. #define HWY_CHOOSE_WASM_EMU256(FUNC_NAME) &N_WASM_EMU256::FUNC_NAME
  128. #else
  129. #define HWY_CHOOSE_WASM_EMU256(FUNC_NAME) nullptr
  130. #endif
  131. #if HWY_TARGETS & HWY_WASM
  132. #define HWY_CHOOSE_WASM(FUNC_NAME) &N_WASM::FUNC_NAME
  133. #else
  134. #define HWY_CHOOSE_WASM(FUNC_NAME) nullptr
  135. #endif
  136. #if HWY_TARGETS & HWY_RVV
  137. #define HWY_CHOOSE_RVV(FUNC_NAME) &N_RVV::FUNC_NAME
  138. #else
  139. #define HWY_CHOOSE_RVV(FUNC_NAME) nullptr
  140. #endif
  141. #if HWY_TARGETS & HWY_NEON_WITHOUT_AES
  142. #define HWY_CHOOSE_NEON_WITHOUT_AES(FUNC_NAME) &N_NEON_WITHOUT_AES::FUNC_NAME
  143. #else
  144. #define HWY_CHOOSE_NEON_WITHOUT_AES(FUNC_NAME) nullptr
  145. #endif
  146. #if HWY_TARGETS & HWY_NEON
  147. #define HWY_CHOOSE_NEON(FUNC_NAME) &N_NEON::FUNC_NAME
  148. #else
  149. #define HWY_CHOOSE_NEON(FUNC_NAME) nullptr
  150. #endif
  151. #if HWY_TARGETS & HWY_NEON_BF16
  152. #define HWY_CHOOSE_NEON_BF16(FUNC_NAME) &N_NEON_BF16::FUNC_NAME
  153. #else
  154. #define HWY_CHOOSE_NEON_BF16(FUNC_NAME) nullptr
  155. #endif
  156. #if HWY_TARGETS & HWY_SVE
  157. #define HWY_CHOOSE_SVE(FUNC_NAME) &N_SVE::FUNC_NAME
  158. #else
  159. #define HWY_CHOOSE_SVE(FUNC_NAME) nullptr
  160. #endif
  161. #if HWY_TARGETS & HWY_SVE2
  162. #define HWY_CHOOSE_SVE2(FUNC_NAME) &N_SVE2::FUNC_NAME
  163. #else
  164. #define HWY_CHOOSE_SVE2(FUNC_NAME) nullptr
  165. #endif
  166. #if HWY_TARGETS & HWY_SVE_256
  167. #define HWY_CHOOSE_SVE_256(FUNC_NAME) &N_SVE_256::FUNC_NAME
  168. #else
  169. #define HWY_CHOOSE_SVE_256(FUNC_NAME) nullptr
  170. #endif
  171. #if HWY_TARGETS & HWY_SVE2_128
  172. #define HWY_CHOOSE_SVE2_128(FUNC_NAME) &N_SVE2_128::FUNC_NAME
  173. #else
  174. #define HWY_CHOOSE_SVE2_128(FUNC_NAME) nullptr
  175. #endif
  176. #if HWY_TARGETS & HWY_PPC8
  177. #define HWY_CHOOSE_PPC8(FUNC_NAME) &N_PPC8::FUNC_NAME
  178. #else
  179. #define HWY_CHOOSE_PPC8(FUNC_NAME) nullptr
  180. #endif
  181. #if HWY_TARGETS & HWY_PPC9
  182. #define HWY_CHOOSE_PPC9(FUNC_NAME) &N_PPC9::FUNC_NAME
  183. #else
  184. #define HWY_CHOOSE_PPC9(FUNC_NAME) nullptr
  185. #endif
  186. #if HWY_TARGETS & HWY_PPC10
  187. #define HWY_CHOOSE_PPC10(FUNC_NAME) &N_PPC10::FUNC_NAME
  188. #else
  189. #define HWY_CHOOSE_PPC10(FUNC_NAME) nullptr
  190. #endif
  191. #if HWY_TARGETS & HWY_Z14
  192. #define HWY_CHOOSE_Z14(FUNC_NAME) &N_Z14::FUNC_NAME
  193. #else
  194. #define HWY_CHOOSE_Z14(FUNC_NAME) nullptr
  195. #endif
  196. #if HWY_TARGETS & HWY_Z15
  197. #define HWY_CHOOSE_Z15(FUNC_NAME) &N_Z15::FUNC_NAME
  198. #else
  199. #define HWY_CHOOSE_Z15(FUNC_NAME) nullptr
  200. #endif
  201. #if HWY_TARGETS & HWY_SSE2
  202. #define HWY_CHOOSE_SSE2(FUNC_NAME) &N_SSE2::FUNC_NAME
  203. #else
  204. #define HWY_CHOOSE_SSE2(FUNC_NAME) nullptr
  205. #endif
  206. #if HWY_TARGETS & HWY_SSSE3
  207. #define HWY_CHOOSE_SSSE3(FUNC_NAME) &N_SSSE3::FUNC_NAME
  208. #else
  209. #define HWY_CHOOSE_SSSE3(FUNC_NAME) nullptr
  210. #endif
  211. #if HWY_TARGETS & HWY_SSE4
  212. #define HWY_CHOOSE_SSE4(FUNC_NAME) &N_SSE4::FUNC_NAME
  213. #else
  214. #define HWY_CHOOSE_SSE4(FUNC_NAME) nullptr
  215. #endif
  216. #if HWY_TARGETS & HWY_AVX2
  217. #define HWY_CHOOSE_AVX2(FUNC_NAME) &N_AVX2::FUNC_NAME
  218. #else
  219. #define HWY_CHOOSE_AVX2(FUNC_NAME) nullptr
  220. #endif
  221. #if HWY_TARGETS & HWY_AVX3
  222. #define HWY_CHOOSE_AVX3(FUNC_NAME) &N_AVX3::FUNC_NAME
  223. #else
  224. #define HWY_CHOOSE_AVX3(FUNC_NAME) nullptr
  225. #endif
  226. #if HWY_TARGETS & HWY_AVX3_DL
  227. #define HWY_CHOOSE_AVX3_DL(FUNC_NAME) &N_AVX3_DL::FUNC_NAME
  228. #else
  229. #define HWY_CHOOSE_AVX3_DL(FUNC_NAME) nullptr
  230. #endif
  231. #if HWY_TARGETS & HWY_AVX3_ZEN4
  232. #define HWY_CHOOSE_AVX3_ZEN4(FUNC_NAME) &N_AVX3_ZEN4::FUNC_NAME
  233. #else
  234. #define HWY_CHOOSE_AVX3_ZEN4(FUNC_NAME) nullptr
  235. #endif
  236. #if HWY_TARGETS & HWY_AVX3_SPR
  237. #define HWY_CHOOSE_AVX3_SPR(FUNC_NAME) &N_AVX3_SPR::FUNC_NAME
  238. #else
  239. #define HWY_CHOOSE_AVX3_SPR(FUNC_NAME) nullptr
  240. #endif
  241. // MSVC 2017 workaround: the non-type template parameter to ChooseAndCall
  242. // apparently cannot be an array. Use a function pointer instead, which has the
  243. // disadvantage that we call the static (not best) target on the first call to
  244. // any HWY_DYNAMIC_DISPATCH.
  245. #if (HWY_COMPILER_MSVC && HWY_COMPILER_MSVC < 1915) || \
  246. (HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 700)
  247. #define HWY_DISPATCH_WORKAROUND 1
  248. #else
  249. #define HWY_DISPATCH_WORKAROUND 0
  250. #endif
  251. #if HWY_DISPATCH_MAP
  252. struct AllExports {
  253. template <class FuncPtr, class ExportsKey, uint64_t kHash>
  254. static const FuncPtr*& GetRefToExportsPtr() {
  255. static const FuncPtr* s_exports = nullptr;
  256. return s_exports;
  257. }
  258. };
  259. #endif
  260. // Provides a static member function which is what is called during the first
  261. // HWY_DYNAMIC_DISPATCH, where GetIndex is still zero, and instantiations of
  262. // this function are the first entry in the tables created by HWY_EXPORT[_T].
  263. template <typename RetType, typename... Args>
  264. struct FunctionCache {
  265. public:
  266. typedef RetType(FuncType)(Args...);
  267. using FuncPtr = FuncType*;
  268. // A template function that when instantiated has the same signature as the
  269. // function being called. This function initializes the bit array of targets
  270. // supported by the current CPU and then calls the appropriate entry within
  271. // the HWY_EXPORT table. Subsequent calls via HWY_DYNAMIC_DISPATCH to any
  272. // exported functions, even those defined by different translation units,
  273. // will dispatch directly to the best available target.
  274. #if HWY_DISPATCH_MAP
  275. template <class ExportsKey, uint64_t kHash>
  276. static RetType ChooseAndCall(Args... args) {
  277. ChosenTarget& chosen_target = GetChosenTarget();
  278. chosen_target.Update(SupportedTargets());
  279. const FuncPtr* table = AllExports::template GetRefToExportsPtr<
  280. FuncPtr, RemoveCvRef<ExportsKey>, kHash>();
  281. HWY_ASSERT(table);
  282. return (table[chosen_target.GetIndex()])(args...);
  283. }
  284. #if !HWY_DISPATCH_WORKAROUND
  285. template <const FuncPtr* table>
  286. static RetType TableChooseAndCall(Args... args) {
  287. ChosenTarget& chosen_target = GetChosenTarget();
  288. chosen_target.Update(SupportedTargets());
  289. return (table[chosen_target.GetIndex()])(args...);
  290. }
  291. #endif // !HWY_DISPATCH_WORKAROUND
  292. #else // !HWY_DISPATCH_MAP: zero-overhead, but requires C++17
  293. template <const FuncPtr* table>
  294. static RetType ChooseAndCall(Args... args) {
  295. ChosenTarget& chosen_target = GetChosenTarget();
  296. chosen_target.Update(SupportedTargets());
  297. return (table[chosen_target.GetIndex()])(args...);
  298. }
  299. #endif // HWY_DISPATCH_MAP
  300. };
  301. // Used to deduce the template parameters RetType and Args from a function.
  302. template <typename RetType, typename... Args>
  303. FunctionCache<RetType, Args...> DeduceFunctionCache(RetType (*)(Args...)) {
  304. return FunctionCache<RetType, Args...>();
  305. }
  306. #define HWY_DISPATCH_TABLE(FUNC_NAME) \
  307. HWY_CONCAT(FUNC_NAME, HighwayDispatchTable)
  308. // HWY_EXPORT(FUNC_NAME); expands to a static array that is used by
  309. // HWY_DYNAMIC_DISPATCH() to call the appropriate function at runtime.
  310. // After being exported, it can be called from other parts of the same source
  311. // file using HWY_DYNAMIC_DISPATCH(), in particular from a function wrapper
  312. // like in the following example:
  313. //
  314. // #include "hwy/highway.h"
  315. // HWY_BEFORE_NAMESPACE();
  316. // namespace skeleton {
  317. // namespace HWY_NAMESPACE {
  318. //
  319. // void MyFunction(int a, char b, const char* c) { ... }
  320. //
  321. // // NOLINTNEXTLINE(google-readability-namespace-comments)
  322. // } // namespace HWY_NAMESPACE
  323. // } // namespace skeleton
  324. // HWY_AFTER_NAMESPACE();
  325. //
  326. // namespace skeleton {
  327. // HWY_EXPORT(MyFunction); // Defines the dispatch table in this scope.
  328. //
  329. // void MyFunction(int a, char b, const char* c) {
  330. // return HWY_DYNAMIC_DISPATCH(MyFunction)(a, b, c);
  331. // }
  332. // } // namespace skeleton
  333. //
  334. // For templated code with a single type parameter, instead use HWY_EXPORT_T and
  335. // its HWY_DYNAMIC_DISPATCH_T counterpart:
  336. //
  337. // template <typename T>
  338. // void MyFunctionCaller(T ...) {
  339. // // First argument to both HWY_EXPORT_T and HWY_DYNAMIC_DISPATCH_T is an
  340. // // arbitrary table name; you must provide the same name for each call.
  341. // // It is fine to have multiple HWY_EXPORT_T in a function, but a 64-bit
  342. // // FNV hash collision among *any* table names will trigger HWY_ABORT.
  343. // HWY_EXPORT_T(Table1, MyFunction<T>)
  344. // HWY_DYNAMIC_DISPATCH_T(Table1)(a, b, c);
  345. // }
  346. //
  347. // Note that HWY_EXPORT_T must be invoked inside a template (in the above
  348. // example: `MyFunctionCaller`), so that a separate table will be created for
  349. // each template instantiation. For convenience, we also provide a macro that
  350. // combines both steps and avoids the need to pick a table name:
  351. //
  352. // template <typename T>
  353. // void MyFunctionCaller(T ...) {
  354. // // Table name is automatically chosen. Note that this variant must be
  355. // // called in statement context; it is not a valid expression.
  356. // HWY_EXPORT_AND_DYNAMIC_DISPATCH_T(MyFunction<T>)(a, b, c);
  357. // }
  358. // Simplified version for IDE or the dynamic dispatch case with only one target.
  359. #if HWY_IDE || ((HWY_TARGETS & (HWY_TARGETS - 1)) == 0)
  360. // We use a table to provide the same compile error conditions as with the
  361. // non-simplified case, but the table only has a single entry.
  362. #define HWY_EXPORT_T(TABLE_NAME, FUNC_NAME) \
  363. HWY_MAYBE_UNUSED static decltype(&HWY_STATIC_DISPATCH(FUNC_NAME)) const \
  364. HWY_DISPATCH_TABLE(TABLE_NAME)[1] = {&HWY_STATIC_DISPATCH(FUNC_NAME)}
  365. // Use the table, not just STATIC_DISPATCH as in DYNAMIC_DISPATCH, because
  366. // TABLE_NAME might not match the function name.
  367. #define HWY_DYNAMIC_POINTER_T(TABLE_NAME) (HWY_DISPATCH_TABLE(TABLE_NAME)[0])
  368. #define HWY_DYNAMIC_DISPATCH_T(TABLE_NAME) \
  369. (*(HWY_DYNAMIC_POINTER_T(TABLE_NAME)))
  370. #define HWY_EXPORT(FUNC_NAME) HWY_EXPORT_T(FUNC_NAME, FUNC_NAME)
  371. #define HWY_DYNAMIC_POINTER(FUNC_NAME) &HWY_STATIC_DISPATCH(FUNC_NAME)
  372. #define HWY_DYNAMIC_DISPATCH(FUNC_NAME) HWY_STATIC_DISPATCH(FUNC_NAME)
  373. #else // not simplified: full table
  374. // Pre-C++17 workaround: non-type template arguments must have linkage, which
  375. // means we cannot pass &table as a template argument to ChooseAndCall.
  376. // ChooseAndCall must find a way to access the table in order to dispatch to the
  377. // chosen target:
  378. // 0) Skipping this by dispatching to the static target would be surprising to
  379. // users and may have serious performance implications.
  380. // 1) An extra function parameter would be unacceptable because it changes the
  381. // user-visible function signature.
  382. // 2) Declaring a table, then defining a pointer to it would work, but requires
  383. // an additional DECLARE step outside the function so that the pointer has
  384. // linkage, which breaks existing code.
  385. // 3) We instead associate the function with the table using an instance of an
  386. // unnamed struct and the hash of the table name as the key. Because
  387. // ChooseAndCall has the type information, it can then cast to the function
  388. // pointer type. However, we cannot simply pass the name as a template
  389. // argument to ChooseAndCall because this requires char*, which hits the same
  390. // linkage problem. We instead hash the table name, which assumes the
  391. // function names do not have collisions.
  392. #if HWY_DISPATCH_MAP
  393. static constexpr uint64_t FNV(const char* name) {
  394. return *name ? static_cast<uint64_t>(static_cast<uint8_t>(*name)) ^
  395. (0x100000001b3ULL * FNV(name + 1))
  396. : 0xcbf29ce484222325ULL;
  397. }
  398. template <uint64_t kHash>
  399. struct AddExport {
  400. template <class ExportsKey, class FuncPtr>
  401. AddExport(ExportsKey /*exports_key*/, const char* table_name,
  402. const FuncPtr* table) {
  403. using FuncCache = decltype(DeduceFunctionCache(hwy::DeclVal<FuncPtr>()));
  404. static_assert(
  405. hwy::IsSame<RemoveCvRef<FuncPtr>, typename FuncCache::FuncPtr>(),
  406. "FuncPtr should be same type as FuncCache::FuncPtr");
  407. const FuncPtr*& exports_ptr = AllExports::template GetRefToExportsPtr<
  408. RemoveCvRef<FuncPtr>, RemoveCvRef<ExportsKey>, kHash>();
  409. if (exports_ptr && exports_ptr != table) {
  410. HWY_ABORT("Hash collision for %s, rename the function\n", table_name);
  411. } else {
  412. exports_ptr = table;
  413. }
  414. }
  415. };
  416. // Dynamic dispatch: defines table of function pointers. This must be invoked
  417. // from inside the function template that calls the template we are exporting.
  418. // TABLE_NAME must match the one passed to HWY_DYNAMIC_DISPATCH_T. This
  419. // argument allows multiple exports within one function.
  420. #define HWY_EXPORT_T(TABLE_NAME, FUNC_NAME) \
  421. static const struct { \
  422. } HWY_CONCAT(TABLE_NAME, HighwayDispatchExportsKey) = {}; \
  423. static decltype(&HWY_STATIC_DISPATCH(FUNC_NAME)) const HWY_DISPATCH_TABLE( \
  424. TABLE_NAME)[static_cast<size_t>(HWY_MAX_DYNAMIC_TARGETS + 2)] = { \
  425. /* The first entry in the table initializes the global cache and \
  426. * calls the appropriate function. */ \
  427. &decltype(hwy::DeduceFunctionCache(&HWY_STATIC_DISPATCH(FUNC_NAME))):: \
  428. template ChooseAndCall<decltype(HWY_CONCAT( \
  429. TABLE_NAME, HighwayDispatchExportsKey)), \
  430. hwy::FNV(#TABLE_NAME)>, \
  431. HWY_CHOOSE_TARGET_LIST(FUNC_NAME), \
  432. HWY_CHOOSE_FALLBACK(FUNC_NAME), \
  433. }; \
  434. HWY_MAYBE_UNUSED static hwy::AddExport<hwy::FNV(#TABLE_NAME)> HWY_CONCAT( \
  435. HighwayAddTable, __LINE__)( \
  436. HWY_CONCAT(TABLE_NAME, HighwayDispatchExportsKey), #TABLE_NAME, \
  437. HWY_DISPATCH_TABLE(TABLE_NAME))
  438. // For non-template functions. Not necessarily invoked within a function, hence
  439. // we derive the string and variable names from FUNC_NAME, not HWY_FUNCTION.
  440. #if HWY_DISPATCH_WORKAROUND
  441. #define HWY_EXPORT(FUNC_NAME) HWY_EXPORT_T(FUNC_NAME, FUNC_NAME)
  442. #else
  443. #define HWY_EXPORT(FUNC_NAME) \
  444. static decltype(&HWY_STATIC_DISPATCH(FUNC_NAME)) const HWY_DISPATCH_TABLE( \
  445. FUNC_NAME)[static_cast<size_t>(HWY_MAX_DYNAMIC_TARGETS + 2)] = { \
  446. /* The first entry in the table initializes the global cache and \
  447. * calls the appropriate function. */ \
  448. &decltype(hwy::DeduceFunctionCache(&HWY_STATIC_DISPATCH(FUNC_NAME))):: \
  449. template TableChooseAndCall<HWY_DISPATCH_TABLE(FUNC_NAME)>, \
  450. HWY_CHOOSE_TARGET_LIST(FUNC_NAME), \
  451. HWY_CHOOSE_FALLBACK(FUNC_NAME), \
  452. }
  453. #endif // HWY_DISPATCH_WORKAROUND
  454. #else // !HWY_DISPATCH_MAP
  455. // Zero-overhead, but requires C++17 for non-type template arguments without
  456. // linkage, because HWY_EXPORT_T tables are local static variables.
  457. #define HWY_EXPORT_T(TABLE_NAME, FUNC_NAME) \
  458. static decltype(&HWY_STATIC_DISPATCH(FUNC_NAME)) const HWY_DISPATCH_TABLE( \
  459. TABLE_NAME)[static_cast<size_t>(HWY_MAX_DYNAMIC_TARGETS + 2)] = { \
  460. /* The first entry in the table initializes the global cache and \
  461. * calls the appropriate function. */ \
  462. &decltype(hwy::DeduceFunctionCache(&HWY_STATIC_DISPATCH(FUNC_NAME))):: \
  463. template ChooseAndCall<HWY_DISPATCH_TABLE(TABLE_NAME)>, \
  464. HWY_CHOOSE_TARGET_LIST(FUNC_NAME), \
  465. HWY_CHOOSE_FALLBACK(FUNC_NAME), \
  466. }
  467. #define HWY_EXPORT(FUNC_NAME) HWY_EXPORT_T(FUNC_NAME, FUNC_NAME)
  468. #endif // HWY_DISPATCH_MAP
  469. // HWY_DISPATCH_MAP only affects how tables are created, not their usage.
  470. // Evaluates to the function pointer for the chosen target.
  471. #define HWY_DYNAMIC_POINTER(FUNC_NAME) \
  472. (HWY_DISPATCH_TABLE(FUNC_NAME)[hwy::GetChosenTarget().GetIndex()])
  473. // Calls the function pointer for the chosen target.
  474. #define HWY_DYNAMIC_DISPATCH(FUNC_NAME) (*(HWY_DYNAMIC_POINTER(FUNC_NAME)))
  475. // Same as DISPATCH, but provide a different arg name to clarify usage.
  476. #define HWY_DYNAMIC_DISPATCH_T(TABLE_NAME) HWY_DYNAMIC_DISPATCH(TABLE_NAME)
  477. #define HWY_DYNAMIC_POINTER_T(TABLE_NAME) HWY_DYNAMIC_POINTER(TABLE_NAME)
  478. #endif // HWY_IDE || ((HWY_TARGETS & (HWY_TARGETS - 1)) == 0)
  479. // Returns the name of an anonymous dispatch table that is only shared with
  480. // macro invocations coming from the same source line.
  481. #define HWY_DISPATCH_TABLE_T() HWY_CONCAT(HighwayDispatchTableT, __LINE__)
  482. // For templated code, combines export and dispatch using an anonymous table.
  483. #define HWY_EXPORT_AND_DYNAMIC_DISPATCH_T(FUNC_NAME) \
  484. HWY_EXPORT_T(HWY_DISPATCH_TABLE_T(), FUNC_NAME); \
  485. HWY_DYNAMIC_DISPATCH_T(HWY_DISPATCH_TABLE_T())
  486. // DEPRECATED names; please use HWY_HAVE_* instead.
  487. #define HWY_CAP_INTEGER64 HWY_HAVE_INTEGER64
  488. #define HWY_CAP_FLOAT16 HWY_HAVE_FLOAT16
  489. #define HWY_CAP_FLOAT64 HWY_HAVE_FLOAT64
  490. } // namespace hwy
  491. #endif // HWY_HIGHWAY_INCLUDED
  492. //------------------------------------------------------------------------------
  493. // NOTE: the following definitions and ops/*.h depend on HWY_TARGET, so we want
  494. // to include them once per target, which is ensured by the toggle check.
  495. // Because ops/*.h are included under it, they do not need their own guard.
  496. #if defined(HWY_HIGHWAY_PER_TARGET) == defined(HWY_TARGET_TOGGLE)
  497. #ifdef HWY_HIGHWAY_PER_TARGET
  498. #undef HWY_HIGHWAY_PER_TARGET
  499. #else
  500. #define HWY_HIGHWAY_PER_TARGET
  501. #endif
  502. // These define ops inside namespace hwy::HWY_NAMESPACE.
  503. #if HWY_TARGET == HWY_SSE2 || HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
  504. #include "hwy/ops/x86_128-inl.h"
  505. #elif HWY_TARGET == HWY_AVX2
  506. #include "hwy/ops/x86_256-inl.h"
  507. #elif HWY_TARGET == HWY_AVX3 || HWY_TARGET == HWY_AVX3_DL || \
  508. HWY_TARGET == HWY_AVX3_ZEN4 || HWY_TARGET == HWY_AVX3_SPR
  509. #include "hwy/ops/x86_512-inl.h"
  510. #elif HWY_TARGET == HWY_Z14 || HWY_TARGET == HWY_Z15 || \
  511. (HWY_TARGET & HWY_ALL_PPC)
  512. #include "hwy/ops/ppc_vsx-inl.h"
  513. #elif HWY_TARGET & HWY_ALL_NEON
  514. #include "hwy/ops/arm_neon-inl.h"
  515. #elif HWY_TARGET & HWY_ALL_SVE
  516. #include "hwy/ops/arm_sve-inl.h"
  517. #elif HWY_TARGET == HWY_WASM_EMU256
  518. #include "hwy/ops/wasm_256-inl.h"
  519. #elif HWY_TARGET == HWY_WASM
  520. #include "hwy/ops/wasm_128-inl.h"
  521. #elif HWY_TARGET == HWY_RVV
  522. #include "hwy/ops/rvv-inl.h"
  523. #elif HWY_TARGET == HWY_EMU128
  524. #include "hwy/ops/emu128-inl.h"
  525. #elif HWY_TARGET == HWY_SCALAR
  526. #include "hwy/ops/scalar-inl.h"
  527. #else
  528. #pragma message("HWY_TARGET does not match any known target")
  529. #endif // HWY_TARGET
  530. #include "hwy/ops/generic_ops-inl.h"
  531. #endif // HWY_HIGHWAY_PER_TARGET