cache_control.h 4.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126
  1. // Copyright 2020 Google LLC
  2. // SPDX-License-Identifier: Apache-2.0
  3. //
  4. // Licensed under the Apache License, Version 2.0 (the "License");
  5. // you may not use this file except in compliance with the License.
  6. // You may obtain a copy of the License at
  7. //
  8. // http://www.apache.org/licenses/LICENSE-2.0
  9. //
  10. // Unless required by applicable law or agreed to in writing, software
  11. // distributed under the License is distributed on an "AS IS" BASIS,
  12. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. // See the License for the specific language governing permissions and
  14. // limitations under the License.
  15. #ifndef HIGHWAY_HWY_CACHE_CONTROL_H_
  16. #define HIGHWAY_HWY_CACHE_CONTROL_H_
  17. #include "hwy/base.h"
  18. // Requires SSE2; fails to compile on 32-bit Clang 7 (see
  19. // https://github.com/gperftools/gperftools/issues/946).
  20. #if !defined(__SSE2__) || (HWY_COMPILER_CLANG && HWY_ARCH_X86_32)
  21. #undef HWY_DISABLE_CACHE_CONTROL
  22. #define HWY_DISABLE_CACHE_CONTROL
  23. #endif
  24. #ifndef HWY_DISABLE_CACHE_CONTROL
  25. // intrin.h is sufficient on MSVC and already included by base.h.
  26. #if HWY_ARCH_X86 && !HWY_COMPILER_MSVC
  27. #include <emmintrin.h> // SSE2
  28. #include <xmmintrin.h> // _mm_prefetch
  29. #elif HWY_ARCH_ARM_A64
  30. #include <arm_acle.h>
  31. #endif
  32. #endif // HWY_DISABLE_CACHE_CONTROL
  33. namespace hwy {
  34. // Even if N*sizeof(T) is smaller, Stream may write a multiple of this size.
  35. #define HWY_STREAM_MULTIPLE 16
  36. // The following functions may also require an attribute.
  37. #if HWY_ARCH_X86 && !defined(HWY_DISABLE_CACHE_CONTROL) && !HWY_COMPILER_MSVC
  38. #define HWY_ATTR_CACHE __attribute__((target("sse2")))
  39. #else
  40. #define HWY_ATTR_CACHE
  41. #endif
  42. // Windows.h #defines this, which causes infinite recursion. Temporarily
  43. // undefine to avoid conflict with our function.
  44. // TODO(janwas): remove when this function is removed.
  45. #pragma push_macro("LoadFence")
  46. #undef LoadFence
  47. // Delays subsequent loads until prior loads are visible. Beware of potentially
  48. // differing behavior across architectures and vendors: on Intel but not
  49. // AMD CPUs, also serves as a full fence (waits for all prior instructions to
  50. // complete).
  51. HWY_INLINE HWY_ATTR_CACHE void LoadFence() {
  52. #if HWY_ARCH_X86 && !defined(HWY_DISABLE_CACHE_CONTROL)
  53. _mm_lfence();
  54. #endif
  55. }
  56. // TODO(janwas): remove when this function is removed. (See above.)
  57. #pragma pop_macro("LoadFence")
  58. // Ensures values written by previous `Stream` calls are visible on the current
  59. // core. This is NOT sufficient for synchronizing across cores; when `Stream`
  60. // outputs are to be consumed by other core(s), the producer must publish
  61. // availability (e.g. via mutex or atomic_flag) after `FlushStream`.
  62. HWY_INLINE HWY_ATTR_CACHE void FlushStream() {
  63. #if HWY_ARCH_X86 && !defined(HWY_DISABLE_CACHE_CONTROL)
  64. _mm_sfence();
  65. #endif
  66. }
  67. // Optionally begins loading the cache line containing "p" to reduce latency of
  68. // subsequent actual loads.
  69. template <typename T>
  70. HWY_INLINE HWY_ATTR_CACHE void Prefetch(const T* p) {
  71. (void)p;
  72. #ifndef HWY_DISABLE_CACHE_CONTROL
  73. #if HWY_ARCH_X86
  74. _mm_prefetch(reinterpret_cast<const char*>(p), _MM_HINT_T0);
  75. #elif HWY_COMPILER_GCC // includes clang
  76. // Hint=0 (NTA) behavior differs, but skipping outer caches is probably not
  77. // desirable, so use the default 3 (keep in caches).
  78. __builtin_prefetch(p, /*write=*/0, /*hint=*/3);
  79. #endif
  80. #endif // HWY_DISABLE_CACHE_CONTROL
  81. }
  82. // Invalidates and flushes the cache line containing "p", if possible.
  83. HWY_INLINE HWY_ATTR_CACHE void FlushCacheline(const void* p) {
  84. #if HWY_ARCH_X86 && !defined(HWY_DISABLE_CACHE_CONTROL)
  85. _mm_clflush(p);
  86. #else
  87. (void)p;
  88. #endif
  89. }
  90. // Hints that we are inside a spin loop and potentially reduces power
  91. // consumption and coherency traffic. For example, x86 avoids multiple
  92. // outstanding load requests, which reduces the memory order violation penalty
  93. // when exiting the loop.
  94. HWY_INLINE HWY_ATTR_CACHE void Pause() {
  95. #ifndef HWY_DISABLE_CACHE_CONTROL
  96. #if HWY_ARCH_X86
  97. _mm_pause();
  98. #elif HWY_ARCH_ARM_A64 && HWY_COMPILER_CLANG
  99. // This is documented in ACLE and the YIELD instruction is also available in
  100. // Armv7, but the intrinsic is broken for Armv7 clang, hence A64 only.
  101. __yield();
  102. #elif HWY_ARCH_ARM && HWY_COMPILER_GCC // includes clang
  103. __asm__ volatile("yield" ::: "memory");
  104. #elif HWY_ARCH_PPC && HWY_COMPILER_GCC // includes clang
  105. __asm__ volatile("or 27,27,27" ::: "memory");
  106. #endif
  107. #endif // HWY_DISABLE_CACHE_CONTROL
  108. }
  109. } // namespace hwy
  110. #endif // HIGHWAY_HWY_CACHE_CONTROL_H_