timer-inl.h 7.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200
  1. // Copyright 2023 Google LLC
  2. // SPDX-License-Identifier: Apache-2.0
  3. //
  4. // Licensed under the Apache License, Version 2.0 (the "License");
  5. // you may not use this file except in compliance with the License.
  6. // You may obtain a copy of the License at
  7. //
  8. // http://www.apache.org/licenses/LICENSE-2.0
  9. //
  10. // Unless required by applicable law or agreed to in writing, software
  11. // distributed under the License is distributed on an "AS IS" BASIS,
  12. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. // See the License for the specific language governing permissions and
  14. // limitations under the License.
  15. // High-resolution and high-precision timer
  16. // Per-target include guard
  17. #if defined(HIGHWAY_HWY_TIMER_INL_H_) == defined(HWY_TARGET_TOGGLE)
  18. #ifdef HIGHWAY_HWY_TIMER_INL_H_
  19. #undef HIGHWAY_HWY_TIMER_INL_H_
  20. #else
  21. #define HIGHWAY_HWY_TIMER_INL_H_
  22. #endif
  23. #include "hwy/highway.h"
  24. #if defined(_WIN32) || defined(_WIN64)
  25. #ifndef NOMINMAX
  26. #define NOMINMAX
  27. #endif // NOMINMAX
  28. #include <windows.h>
  29. #endif
  30. #if defined(__APPLE__)
  31. #include <mach/mach.h>
  32. #include <mach/mach_time.h>
  33. #endif
  34. #if defined(__HAIKU__)
  35. #include <OS.h>
  36. #endif
  37. #if HWY_ARCH_PPC && defined(__GLIBC__) && defined(__powerpc64__)
  38. #include <sys/platform/ppc.h> // NOLINT __ppc_get_timebase_freq
  39. #endif
  40. #if HWY_ARCH_X86 && HWY_COMPILER_MSVC
  41. #include <intrin.h>
  42. #endif
  43. #include <stdint.h>
  44. #include <time.h> // clock_gettime
  45. HWY_BEFORE_NAMESPACE();
  46. namespace hwy {
  47. namespace HWY_NAMESPACE {
  48. namespace timer {
  49. // Ticks := platform-specific timer values (CPU cycles on x86). Must be
  50. // unsigned to guarantee wraparound on overflow.
  51. using Ticks = uint64_t;
  52. // Start/Stop return absolute timestamps and must be placed immediately before
  53. // and after the region to measure. We provide separate Start/Stop functions
  54. // because they use different fences.
  55. //
  56. // Background: RDTSC is not 'serializing'; earlier instructions may complete
  57. // after it, and/or later instructions may complete before it. 'Fences' ensure
  58. // regions' elapsed times are independent of such reordering. The only
  59. // documented unprivileged serializing instruction is CPUID, which acts as a
  60. // full fence (no reordering across it in either direction). Unfortunately
  61. // the latency of CPUID varies wildly (perhaps made worse by not initializing
  62. // its EAX input). Because it cannot reliably be deducted from the region's
  63. // elapsed time, it must not be included in the region to measure (i.e.
  64. // between the two RDTSC).
  65. //
  66. // The newer RDTSCP is sometimes described as serializing, but it actually
  67. // only serves as a half-fence with release semantics. Although all
  68. // instructions in the region will complete before the final timestamp is
  69. // captured, subsequent instructions may leak into the region and increase the
  70. // elapsed time. Inserting another fence after the final RDTSCP would prevent
  71. // such reordering without affecting the measured region.
  72. //
  73. // Fortunately, such a fence exists. The LFENCE instruction is only documented
  74. // to delay later loads until earlier loads are visible. However, Intel's
  75. // reference manual says it acts as a full fence (waiting until all earlier
  76. // instructions have completed, and delaying later instructions until it
  77. // completes). AMD assigns the same behavior to MFENCE.
  78. //
  79. // We need a fence before the initial RDTSC to prevent earlier instructions
  80. // from leaking into the region, and arguably another after RDTSC to avoid
  81. // region instructions from completing before the timestamp is recorded.
  82. // When surrounded by fences, the additional RDTSCP half-fence provides no
  83. // benefit, so the initial timestamp can be recorded via RDTSC, which has
  84. // lower overhead than RDTSCP because it does not read TSC_AUX. In summary,
  85. // we define Start = LFENCE/RDTSC/LFENCE; Stop = RDTSCP/LFENCE.
  86. //
  87. // Using Start+Start leads to higher variance and overhead than Stop+Stop.
  88. // However, Stop+Stop includes an LFENCE in the region measurements, which
  89. // adds a delay dependent on earlier loads. The combination of Start+Stop
  90. // is faster than Start+Start and more consistent than Stop+Stop because
  91. // the first LFENCE already delayed subsequent loads before the measured
  92. // region. This combination seems not to have been considered in prior work:
  93. // http://akaros.cs.berkeley.edu/lxr/akaros/kern/arch/x86/rdtsc_test.c
  94. //
  95. // Note: performance counters can measure 'exact' instructions-retired or
  96. // (unhalted) cycle counts. The RDPMC instruction is not serializing and also
  97. // requires fences. Unfortunately, it is not accessible on all OSes and we
  98. // prefer to avoid kernel-mode drivers. Performance counters are also affected
  99. // by several under/over-count errata, so we use the TSC instead.
  100. // Returns a 64-bit timestamp in unit of 'ticks'; to convert to seconds,
  101. // divide by InvariantTicksPerSecond.
  102. inline Ticks Start() {
  103. Ticks t;
  104. #if HWY_ARCH_PPC && defined(__GLIBC__) && defined(__powerpc64__)
  105. asm volatile("mfspr %0, %1" : "=r"(t) : "i"(268));
  106. #elif HWY_ARCH_ARM_A64 && !HWY_COMPILER_MSVC
  107. // pmccntr_el0 is privileged but cntvct_el0 is accessible in Linux and QEMU.
  108. asm volatile("mrs %0, cntvct_el0" : "=r"(t));
  109. #elif HWY_ARCH_X86 && HWY_COMPILER_MSVC
  110. _ReadWriteBarrier();
  111. _mm_lfence();
  112. _ReadWriteBarrier();
  113. t = __rdtsc();
  114. _ReadWriteBarrier();
  115. _mm_lfence();
  116. _ReadWriteBarrier();
  117. #elif HWY_ARCH_X86_64
  118. asm volatile(
  119. "lfence\n\t"
  120. "rdtsc\n\t"
  121. "shl $32, %%rdx\n\t"
  122. "or %%rdx, %0\n\t"
  123. "lfence"
  124. : "=a"(t)
  125. :
  126. // "memory" avoids reordering. rdx = TSC >> 32.
  127. // "cc" = flags modified by SHL.
  128. : "rdx", "memory", "cc");
  129. #elif HWY_ARCH_RISCV
  130. asm volatile("fence; rdtime %0" : "=r"(t));
  131. #elif defined(_WIN32) || defined(_WIN64)
  132. LARGE_INTEGER counter;
  133. (void)QueryPerformanceCounter(&counter);
  134. t = counter.QuadPart;
  135. #elif defined(__APPLE__)
  136. t = mach_absolute_time();
  137. #elif defined(__HAIKU__)
  138. t = system_time_nsecs(); // since boot
  139. #else // POSIX
  140. timespec ts;
  141. clock_gettime(CLOCK_MONOTONIC, &ts);
  142. t = static_cast<Ticks>(ts.tv_sec * 1000000000LL + ts.tv_nsec);
  143. #endif
  144. return t;
  145. }
  146. // WARNING: on x86, caller must check HasRDTSCP before using this!
  147. inline Ticks Stop() {
  148. uint64_t t;
  149. #if HWY_ARCH_PPC && defined(__GLIBC__) && defined(__powerpc64__)
  150. asm volatile("mfspr %0, %1" : "=r"(t) : "i"(268));
  151. #elif HWY_ARCH_ARM_A64 && !HWY_COMPILER_MSVC
  152. // pmccntr_el0 is privileged but cntvct_el0 is accessible in Linux and QEMU.
  153. asm volatile("mrs %0, cntvct_el0" : "=r"(t));
  154. #elif HWY_ARCH_X86 && HWY_COMPILER_MSVC
  155. _ReadWriteBarrier();
  156. unsigned aux;
  157. t = __rdtscp(&aux);
  158. _ReadWriteBarrier();
  159. _mm_lfence();
  160. _ReadWriteBarrier();
  161. #elif HWY_ARCH_X86_64
  162. // Use inline asm because __rdtscp generates code to store TSC_AUX (ecx).
  163. asm volatile(
  164. "rdtscp\n\t"
  165. "shl $32, %%rdx\n\t"
  166. "or %%rdx, %0\n\t"
  167. "lfence"
  168. : "=a"(t)
  169. :
  170. // "memory" avoids reordering. rcx = TSC_AUX. rdx = TSC >> 32.
  171. // "cc" = flags modified by SHL.
  172. : "rcx", "rdx", "memory", "cc");
  173. #else
  174. t = Start();
  175. #endif
  176. return t;
  177. }
  178. } // namespace timer
  179. // NOLINTNEXTLINE(google-readability-namespace-comments)
  180. } // namespace HWY_NAMESPACE
  181. } // namespace hwy
  182. HWY_AFTER_NAMESPACE();
  183. #endif // per-target include guard