nanobenchmark.h 7.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171
  1. // Copyright 2019 Google LLC
  2. // SPDX-License-Identifier: Apache-2.0
  3. //
  4. // Licensed under the Apache License, Version 2.0 (the "License");
  5. // you may not use this file except in compliance with the License.
  6. // You may obtain a copy of the License at
  7. //
  8. // http://www.apache.org/licenses/LICENSE-2.0
  9. //
  10. // Unless required by applicable law or agreed to in writing, software
  11. // distributed under the License is distributed on an "AS IS" BASIS,
  12. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. // See the License for the specific language governing permissions and
  14. // limitations under the License.
  15. #ifndef HIGHWAY_HWY_NANOBENCHMARK_H_
  16. #define HIGHWAY_HWY_NANOBENCHMARK_H_
  17. // Benchmarks functions of a single integer argument with realistic branch
  18. // prediction hit rates. Uses a robust estimator to summarize the measurements.
  19. // The precision is about 0.2%.
  20. //
  21. // Examples: see nanobenchmark_test.cc.
  22. //
  23. // Background: Microbenchmarks such as http://github.com/google/benchmark
  24. // can measure elapsed times on the order of a microsecond. Shorter functions
  25. // are typically measured by repeating them thousands of times and dividing
  26. // the total elapsed time by this count. Unfortunately, repetition (especially
  27. // with the same input parameter!) influences the runtime. In time-critical
  28. // code, it is reasonable to expect warm instruction/data caches and TLBs,
  29. // but a perfect record of which branches will be taken is unrealistic.
  30. // Unless the application also repeatedly invokes the measured function with
  31. // the same parameter, the benchmark is measuring something very different -
  32. // a best-case result, almost as if the parameter were made a compile-time
  33. // constant. This may lead to erroneous conclusions about branch-heavy
  34. // algorithms outperforming branch-free alternatives.
  35. //
  36. // Our approach differs in three ways. Adding fences to the timer functions
  37. // reduces variability due to instruction reordering, improving the timer
  38. // resolution to about 40 CPU cycles. However, shorter functions must still
  39. // be invoked repeatedly. For more realistic branch prediction performance,
  40. // we vary the input parameter according to a user-specified distribution.
  41. // Thus, instead of VaryInputs(Measure(Repeat(func))), we change the
  42. // loop nesting to Measure(Repeat(VaryInputs(func))). We also estimate the
  43. // central tendency of the measurement samples with the "half sample mode",
  44. // which is more robust to outliers and skewed data than the mean or median.
  45. #include <stddef.h>
  46. #include <stdint.h>
  47. #include "hwy/highway_export.h"
  48. #include "hwy/timer.h"
  49. // Enables sanity checks that verify correct operation at the cost of
  50. // longer benchmark runs.
  51. #ifndef NANOBENCHMARK_ENABLE_CHECKS
  52. #define NANOBENCHMARK_ENABLE_CHECKS 0
  53. #endif
  54. #define NANOBENCHMARK_CHECK_ALWAYS(condition) \
  55. while (!(condition)) { \
  56. fprintf(stderr, "Nanobenchmark check failed at line %d\n", __LINE__); \
  57. abort(); \
  58. }
  59. #if NANOBENCHMARK_ENABLE_CHECKS
  60. #define NANOBENCHMARK_CHECK(condition) NANOBENCHMARK_CHECK_ALWAYS(condition)
  61. #else
  62. #define NANOBENCHMARK_CHECK(condition)
  63. #endif
  64. namespace hwy {
  65. // Returns 1, but without the compiler knowing what the value is. This prevents
  66. // optimizing out code.
  67. HWY_DLLEXPORT int Unpredictable1();
  68. // Input influencing the function being measured (e.g. number of bytes to copy).
  69. using FuncInput = size_t;
  70. // "Proof of work" returned by Func to ensure the compiler does not elide it.
  71. using FuncOutput = uint64_t;
  72. // Function to measure: either 1) a captureless lambda or function with two
  73. // arguments or 2) a lambda with capture, in which case the first argument
  74. // is reserved for use by MeasureClosure.
  75. using Func = FuncOutput (*)(const void*, FuncInput);
  76. // Internal parameters that determine precision/resolution/measuring time.
  77. struct Params {
  78. // Best-case precision, expressed as a divisor of the timer resolution.
  79. // Larger => more calls to Func and higher precision.
  80. size_t precision_divisor = 1024;
  81. // Ratio between full and subset input distribution sizes. Cannot be less
  82. // than 2; larger values increase measurement time but more faithfully
  83. // model the given input distribution.
  84. size_t subset_ratio = 2;
  85. // Together with the estimated Func duration, determines how many times to
  86. // call Func before checking the sample variability. Larger values increase
  87. // measurement time, memory/cache use and precision.
  88. double seconds_per_eval = 4E-3;
  89. // The minimum number of samples before estimating the central tendency.
  90. size_t min_samples_per_eval = 7;
  91. // The mode is better than median for estimating the central tendency of
  92. // skewed/fat-tailed distributions, but it requires sufficient samples
  93. // relative to the width of half-ranges.
  94. size_t min_mode_samples = 64;
  95. // Maximum permissible variability (= median absolute deviation / center).
  96. double target_rel_mad = 0.002;
  97. // Abort after this many evals without reaching target_rel_mad. This
  98. // prevents infinite loops.
  99. size_t max_evals = 9;
  100. // Whether to print additional statistics to stdout.
  101. bool verbose = true;
  102. };
  103. // Measurement result for each unique input.
  104. struct Result {
  105. FuncInput input;
  106. // Robust estimate (mode or median) of duration.
  107. float ticks;
  108. // Measure of variability (median absolute deviation relative to "ticks").
  109. float variability;
  110. };
  111. // Precisely measures the number of ticks elapsed when calling "func" with the
  112. // given inputs, shuffled to ensure realistic branch prediction hit rates.
  113. //
  114. // "func" returns a 'proof of work' to ensure its computations are not elided.
  115. // "arg" is passed to Func, or reserved for internal use by MeasureClosure.
  116. // "inputs" is an array of "num_inputs" (not necessarily unique) arguments to
  117. // "func". The values should be chosen to maximize coverage of "func". This
  118. // represents a distribution, so a value's frequency should reflect its
  119. // probability in the real application. Order does not matter; for example, a
  120. // uniform distribution over [0, 4) could be represented as {3,0,2,1}.
  121. // Returns how many Result were written to "results": one per unique input, or
  122. // zero if the measurement failed (an error message goes to stderr).
  123. HWY_DLLEXPORT size_t Measure(Func func, const uint8_t* arg,
  124. const FuncInput* inputs, size_t num_inputs,
  125. Result* results, const Params& p = Params());
  126. // Calls operator() of the given closure (lambda function).
  127. template <class Closure>
  128. static FuncOutput CallClosure(const Closure* f, const FuncInput input) {
  129. return (*f)(input);
  130. }
  131. // Same as Measure, except "closure" is typically a lambda function of
  132. // FuncInput -> FuncOutput with a capture list.
  133. template <class Closure>
  134. static inline size_t MeasureClosure(const Closure& closure,
  135. const FuncInput* inputs,
  136. const size_t num_inputs, Result* results,
  137. const Params& p = Params()) {
  138. return Measure(reinterpret_cast<Func>(&CallClosure<Closure>),
  139. reinterpret_cast<const uint8_t*>(&closure), inputs, num_inputs,
  140. results, p);
  141. }
  142. } // namespace hwy
  143. #endif // HIGHWAY_HWY_NANOBENCHMARK_H_