profiler.h 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648
  1. // Copyright 2017 Google Inc. All Rights Reserved.
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. #ifndef HIGHWAY_HWY_PROFILER_H_
  15. #define HIGHWAY_HWY_PROFILER_H_
  16. // High precision, low overhead time measurements. Returns exact call counts and
  17. // total elapsed time for user-defined 'zones' (code regions, i.e. C++ scopes).
  18. //
  19. // Uses RAII to capture begin/end timestamps, with user-specified zone names:
  20. // { PROFILER_ZONE("name"); /*code*/ } or
  21. // the name of the current function:
  22. // void FuncToMeasure() { PROFILER_FUNC; /*code*/ }.
  23. //
  24. // After all threads have exited any zones, invoke PROFILER_PRINT_RESULTS() to
  25. // print call counts and average durations [CPU cycles] to stdout, sorted in
  26. // descending order of total duration.
  27. //
  28. // The binary MUST be built with --dynamic_mode=off because we rely on the data
  29. // segments being nearby; if not, an assertion will likely fail.
  30. #include "hwy/base.h"
  31. // Configuration settings:
  32. // If zero, this file has no effect and no measurements will be recorded.
  33. #ifndef PROFILER_ENABLED
  34. #define PROFILER_ENABLED 0
  35. #endif
  36. // How many mebibytes to allocate (if PROFILER_ENABLED) per thread that
  37. // enters at least one zone. Once this buffer is full, the thread will analyze
  38. // and discard packets, thus temporarily adding some observer overhead.
  39. // Each zone occupies 16 bytes.
  40. #ifndef PROFILER_THREAD_STORAGE
  41. #define PROFILER_THREAD_STORAGE 200ULL
  42. #endif
  43. #if PROFILER_ENABLED || HWY_IDE
  44. #include <stddef.h>
  45. #include <stdint.h>
  46. #include <stdio.h>
  47. #include <string.h> // strcmp
  48. #include <algorithm> // std::sort
  49. #include <atomic>
  50. #include "hwy/aligned_allocator.h"
  51. #include "hwy/cache_control.h" // FlushStream
  52. // #include "hwy/contrib/sort/vqsort.h"
  53. #include "hwy/highway.h" // Stream
  54. #include "hwy/robust_statistics.h"
  55. #include "hwy/timer-inl.h"
  56. #include "hwy/timer.h"
  57. #define PROFILER_PRINT_OVERHEAD 0
  58. namespace hwy {
  59. // Upper bounds for fixed-size data structures (guarded via HWY_DASSERT):
  60. // How many threads can actually enter a zone (those that don't do not count).
  61. // Memory use is about kMaxThreads * PROFILER_THREAD_STORAGE MiB.
  62. // WARNING: a fiber library can spawn hundreds of threads.
  63. static constexpr size_t kMaxThreads = 256;
  64. static constexpr size_t kMaxDepth = 64; // Maximum nesting of zones.
  65. static constexpr size_t kMaxZones = 256; // Total number of zones.
  66. // Overwrites "to" without loading it into the cache (read-for-ownership).
  67. // Both pointers must be aligned.
  68. HWY_ATTR static void StreamCacheLine(const uint64_t* HWY_RESTRICT from,
  69. uint64_t* HWY_RESTRICT to) {
  70. namespace hn = HWY_NAMESPACE;
  71. const hn::ScalableTag<uint64_t> d;
  72. for (size_t i = 0; i < HWY_ALIGNMENT / sizeof(uint64_t); i += Lanes(d)) {
  73. hn::Stream(hn::Load(d, from + i), d, to + i);
  74. }
  75. }
  76. #pragma pack(push, 1)
  77. // Represents zone entry/exit events. Stores a full-resolution timestamp plus
  78. // an offset (representing zone name or identifying exit packets). POD.
  79. class Packet {
  80. public:
  81. // If offsets do not fit, UpdateOrAdd will overrun our heap allocation
  82. // (governed by kMaxZones). We have seen multi-megabyte offsets.
  83. static constexpr size_t kOffsetBits = 25;
  84. static constexpr uint64_t kOffsetBias = 1ULL << (kOffsetBits - 1);
  85. // We need full-resolution timestamps; at an effective rate of 4 GHz,
  86. // this permits 1 minute zone durations (for longer durations, split into
  87. // multiple zones). Wraparound is handled by masking.
  88. static constexpr size_t kTimestampBits = 64 - kOffsetBits;
  89. static constexpr uint64_t kTimestampMask = (1ULL << kTimestampBits) - 1;
  90. static Packet Make(const size_t biased_offset, const uint64_t timestamp) {
  91. HWY_DASSERT(biased_offset < (1ULL << kOffsetBits));
  92. Packet packet;
  93. packet.bits_ =
  94. (biased_offset << kTimestampBits) + (timestamp & kTimestampMask);
  95. return packet;
  96. }
  97. uint64_t Timestamp() const { return bits_ & kTimestampMask; }
  98. size_t BiasedOffset() const { return (bits_ >> kTimestampBits); }
  99. private:
  100. uint64_t bits_;
  101. };
  102. static_assert(sizeof(Packet) == 8, "Wrong Packet size");
  103. // Returns the address of a string literal. Assuming zone names are also
  104. // literals and stored nearby, we can represent them as offsets, which are
  105. // faster to compute than hashes or even a static index.
  106. //
  107. // This function must not be static - each call (even from other translation
  108. // units) must return the same value.
  109. inline const char* StringOrigin() {
  110. // Chosen such that no zone name is a prefix nor suffix of this string
  111. // to ensure they aren't merged (offset 0 identifies zone-exit packets).
  112. static const char* string_origin = "__#__";
  113. return string_origin - Packet::kOffsetBias;
  114. }
  115. // Representation of an active zone, stored in a stack. Used to deduct
  116. // child duration from the parent's self time. POD.
  117. struct Node {
  118. Packet packet;
  119. uint64_t child_total;
  120. };
  121. static_assert(sizeof(Node) == 16, "Wrong Node size");
  122. // Holds statistics for all zones with the same name. POD.
  123. struct Accumulator {
  124. static constexpr size_t kNumCallBits = 64 - Packet::kOffsetBits;
  125. uint64_t BiasedOffset() const { return u128.lo >> kNumCallBits; }
  126. uint64_t NumCalls() const { return u128.lo & ((1ULL << kNumCallBits) - 1); }
  127. uint64_t Duration() const { return u128.hi; }
  128. void Set(uint64_t biased_offset, uint64_t num_calls, uint64_t duration) {
  129. u128.hi = duration;
  130. u128.lo = (biased_offset << kNumCallBits) + num_calls;
  131. }
  132. void Add(uint64_t num_calls, uint64_t duration) {
  133. u128.lo += num_calls;
  134. u128.hi += duration;
  135. }
  136. // For fast sorting by duration, which must therefore be the hi element.
  137. // lo holds BiasedOffset and NumCalls.
  138. uint128_t u128;
  139. };
  140. static_assert(sizeof(Accumulator) == 16, "Wrong Accumulator size");
  141. template <typename T>
  142. inline T ClampedSubtract(const T minuend, const T subtrahend) {
  143. if (subtrahend > minuend) {
  144. return 0;
  145. }
  146. return minuend - subtrahend;
  147. }
  148. // Per-thread call graph (stack) and Accumulator for each zone.
  149. class Results {
  150. public:
  151. Results() { ZeroBytes(zones_, sizeof(zones_)); }
  152. // Used for computing overhead when this thread encounters its first Zone.
  153. // This has no observable effect apart from increasing "analyze_elapsed_".
  154. uint64_t ZoneDuration(const Packet* packets) {
  155. HWY_DASSERT(depth_ == 0);
  156. HWY_DASSERT(num_zones_ == 0);
  157. AnalyzePackets(packets, 2);
  158. const uint64_t duration = zones_[0].Duration();
  159. zones_[0].Set(0, 0, 0);
  160. HWY_DASSERT(depth_ == 0);
  161. num_zones_ = 0;
  162. return duration;
  163. }
  164. void SetSelfOverhead(const uint64_t self_overhead) {
  165. self_overhead_ = self_overhead;
  166. }
  167. void SetChildOverhead(const uint64_t child_overhead) {
  168. child_overhead_ = child_overhead;
  169. }
  170. // Draw all required information from the packets, which can be discarded
  171. // afterwards. Called whenever this thread's storage is full.
  172. void AnalyzePackets(const Packet* packets, const size_t num_packets) {
  173. namespace hn = HWY_NAMESPACE;
  174. const uint64_t t0 = hn::timer::Start();
  175. for (size_t i = 0; i < num_packets; ++i) {
  176. const Packet p = packets[i];
  177. // Entering a zone
  178. if (p.BiasedOffset() != Packet::kOffsetBias) {
  179. HWY_DASSERT(depth_ < kMaxDepth);
  180. nodes_[depth_].packet = p;
  181. nodes_[depth_].child_total = 0;
  182. ++depth_;
  183. continue;
  184. }
  185. HWY_DASSERT(depth_ != 0);
  186. const Node& node = nodes_[depth_ - 1];
  187. // Masking correctly handles unsigned wraparound.
  188. const uint64_t duration =
  189. (p.Timestamp() - node.packet.Timestamp()) & Packet::kTimestampMask;
  190. const uint64_t self_duration = ClampedSubtract(
  191. duration, self_overhead_ + child_overhead_ + node.child_total);
  192. UpdateOrAdd(node.packet.BiasedOffset(), 1, self_duration);
  193. --depth_;
  194. // Deduct this nested node's time from its parent's self_duration.
  195. if (depth_ != 0) {
  196. nodes_[depth_ - 1].child_total += duration + child_overhead_;
  197. }
  198. }
  199. const uint64_t t1 = hn::timer::Stop();
  200. analyze_elapsed_ += t1 - t0;
  201. }
  202. // Incorporates results from another thread. Call after all threads have
  203. // exited any zones.
  204. void Assimilate(const Results& other) {
  205. namespace hn = HWY_NAMESPACE;
  206. const uint64_t t0 = hn::timer::Start();
  207. HWY_DASSERT(depth_ == 0);
  208. HWY_DASSERT(other.depth_ == 0);
  209. for (size_t i = 0; i < other.num_zones_; ++i) {
  210. const Accumulator& zone = other.zones_[i];
  211. UpdateOrAdd(zone.BiasedOffset(), zone.NumCalls(), zone.Duration());
  212. }
  213. const uint64_t t1 = hn::timer::Stop();
  214. analyze_elapsed_ += t1 - t0 + other.analyze_elapsed_;
  215. }
  216. // Single-threaded.
  217. void Print() {
  218. namespace hn = HWY_NAMESPACE;
  219. const uint64_t t0 = hn::timer::Start();
  220. MergeDuplicates();
  221. // Sort by decreasing total (self) cost.
  222. // VQSort(&zones_[0].u128, num_zones_, SortDescending());
  223. std::sort(zones_, zones_ + num_zones_,
  224. [](const Accumulator& r1, const Accumulator& r2) {
  225. return r1.Duration() > r2.Duration();
  226. });
  227. const double inv_freq = 1.0 / platform::InvariantTicksPerSecond();
  228. const char* string_origin = StringOrigin();
  229. for (size_t i = 0; i < num_zones_; ++i) {
  230. const Accumulator& r = zones_[i];
  231. const uint64_t num_calls = r.NumCalls();
  232. printf("%-40s: %10zu x %15zu = %9.6f\n", string_origin + r.BiasedOffset(),
  233. num_calls, r.Duration() / num_calls,
  234. static_cast<double>(r.Duration()) * inv_freq);
  235. }
  236. const uint64_t t1 = hn::timer::Stop();
  237. analyze_elapsed_ += t1 - t0;
  238. printf("Total analysis [s]: %f\n",
  239. static_cast<double>(analyze_elapsed_) * inv_freq);
  240. }
  241. private:
  242. // Updates an existing Accumulator (uniquely identified by biased_offset) or
  243. // adds one if this is the first time this thread analyzed that zone.
  244. // Uses a self-organizing list data structure, which avoids dynamic memory
  245. // allocations and is far faster than unordered_map. Loads, updates and
  246. // stores the entire Accumulator with vector instructions.
  247. void UpdateOrAdd(const size_t biased_offset, const uint64_t num_calls,
  248. const uint64_t duration) {
  249. HWY_DASSERT(biased_offset < (1ULL << Packet::kOffsetBits));
  250. // Special case for first zone: (maybe) update, without swapping.
  251. if (zones_[0].BiasedOffset() == biased_offset) {
  252. zones_[0].Add(num_calls, duration);
  253. HWY_DASSERT(zones_[0].BiasedOffset() == biased_offset);
  254. return;
  255. }
  256. // Look for a zone with the same offset.
  257. for (size_t i = 1; i < num_zones_; ++i) {
  258. if (zones_[i].BiasedOffset() == biased_offset) {
  259. zones_[i].Add(num_calls, duration);
  260. HWY_DASSERT(zones_[i].BiasedOffset() == biased_offset);
  261. // Swap with predecessor (more conservative than move to front,
  262. // but at least as successful).
  263. const Accumulator prev = zones_[i - 1];
  264. zones_[i - 1] = zones_[i];
  265. zones_[i] = prev;
  266. return;
  267. }
  268. }
  269. // Not found; create a new Accumulator.
  270. HWY_DASSERT(num_zones_ < kMaxZones);
  271. Accumulator* HWY_RESTRICT zone = zones_ + num_zones_;
  272. zone->Set(biased_offset, num_calls, duration);
  273. HWY_DASSERT(zone->BiasedOffset() == biased_offset);
  274. ++num_zones_;
  275. }
  276. // Each instantiation of a function template seems to get its own copy of
  277. // __func__ and GCC doesn't merge them. An N^2 search for duplicates is
  278. // acceptable because we only expect a few dozen zones.
  279. void MergeDuplicates() {
  280. const char* string_origin = StringOrigin();
  281. for (size_t i = 0; i < num_zones_; ++i) {
  282. const size_t biased_offset = zones_[i].BiasedOffset();
  283. const char* name = string_origin + biased_offset;
  284. // Separate num_calls from biased_offset so we can add them together.
  285. uint64_t num_calls = zones_[i].NumCalls();
  286. // Add any subsequent duplicates to num_calls and total_duration.
  287. for (size_t j = i + 1; j < num_zones_;) {
  288. if (!strcmp(name, string_origin + zones_[j].BiasedOffset())) {
  289. num_calls += zones_[j].NumCalls();
  290. zones_[i].Add(0, zones_[j].Duration());
  291. // Fill hole with last item.
  292. zones_[j] = zones_[--num_zones_];
  293. } else { // Name differed, try next Accumulator.
  294. ++j;
  295. }
  296. }
  297. HWY_DASSERT(num_calls < (1ULL << Accumulator::kNumCallBits));
  298. // Re-pack regardless of whether any duplicates were found.
  299. zones_[i].Set(biased_offset, num_calls, zones_[i].Duration());
  300. }
  301. }
  302. uint64_t analyze_elapsed_ = 0;
  303. uint64_t self_overhead_ = 0;
  304. uint64_t child_overhead_ = 0;
  305. size_t depth_ = 0; // Number of active zones.
  306. size_t num_zones_ = 0; // Number of retired zones.
  307. alignas(HWY_ALIGNMENT) Node nodes_[kMaxDepth]; // Stack
  308. alignas(HWY_ALIGNMENT) Accumulator zones_[kMaxZones]; // Self-organizing list
  309. };
  310. // Per-thread packet storage, dynamically allocated.
  311. class ThreadSpecific {
  312. static constexpr size_t kBufferCapacity = HWY_ALIGNMENT / sizeof(Packet);
  313. public:
  314. // "name" is used to sanity-check offsets fit in kOffsetBits.
  315. explicit ThreadSpecific(const char* name)
  316. : max_packets_((PROFILER_THREAD_STORAGE << 20) / sizeof(Packet)),
  317. packets_(AllocateAligned<Packet>(max_packets_)),
  318. num_packets_(0),
  319. string_origin_(StringOrigin()) {
  320. // Even in optimized builds, verify that this zone's name offset fits
  321. // within the allotted space. If not, UpdateOrAdd is likely to overrun
  322. // zones_[]. Checking here on the cold path (only reached once per thread)
  323. // is cheap, but it only covers one zone.
  324. const size_t biased_offset = name - string_origin_;
  325. HWY_ASSERT(biased_offset <= (1ULL << Packet::kOffsetBits));
  326. }
  327. // Depends on Zone => defined below.
  328. void ComputeOverhead();
  329. void WriteEntry(const char* name, const uint64_t timestamp) {
  330. const size_t biased_offset = name - string_origin_;
  331. Write(Packet::Make(biased_offset, timestamp));
  332. }
  333. void WriteExit(const uint64_t timestamp) {
  334. const size_t biased_offset = Packet::kOffsetBias;
  335. Write(Packet::Make(biased_offset, timestamp));
  336. }
  337. void AnalyzeRemainingPackets() {
  338. // Ensures prior weakly-ordered streaming stores are globally visible.
  339. FlushStream();
  340. // Storage full => empty it.
  341. if (num_packets_ + buffer_size_ > max_packets_) {
  342. results_.AnalyzePackets(packets_.get(), num_packets_);
  343. num_packets_ = 0;
  344. }
  345. CopyBytes(buffer_, packets_.get() + num_packets_,
  346. buffer_size_ * sizeof(Packet));
  347. num_packets_ += buffer_size_;
  348. results_.AnalyzePackets(packets_.get(), num_packets_);
  349. num_packets_ = 0;
  350. }
  351. Results& GetResults() { return results_; }
  352. private:
  353. // Write packet to buffer/storage, emptying them as needed.
  354. void Write(const Packet packet) {
  355. // Buffer full => copy to storage.
  356. if (buffer_size_ == kBufferCapacity) {
  357. // Storage full => empty it.
  358. if (num_packets_ + kBufferCapacity > max_packets_) {
  359. results_.AnalyzePackets(packets_.get(), num_packets_);
  360. num_packets_ = 0;
  361. }
  362. // This buffering halves observer overhead and decreases the overall
  363. // runtime by about 3%. Casting is safe because the first member is u64.
  364. StreamCacheLine(
  365. reinterpret_cast<const uint64_t*>(buffer_),
  366. reinterpret_cast<uint64_t*>(packets_.get() + num_packets_));
  367. num_packets_ += kBufferCapacity;
  368. buffer_size_ = 0;
  369. }
  370. buffer_[buffer_size_] = packet;
  371. ++buffer_size_;
  372. }
  373. // Write-combining buffer to avoid cache pollution. Must be the first
  374. // non-static member to ensure cache-line alignment.
  375. Packet buffer_[kBufferCapacity];
  376. size_t buffer_size_ = 0;
  377. const size_t max_packets_;
  378. // Contiguous storage for zone enter/exit packets.
  379. AlignedFreeUniquePtr<Packet[]> packets_;
  380. size_t num_packets_;
  381. // Cached here because we already read this cache line on zone entry/exit.
  382. const char* HWY_RESTRICT string_origin_;
  383. Results results_;
  384. };
  385. class ThreadList {
  386. public:
  387. // Called from any thread.
  388. ThreadSpecific* Add(const char* name) {
  389. const size_t index = num_threads_.fetch_add(1, std::memory_order_relaxed);
  390. HWY_DASSERT(index < kMaxThreads);
  391. ThreadSpecific* ts = MakeUniqueAligned<ThreadSpecific>(name).release();
  392. threads_[index].store(ts, std::memory_order_release);
  393. return ts;
  394. }
  395. // Single-threaded.
  396. void PrintResults() {
  397. const auto acq = std::memory_order_acquire;
  398. const size_t num_threads = num_threads_.load(acq);
  399. ThreadSpecific* main = threads_[0].load(acq);
  400. main->AnalyzeRemainingPackets();
  401. for (size_t i = 1; i < num_threads; ++i) {
  402. ThreadSpecific* ts = threads_[i].load(acq);
  403. ts->AnalyzeRemainingPackets();
  404. main->GetResults().Assimilate(ts->GetResults());
  405. }
  406. if (num_threads != 0) {
  407. main->GetResults().Print();
  408. }
  409. }
  410. private:
  411. // Owning pointers.
  412. alignas(64) std::atomic<ThreadSpecific*> threads_[kMaxThreads];
  413. std::atomic<size_t> num_threads_{0};
  414. };
  415. // RAII zone enter/exit recorder constructed by the ZONE macro; also
  416. // responsible for initializing ThreadSpecific.
  417. class Zone {
  418. public:
  419. // "name" must be a string literal (see StringOrigin).
  420. HWY_NOINLINE explicit Zone(const char* name) {
  421. HWY_FENCE;
  422. ThreadSpecific* HWY_RESTRICT thread_specific = StaticThreadSpecific();
  423. if (HWY_UNLIKELY(thread_specific == nullptr)) {
  424. // Ensure the CPU supports our timer.
  425. char cpu[100];
  426. if (!platform::HaveTimerStop(cpu)) {
  427. HWY_ABORT("CPU %s is too old for PROFILER_ENABLED=1, exiting", cpu);
  428. }
  429. thread_specific = StaticThreadSpecific() = Threads().Add(name);
  430. // Must happen after setting StaticThreadSpecific, because ComputeOverhead
  431. // also calls Zone().
  432. thread_specific->ComputeOverhead();
  433. }
  434. // (Capture timestamp ASAP, not inside WriteEntry.)
  435. HWY_FENCE;
  436. const uint64_t timestamp = HWY_NAMESPACE::timer::Start();
  437. thread_specific->WriteEntry(name, timestamp);
  438. }
  439. HWY_NOINLINE ~Zone() {
  440. HWY_FENCE;
  441. const uint64_t timestamp = HWY_NAMESPACE::timer::Stop();
  442. StaticThreadSpecific()->WriteExit(timestamp);
  443. HWY_FENCE;
  444. }
  445. // Call exactly once after all threads have exited all zones.
  446. static void PrintResults() { Threads().PrintResults(); }
  447. private:
  448. // Returns reference to the thread's ThreadSpecific pointer (initially null).
  449. // Function-local static avoids needing a separate definition.
  450. static ThreadSpecific*& StaticThreadSpecific() {
  451. static thread_local ThreadSpecific* thread_specific;
  452. return thread_specific;
  453. }
  454. // Returns the singleton ThreadList. Non time-critical.
  455. static ThreadList& Threads() {
  456. static ThreadList threads_;
  457. return threads_;
  458. }
  459. };
  460. // Creates a zone starting from here until the end of the current scope.
  461. // Timestamps will be recorded when entering and exiting the zone.
  462. // "name" must be a string literal, which is ensured by merging with "".
  463. #define PROFILER_ZONE(name) \
  464. HWY_FENCE; \
  465. const hwy::Zone zone("" name); \
  466. HWY_FENCE
  467. // Creates a zone for an entire function (when placed at its beginning).
  468. // Shorter/more convenient than ZONE.
  469. #define PROFILER_FUNC \
  470. HWY_FENCE; \
  471. const hwy::Zone zone(__func__); \
  472. HWY_FENCE
  473. #define PROFILER_PRINT_RESULTS hwy::Zone::PrintResults
  474. inline void ThreadSpecific::ComputeOverhead() {
  475. namespace hn = HWY_NAMESPACE;
  476. // Delay after capturing timestamps before/after the actual zone runs. Even
  477. // with frequency throttling disabled, this has a multimodal distribution,
  478. // including 32, 34, 48, 52, 59, 62.
  479. uint64_t self_overhead;
  480. {
  481. const size_t kNumSamples = 32;
  482. uint32_t samples[kNumSamples];
  483. for (size_t idx_sample = 0; idx_sample < kNumSamples; ++idx_sample) {
  484. const size_t kNumDurations = 1024;
  485. uint32_t durations[kNumDurations];
  486. for (size_t idx_duration = 0; idx_duration < kNumDurations;
  487. ++idx_duration) {
  488. {
  489. PROFILER_ZONE("Dummy Zone (never shown)");
  490. }
  491. const uint64_t duration = results_.ZoneDuration(buffer_);
  492. buffer_size_ = 0;
  493. durations[idx_duration] = static_cast<uint32_t>(duration);
  494. HWY_DASSERT(num_packets_ == 0);
  495. }
  496. robust_statistics::CountingSort(durations, kNumDurations);
  497. samples[idx_sample] = robust_statistics::Mode(durations, kNumDurations);
  498. }
  499. // Median.
  500. robust_statistics::CountingSort(samples, kNumSamples);
  501. self_overhead = samples[kNumSamples / 2];
  502. if (PROFILER_PRINT_OVERHEAD) {
  503. printf("Overhead: %zu\n", self_overhead);
  504. }
  505. results_.SetSelfOverhead(self_overhead);
  506. }
  507. // Delay before capturing start timestamp / after end timestamp.
  508. const size_t kNumSamples = 32;
  509. uint32_t samples[kNumSamples];
  510. for (size_t idx_sample = 0; idx_sample < kNumSamples; ++idx_sample) {
  511. const size_t kNumDurations = 16;
  512. uint32_t durations[kNumDurations];
  513. for (size_t idx_duration = 0; idx_duration < kNumDurations;
  514. ++idx_duration) {
  515. const size_t kReps = 10000;
  516. // Analysis time should not be included => must fit within buffer.
  517. HWY_DASSERT(kReps * 2 < max_packets_);
  518. std::atomic_thread_fence(std::memory_order_seq_cst);
  519. const uint64_t t0 = hn::timer::Start();
  520. for (size_t i = 0; i < kReps; ++i) {
  521. PROFILER_ZONE("Dummy");
  522. }
  523. FlushStream();
  524. const uint64_t t1 = hn::timer::Stop();
  525. HWY_DASSERT(num_packets_ + buffer_size_ == kReps * 2);
  526. buffer_size_ = 0;
  527. num_packets_ = 0;
  528. const uint64_t avg_duration = (t1 - t0 + kReps / 2) / kReps;
  529. durations[idx_duration] =
  530. static_cast<uint32_t>(ClampedSubtract(avg_duration, self_overhead));
  531. }
  532. robust_statistics::CountingSort(durations, kNumDurations);
  533. samples[idx_sample] = robust_statistics::Mode(durations, kNumDurations);
  534. }
  535. robust_statistics::CountingSort(samples, kNumSamples);
  536. const uint64_t child_overhead = samples[9 * kNumSamples / 10];
  537. if (PROFILER_PRINT_OVERHEAD) {
  538. printf("Child overhead: %zu\n", child_overhead);
  539. }
  540. results_.SetChildOverhead(child_overhead);
  541. }
  542. #pragma pack(pop)
  543. } // namespace hwy
  544. #endif // PROFILER_ENABLED || HWY_IDE
  545. #if !PROFILER_ENABLED && !HWY_IDE
  546. #define PROFILER_ZONE(name)
  547. #define PROFILER_FUNC
  548. #define PROFILER_PRINT_RESULTS()
  549. #endif
  550. #endif // HIGHWAY_HWY_PROFILER_H_