aligned_allocator.h 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423
  1. // Copyright 2020 Google LLC
  2. // SPDX-License-Identifier: Apache-2.0
  3. //
  4. // Licensed under the Apache License, Version 2.0 (the "License");
  5. // you may not use this file except in compliance with the License.
  6. // You may obtain a copy of the License at
  7. //
  8. // http://www.apache.org/licenses/LICENSE-2.0
  9. //
  10. // Unless required by applicable law or agreed to in writing, software
  11. // distributed under the License is distributed on an "AS IS" BASIS,
  12. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. // See the License for the specific language governing permissions and
  14. // limitations under the License.
  15. #ifndef HIGHWAY_HWY_ALIGNED_ALLOCATOR_H_
  16. #define HIGHWAY_HWY_ALIGNED_ALLOCATOR_H_
  17. // Memory allocator with support for alignment and offsets.
  18. #include <algorithm>
  19. #include <array>
  20. #include <cassert>
  21. #include <cstdint>
  22. #include <cstring>
  23. #include <initializer_list>
  24. #include <memory>
  25. #include <type_traits>
  26. #include <utility>
  27. #include <vector>
  28. #include "hwy/base.h"
  29. #include "hwy/per_target.h"
  30. namespace hwy {
  31. // Minimum alignment of allocated memory for use in HWY_ASSUME_ALIGNED, which
  32. // requires a literal. To prevent false sharing, this should be at least the
  33. // L1 cache line size, usually 64 bytes. However, Intel's L2 prefetchers may
  34. // access pairs of lines, and M1 L2 and POWER8 lines are also 128 bytes.
  35. #define HWY_ALIGNMENT 128
  36. template <typename T>
  37. HWY_API constexpr bool IsAligned(T* ptr, size_t align = HWY_ALIGNMENT) {
  38. return reinterpret_cast<uintptr_t>(ptr) % align == 0;
  39. }
  40. // Pointers to functions equivalent to malloc/free with an opaque void* passed
  41. // to them.
  42. using AllocPtr = void* (*)(void* opaque, size_t bytes);
  43. using FreePtr = void (*)(void* opaque, void* memory);
  44. // Returns null or a pointer to at least `payload_size` (which can be zero)
  45. // bytes of newly allocated memory, aligned to the larger of HWY_ALIGNMENT and
  46. // the vector size. Calls `alloc` with the passed `opaque` pointer to obtain
  47. // memory or malloc() if it is null.
  48. HWY_DLLEXPORT void* AllocateAlignedBytes(size_t payload_size,
  49. AllocPtr alloc_ptr = nullptr,
  50. void* opaque_ptr = nullptr);
  51. // Frees all memory. No effect if `aligned_pointer` == nullptr, otherwise it
  52. // must have been returned from a previous call to `AllocateAlignedBytes`.
  53. // Calls `free_ptr` with the passed `opaque_ptr` pointer to free the memory; if
  54. // `free_ptr` function is null, uses the default free().
  55. HWY_DLLEXPORT void FreeAlignedBytes(const void* aligned_pointer,
  56. FreePtr free_ptr, void* opaque_ptr);
  57. // Class that deletes the aligned pointer passed to operator() calling the
  58. // destructor before freeing the pointer. This is equivalent to the
  59. // std::default_delete but for aligned objects. For a similar deleter equivalent
  60. // to free() for aligned memory see AlignedFreer().
  61. class AlignedDeleter {
  62. public:
  63. AlignedDeleter() : free_(nullptr), opaque_ptr_(nullptr) {}
  64. AlignedDeleter(FreePtr free_ptr, void* opaque_ptr)
  65. : free_(free_ptr), opaque_ptr_(opaque_ptr) {}
  66. template <typename T>
  67. void operator()(T* aligned_pointer) const {
  68. return DeleteAlignedArray(aligned_pointer, free_, opaque_ptr_,
  69. TypedArrayDeleter<T>);
  70. }
  71. private:
  72. template <typename T>
  73. static void TypedArrayDeleter(void* ptr, size_t size_in_bytes) {
  74. size_t elems = size_in_bytes / sizeof(T);
  75. for (size_t i = 0; i < elems; i++) {
  76. // Explicitly call the destructor on each element.
  77. (static_cast<T*>(ptr) + i)->~T();
  78. }
  79. }
  80. // Function prototype that calls the destructor for each element in a typed
  81. // array. TypeArrayDeleter<T> would match this prototype.
  82. using ArrayDeleter = void (*)(void* t_ptr, size_t t_size);
  83. HWY_DLLEXPORT static void DeleteAlignedArray(void* aligned_pointer,
  84. FreePtr free_ptr,
  85. void* opaque_ptr,
  86. ArrayDeleter deleter);
  87. FreePtr free_;
  88. void* opaque_ptr_;
  89. };
  90. // Unique pointer to T with custom aligned deleter. This can be a single
  91. // element U or an array of element if T is a U[]. The custom aligned deleter
  92. // will call the destructor on U or each element of a U[] in the array case.
  93. template <typename T>
  94. using AlignedUniquePtr = std::unique_ptr<T, AlignedDeleter>;
  95. // Aligned memory equivalent of make_unique<T> using the custom allocators
  96. // alloc/free with the passed `opaque` pointer. This function calls the
  97. // constructor with the passed Args... and calls the destructor of the object
  98. // when the AlignedUniquePtr is destroyed.
  99. template <typename T, typename... Args>
  100. AlignedUniquePtr<T> MakeUniqueAlignedWithAlloc(AllocPtr alloc, FreePtr free,
  101. void* opaque, Args&&... args) {
  102. T* ptr = static_cast<T*>(AllocateAlignedBytes(sizeof(T), alloc, opaque));
  103. return AlignedUniquePtr<T>(new (ptr) T(std::forward<Args>(args)...),
  104. AlignedDeleter(free, opaque));
  105. }
  106. // Similar to MakeUniqueAlignedWithAlloc but using the default alloc/free
  107. // functions.
  108. template <typename T, typename... Args>
  109. AlignedUniquePtr<T> MakeUniqueAligned(Args&&... args) {
  110. T* ptr = static_cast<T*>(AllocateAlignedBytes(sizeof(T)));
  111. return AlignedUniquePtr<T>(new (ptr) T(std::forward<Args>(args)...),
  112. AlignedDeleter());
  113. }
  114. template <class T>
  115. struct AlignedAllocator {
  116. using value_type = T;
  117. AlignedAllocator() = default;
  118. template <class V>
  119. explicit AlignedAllocator(const AlignedAllocator<V>&) noexcept {}
  120. template <class V>
  121. value_type* allocate(V n) {
  122. static_assert(std::is_integral<V>::value,
  123. "AlignedAllocator only supports integer types");
  124. static_assert(sizeof(V) <= sizeof(std::size_t),
  125. "V n must be smaller or equal size_t to avoid overflow");
  126. return static_cast<value_type*>(
  127. AllocateAlignedBytes(static_cast<std::size_t>(n) * sizeof(value_type)));
  128. }
  129. template <class V>
  130. void deallocate(value_type* p, HWY_MAYBE_UNUSED V n) {
  131. return FreeAlignedBytes(p, nullptr, nullptr);
  132. }
  133. };
  134. template <class T, class V>
  135. constexpr bool operator==(const AlignedAllocator<T>&,
  136. const AlignedAllocator<V>&) noexcept {
  137. return true;
  138. }
  139. template <class T, class V>
  140. constexpr bool operator!=(const AlignedAllocator<T>&,
  141. const AlignedAllocator<V>&) noexcept {
  142. return false;
  143. }
  144. template <class T>
  145. using AlignedVector = std::vector<T, AlignedAllocator<T>>;
  146. // Helpers for array allocators (avoids overflow)
  147. namespace detail {
  148. // Returns x such that 1u << x == n (if n is a power of two).
  149. static inline constexpr size_t ShiftCount(size_t n) {
  150. return (n <= 1) ? 0 : 1 + ShiftCount(n / 2);
  151. }
  152. template <typename T>
  153. T* AllocateAlignedItems(size_t items, AllocPtr alloc_ptr, void* opaque_ptr) {
  154. constexpr size_t size = sizeof(T);
  155. constexpr bool is_pow2 = (size & (size - 1)) == 0;
  156. constexpr size_t bits = ShiftCount(size);
  157. static_assert(!is_pow2 || (1ull << bits) == size, "ShiftCount is incorrect");
  158. const size_t bytes = is_pow2 ? items << bits : items * size;
  159. const size_t check = is_pow2 ? bytes >> bits : bytes / size;
  160. if (check != items) {
  161. return nullptr; // overflowed
  162. }
  163. return static_cast<T*>(AllocateAlignedBytes(bytes, alloc_ptr, opaque_ptr));
  164. }
  165. } // namespace detail
  166. // Aligned memory equivalent of make_unique<T[]> for array types using the
  167. // custom allocators alloc/free. This function calls the constructor with the
  168. // passed Args... on every created item. The destructor of each element will be
  169. // called when the AlignedUniquePtr is destroyed.
  170. template <typename T, typename... Args>
  171. AlignedUniquePtr<T[]> MakeUniqueAlignedArrayWithAlloc(
  172. size_t items, AllocPtr alloc, FreePtr free, void* opaque, Args&&... args) {
  173. T* ptr = detail::AllocateAlignedItems<T>(items, alloc, opaque);
  174. if (ptr != nullptr) {
  175. for (size_t i = 0; i < items; i++) {
  176. new (ptr + i) T(std::forward<Args>(args)...);
  177. }
  178. }
  179. return AlignedUniquePtr<T[]>(ptr, AlignedDeleter(free, opaque));
  180. }
  181. template <typename T, typename... Args>
  182. AlignedUniquePtr<T[]> MakeUniqueAlignedArray(size_t items, Args&&... args) {
  183. return MakeUniqueAlignedArrayWithAlloc<T, Args...>(
  184. items, nullptr, nullptr, nullptr, std::forward<Args>(args)...);
  185. }
  186. // Custom deleter for std::unique_ptr equivalent to using free() as a deleter
  187. // but for aligned memory.
  188. class AlignedFreer {
  189. public:
  190. // Pass address of this to ctor to skip deleting externally-owned memory.
  191. static void DoNothing(void* /*opaque*/, void* /*aligned_pointer*/) {}
  192. AlignedFreer() : free_(nullptr), opaque_ptr_(nullptr) {}
  193. AlignedFreer(FreePtr free_ptr, void* opaque_ptr)
  194. : free_(free_ptr), opaque_ptr_(opaque_ptr) {}
  195. template <typename T>
  196. void operator()(T* aligned_pointer) const {
  197. // TODO(deymo): assert that we are using a POD type T.
  198. FreeAlignedBytes(aligned_pointer, free_, opaque_ptr_);
  199. }
  200. private:
  201. FreePtr free_;
  202. void* opaque_ptr_;
  203. };
  204. // Unique pointer to single POD, or (if T is U[]) an array of POD. For non POD
  205. // data use AlignedUniquePtr.
  206. template <typename T>
  207. using AlignedFreeUniquePtr = std::unique_ptr<T, AlignedFreer>;
  208. // Allocate an aligned and uninitialized array of POD values as a unique_ptr.
  209. // Upon destruction of the unique_ptr the aligned array will be freed.
  210. template <typename T>
  211. AlignedFreeUniquePtr<T[]> AllocateAligned(const size_t items, AllocPtr alloc,
  212. FreePtr free, void* opaque) {
  213. return AlignedFreeUniquePtr<T[]>(
  214. detail::AllocateAlignedItems<T>(items, alloc, opaque),
  215. AlignedFreer(free, opaque));
  216. }
  217. // Same as previous AllocateAligned(), using default allocate/free functions.
  218. template <typename T>
  219. AlignedFreeUniquePtr<T[]> AllocateAligned(const size_t items) {
  220. return AllocateAligned<T>(items, nullptr, nullptr, nullptr);
  221. }
  222. // A simple span containing data and size of data.
  223. template <typename T>
  224. class Span {
  225. public:
  226. Span() = default;
  227. Span(T* data, size_t size) : size_(size), data_(data) {}
  228. template <typename U>
  229. Span(U u) : Span(u.data(), u.size()) {}
  230. Span(std::initializer_list<const T> v) : Span(v.begin(), v.size()) {}
  231. // Copies the contents of the initializer list to the span.
  232. Span<T>& operator=(std::initializer_list<const T> v) {
  233. HWY_DASSERT(size_ == v.size());
  234. CopyBytes(v.begin(), data_, sizeof(T) * std::min(size_, v.size()));
  235. return *this;
  236. }
  237. // Returns the size of the contained data.
  238. size_t size() const { return size_; }
  239. // Returns a pointer to the contained data.
  240. T* data() { return data_; }
  241. T* data() const { return data_; }
  242. // Returns the element at index.
  243. T& operator[](size_t index) const { return data_[index]; }
  244. // Returns an iterator pointing to the first element of this span.
  245. T* begin() { return data_; }
  246. // Returns a const iterator pointing to the first element of this span.
  247. constexpr const T* cbegin() const { return data_; }
  248. // Returns an iterator pointing just beyond the last element at the
  249. // end of this span.
  250. T* end() { return data_ + size_; }
  251. // Returns a const iterator pointing just beyond the last element at the
  252. // end of this span.
  253. constexpr const T* cend() const { return data_ + size_; }
  254. private:
  255. size_t size_ = 0;
  256. T* data_ = nullptr;
  257. };
  258. // A multi dimensional array containing an aligned buffer.
  259. //
  260. // To maintain alignment, the innermost dimension will be padded to ensure all
  261. // innermost arrays are aligned.
  262. template <typename T, size_t axes>
  263. class AlignedNDArray {
  264. static_assert(std::is_trivial<T>::value,
  265. "AlignedNDArray can only contain trivial types");
  266. public:
  267. AlignedNDArray(AlignedNDArray&& other) = default;
  268. AlignedNDArray& operator=(AlignedNDArray&& other) = default;
  269. // Constructs an array of the provided shape and fills it with zeros.
  270. explicit AlignedNDArray(std::array<size_t, axes> shape) : shape_(shape) {
  271. sizes_ = ComputeSizes(shape_);
  272. memory_shape_ = shape_;
  273. // Round the innermost dimension up to the number of bytes available for
  274. // SIMD operations on this architecture to make sure that each innermost
  275. // array is aligned from the first element.
  276. memory_shape_[axes - 1] = RoundUpTo(memory_shape_[axes - 1], VectorBytes());
  277. memory_sizes_ = ComputeSizes(memory_shape_);
  278. buffer_ = hwy::AllocateAligned<T>(memory_size());
  279. hwy::ZeroBytes(buffer_.get(), memory_size() * sizeof(T));
  280. }
  281. // Returns a span containing the innermost array at the provided indices.
  282. Span<T> operator[](std::array<const size_t, axes - 1> indices) {
  283. return Span<T>(buffer_.get() + Offset(indices), sizes_[indices.size()]);
  284. }
  285. // Returns a const span containing the innermost array at the provided
  286. // indices.
  287. Span<const T> operator[](std::array<const size_t, axes - 1> indices) const {
  288. return Span<const T>(buffer_.get() + Offset(indices),
  289. sizes_[indices.size()]);
  290. }
  291. // Returns the shape of the array, which might be smaller than the allocated
  292. // buffer after padding the last axis to alignment.
  293. const std::array<size_t, axes>& shape() const { return shape_; }
  294. // Returns the shape of the allocated buffer, which might be larger than the
  295. // used size of the array after padding to alignment.
  296. const std::array<size_t, axes>& memory_shape() const { return memory_shape_; }
  297. // Returns the size of the array, which might be smaller than the allocated
  298. // buffer after padding the last axis to alignment.
  299. size_t size() const { return sizes_[0]; }
  300. // Returns the size of the allocated buffer, which might be larger than the
  301. // used size of the array after padding to alignment.
  302. size_t memory_size() const { return memory_sizes_[0]; }
  303. // Returns a pointer to the allocated buffer.
  304. T* data() { return buffer_.get(); }
  305. // Returns a const pointer to the buffer.
  306. const T* data() const { return buffer_.get(); }
  307. // Truncates the array by updating its shape.
  308. //
  309. // The new shape must be equal to or less than the old shape in all axes.
  310. //
  311. // Doesn't modify underlying memory.
  312. void truncate(const std::array<size_t, axes>& new_shape) {
  313. #if HWY_IS_DEBUG_BUILD
  314. for (size_t axis_index = 0; axis_index < axes; ++axis_index) {
  315. HWY_ASSERT(new_shape[axis_index] <= shape_[axis_index]);
  316. }
  317. #endif
  318. shape_ = new_shape;
  319. sizes_ = ComputeSizes(shape_);
  320. }
  321. private:
  322. std::array<size_t, axes> shape_;
  323. std::array<size_t, axes> memory_shape_;
  324. std::array<size_t, axes + 1> sizes_;
  325. std::array<size_t, axes + 1> memory_sizes_;
  326. hwy::AlignedFreeUniquePtr<T[]> buffer_;
  327. // Computes offset in the buffer based on the provided indices.
  328. size_t Offset(std::array<const size_t, axes - 1> indices) const {
  329. size_t offset = 0;
  330. size_t shape_index = 0;
  331. for (const size_t axis_index : indices) {
  332. offset += memory_sizes_[shape_index + 1] * axis_index;
  333. shape_index++;
  334. }
  335. return offset;
  336. }
  337. // Computes the sizes of all sub arrays based on the sizes of each axis.
  338. //
  339. // Does this by multiplying the size of each axis with the previous one in
  340. // reverse order, starting with the conceptual axis of size 1 containing the
  341. // actual elements in the array.
  342. static std::array<size_t, axes + 1> ComputeSizes(
  343. std::array<size_t, axes> shape) {
  344. std::array<size_t, axes + 1> sizes;
  345. size_t axis = shape.size();
  346. sizes[axis] = 1;
  347. while (axis > 0) {
  348. --axis;
  349. sizes[axis] = sizes[axis + 1] * shape[axis];
  350. }
  351. return sizes;
  352. }
  353. };
  354. } // namespace hwy
  355. #endif // HIGHWAY_HWY_ALIGNED_ALLOCATOR_H_