From a84f920a860cf5a5c720e025e5267f696862c1c8 Mon Sep 17 00:00:00 2001 From: Joseph Schuchart Date: Thu, 7 Nov 2024 21:10:55 -0500 Subject: [PATCH 01/39] Replace buffer tracking with serialization inspection We have been tracking the buffers of structures by having the buffer register itself with some metadata. This is clumsy and error-prone (no support for nesting, for example). Now we use a specialized archive to inspect all members of a value and apply some functor to each buffer (or its internal parsec data). This is also the first step towards a structure derived from parsec_data_copy_t to better manage the life-time of the host copy. We now only allow shared_ptr for non-owning buffers so some adjustments must be made by users. Signed-off-by: Joseph Schuchart --- examples/matrixtile.h | 40 ++-- tests/unit/device_coro.cc | 45 +++- ttg/ttg/buffer.h | 87 ++++++++ ttg/ttg/parsec/buffer.h | 394 +++++++++++++++++++++------------ ttg/ttg/parsec/parsec_data.h | 16 ++ ttg/ttg/parsec/ttg.h | 340 ++++++++++++++-------------- ttg/ttg/parsec/ttg_data_copy.h | 323 --------------------------- 7 files changed, 580 insertions(+), 665 deletions(-) create mode 100644 ttg/ttg/parsec/parsec_data.h diff --git a/examples/matrixtile.h b/examples/matrixtile.h index 203c58bf29..e30ef2eec7 100644 --- a/examples/matrixtile.h +++ b/examples/matrixtile.h @@ -62,6 +62,10 @@ class MatrixTile : public ttg::TTValue> { #endif // DEBUG_TILES_VALUES } + struct non_owning_deleter { + void operator()(T* ptr) { } + }; + public: MatrixTile() {} @@ -85,7 +89,7 @@ class MatrixTile : public ttg::TTValue> { */ MatrixTile(std::size_t rows, std::size_t cols, T* data, std::size_t lda) : ttvalue_type() - , _buffer(data, lda*cols) + , _buffer(std::unique_ptr(data, non_owning_deleter{}), lda*cols) , _rows(rows) , _cols(cols) , _lda(lda) @@ -187,6 +191,17 @@ class MatrixTile : public ttg::TTValue> { o << " } "; return o; } + + template + void serialize(Archive& ar, const unsigned int version) { + serialize(ar); + } + + template + void serialize(Archive& ar) { + ar & rows() & cols() & lda(); + ar & buffer(); + } }; namespace ttg { @@ -203,30 +218,7 @@ namespace ttg { } // namespace ttg #ifdef TTG_SERIALIZATION_SUPPORTS_MADNESS -namespace madness { - namespace archive { - template - struct ArchiveStoreImpl> { - static inline void store(const Archive& ar, const MatrixTile& tile) { - ar << tile.rows() << tile.cols() << tile.lda(); - ar << wrap(tile.data(), tile.rows() * tile.cols()); - } - }; - - template - struct ArchiveLoadImpl> { - static inline void load(const Archive& ar, MatrixTile& tile) { - std::size_t rows, cols, lda; - ar >> rows >> cols >> lda; - tile = MatrixTile(rows, cols, lda); - ar >> wrap(tile.data(), tile.rows() * tile.cols()); // MatrixTile(bm.rows(), bm.cols()); - } - }; - } // namespace archive -} // namespace madness - static_assert(madness::is_serializable_v>); - #endif // TTG_SERIALIZATION_SUPPORTS_MADNESS #endif // TTG_EXAMPLES_MATRIX_TILE_H diff --git a/tests/unit/device_coro.cc b/tests/unit/device_coro.cc index 8ed015aadf..83b9fc3c93 100644 --- a/tests/unit/device_coro.cc +++ b/tests/unit/device_coro.cc @@ -12,21 +12,44 @@ struct value_t { template void serialize(Archive& ar, const unsigned int version) { + serialize(ar); + } + + template + void serialize(Archive& ar) { ar& quark; - ar& db; // input: + ar& db; } }; -#ifdef TTG_SERIALIZATION_SUPPORTS_MADNESS -/* devicebuf is non-POD so provide serialization - * information for members not a devicebuf */ -namespace madness::archive { - template - struct ArchiveSerializeImpl { - static inline void serialize(const Archive& ar, value_t& obj) { ar& obj.quark & obj.db; }; - }; -} // namespace madness::archive -#endif // TTG_SERIALIZATION_SUPPORTS_MADNESS +struct nested_value_t { + value_t v; + ttg::Buffer db; + + template + void serialize(Archive& ar, const unsigned int version) { + serialize(ar); + } + + template + void serialize(Archive& ar) { + ar& v; + ar& db; + } +}; + +TEST_CASE("Device", "coro") { + SECTION("buffer-inspection") { + value_t v1; + std::size_t i = 0; + ttg::detail::buffer_apply(v1, [&](const ttg::Buffer& b){ i++; }); + CHECK(i == 1); + + nested_value_t v2; + i = 0; + ttg::detail::buffer_apply(v2, [&](const ttg::Buffer& b){ i++; }); + } +} #if defined(TTG_HAVE_DEVICE) && defined(TTG_IMPL_DEVICE_SUPPORT) diff --git a/ttg/ttg/buffer.h b/ttg/ttg/buffer.h index 59a9264155..e3b9a1d967 100644 --- a/ttg/ttg/buffer.h +++ b/ttg/ttg/buffer.h @@ -6,6 +6,8 @@ #include "ttg/fwd.h" #include "ttg/util/meta.h" +#include + namespace ttg { template>> @@ -32,4 +34,89 @@ namespace meta { } // namespace ttg +#ifdef TTG_SERIALIZATION_SUPPORTS_MADNESS +#include + +namespace madness { + namespace archive { + template + struct BufferInspectorArchive : public madness::archive::BaseOutputArchive { + private: + Fn m_fn; + + public: + template + BufferInspectorArchive(_Fn&& fn) + : m_fn(fn) + { } + + /// Stores (counts) data into the memory buffer. + + /// The function only appears (due to \c enable_if) if \c T is + /// serializable. + /// \tparam T Type of the data to be stored (counted). + /// \param[in] t Pointer to the data to be stored (counted). + /// \param[in] n Size of data to be stored (counted). + template + void store(const ttg::Buffer* t, long n) const { + /* invoke the function on each buffer */ + for (std::size_t i = 0; i < n; ++i) { + m_fn(t[i]); + } + } + + template + void store(const T* t, long n) const { + /* nothing to be done for other types */ + } + + /// Open a buffer with a specific size. + void open(std::size_t /*hint*/) {} + + /// Close the archive. + void close() {} + + /// Flush the archive. + void flush() {} + + /// Return the amount of data stored (counted) in the buffer. + /// \return The amount of data stored (counted) in the buffer (zero). + std::size_t size() const { + return 0; + }; + }; + + /* deduction guide */ + template + BufferInspectorArchive(Fn&&) -> BufferInspectorArchive; + } // namespace archive + + template + struct is_archive> : std::true_type {}; + + template + struct is_output_archive> : std::true_type {}; + + template + struct is_default_serializable_helper, T, + std::enable_if_t::value>> + : std::true_type {}; + + template + struct is_default_serializable_helper, ttg::Buffer> + : std::true_type {}; +} // namespace madness + +namespace ttg::detail { + template + requires(madness::is_serializable_v, T>) + void buffer_apply(T&& t, Fn&& fn) { + madness::archive::BufferInspectorArchive ar(std::forward(fn)); + ar & t; + } +} // namespace ttg::detail + +#endif // TTG_SERIALIZATION_SUPPORTS_MADNESS + + #endif // TTG_buffer_H \ No newline at end of file diff --git a/ttg/ttg/parsec/buffer.h b/ttg/ttg/parsec/buffer.h index 7bcccbfd54..382a7a9df8 100644 --- a/ttg/ttg/parsec/buffer.h +++ b/ttg/ttg/parsec/buffer.h @@ -3,6 +3,7 @@ #include #include +#include #include #include #include @@ -24,6 +25,194 @@ namespace detail { // fwd decl template parsec_data_t* get_parsec_data(const ttg_parsec::Buffer& db); + + template + struct empty_allocator { + + using value_type = std::decay_t; + + value_type* allocate(std::size_t size) { + throw std::runtime_error("Allocate on empty allocator!"); + } + + void deallocate(value_type* ptr, std::size_t size) { + throw std::runtime_error("Deallocate on empty allocator!"); + } + }; + + /** + * Wrapper type to carry the Allocator into types that are using + * the PaRSEC object system. + */ + template + struct ttg_parsec_data_types { + using allocator_traits = std::allocator_traits; + using allocator_type = typename allocator_traits::allocator_type; + using value_type = typename allocator_traits::value_type; + + /* used as a hook into the PaRSEC object management system + * so we can release the memory back to the allocator once + * data copy is destroyed */ + struct data_copy_type : public parsec_data_copy_t + { + private: + [[no_unique_address]] + allocator_type m_allocator; + PtrT m_ptr; // keep a reference if PtrT is a shared_ptr + std::size_t m_size; + + void allocate(std::size_t size) { + if constexpr (std::is_pointer_v) { + m_ptr = allocator_traits::allocate(m_allocator, size); + } + this->device_private = &(*m_ptr); + m_size = size; + } + + void deallocate() { + allocator_traits::deallocate(m_allocator, static_cast(this->device_private), this->m_size); + this->device_private = nullptr; + this->m_size = 0; + } + + public: + + /* allocation through PARSEC_OBJ_NEW, so no inline constructor */ + + void construct(PtrT ptr, std::size_t size) { + m_allocator = allocator_type{}; + constexpr const bool is_empty_allocator = std::is_same_v>; + assert(is_empty_allocator); + m_ptr = std::move(ptr); + this->device_private = &(*m_ptr); + } + + void construct(std::size_t size, const allocator_type& alloc = allocator_type()) { + constexpr const bool is_empty_allocator = std::is_same_v>; + assert(!is_empty_allocator); + m_allocator = alloc; + allocate(size); + this->device_private = &(*m_ptr); + } + + ~data_copy_type() { + this->deallocate(); + } + }; + +#if 0 + /** + * derived from parsec_data_t managing an allocator for host memory + * and creating a host copy through a callback + */ + struct data_type : public parsec_data_t { + private: + [[no_unique_address]] + Allocator host_allocator; + + allocator_type& get_allocator_reference() { return static_cast(*this); } + + element_type* do_allocate(std::size_t n) { + return allocator_traits::allocate(get_allocator_reference(), n); + } + + public: + + explicit data_type(const Allocator& host_alloc = Allocator()) + : host_allocator(host_alloc) + { } + + void* allocate(std::size_t size) { + return do_allocate(size); + } + + void deallocate(void* ptr, std::size_t size) { + allocator_traits::deallocate(get_allocator_reference(), ptr, size); + } + }; + + /* Allocate the host copy for a given data_t */ + static parsec_data_copy_t* data_copy_allocate(parsec_data_t* pdata, int device) { + if (device != 0) return nullptr; // we only allocate for the host + data_type* data = static_cast(pdata); // downcast + data_copy_type* copy = PARSEC_OBJ_NEW(data_copy_type); + copy->device_private = data->allocate(data->nb_elts); + return copy; + } + /** + * Create the PaRSEC object infrastructure for the data copy type + */ + inline void data_construct(data_type* obj) + { + obj->allocate_cb = &data_copy_allocate; + } + + static void data_destruct(data_type* obj) + { + /* cleanup alloctor instance */ + obj->~data_type(); + } + + static constexpr PARSEC_OBJ_CLASS_INSTANCE(data_type, parsec_data_t, + data_copy_construct, + data_copy_destruct); + +#endif // 0 + + /** + * Create the PaRSEC object infrastructure for the data copy type + */ + static void data_copy_construct(data_copy_type* obj) + { + /* nothing to do */ + } + + static void data_copy_destruct(data_copy_type* obj) + { + obj->~data_copy_type(); // call destructor + } + + inline static PARSEC_OBJ_CLASS_INSTANCE(data_copy_type, parsec_data_copy_t, + data_copy_construct, + data_copy_destruct); + + static parsec_data_t * create_data(std::size_t size, + const allocator_type& allocator = allocator_type()) { + parsec_data_t *data = PARSEC_OBJ_NEW(parsec_data_t); + data->owner_device = 0; + data->nb_elts = size; + + /* create the host copy and allocate host memory */ + data_copy_type *copy = PARSEC_OBJ_NEW(data_copy_type); + copy->construct(size, allocator); + parsec_data_copy_attach(data, copy, 0); + + /* adjust data flags */ + data->device_copies[0]->flags |= PARSEC_DATA_FLAG_PARSEC_MANAGED; + data->device_copies[0]->coherency_state = PARSEC_DATA_COHERENCY_SHARED; + data->device_copies[0]->version = 1; + + return data; + } + + static parsec_data_t * create_data(PtrT& ptr, std::size_t size) { + parsec_data_t *data = PARSEC_OBJ_NEW(parsec_data_t); + data->owner_device = 0; + data->nb_elts = size; + + /* create the host copy and allocate host memory */ + data_copy_type *copy = PARSEC_OBJ_NEW(data_copy_type); + copy->construct(ptr, size); + parsec_data_copy_attach(data, copy, 0); + + /* adjust data flags */ + data->device_copies[0]->flags |= PARSEC_DATA_FLAG_PARSEC_MANAGED; + data->device_copies[0]->coherency_state = PARSEC_DATA_COHERENCY_SHARED; + data->device_copies[0]->version = 1; + + return data; + } + }; } // namespace detail /** @@ -38,8 +227,7 @@ namespace detail { * tracking of the containing object. */ template -struct Buffer : public detail::ttg_parsec_data_wrapper_t - , private Allocator { +struct Buffer { /* TODO: add overloads for T[]? */ using value_type = std::remove_all_extents_t; @@ -47,110 +235,95 @@ struct Buffer : public detail::ttg_parsec_data_wrapper_t using const_pointer_type = const std::remove_const_t*; using element_type = std::decay_t; - using allocator_traits = std::allocator_traits; - using allocator_type = typename allocator_traits::allocator_type; - static_assert(std::is_trivially_copyable_v, "Only trivially copyable types are supported for devices."); private: using delete_fn_t = std::function; - using host_data_ptr = std::add_pointer_t; - host_data_ptr m_host_data = nullptr; + using host_data_ptr = std::add_pointer_t; + parsec_data_t *m_data = nullptr; std::size_t m_count = 0; - bool m_owned= false; - static void delete_non_owned(element_type *ptr) { - // nothing to be done, we don't own the memory - } + friend parsec_data_t* detail::get_parsec_data(const ttg_parsec::Buffer&); - friend parsec_data_t* detail::get_parsec_data(const ttg_parsec::Buffer&); - allocator_type& get_allocator_reference() { return static_cast(*this); } + void release_data() { + if (nullptr == m_data) return; - element_type* allocate(std::size_t n) { - return allocator_traits::allocate(get_allocator_reference(), n); - } + /* first: drop our reference to the parsec_data_t */ + PARSEC_OBJ_RELEASE(m_data); + /** + * second: drop a second reference to the parsec_data_t, for the host-side data_t. + * + * There are two cases: + * 1) There was only a host data. By dropping this reference + * the data_t is destroyed, destroying the host-side data copy as well. + * 2) There was a device-side data copy with a reference to the data_t. + * This reference will be released once the device-side data copy is released, + * which will finally destroy the host-side data-copy. + */ + PARSEC_OBJ_RELEASE(m_data); - void deallocate() { - allocator_traits::deallocate(get_allocator_reference(), m_host_data, m_count); + /* set data to null so we don't repeat the above */ + m_data = nullptr; } public: Buffer() - : ttg_parsec_data_wrapper_t() - , allocator_type() - , m_host_data(nullptr) - , m_count(0) - , m_owned(false) { } - /** - * Allocates n elements, unitialized - * By default, data is synchronized to the device, allowing codes - * to fill the buffer before making it available on the device. - * Passing ttg::scope::Allocate will prevent the initial synchronization. - * Subsequent data transfers behave as expected (i.e., data is transferred - * to the host and other devices as needed). - */ - Buffer(std::size_t n, ttg::scope scope = ttg::scope::SyncIn) - : ttg_parsec_data_wrapper_t() - , allocator_type() - , m_host_data(allocate(n)) + Buffer(std::size_t n) + : m_data(detail::ttg_parsec_data_types::create_data(n*sizeof(element_type))) , m_count(n) - , m_owned(true) - { - //std::cout << "buffer " << this << " ctor count " - // << m_count << "(" << m_host_data << ") ttg_copy " - // << m_ttg_copy - // << " parsec_data " << m_data.get() << std::endl; - this->reset_parsec_data(m_host_data, n*sizeof(element_type), (scope == ttg::scope::SyncIn)); - } + { } /** * Constructing a buffer using application-managed memory. - * The memory pointed to by ptr must be accessible during - * the life-time of the buffer. - * - * Passing ttg::scope::Allocate will prevent the initial synchronization. - * Subsequent data transfers behave as expected (i.e., data is transferred - * to the host and other devices as needed). + * The shared_ptr will ensure that the memory is not free'd before + * the runtime has released all of its references. */ - Buffer(pointer_type ptr, std::size_t n = 1, ttg::scope scope = ttg::scope::SyncIn) - : ttg_parsec_data_wrapper_t() - , allocator_type() - , m_host_data(const_cast(ptr)) + Buffer(std::shared_ptr ptr, std::size_t n) + : m_data(detail::ttg_parsec_data_types, + detail::empty_allocator> + ::create_data(ptr, n*sizeof(element_type))) , m_count(n) - , m_owned(false) - { - //std::cout << "buffer " << this << " ctor ptr " << ptr << "count " - // << m_count << "(" << m_host_data << ") ttg_copy " - // << m_ttg_copy - // << " parsec_data " << m_data.get() << std::endl; - this->reset_parsec_data(m_host_data, n*sizeof(element_type), (scope == ttg::scope::SyncIn)); - } + { } + + Buffer(std::shared_ptr ptr) + : m_data(detail::ttg_parsec_data_types, + detail::empty_allocator> + ::create_data(ptr, sizeof(element_type))) + , m_count(1) + { } + + Buffer(std::unique_ptr ptr, std::size_t n) + : m_data(detail::ttg_parsec_data_types, + detail::empty_allocator> + ::create_data(ptr, n*sizeof(element_type))) + , m_count(n) + { } + + Buffer(std::unique_ptr ptr) + : m_data(detail::ttg_parsec_data_types, + detail::empty_allocator> + ::create_data(ptr, sizeof(element_type))) + , m_count(1) + { } virtual ~Buffer() { - if (m_owned) { - deallocate(); - m_owned = false; - } + release_data(); unpin(); // make sure the copies are not pinned } /* allow moving device buffers */ Buffer(Buffer&& db) - : ttg_parsec_data_wrapper_t(std::move(db)) - , allocator_type(std::move(db)) - , m_host_data(db.m_host_data) + : m_data(db.m_data) , m_count(db.m_count) - , m_owned(db.m_owned) { - db.m_host_data = nullptr; + db.m_data = nullptr; db.m_count = 0; - db.m_owned = false; } /* explicitly disable copying of buffers @@ -160,11 +333,8 @@ struct Buffer : public detail::ttg_parsec_data_wrapper_t /* allow moving device buffers */ Buffer& operator=(Buffer&& db) { - ttg_parsec_data_wrapper_t::operator=(std::move(db)); - allocator_type::operator=(std::move(db)); - std::swap(m_host_data, db.m_host_data); + std::swap(m_data, db.m_data); std::swap(m_count, db.m_count); - std::swap(m_owned, db.m_owned); //std::cout << "buffer " << this << " other " << &db << " mv op ttg_copy " << m_ttg_copy << std::endl; //std::cout << "buffer::move-assign from " << &db << " ttg-copy " << db.m_ttg_copy // << " to " << this << " ttg-copy " << m_ttg_copy @@ -252,7 +422,7 @@ struct Buffer : public detail::ttg_parsec_data_wrapper_t assert(is_valid()); if (empty()) return nullptr; int device_id = detail::ttg_device_to_parsec_device(device); - return static_cast(parsec_data_get_ptr(m_data.get(), device_id)); + return static_cast(parsec_data_get_ptr(m_data, device_id)); } /* get the device pointer at the given device @@ -261,23 +431,21 @@ struct Buffer : public detail::ttg_parsec_data_wrapper_t assert(is_valid()); if (empty()) return nullptr; int device_id = detail::ttg_device_to_parsec_device(device); - return static_cast(parsec_data_get_ptr(m_data.get(), device_id)); + return static_cast(parsec_data_get_ptr(m_data, device_id)); } pointer_type host_ptr() { - if (empty()) return nullptr; - return static_cast(parsec_data_get_ptr(m_data.get(), 0)); + return static_cast(parsec_data_get_ptr(m_data, 0)); } const_pointer_type host_ptr() const { - if (empty()) return nullptr; - return static_cast(parsec_data_get_ptr(m_data.get(), 0)); + return static_cast(parsec_data_get_ptr(m_data, 0)); } bool is_valid_on(const ttg::device::Device& device) const { assert(is_valid()); int device_id = detail::ttg_device_to_parsec_device(device); - return (parsec_data_get_ptr(m_data.get(), device_id) != nullptr); + return (parsec_data_get_ptr(m_data, device_id) != nullptr); } void allocate_on(const ttg::device::Device& device_id) { @@ -332,53 +500,16 @@ struct Buffer : public detail::ttg_parsec_data_wrapper_t /* Reallocate the buffer with count elements */ void reset(std::size_t n, ttg::scope scope = ttg::scope::SyncIn) { - /* TODO: can we resize if count is smaller than m_count? */ + release_data(); - if (m_owned) { - deallocate(); - m_owned = false; + if (n > 0) { + m_data = detail::ttg_parsec_data_types + ::create_data(n*sizeof(element_type)); } - if (n == 0) { - m_host_data = nullptr; - m_owned = false; - } else { - m_host_data = allocate(n); - m_owned = true; - } - reset_parsec_data(m_host_data, n*sizeof(element_type), (scope == ttg::scope::SyncIn)); - //std::cout << "buffer::reset(" << count << ") ptr " << m_host_data.get() - // << " ttg_copy " << m_ttg_copy - // << " parsec_data " << m_data.get() << std::endl; m_count = n; } - /* Reset the buffer to use the ptr to count elements */ - void reset(pointer_type ptr, std::size_t n = 1, ttg::scope scope = ttg::scope::SyncIn) { - /* TODO: can we resize if count is smaller than m_count? */ - if (n == m_count) { - return; - } - - if (m_owned) { - deallocate(); - } - - if (nullptr == ptr) { - m_host_data = nullptr; - m_count = 0; - m_owned = false; - } else { - m_host_data = ptr; - m_count = n; - m_owned = false; - } - reset_parsec_data(m_host_data, n*sizeof(element_type), (scope == ttg::scope::SyncIn)); - //std::cout << "buffer::reset(" << ptr << ", " << count << ") ptr " << m_host_data.get() - // << " ttg_copy " << m_ttg_copy - // << " parsec_data " << m_data.get() << std::endl; - } - /** * Resets the scope of the buffer. * If scope is SyncIn then the next time @@ -434,15 +565,11 @@ struct Buffer : public detail::ttg_parsec_data_wrapper_t if constexpr (ttg::detail::is_output_archive_v) { std::size_t s = size(); ar& s; - assert(m_ttg_copy != nullptr); // only tracked objects allowed - m_ttg_copy->iovec_add(ttg::iovec{s*sizeof(T), current_device_ptr()}); } else { std::size_t s; ar & s; /* initialize internal pointers and then reset */ reset(s); - assert(m_ttg_copy != nullptr); // only tracked objects allowed - m_ttg_copy->iovec_add(ttg::iovec{s*sizeof(T), current_device_ptr()}); } } #endif // TTG_SERIALIZATION_SUPPORTS_BOOST @@ -455,21 +582,12 @@ struct Buffer : public detail::ttg_parsec_data_wrapper_t if constexpr (ttg::detail::is_output_archive_v) { std::size_t s = size(); ar& s; - assert(m_ttg_copy != nullptr); // only tracked objects allowed - /* transfer from the current device - * note: if the transport layer (MPI) does not support device transfers - * the data will have been pushed out */ - m_ttg_copy->iovec_add(ttg::iovec{s*sizeof(T), current_device_ptr()}); } else { std::size_t s; ar & s; //std::cout << "serialize(IN) buffer " << this << " size " << s << std::endl; /* initialize internal pointers and then reset */ reset(s); - assert(m_ttg_copy != nullptr); // only tracked objects allowed - /* transfer to the current device - * TODO: how can we make sure the device copy is not evicted? */ - m_ttg_copy->iovec_add(ttg::iovec{s*sizeof(T), current_device_ptr()}); } } #endif // TTG_SERIALIZATION_SUPPORTS_MADNESS @@ -478,10 +596,10 @@ struct Buffer : public detail::ttg_parsec_data_wrapper_t namespace detail { template parsec_data_t* get_parsec_data(const ttg_parsec::Buffer& db) { - return const_cast(db.m_data.get()); + return const_cast(db.m_data); } } // namespace detail } // namespace ttg_parsec -#endif // TTG_PARSEC_BUFFER_H \ No newline at end of file +#endif // TTG_PARSEC_BUFFER_H diff --git a/ttg/ttg/parsec/parsec_data.h b/ttg/ttg/parsec/parsec_data.h new file mode 100644 index 0000000000..843bf82a29 --- /dev/null +++ b/ttg/ttg/parsec/parsec_data.h @@ -0,0 +1,16 @@ +#ifndef TTG_PARSEC_PARSEC_DATA_H +#define TTG_PARSEC_PARSEC_DATA_H + +#include "ttg/parsec/buffer.h" +#include "ttg/buffer.h" + +namespace ttg_parsec::detail { + template + void foreach_parsec_data(Value&& value, Fn&& fn) { + ttg::detail::buffer_apply(value, [&](B&& b){ + fn(detail::get_parsec_data(b)); + }); + } +} // namespace ttg_parsec::detail + +#endif // TTG_PARSEC_PARSEC_DATA_H \ No newline at end of file diff --git a/ttg/ttg/parsec/ttg.h b/ttg/ttg/parsec/ttg.h index 2814cf3247..903c3feea2 100644 --- a/ttg/ttg/parsec/ttg.h +++ b/ttg/ttg/parsec/ttg.h @@ -48,6 +48,7 @@ #include "ttg/parsec/devicefunc.h" #include "ttg/parsec/ttvalue.h" #include "ttg/device/task.h" +#include "ttg/parsec/parsec_data.h" #include #include @@ -743,18 +744,25 @@ namespace ttg_parsec { } #endif // 0 - template - inline void transfer_ownership_impl(ttg_data_copy_t *copy, int device) { - if constexpr(!std::is_const_v>) { - copy->transfer_ownership(PARSEC_FLOW_ACCESS_RW, device); - copy->inc_current_version(); + template + inline void transfer_ownership_impl(T&& arg, int device) { + if constexpr(!std::is_const_v>) { + ttg::detail::buffer_apply(arg, [&](auto&& buffer){ + auto *data = detail::get_parsec_data(buffer); + parsec_data_transfer_ownership_to_copy(data, device, PARSEC_FLOW_ACCESS_RW); + /* make sure we increment the version since we will modify the data */ + data->device_copies[0]->version++; + }); } } template inline void transfer_ownership(parsec_ttg_task_t *me, int device, std::index_sequence) { /* transfer ownership of each data */ - int junk[] = {0, (transfer_ownership_impl(me->copies[Is], device), 0)...}; + int junk[] = {0, + (transfer_ownership_impl( + *reinterpret_cast> *>( + me->copies[Is]->get_ptr()), device), 0)...}; junk[0]++; } @@ -1936,11 +1944,6 @@ namespace ttg_parsec { uint64_t pack(T &obj, void *bytes, uint64_t pos, detail::ttg_data_copy_t *copy = nullptr) { using dd_t = ttg::default_data_descriptor>; uint64_t payload_size = dd_t::payload_size(&obj); - if (copy) { - /* reset any tracked data, we don't care about the packing from the payload size */ - copy->iovec_reset(); - } - if constexpr (!dd_t::serialize_size_is_const) { pos = ttg::default_data_descriptor::pack_payload(&payload_size, sizeof(uint64_t), pos, bytes); } @@ -2129,8 +2132,6 @@ namespace ttg_parsec { #endif // 0 /* unpack the object, potentially discovering iovecs */ pos = unpack(*static_cast(copy->get_ptr()), msg->bytes, pos); - //std::cout << "set_arg_from_msg iovec_begin num_iovecs " << num_iovecs << " distance " << std::distance(copy->iovec_begin(), copy->iovec_end()) << std::endl; - assert(std::distance(copy->iovec_begin(), copy->iovec_end()) == num_iovecs); } if (num_iovecs == 0) { @@ -2147,67 +2148,79 @@ namespace ttg_parsec { bool inline_data = msg->tt_id.inline_data; int nv = 0; + parsec_ce_tag_t cbtag; /* start the RMA transfers */ - auto handle_iovecs_fn = - [&](auto&& iovecs) { - - if (inline_data) { - /* unpack the data from the message */ - for (auto &&iov : iovecs) { - ++nv; - std::memcpy(iov.data, msg->bytes + pos, iov.num_bytes); - pos += iov.num_bytes; - } - } else { - /* extract the callback tag */ - parsec_ce_tag_t cbtag; - std::memcpy(&cbtag, msg->bytes + pos, sizeof(cbtag)); - pos += sizeof(cbtag); - - /* create the value from the metadata */ - auto activation = new detail::rma_delayed_activate( - std::move(keylist), copy, num_iovecs, [this](std::vector &&keylist, detail::ttg_data_copy_t *copy) { - set_arg_from_msg_keylist(keylist, copy); - this->world.impl().decrement_inflight_msg(); - }); - - using ActivationT = std::decay_t; - - for (auto &&iov : iovecs) { - ++nv; - parsec_ce_mem_reg_handle_t rreg; - int32_t rreg_size_i; - std::memcpy(&rreg_size_i, msg->bytes + pos, sizeof(rreg_size_i)); - pos += sizeof(rreg_size_i); - rreg = static_cast(msg->bytes + pos); - pos += rreg_size_i; - // std::intptr_t *fn_ptr = reinterpret_cast(msg->bytes + pos); - // pos += sizeof(*fn_ptr); - std::intptr_t fn_ptr; - std::memcpy(&fn_ptr, msg->bytes + pos, sizeof(fn_ptr)); - pos += sizeof(fn_ptr); - - /* register the local memory */ - parsec_ce_mem_reg_handle_t lreg; - size_t lreg_size; - parsec_ce.mem_register(iov.data, PARSEC_MEM_TYPE_NONCONTIGUOUS, iov.num_bytes, parsec_datatype_int8_t, - iov.num_bytes, &lreg, &lreg_size); - world.impl().increment_inflight_msg(); - /* TODO: PaRSEC should treat the remote callback as a tag, not a function pointer! */ - //std::cout << "set_arg_from_msg: get rreg " << rreg << " remote " << remote << std::endl; - parsec_ce.get(&parsec_ce, lreg, 0, rreg, 0, iov.num_bytes, remote, - &detail::get_complete_cb, activation, - /*world.impl().parsec_ttg_rma_tag()*/ - cbtag, &fn_ptr, sizeof(std::intptr_t)); - } - } + auto create_activation_fn = [&]() { + /* extract the callback tag */ + std::memcpy(&cbtag, msg->bytes + pos, sizeof(cbtag)); + pos += sizeof(cbtag); + + /* create the value from the metadata */ + auto activation = new detail::rma_delayed_activate( + std::move(keylist), copy, num_iovecs, [this](std::vector &&keylist, detail::ttg_data_copy_t *copy) { + set_arg_from_msg_keylist(keylist, copy); + this->world.impl().decrement_inflight_msg(); + }); + return activation; + }; + auto read_inline_data = [&](auto&& iovec){ + /* unpack the data from the message */ + ++nv; + std::memcpy(iovec.data, msg->bytes + pos, iovec.num_bytes); + pos += iovec.num_bytes; + }; + auto handle_iovec_fn = [&](auto&& iovec, auto activation) { + using ActivationT = std::decay_t; + + ++nv; + parsec_ce_mem_reg_handle_t rreg; + int32_t rreg_size_i; + std::memcpy(&rreg_size_i, msg->bytes + pos, sizeof(rreg_size_i)); + pos += sizeof(rreg_size_i); + rreg = static_cast(msg->bytes + pos); + pos += rreg_size_i; + // std::intptr_t *fn_ptr = reinterpret_cast(msg->bytes + pos); + // pos += sizeof(*fn_ptr); + std::intptr_t fn_ptr; + std::memcpy(&fn_ptr, msg->bytes + pos, sizeof(fn_ptr)); + pos += sizeof(fn_ptr); + + /* register the local memory */ + parsec_ce_mem_reg_handle_t lreg; + size_t lreg_size; + parsec_ce.mem_register(iovec.data, PARSEC_MEM_TYPE_NONCONTIGUOUS, iovec.num_bytes, parsec_datatype_int8_t, + iovec.num_bytes, &lreg, &lreg_size); + world.impl().increment_inflight_msg(); + /* TODO: PaRSEC should treat the remote callback as a tag, not a function pointer! */ + //std::cout << "set_arg_from_msg: get rreg " << rreg << " remote " << remote << std::endl; + parsec_ce.get(&parsec_ce, lreg, 0, rreg, 0, iovec.num_bytes, remote, + &detail::get_complete_cb, activation, + /*world.impl().parsec_ttg_rma_tag()*/ + cbtag, &fn_ptr, sizeof(std::intptr_t)); }; if constexpr (ttg::has_split_metadata::value) { ttg::SplitMetadataDescriptor descr; - handle_iovecs_fn(descr.get_data(val)); + if (inline_data) { + for (auto&& iov : descr.get_data(val)) { + read_inline_data(iov); + } + } else { + auto activation = create_activation_fn(); + for (auto&& iov : descr.get_data(val)) { + handle_iovec_fn(iov, activation); + } + } } else if constexpr (!ttg::has_split_metadata::value) { - handle_iovecs_fn(copy->iovec_span()); - copy->iovec_reset(); + if (inline_data) { + detail::foreach_parsec_data(val, [&](parsec_data_t* data){ + read_inline_data(ttg::iovec{data->nb_elts, data->device_copies[data->owner_device]->device_private}); + }); + } else { + auto activation = create_activation_fn(); + detail::foreach_parsec_data(val, [&](parsec_data_t* data){ + handle_iovec_fn(ttg::iovec{data->nb_elts, data->device_copies[data->owner_device]->device_private}, activation); + }); + } } assert(num_iovecs == nv); @@ -2751,8 +2764,7 @@ namespace ttg_parsec { } else { /* TODO: how can we query the iovecs of the buffers here without actually packing the data? */ metadata_size = ttg::default_data_descriptor>::payload_size(value_ptr); - iov_size = std::accumulate(copy->iovec_begin(), copy->iovec_end(), 0, - [](std::size_t s, auto& iov){ return s + iov.num_bytes; }); + detail::foreach_parsec_data(*value_ptr, [&](parsec_data_t* data){ iov_size += data->nb_elts; }); } /* key is packed at the end */ std::size_t key_pack_size = ttg::default_data_descriptor::payload_size(&key); @@ -2820,55 +2832,54 @@ namespace ttg_parsec { bool inline_data = can_inline_data(value_ptr, copy, key, 1); msg->tt_id.inline_data = inline_data; - auto handle_iovec_fn = [&](auto&& iovecs){ - - if (inline_data) { - /* inline data is packed right after the tt_id in the message */ - for (auto &&iov : iovecs) { - std::memcpy(msg->bytes + pos, iov.data, iov.num_bytes); - pos += iov.num_bytes; - } - } else { - + auto write_header_fn = [&]() { + if (!inline_data) { /* TODO: at the moment, the tag argument to parsec_ce.get() is treated as a * raw function pointer instead of a preregistered AM tag, so play that game. * Once this is fixed in PaRSEC we need to use parsec_ttg_rma_tag instead! */ parsec_ce_tag_t cbtag = reinterpret_cast(&detail::get_remote_complete_cb); std::memcpy(msg->bytes + pos, &cbtag, sizeof(cbtag)); pos += sizeof(cbtag); + } + }; + auto handle_iovec_fn = [&](auto&& iovec){ + + if (inline_data) { + /* inline data is packed right after the tt_id in the message */ + std::memcpy(msg->bytes + pos, iovec.data, iovec.num_bytes); + pos += iovec.num_bytes; + } else { /** * register the generic iovecs and pack the registration handles * memory layout: [, ...] */ - for (auto &&iov : iovecs) { - copy = detail::register_data_copy(copy, nullptr, true); - parsec_ce_mem_reg_handle_t lreg; - size_t lreg_size; - /* TODO: only register once when we can broadcast the data! */ - parsec_ce.mem_register(iov.data, PARSEC_MEM_TYPE_NONCONTIGUOUS, iov.num_bytes, parsec_datatype_int8_t, - iov.num_bytes, &lreg, &lreg_size); - auto lreg_ptr = std::shared_ptr{lreg, [](void *ptr) { - parsec_ce_mem_reg_handle_t memreg = (parsec_ce_mem_reg_handle_t)ptr; - parsec_ce.mem_unregister(&memreg); - }}; - int32_t lreg_size_i = lreg_size; - std::memcpy(msg->bytes + pos, &lreg_size_i, sizeof(lreg_size_i)); - pos += sizeof(lreg_size_i); - std::memcpy(msg->bytes + pos, lreg, lreg_size); - pos += lreg_size; - //std::cout << "set_arg_impl lreg " << lreg << std::endl; - /* TODO: can we avoid the extra indirection of going through std::function? */ - std::function *fn = new std::function([=]() mutable { - /* shared_ptr of value and registration captured by value so resetting - * them here will eventually release the memory/registration */ - detail::release_data_copy(copy); - lreg_ptr.reset(); - }); - std::intptr_t fn_ptr{reinterpret_cast(fn)}; - std::memcpy(msg->bytes + pos, &fn_ptr, sizeof(fn_ptr)); - pos += sizeof(fn_ptr); - } + copy = detail::register_data_copy(copy, nullptr, true); + parsec_ce_mem_reg_handle_t lreg; + size_t lreg_size; + /* TODO: only register once when we can broadcast the data! */ + parsec_ce.mem_register(iovec.data, PARSEC_MEM_TYPE_NONCONTIGUOUS, iovec.num_bytes, parsec_datatype_int8_t, + iovec.num_bytes, &lreg, &lreg_size); + auto lreg_ptr = std::shared_ptr{lreg, [](void *ptr) { + parsec_ce_mem_reg_handle_t memreg = (parsec_ce_mem_reg_handle_t)ptr; + parsec_ce.mem_unregister(&memreg); + }}; + int32_t lreg_size_i = lreg_size; + std::memcpy(msg->bytes + pos, &lreg_size_i, sizeof(lreg_size_i)); + pos += sizeof(lreg_size_i); + std::memcpy(msg->bytes + pos, lreg, lreg_size); + pos += lreg_size; + //std::cout << "set_arg_impl lreg " << lreg << std::endl; + /* TODO: can we avoid the extra indirection of going through std::function? */ + std::function *fn = new std::function([=]() mutable { + /* shared_ptr of value and registration captured by value so resetting + * them here will eventually release the memory/registration */ + detail::release_data_copy(copy); + lreg_ptr.reset(); + }); + std::intptr_t fn_ptr{reinterpret_cast(fn)}; + std::memcpy(msg->bytes + pos, &fn_ptr, sizeof(fn_ptr)); + pos += sizeof(fn_ptr); } }; @@ -2880,16 +2891,20 @@ namespace ttg_parsec { auto metadata = descr.get_metadata(*const_cast(value_ptr)); pos = pack(metadata, msg->bytes, pos); //std::cout << "set_arg_impl splitmd num_iovecs " << num_iovecs << std::endl; - handle_iovec_fn(iovs); + write_header_fn(); + for (auto&& iov : iovs) { + handle_iovec_fn(iov); + } } else if constexpr (!ttg::has_split_metadata>::value) { /* serialize the object */ - //std::cout << "PRE pack num_iovecs " << std::distance(copy->iovec_begin(), copy->iovec_end()) << std::endl; pos = pack(*value_ptr, msg->bytes, pos, copy); - num_iovecs = std::distance(copy->iovec_begin(), copy->iovec_end()); + detail::foreach_parsec_data(value, [&](parsec_data_t *data){ ++num_iovecs; }); //std::cout << "POST pack num_iovecs " << num_iovecs << std::endl; /* handle any iovecs contained in it */ - handle_iovec_fn(copy->iovec_span()); - copy->iovec_reset(); + write_header_fn(); + detail::foreach_parsec_data(value, [&](parsec_data_t *data){ + handle_iovec_fn(ttg::iovec{data->nb_elts, data->device_copies[data->owner_device]->device_private}); + }); } msg->tt_id.num_iovecs = num_iovecs; @@ -3012,39 +3027,36 @@ namespace ttg_parsec { msg->tt_id.inline_data = inline_data; std::vector>> memregs; - auto handle_iovs_fn = [&](auto&& iovs){ - - if (inline_data) { - /* inline data is packed right after the tt_id in the message */ - for (auto &&iov : iovs) { - std::memcpy(msg->bytes + pos, iov.data, iov.num_bytes); - pos += iov.num_bytes; - } - } else { - + auto write_iov_header = [&](){ + if (!inline_data) { /* TODO: at the moment, the tag argument to parsec_ce.get() is treated as a * raw function pointer instead of a preregistered AM tag, so play that game. * Once this is fixed in PaRSEC we need to use parsec_ttg_rma_tag instead! */ parsec_ce_tag_t cbtag = reinterpret_cast(&detail::get_remote_complete_cb); std::memcpy(msg->bytes + pos, &cbtag, sizeof(cbtag)); pos += sizeof(cbtag); - - for (auto &&iov : iovs) { - parsec_ce_mem_reg_handle_t lreg; - size_t lreg_size; - parsec_ce.mem_register(iov.data, PARSEC_MEM_TYPE_NONCONTIGUOUS, iov.num_bytes, parsec_datatype_int8_t, - iov.num_bytes, &lreg, &lreg_size); - /* TODO: use a static function for deregistration here? */ - memregs.push_back(std::make_pair(static_cast(lreg_size), - /* TODO: this assumes that parsec_ce_mem_reg_handle_t is void* */ - std::shared_ptr{lreg, [](void *ptr) { - parsec_ce_mem_reg_handle_t memreg = - (parsec_ce_mem_reg_handle_t)ptr; - //std::cout << "broadcast_arg memunreg lreg " << memreg << std::endl; - parsec_ce.mem_unregister(&memreg); - }})); - //std::cout << "broadcast_arg memreg lreg " << lreg << std::endl; - } + } + }; + auto handle_iov_fn = [&](auto&& iovec){ + if (inline_data) { + /* inline data is packed right after the tt_id in the message */ + std::memcpy(msg->bytes + pos, iovec.data, iovec.num_bytes); + pos += iovec.num_bytes; + } else { + parsec_ce_mem_reg_handle_t lreg; + size_t lreg_size; + parsec_ce.mem_register(iovec.data, PARSEC_MEM_TYPE_NONCONTIGUOUS, iovec.num_bytes, parsec_datatype_int8_t, + iovec.num_bytes, &lreg, &lreg_size); + /* TODO: use a static function for deregistration here? */ + memregs.push_back(std::make_pair(static_cast(lreg_size), + /* TODO: this assumes that parsec_ce_mem_reg_handle_t is void* */ + std::shared_ptr{lreg, [](void *ptr) { + parsec_ce_mem_reg_handle_t memreg = + (parsec_ce_mem_reg_handle_t)ptr; + //std::cout << "broadcast_arg memunreg lreg " << memreg << std::endl; + parsec_ce.mem_unregister(&memreg); + }})); + //std::cout << "broadcast_arg memreg lreg " << lreg << std::endl; } }; @@ -3056,14 +3068,21 @@ namespace ttg_parsec { auto iovs = descr.get_data(*const_cast(&value)); num_iovs = std::distance(std::begin(iovs), std::end(iovs)); memregs.reserve(num_iovs); - handle_iovs_fn(iovs); + write_iov_header(); + for (auto &&iov : iovs) { + handle_iov_fn(iov); + } //std::cout << "broadcast_arg splitmd num_iovecs " << num_iovs << std::endl; } else if constexpr (!ttg::has_split_metadata>::value) { /* serialize the object once */ pos = pack(value, msg->bytes, pos, copy); - num_iovs = std::distance(copy->iovec_begin(), copy->iovec_end()); - handle_iovs_fn(copy->iovec_span()); - copy->iovec_reset(); + detail::foreach_parsec_data(value, [&](parsec_data_t *data){ ++num_iovs; }); + memregs.reserve(num_iovs); + write_iov_header(); + detail::foreach_parsec_data(value, [&](parsec_data_t *data){ + handle_iov_fn(ttg::iovec{data->nb_elts, + data->device_copies[data->owner_device]->device_private}); + }); } msg->tt_id.num_iovecs = num_iovs; @@ -3424,7 +3443,8 @@ namespace ttg_parsec { } } - void copy_mark_pushout(detail::ttg_data_copy_t *copy) { + template + void copy_mark_pushout(const Value& value) { assert(detail::parsec_ttg_caller->dev_ptr && detail::parsec_ttg_caller->dev_ptr->gpu_task); parsec_gpu_task_t *gpu_task = detail::parsec_ttg_caller->dev_ptr->gpu_task; @@ -3453,7 +3473,9 @@ namespace ttg_parsec { gpu_task->pushout |= 1<foreach_parsec_data(check_parsec_data); + ttg::detail::buffer_apply(value, [&](const ttg::Buffer& buffer){ + check_parsec_data(detail::get_parsec_data(buffer)); + }); } @@ -3485,7 +3507,7 @@ namespace ttg_parsec { auto &reducer = std::get(input_reducers); if (reducer) { /* reductions are currently done only on the host so push out */ - copy_mark_pushout(copy); + copy_mark_pushout(value); caller->data_flags |= detail::ttg_parsec_data_flags::MARKED_PUSHOUT; return; } @@ -3548,7 +3570,7 @@ namespace ttg_parsec { } if (need_pushout) { - copy_mark_pushout(copy); + copy_mark_pushout(value); caller->data_flags |= detail::ttg_parsec_data_flags::MARKED_PUSHOUT; } } @@ -3794,22 +3816,6 @@ namespace ttg_parsec { parsec_key_fn_t tasks_hash_fcts = {key_equal, key_print, key_hash}; - template - inline static void increment_data_version_impl(task_t *task) { - if constexpr (!std::is_const_v>) { - if (task->copies[I] != nullptr){ - task->copies[I]->inc_current_version(); - } - } - } - - template - inline static void increment_data_versions(task_t *task, std::index_sequence) { - /* increment version of each mutable data */ - int junk[] = {0, (increment_data_version_impl(task), 0)...}; - junk[0]++; - } - static parsec_hook_return_t complete_task_and_release(parsec_execution_stream_t *es, parsec_task_t *parsec_task) { //std::cout << "complete_task_and_release: task " << parsec_task << std::endl; @@ -3822,10 +3828,6 @@ namespace ttg_parsec { assert(task->coroutine_id != ttg::TaskCoroutineID::Invalid); #ifdef TTG_HAVE_DEVICE if (task->coroutine_id == ttg::TaskCoroutineID::DeviceTask) { - /* increment versions of all data we might have modified - * this must happen before we issue the sends */ - //increment_data_versions(task, std::make_index_sequence>{}); - // get the device task from the coroutine handle auto dev_task = ttg::device::detail::device_task_handle_type::from_address(task->suspended_task_address); diff --git a/ttg/ttg/parsec/ttg_data_copy.h b/ttg/ttg/parsec/ttg_data_copy.h index fbd58c64ec..b79f794a2f 100644 --- a/ttg/ttg/parsec/ttg_data_copy.h +++ b/ttg/ttg/parsec/ttg_data_copy.h @@ -26,91 +26,6 @@ namespace ttg_parsec { // fwd-decl struct ttg_data_copy_t; - /* Wrapper managing the relationship between a ttg data copy and the parsec_data_t object */ - struct ttg_parsec_data_wrapper_t { - - protected: - using parsec_data_ptr = std::unique_ptr; - - ttg_data_copy_t *m_ttg_copy = nullptr; - parsec_data_ptr m_data; - - friend ttg_data_copy_t; - - static parsec_data_t* create_parsec_data(void *ptr, size_t size, bool sync_to_device) { - parsec_data_t *data = parsec_data_create_with_type(nullptr, 0, ptr, size, - parsec_datatype_int8_t); - data->device_copies[0]->flags |= PARSEC_DATA_FLAG_PARSEC_MANAGED; - data->device_copies[0]->coherency_state = PARSEC_DATA_COHERENCY_SHARED; - // if we don't want to synchronize data to the device we set the version to 0 - data->device_copies[0]->version = (sync_to_device) ? 1 : 0; - return data; - } - - parsec_data_t* parsec_data() { - return m_data.get(); - } - - const parsec_data_t* parsec_data() const { - return m_data.get(); - } - - static void delete_parsec_data(parsec_data_t *data) { -#if defined(PARSEC_HAVE_DEV_CUDA_SUPPORT) - if (data->device_copies[0]->flags & TTG_PARSEC_DATA_FLAG_REGISTERED) { - // register the memory for faster access - cudaError_t status; - status = cudaHostUnregister(data->device_copies[0]->device_private); - assert(cudaSuccess == status); - data->device_copies[0]->flags ^= TTG_PARSEC_DATA_FLAG_REGISTERED; - } -#endif // PARSEC_HAVE_DEV_CUDA_SUPPORT - assert(data->device_copies[0] != nullptr); - auto copy = data->device_copies[0]; - parsec_data_copy_detach(data, data->device_copies[0], 0); - PARSEC_OBJ_RELEASE(copy); - PARSEC_OBJ_RELEASE(data); - } - - static void delete_null_parsec_data(parsec_data_t *) { - // nothing to be done, only used for nullptr - } - - protected: - - /* remove the data from the owning data copy */ - void remove_from_owner(); - - /* add the data to the owning data copy */ - void reset_parsec_data(void *ptr, size_t size, bool sync_to_device); - - ttg_parsec_data_wrapper_t(); - - ttg_parsec_data_wrapper_t(const ttg_parsec_data_wrapper_t& other) = delete; - - ttg_parsec_data_wrapper_t(ttg_parsec_data_wrapper_t&& other); - - ttg_parsec_data_wrapper_t& operator=(const ttg_parsec_data_wrapper_t& other) = delete; - - ttg_parsec_data_wrapper_t& operator=(ttg_parsec_data_wrapper_t&& other); - - virtual ~ttg_parsec_data_wrapper_t(); - - /* set a new owning data copy object */ - void set_owner(ttg_data_copy_t& new_copy) { - m_ttg_copy = &new_copy; - } - - /* add a new copy to the data on the give device backed by ptr */ - void add_copy(int parsec_dev, void *ptr) { - parsec_data_copy_t* copy = parsec_data_copy_new(m_data.get(), parsec_dev, - parsec_datatype_int8_t, - PARSEC_DATA_FLAG_PARSEC_MANAGED); - copy->device_private = ptr; - } - }; - - /* templated to break cyclic dependency with ttg_data_copy_container */ template struct ttg_data_copy_container_setter { @@ -162,17 +77,8 @@ namespace ttg_parsec { , m_next_task(c.m_next_task) , m_readers(c.m_readers) , m_refs(c.m_refs.load(std::memory_order_relaxed)) - , m_dev_data(std::move(c.m_dev_data)) - , m_single_dev_data(c.m_single_dev_data) - , m_num_dev_data(c.m_num_dev_data) { - c.m_num_dev_data = 0; c.m_readers = 0; - c.m_single_dev_data = nullptr; - - foreach_wrapper([&](ttg_parsec_data_wrapper_t* data){ - data->set_owner(*this); - }); } ttg_data_copy_t& operator=(ttg_data_copy_t&& c) @@ -183,16 +89,6 @@ namespace ttg_parsec { c.m_readers = 0; m_refs.store(c.m_refs.load(std::memory_order_relaxed), std::memory_order_relaxed); c.m_refs.store(0, std::memory_order_relaxed); - m_dev_data = std::move(c.m_dev_data); - m_single_dev_data = c.m_single_dev_data; - c.m_single_dev_data = nullptr; - m_num_dev_data = c.m_num_dev_data; - c.m_num_dev_data = 0; - - /* move all data to the new owner */ - foreach_wrapper([&](ttg_parsec_data_wrapper_t* data){ - data->set_owner(*this); - }); return *this; } @@ -287,147 +183,6 @@ namespace ttg_parsec { return m_refs.load(std::memory_order_relaxed); } - /* increment the version of the current copy */ - void inc_current_version() { - //std::cout << "data-copy " << this << " inc_current_version " << " count " << m_num_dev_data << std::endl; - foreach_parsec_data([](parsec_data_t* data){ - assert(data->device_copies[0] != nullptr); - data->device_copies[0]->version++; - }); - } - - void transfer_ownership(int access, int device = 0) { - foreach_parsec_data([&](parsec_data_t* data){ - parsec_data_transfer_ownership_to_copy(data, device, access); - }); - } - - /* manage device copies owned by this object - * we only touch the vector if we have more than one copies to track - * and otherwise use the single-element member. - */ - using iterator = ttg_parsec_data_wrapper_t**; - - void add_device_data(ttg_parsec_data_wrapper_t* data) { - switch (m_num_dev_data) { - case 0: - m_single_dev_data = data; - break; - case 1: - /* move single copy into vector and add new copy below */ - m_dev_data.push_back(m_single_dev_data); - m_single_dev_data = nullptr; - /* fall-through */ - default: - /* store in multi-copy vector */ - m_dev_data.push_back(data); - break; - } - //std::cout << "data-copy " << this << " add data " << data << " count " << m_num_dev_data << std::endl; - m_num_dev_data++; - } - - void remove_device_data(ttg_parsec_data_wrapper_t* data) { - //std::cout << "data-copy " << this << " remove data " << data << " count " << m_num_dev_data << std::endl; - if (m_num_dev_data == 0) { - /* this may happen if we're integrated into the object and have been moved */ - return; - } - if (m_num_dev_data == 1) { - assert(m_single_dev_data == data); - m_single_dev_data = nullptr; - } else if (m_num_dev_data > 1) { - auto it = std::find(m_dev_data.begin(), m_dev_data.end(), data); - if (it != m_dev_data.end()) { - m_dev_data.erase(it); - } - } - --m_num_dev_data; - /* make single-entry if needed */ - if (m_num_dev_data == 1) { - m_single_dev_data = m_dev_data[0]; - m_dev_data.clear(); - } - } - - int num_dev_data() const { - return m_num_dev_data; - } - - template - void foreach_wrapper(Fn&& fn) { - if (m_num_dev_data == 1) { - fn(m_single_dev_data); - } else if (m_num_dev_data > 1) { - std::for_each(m_dev_data.begin(), m_dev_data.end(), fn); - } - } - - template - void foreach_parsec_data(Fn&& fn) { - if (m_num_dev_data == 1) { - if (m_single_dev_data->parsec_data()) { - fn(m_single_dev_data->parsec_data()); - } - } else if (m_num_dev_data > 1) { - std::for_each(m_dev_data.begin(), m_dev_data.end(), - [&](ttg_parsec_data_wrapper_t* data){ - if (data->parsec_data()) { - fn(data->parsec_data()); - } - } - ); - } - } - - -#if 0 - iterator begin() { - switch(m_num_dev_data) { - // no device copies - case 0: return end(); - case 1: return &m_single_dev_data; - default: return m_dev_data.data(); - } - } - - iterator end() { - switch(m_num_dev_data) { - case 0: - case 1: - return &(m_single_dev_data) + 1; - default: - return m_dev_data.data() + m_dev_data.size(); - } - } -#endif // 0 - - using iovec_iterator = typename std::vector::iterator; - - iovec_iterator iovec_begin() { - return m_iovecs.begin(); - } - - iovec_iterator iovec_end() { - return m_iovecs.end(); - } - - void iovec_reset() { - m_iovecs.clear(); - } - - void iovec_add(const ttg::iovec& iov) { - m_iovecs.push_back(iov); - } - - ttg::span iovec_span() { - return ttg::span(m_iovecs.data(), m_iovecs.size()); - } - - std::size_t iovec_count() const { - return m_iovecs.size(); - } - #if defined(PARSEC_PROF_TRACE) && defined(PARSEC_TTG_PROFILE_BACKEND) int64_t size; int64_t uid; @@ -436,13 +191,6 @@ namespace ttg_parsec { parsec_task_t *m_next_task = nullptr; int32_t m_readers = 1; std::atomic m_refs = 1; //< number of entities referencing this copy (TTGs, external) - - std::vector m_iovecs; - - std::vector m_dev_data; //< used if there are multiple device copies - // that belong to this object - ttg_parsec_data_wrapper_t *m_single_dev_data; //< used if there is a single device copy - int m_num_dev_data = 0; //< number of device copies }; @@ -521,77 +269,6 @@ namespace ttg_parsec { return &m_value; } }; - - /** - * definition of ttg_parsec_data_wrapper_t members that depend on ttg_data_copy_t - */ - - inline - void ttg_parsec_data_wrapper_t::remove_from_owner() { - if (nullptr != m_ttg_copy) { - m_ttg_copy->remove_device_data(this); - m_ttg_copy = nullptr; - } - } - - inline - void ttg_parsec_data_wrapper_t::reset_parsec_data(void *ptr, size_t size, bool sync_to_device) { - if (ptr == m_data.get()) return; - - if (nullptr == ptr) { - m_data = parsec_data_ptr(nullptr, &delete_null_parsec_data); - } else { - m_data = parsec_data_ptr(create_parsec_data(ptr, size, sync_to_device), &delete_parsec_data); - } - } - - inline - ttg_parsec_data_wrapper_t::ttg_parsec_data_wrapper_t() - : m_data(nullptr, delete_null_parsec_data) - , m_ttg_copy(detail::ttg_data_copy_container()) - { - if (m_ttg_copy) { - m_ttg_copy->add_device_data(this); - } - } - - inline - ttg_parsec_data_wrapper_t::ttg_parsec_data_wrapper_t(ttg_parsec_data_wrapper_t&& other) - : m_data(std::move(other.m_data)) - , m_ttg_copy(detail::ttg_data_copy_container()) - { - // try to remove the old buffer from the *old* ttg_copy - other.remove_from_owner(); - - // register with the new ttg_copy - if (nullptr != m_ttg_copy) { - m_ttg_copy->add_device_data(this); - } - } - - inline - ttg_parsec_data_wrapper_t& ttg_parsec_data_wrapper_t::operator=(ttg_parsec_data_wrapper_t&& other) { - m_data = std::move(other.m_data); - /* remove from old ttg copy */ - other.remove_from_owner(); - - if (nullptr != m_ttg_copy) { - /* register with the new ttg_copy */ - m_ttg_copy->add_device_data(this); - } - return *this; - } - - - inline - ttg_parsec_data_wrapper_t::~ttg_parsec_data_wrapper_t() { - if (nullptr != m_ttg_copy) { - m_ttg_copy->remove_device_data(this); - m_ttg_copy = nullptr; - } - } - - } // namespace detail } // namespace ttg_parsec From 7db597c1561b0292d7ae5b6c6390cb1b0144025c Mon Sep 17 00:00:00 2001 From: Joseph Schuchart Date: Mon, 11 Nov 2024 14:56:09 -0500 Subject: [PATCH 02/39] Use parsec_data_discard to safely destroy parsec_data_t Signed-off-by: Joseph Schuchart --- ttg/ttg/parsec/buffer.h | 18 +++--------------- ttg/ttg/parsec/devicescratch.h | 20 +------------------- 2 files changed, 4 insertions(+), 34 deletions(-) diff --git a/ttg/ttg/parsec/buffer.h b/ttg/ttg/parsec/buffer.h index 382a7a9df8..5f6762ab69 100644 --- a/ttg/ttg/parsec/buffer.h +++ b/ttg/ttg/parsec/buffer.h @@ -250,21 +250,9 @@ struct Buffer { void release_data() { if (nullptr == m_data) return; - - /* first: drop our reference to the parsec_data_t */ - PARSEC_OBJ_RELEASE(m_data); - /** - * second: drop a second reference to the parsec_data_t, for the host-side data_t. - * - * There are two cases: - * 1) There was only a host data. By dropping this reference - * the data_t is destroyed, destroying the host-side data copy as well. - * 2) There was a device-side data copy with a reference to the data_t. - * This reference will be released once the device-side data copy is released, - * which will finally destroy the host-side data-copy. - */ - PARSEC_OBJ_RELEASE(m_data); - + /* discard the parsec data so it can be collected by the runtime + * and the buffer be free'd in the parsec_data_copy_t destructor */ + parsec_data_discard(m_data); /* set data to null so we don't repeat the above */ m_data = nullptr; } diff --git a/ttg/ttg/parsec/devicescratch.h b/ttg/ttg/parsec/devicescratch.h index e2c3743aa3..085d92caea 100644 --- a/ttg/ttg/parsec/devicescratch.h +++ b/ttg/ttg/parsec/devicescratch.h @@ -50,19 +50,6 @@ struct devicescratch { return data; } - void remove_from_flow() { - /* remove the scratch from the gpu-task flow */ - assert(nullptr != detail::parsec_ttg_caller); - parsec_task_t *parsec_task = &detail::parsec_ttg_caller->parsec_task; - parsec_flow_t *flows = detail::parsec_ttg_caller->dev_ptr->flows; - for (int i = 0; i < MAX_PARAM_COUNT; ++i) { - if (nullptr != parsec_task->data[i].data_in && parsec_task->data[i].data_in->original == m_data) { - flows[i].flow_flags = PARSEC_FLOW_ACCESS_NONE; // disable this flow - break; - } - } - } - friend parsec_data_t* detail::get_parsec_data(const ttg_parsec::devicescratch&); public: @@ -94,14 +81,9 @@ struct devicescratch { ~devicescratch() { /* remove data from flow */ - //remove_from_flow(); if (nullptr != m_data) { - //parsec_data_destroy(m_data); - //parsec_data_copy_detach(m_data, parsec_data_get_copy(m_data, 0), 0); - //auto *copy = parsec_data_get_copy(m_data, 0); - //PARSEC_OBJ_RELEASE(copy); + parsec_data_discard(m_data); } - //parsec_data_destroy(m_data); m_data = nullptr; } From ae656c426bcc980950582d0c936309bb5974e87a Mon Sep 17 00:00:00 2001 From: Joseph Schuchart Date: Mon, 11 Nov 2024 16:47:23 -0500 Subject: [PATCH 03/39] Restrict calls to buffer_apply() to serializable types Signed-off-by: Joseph Schuchart --- tests/unit/device_coro.cc | 28 +++++++++++++++++ ttg/ttg/buffer.h | 27 +++++++++++++++-- ttg/ttg/parsec/buffer.h | 3 ++ ttg/ttg/parsec/ttg.h | 63 +++++++++++++++++++++------------------ 4 files changed, 90 insertions(+), 31 deletions(-) diff --git a/tests/unit/device_coro.cc b/tests/unit/device_coro.cc index 83b9fc3c93..b0d91fabd6 100644 --- a/tests/unit/device_coro.cc +++ b/tests/unit/device_coro.cc @@ -38,6 +38,34 @@ struct nested_value_t { } }; +struct derived_value_t { + nested_value_t v; +}; + +#ifdef TTG_SERIALIZATION_SUPPORTS_MADNESS +namespace madness { + namespace archive { + + template + struct ArchiveLoadImpl { + static inline void load(const Archive& ar, derived_value_t& v) { + ar& v.v; + } + }; + + template + struct ArchiveStoreImpl { + static inline void store(const Archive& ar, const derived_value_t& v) { + ar& v.v; + } + }; + } // namespace archive +} // namespace madness +#endif // TTG_SERIALIZATION_SUPPORTS_MADNESS + +static_assert(madness::is_serializable_v, derived_value_t>); +static_assert(ttg::detail::has_buffer_apply_v); + TEST_CASE("Device", "coro") { SECTION("buffer-inspection") { value_t v1; diff --git a/ttg/ttg/buffer.h b/ttg/ttg/buffer.h index e3b9a1d967..95e35cdd97 100644 --- a/ttg/ttg/buffer.h +++ b/ttg/ttg/buffer.h @@ -32,8 +32,24 @@ namespace meta { } // namespace meta +namespace detail { + /** + * Type trait to check whether we can use serialization + * to inspect the buffers owned by an object passing + * through a task graph. + */ + template + struct has_buffer_apply : std::false_type + { }; + + template + constexpr const bool has_buffer_apply_v = has_buffer_apply::value; +} // namespace detail + } // namespace ttg + + #ifdef TTG_SERIALIZATION_SUPPORTS_MADNESS #include @@ -109,14 +125,21 @@ namespace madness { namespace ttg::detail { template - requires(madness::is_serializable_v, T>) + requires(madness::is_serializable_v, std::decay>) void buffer_apply(T&& t, Fn&& fn) { madness::archive::BufferInspectorArchive ar(std::forward(fn)); ar & t; } + + using buffer_apply_dummy_fn = decltype([](const ttg::Buffer&){}); + template + struct has_buffer_apply, std::decay_t>>> + : std::true_type + { }; + } // namespace ttg::detail #endif // TTG_SERIALIZATION_SUPPORTS_MADNESS -#endif // TTG_buffer_H \ No newline at end of file +#endif // TTG_buffer_H diff --git a/ttg/ttg/parsec/buffer.h b/ttg/ttg/parsec/buffer.h index 5f6762ab69..c6e7ab3623 100644 --- a/ttg/ttg/parsec/buffer.h +++ b/ttg/ttg/parsec/buffer.h @@ -581,6 +581,9 @@ struct Buffer { #endif // TTG_SERIALIZATION_SUPPORTS_MADNESS }; + +static_assert(madness::is_serializable_v, const Buffer>&>); + namespace detail { template parsec_data_t* get_parsec_data(const ttg_parsec::Buffer& db) { diff --git a/ttg/ttg/parsec/ttg.h b/ttg/ttg/parsec/ttg.h index 903c3feea2..1c44fd1b97 100644 --- a/ttg/ttg/parsec/ttg.h +++ b/ttg/ttg/parsec/ttg.h @@ -746,7 +746,7 @@ namespace ttg_parsec { template inline void transfer_ownership_impl(T&& arg, int device) { - if constexpr(!std::is_const_v>) { + if constexpr(!std::is_const_v> && ttg::detail::has_buffer_apply_v) { ttg::detail::buffer_apply(arg, [&](auto&& buffer){ auto *data = detail::get_parsec_data(buffer); parsec_data_transfer_ownership_to_copy(data, device, PARSEC_FLOW_ACCESS_RW); @@ -3446,36 +3446,41 @@ namespace ttg_parsec { template void copy_mark_pushout(const Value& value) { - assert(detail::parsec_ttg_caller->dev_ptr && detail::parsec_ttg_caller->dev_ptr->gpu_task); - parsec_gpu_task_t *gpu_task = detail::parsec_ttg_caller->dev_ptr->gpu_task; - auto check_parsec_data = [&](parsec_data_t* data) { - if (data->owner_device != 0) { - /* find the flow */ - int flowidx = 0; - while (flowidx < MAX_PARAM_COUNT && - gpu_task->flow[flowidx]->flow_flags != PARSEC_FLOW_ACCESS_NONE) { - if (detail::parsec_ttg_caller->parsec_task.data[flowidx].data_in->original == data) { - /* found the right data, set the corresponding flow as pushout */ - break; + if constexpr (ttg::detail::has_buffer_apply_v) { + assert(detail::parsec_ttg_caller->dev_ptr && detail::parsec_ttg_caller->dev_ptr->gpu_task); + parsec_gpu_task_t *gpu_task = detail::parsec_ttg_caller->dev_ptr->gpu_task; + auto check_parsec_data = [&](parsec_data_t* data) { + if (data->owner_device != 0) { + /* find the flow */ + int flowidx = 0; + while (flowidx < MAX_PARAM_COUNT && + gpu_task->flow[flowidx]->flow_flags != PARSEC_FLOW_ACCESS_NONE) { + if (detail::parsec_ttg_caller->parsec_task.data[flowidx].data_in->original == data) { + /* found the right data, set the corresponding flow as pushout */ + break; + } + ++flowidx; } - ++flowidx; - } - if (flowidx == MAX_PARAM_COUNT) { - throw std::runtime_error("Cannot add more than MAX_PARAM_COUNT flows to a task!"); - } - if (gpu_task->flow[flowidx]->flow_flags == PARSEC_FLOW_ACCESS_NONE) { - /* no flow found, add one and mark it pushout */ - detail::parsec_ttg_caller->parsec_task.data[flowidx].data_in = data->device_copies[0]; - gpu_task->flow_nb_elts[flowidx] = data->nb_elts; + if (flowidx == MAX_PARAM_COUNT) { + throw std::runtime_error("Cannot add more than MAX_PARAM_COUNT flows to a task!"); + } + if (gpu_task->flow[flowidx]->flow_flags == PARSEC_FLOW_ACCESS_NONE) { + /* no flow found, add one and mark it pushout */ + detail::parsec_ttg_caller->parsec_task.data[flowidx].data_in = data->device_copies[0]; + gpu_task->flow_nb_elts[flowidx] = data->nb_elts; + } + /* need to mark the flow WRITE to make PaRSEC happy */ + ((parsec_flow_t *)gpu_task->flow[flowidx])->flow_flags |= PARSEC_FLOW_ACCESS_WRITE; + gpu_task->pushout |= 1<flow[flowidx])->flow_flags |= PARSEC_FLOW_ACCESS_WRITE; - gpu_task->pushout |= 1<(const ttg::Buffer& buffer){ - check_parsec_data(detail::get_parsec_data(buffer)); - }); + }; + ttg::detail::buffer_apply(value, + [&](const ttg::Buffer& buffer){ + check_parsec_data(detail::get_parsec_data(buffer)); + }); + } else { + throw std::runtime_error("Value type must be serializable with ttg::BufferInspectorArchive"); + } } From 13b6aa6dfd183e8db378f1b96dcc859c03a520cd Mon Sep 17 00:00:00 2001 From: Joseph Schuchart Date: Mon, 11 Nov 2024 17:05:46 -0500 Subject: [PATCH 04/39] static_assert that device task output value types are serializable Signed-off-by: Joseph Schuchart --- ttg/ttg/buffer.h | 10 +++++++--- ttg/ttg/edge.h | 8 ++++++++ ttg/ttg/parsec/ttg.h | 4 ++++ 3 files changed, 19 insertions(+), 3 deletions(-) diff --git a/ttg/ttg/buffer.h b/ttg/ttg/buffer.h index 95e35cdd97..cb04c70355 100644 --- a/ttg/ttg/buffer.h +++ b/ttg/ttg/buffer.h @@ -34,12 +34,16 @@ namespace meta { namespace detail { /** - * Type trait to check whether we can use serialization + * Type traits to check whether we can use serialization * to inspect the buffers owned by an object passing * through a task graph. */ template - struct has_buffer_apply : std::false_type + struct has_buffer_apply_helper : std::false_type + { }; + + template + struct has_buffer_apply : has_buffer_apply_helper { }; template @@ -133,7 +137,7 @@ namespace ttg::detail { using buffer_apply_dummy_fn = decltype([](const ttg::Buffer&){}); template - struct has_buffer_apply, std::decay_t>>> + struct has_buffer_apply_helper, std::decay_t>>> : std::true_type { }; diff --git a/ttg/ttg/edge.h b/ttg/ttg/edge.h index 9121a503f2..8eed2e3367 100644 --- a/ttg/ttg/edge.h +++ b/ttg/ttg/edge.h @@ -178,6 +178,14 @@ namespace ttg { typedef std::tuple type; }; + template + struct edges_to_output_value_types; + + template + struct edges_to_output_value_types> { + typedef std::tuple type; + }; + namespace detail { template struct edges_tuple; diff --git a/ttg/ttg/parsec/ttg.h b/ttg/ttg/parsec/ttg.h index 1c44fd1b97..852dcfebe7 100644 --- a/ttg/ttg/parsec/ttg.h +++ b/ttg/ttg/parsec/ttg.h @@ -1256,6 +1256,10 @@ namespace ttg_parsec { return (derived_has_cuda_op() || derived_has_hip_op() || derived_has_level_zero_op()); } + static_assert(!derived_has_device_op() || ttg::meta::probe_all_v>, + "Data sent from a device-capable template task must be serializable."); + using ttT = TT; using key_type = keyT; using input_terminals_type = ttg::detail::input_terminals_tuple_t; From 3cce7a8f4b4f099275eb34f29c6d9abef0a728ee Mon Sep 17 00:00:00 2001 From: Joseph Schuchart Date: Mon, 11 Nov 2024 20:05:03 -0500 Subject: [PATCH 05/39] Remove static assert Signed-off-by: Joseph Schuchart --- ttg/ttg/parsec/buffer.h | 3 --- 1 file changed, 3 deletions(-) diff --git a/ttg/ttg/parsec/buffer.h b/ttg/ttg/parsec/buffer.h index c6e7ab3623..5f6762ab69 100644 --- a/ttg/ttg/parsec/buffer.h +++ b/ttg/ttg/parsec/buffer.h @@ -581,9 +581,6 @@ struct Buffer { #endif // TTG_SERIALIZATION_SUPPORTS_MADNESS }; - -static_assert(madness::is_serializable_v, const Buffer>&>); - namespace detail { template parsec_data_t* get_parsec_data(const ttg_parsec::Buffer& db) { From 356ade06d2eefb68245a5e3ecd89dd95f80b2f92 Mon Sep 17 00:00:00 2001 From: Joseph Schuchart Date: Mon, 11 Nov 2024 20:30:11 -0500 Subject: [PATCH 06/39] Attempt at a boost serialization frontend for buffer_apply Signed-off-by: Joseph Schuchart --- ttg/ttg/buffer.h | 68 ++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 66 insertions(+), 2 deletions(-) diff --git a/ttg/ttg/buffer.h b/ttg/ttg/buffer.h index cb04c70355..54189454e1 100644 --- a/ttg/ttg/buffer.h +++ b/ttg/ttg/buffer.h @@ -5,7 +5,7 @@ #include "ttg/fwd.h" #include "ttg/util/meta.h" - +#include "ttg/serialization.h" #include namespace ttg { @@ -48,6 +48,9 @@ namespace detail { template constexpr const bool has_buffer_apply_v = has_buffer_apply::value; + + /* dummy function type used to check whether buffer_apply is available */ + using buffer_apply_dummy_fn = decltype([](const ttg::Buffer&){}); } // namespace detail } // namespace ttg @@ -135,7 +138,6 @@ namespace ttg::detail { ar & t; } - using buffer_apply_dummy_fn = decltype([](const ttg::Buffer&){}); template struct has_buffer_apply_helper, std::decay_t>>> : std::true_type @@ -143,6 +145,68 @@ namespace ttg::detail { } // namespace ttg::detail +#elif defined(TTG_SERIALZIATION_SUPPORTS_BOOST) + +#include +namespace ttg::detail { + + template + struct BufferInspectorArchive + : public boost::archive::detail::common_oarchive> { + private: + Fn m_fn; + + friend class boost::archive::save_access; + + /// Stores (counts) data into the memory buffer. + + /// The function only appears (due to \c enable_if) if \c T is + /// serializable. + /// \tparam T Type of the data to be stored (counted). + /// \param[in] t Pointer to the data to be stored (counted). + /// \param[in] n Size of data to be stored (counted). + template + void save(const ttg::Buffer* t) const { + /* invoke the function on buffer */ + m_fn(t[i]); + } + + template + void save(const T& t) const { + /* nothing to be done for other types */ + } + + public: + template + BufferInspectorArchive(_Fn&& fn) + : m_fn(fn) + { } + + // archives are expected to support this function + void save_binary(void *address, std::size_t count) + { + /* nothing to do */ + } + }; + + /* deduction guide */ + template + BufferInspectorArchive(Fn&&) -> BufferInspectorArchive; + + template + requires(is_boost_serializable_v, std::decay>) + void buffer_apply(T&& t, Fn&& fn) { + BufferInspectorArchive ar(std::forward(fn)); + ar & t; + } + + template + struct has_buffer_apply_helper, std::decay_t>>> + : std::true_type + { }; + +} // namespace ttg::detail + #endif // TTG_SERIALIZATION_SUPPORTS_MADNESS From 099baef96102767dd1959eaa211fe76f235a9136 Mon Sep 17 00:00:00 2001 From: Joseph Schuchart Date: Tue, 12 Nov 2024 13:53:16 -0500 Subject: [PATCH 07/39] Using ttg::Buffer requires MADNESS serialization We cannot use boost serialization to inspect objects for the buffers they contain because boost archives cannot be templated properly and need explicit instantiations for its archives, which is impossible for the BufferInspectorArchive. We just rely on madness from now on. Signed-off-by: Joseph Schuchart --- tests/unit/device_coro.cc | 6 ++-- ttg/ttg/buffer.h | 62 +++--------------------------------- ttg/ttg/parsec/parsec_data.h | 9 ++++-- 3 files changed, 15 insertions(+), 62 deletions(-) diff --git a/tests/unit/device_coro.cc b/tests/unit/device_coro.cc index b0d91fabd6..f2340f259f 100644 --- a/tests/unit/device_coro.cc +++ b/tests/unit/device_coro.cc @@ -61,10 +61,10 @@ namespace madness { }; } // namespace archive } // namespace madness -#endif // TTG_SERIALIZATION_SUPPORTS_MADNESS static_assert(madness::is_serializable_v, derived_value_t>); -static_assert(ttg::detail::has_buffer_apply_v); +static_assert(ttg::detail::has_buffer_apply_v); + TEST_CASE("Device", "coro") { SECTION("buffer-inspection") { @@ -500,3 +500,5 @@ TEST_CASE("Device", "coro") { } #endif // TTG_IMPL_DEVICE_SUPPORT + +#endif // TTG_SERIALIZATION_SUPPORTS_MADNESS diff --git a/ttg/ttg/buffer.h b/ttg/ttg/buffer.h index 54189454e1..59f97bf70b 100644 --- a/ttg/ttg/buffer.h +++ b/ttg/ttg/buffer.h @@ -49,8 +49,6 @@ namespace detail { template constexpr const bool has_buffer_apply_v = has_buffer_apply::value; - /* dummy function type used to check whether buffer_apply is available */ - using buffer_apply_dummy_fn = decltype([](const ttg::Buffer&){}); } // namespace detail } // namespace ttg @@ -138,6 +136,9 @@ namespace ttg::detail { ar & t; } + /* dummy function type used to check whether buffer_apply is available */ + using buffer_apply_dummy_fn = decltype([](const ttg::Buffer&){}); + template struct has_buffer_apply_helper, std::decay_t>>> : std::true_type @@ -145,69 +146,16 @@ namespace ttg::detail { } // namespace ttg::detail -#elif defined(TTG_SERIALZIATION_SUPPORTS_BOOST) +#else -#include namespace ttg::detail { - - template - struct BufferInspectorArchive - : public boost::archive::detail::common_oarchive> { - private: - Fn m_fn; - - friend class boost::archive::save_access; - - /// Stores (counts) data into the memory buffer. - - /// The function only appears (due to \c enable_if) if \c T is - /// serializable. - /// \tparam T Type of the data to be stored (counted). - /// \param[in] t Pointer to the data to be stored (counted). - /// \param[in] n Size of data to be stored (counted). - template - void save(const ttg::Buffer* t) const { - /* invoke the function on buffer */ - m_fn(t[i]); - } - - template - void save(const T& t) const { - /* nothing to be done for other types */ - } - - public: - template - BufferInspectorArchive(_Fn&& fn) - : m_fn(fn) - { } - - // archives are expected to support this function - void save_binary(void *address, std::size_t count) - { - /* nothing to do */ - } - }; - - /* deduction guide */ - template - BufferInspectorArchive(Fn&&) -> BufferInspectorArchive; - template - requires(is_boost_serializable_v, std::decay>) void buffer_apply(T&& t, Fn&& fn) { - BufferInspectorArchive ar(std::forward(fn)); - ar & t; + static_assert(ttg::meta::is_void_v, "Types using ttg::Buffer must be MADNESS serializable."); } - template - struct has_buffer_apply_helper, std::decay_t>>> - : std::true_type - { }; - } // namespace ttg::detail #endif // TTG_SERIALIZATION_SUPPORTS_MADNESS - #endif // TTG_buffer_H diff --git a/ttg/ttg/parsec/parsec_data.h b/ttg/ttg/parsec/parsec_data.h index 843bf82a29..e7b1c9f564 100644 --- a/ttg/ttg/parsec/parsec_data.h +++ b/ttg/ttg/parsec/parsec_data.h @@ -7,9 +7,12 @@ namespace ttg_parsec::detail { template void foreach_parsec_data(Value&& value, Fn&& fn) { - ttg::detail::buffer_apply(value, [&](B&& b){ - fn(detail::get_parsec_data(b)); - }); + /* protect for non-serializable types, allowed if the TT has no device op */ + if constexpr (ttg::detail::has_buffer_apply_v) { + ttg::detail::buffer_apply(value, [&](B&& b){ + fn(detail::get_parsec_data(b)); + }); + } } } // namespace ttg_parsec::detail From 079b694074ddc57e72f7da42a37f048f2549cebe Mon Sep 17 00:00:00 2001 From: Joseph Schuchart Date: Tue, 12 Nov 2024 18:15:32 -0500 Subject: [PATCH 08/39] Fix usage of smart-pointer for buffers For now only allow T[] on smart pointers passed to buffers. We may want to support both T and T[] on ttg::Buffer so we can distinguish the two later. Signed-off-by: Joseph Schuchart --- ttg/ttg/madness/buffer.h | 24 +++++++++++++++++++----- ttg/ttg/parsec/buffer.h | 38 ++++++++++++++++++-------------------- 2 files changed, 37 insertions(+), 25 deletions(-) diff --git a/ttg/ttg/madness/buffer.h b/ttg/ttg/madness/buffer.h index fba1ee9e15..3aeddd2434 100644 --- a/ttg/ttg/madness/buffer.h +++ b/ttg/ttg/madness/buffer.h @@ -4,6 +4,7 @@ #include "ttg/serialization/traits.h" #include "ttg/device/device.h" +#include namespace ttg_madness { @@ -23,8 +24,8 @@ struct Buffer : private Allocator { private: using delete_fn_t = std::function; - using host_data_ptr = std::add_pointer_t; + std::shared_ptr m_sptr; // to capture smart pointers host_data_ptr m_host_data = nullptr; std::size_t m_count = 0; bool m_owned= false; @@ -58,9 +59,22 @@ struct Buffer : private Allocator { /* Constructing a buffer using application-managed memory. * The memory pointed to by ptr must be accessible during * the life-time of the buffer. */ - Buffer(element_type* ptr, std::size_t n = 1) + template + Buffer(std::unique_ptr ptr, std::size_t n) + : allocator_type() + , m_sptr(std::move(ptr)) + , m_host_data(m_sptr.get()) + , m_count(n) + , m_owned(false) + { } + + /* Constructing a buffer using application-managed memory. + * The memory pointed to by ptr must be accessible during + * the life-time of the buffer. */ + Buffer(std::shared_ptr ptr, std::size_t n) : allocator_type() - , m_host_data(ptr) + , m_sptr(std::move(ptr)) + , m_host_data(m_sptr.get()) , m_count(n) , m_owned(false) { } @@ -293,12 +307,12 @@ struct Buffer : private Allocator { if constexpr (ttg::detail::is_output_archive_v) { std::size_t s = size(); ar& s; - ar << wrap(host_ptr(), s); + ar << madness::archive::wrap(host_ptr(), s); } else { std::size_t s; ar & s; reset(s); - ar >> wrap(host_ptr(), s); // MatrixTile(bm.rows(), bm.cols()); + ar >> madness::archive::wrap(host_ptr(), s); // MatrixTile(bm.rows(), bm.cols()); } } #endif // TTG_SERIALIZATION_SUPPORTS_MADNESS diff --git a/ttg/ttg/parsec/buffer.h b/ttg/ttg/parsec/buffer.h index 5f6762ab69..ccebfb48e6 100644 --- a/ttg/ttg/parsec/buffer.h +++ b/ttg/ttg/parsec/buffer.h @@ -40,6 +40,17 @@ namespace detail { } }; + /* overloads for pointers and smart pointers */ + template + inline T* to_address(T* ptr) { + return ptr; + } + + template + inline auto to_address(T&& ptr) { + return ptr.get(); // smart pointer + } + /** * Wrapper type to carry the Allocator into types that are using * the PaRSEC object system. @@ -65,7 +76,7 @@ namespace detail { if constexpr (std::is_pointer_v) { m_ptr = allocator_traits::allocate(m_allocator, size); } - this->device_private = &(*m_ptr); + this->device_private = m_ptr; m_size = size; } @@ -84,7 +95,7 @@ namespace detail { constexpr const bool is_empty_allocator = std::is_same_v>; assert(is_empty_allocator); m_ptr = std::move(ptr); - this->device_private = &(*m_ptr); + this->device_private = to_address(m_ptr); } void construct(std::size_t size, const allocator_type& alloc = allocator_type()) { @@ -92,7 +103,7 @@ namespace detail { assert(!is_empty_allocator); m_allocator = alloc; allocate(size); - this->device_private = &(*m_ptr); + this->device_private = m_ptr; } ~data_copy_type() { @@ -202,7 +213,7 @@ namespace detail { /* create the host copy and allocate host memory */ data_copy_type *copy = PARSEC_OBJ_NEW(data_copy_type); - copy->construct(ptr, size); + copy->construct(std::move(ptr), size); parsec_data_copy_attach(data, copy, 0); /* adjust data flags */ @@ -279,27 +290,14 @@ struct Buffer { , m_count(n) { } - Buffer(std::shared_ptr ptr) - : m_data(detail::ttg_parsec_data_types, - detail::empty_allocator> - ::create_data(ptr, sizeof(element_type))) - , m_count(1) - { } - - Buffer(std::unique_ptr ptr, std::size_t n) - : m_data(detail::ttg_parsec_data_types, + template + Buffer(std::unique_ptr ptr, std::size_t n) + : m_data(detail::ttg_parsec_data_types, detail::empty_allocator> ::create_data(ptr, n*sizeof(element_type))) , m_count(n) { } - Buffer(std::unique_ptr ptr) - : m_data(detail::ttg_parsec_data_types, - detail::empty_allocator> - ::create_data(ptr, sizeof(element_type))) - , m_count(1) - { } - virtual ~Buffer() { release_data(); unpin(); // make sure the copies are not pinned From 659746ee6b509cc24f46e0069321337f9123cc37 Mon Sep 17 00:00:00 2001 From: Joseph Schuchart Date: Wed, 13 Nov 2024 16:49:15 -0500 Subject: [PATCH 09/39] Bring back Buffer::reset(n) needed for serialization Signed-off-by: Joseph Schuchart --- ttg/ttg/parsec/buffer.h | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/ttg/ttg/parsec/buffer.h b/ttg/ttg/parsec/buffer.h index ccebfb48e6..7f2dfaa840 100644 --- a/ttg/ttg/parsec/buffer.h +++ b/ttg/ttg/parsec/buffer.h @@ -487,12 +487,7 @@ struct Buffer { /* Reallocate the buffer with count elements */ void reset(std::size_t n, ttg::scope scope = ttg::scope::SyncIn) { release_data(); - - if (n > 0) { - m_data = detail::ttg_parsec_data_types - ::create_data(n*sizeof(element_type)); - } - + m_data = detail::ttg_parsec_data_types::create_data(n*sizeof(element_type), scope); m_count = n; } @@ -526,8 +521,8 @@ struct Buffer { void prefer_device(ttg::device::Device dev) { /* only set device if the host has the latest copy as otherwise we might end up with a stale copy */ - if (dev.is_device() && this->parsec_data()->owner_device == 0) { - parsec_advise_data_on_device(this->parsec_data(), detail::ttg_device_to_parsec_device(dev), + if (dev.is_device() && m_data->owner_device == 0) { + parsec_advise_data_on_device(m_data, detail::ttg_device_to_parsec_device(dev), PARSEC_DEV_DATA_ADVICE_PREFERRED_DEVICE); } } @@ -539,7 +534,7 @@ struct Buffer { add_copy(detail::ttg_device_to_parsec_device(dev), ptr); if (is_current) { // mark the data as being current on the new device - parsec_data()->owner_device = detail::ttg_device_to_parsec_device(dev); + m_data->owner_device = detail::ttg_device_to_parsec_device(dev); } } From 42d041fe20c06fe5ec2cecafa0577b9993fdfe2a Mon Sep 17 00:00:00 2001 From: Joseph Schuchart Date: Wed, 13 Nov 2024 18:23:29 -0500 Subject: [PATCH 10/39] Remove exception from empty_allocator::deallocate() The data copy will call deallocate no matter what, so be graceful. We will still catch cases where we try to allocate through the empty allocator. Signed-off-by: Joseph Schuchart --- ttg/ttg/parsec/buffer.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ttg/ttg/parsec/buffer.h b/ttg/ttg/parsec/buffer.h index 7f2dfaa840..9c21758b17 100644 --- a/ttg/ttg/parsec/buffer.h +++ b/ttg/ttg/parsec/buffer.h @@ -36,7 +36,7 @@ namespace detail { } void deallocate(value_type* ptr, std::size_t size) { - throw std::runtime_error("Deallocate on empty allocator!"); + /* nothing to be done; will be called from ~data_copy_type() */ } }; From 1295bacec69b93ba081041edbf709f961aa22bac Mon Sep 17 00:00:00 2001 From: Joseph Schuchart Date: Fri, 15 Nov 2024 11:15:19 -0500 Subject: [PATCH 11/39] Fix serialization and remove split metadata from MatrixTile We can now inspect the buffers of the tile to extract its memory regions. Signed-off-by: Joseph Schuchart --- examples/matrixtile.h | 36 +++----------------------------- examples/potrf/testing_dlauum.cc | 2 -- examples/potrf/testing_dpoinv.cc | 2 -- examples/potrf/testing_dpotrf.cc | 2 -- examples/potrf/testing_dtrtri.cc | 2 -- 5 files changed, 3 insertions(+), 41 deletions(-) diff --git a/examples/matrixtile.h b/examples/matrixtile.h index e30ef2eec7..d62e7047dc 100644 --- a/examples/matrixtile.h +++ b/examples/matrixtile.h @@ -41,10 +41,8 @@ inline void allocator_fini() { } template > class MatrixTile : public ttg::TTValue> { public: - using metadata_t = typename std::tuple; - - using buffer_t = typename ttg::Buffer; - using ttvalue_type = ttg::TTValue>; + using buffer_t = typename ttg::Buffer; + using ttvalue_type = ttg::TTValue>; private: buffer_t _buffer; @@ -77,12 +75,6 @@ class MatrixTile : public ttg::TTValue> { , _lda(lda) { } - MatrixTile(const metadata_t& metadata) - : MatrixTile(std::get<0>(metadata), std::get<1>(metadata), std::get<2>(metadata)) {} - - MatrixTile(const metadata_t& metadata, T* data) - : MatrixTile(std::get<0>(metadata), std::get<1>(metadata), std::forward(data), std::get<2>(metadata)) {} - /** * Constructor with outside memory. The tile will *not* delete this memory * upon destruction. @@ -125,15 +117,6 @@ class MatrixTile : public ttg::TTValue> { return *this; } - void set_metadata(metadata_t meta) { - _rows = std::get<0>(meta); - _cols = std::get<1>(meta); - _lda = std::get<2>(meta); - this->realloc(); - } - - metadata_t get_metadata(void) const { return metadata_t{_rows, _cols, _lda}; } - // Accessing the raw data T* data() { return _buffer.host_ptr(); } @@ -199,24 +182,11 @@ class MatrixTile : public ttg::TTValue> { template void serialize(Archive& ar) { - ar & rows() & cols() & lda(); + ar & _rows & _cols & _lda; ar & buffer(); } }; -namespace ttg { - - template - struct SplitMetadataDescriptor> { - auto get_metadata(const MatrixTile& t) { return t.get_metadata(); } - - auto get_data(MatrixTile& t) { return std::array({t.size() * sizeof(T), t.data()}); } - - auto create_from_metadata(const typename MatrixTile::metadata_t& meta) { return MatrixTile(meta); } - }; - -} // namespace ttg - #ifdef TTG_SERIALIZATION_SUPPORTS_MADNESS static_assert(madness::is_serializable_v>); #endif // TTG_SERIALIZATION_SUPPORTS_MADNESS diff --git a/examples/potrf/testing_dlauum.cc b/examples/potrf/testing_dlauum.cc index bab6d676cf..0ff569dffa 100644 --- a/examples/potrf/testing_dlauum.cc +++ b/examples/potrf/testing_dlauum.cc @@ -71,8 +71,6 @@ int main(int argc, char **argv) int P = std::sqrt(world.size()); int Q = (world.size() + P - 1)/P; - static_assert(ttg::has_split_metadata>::value); - std::cout << "Creating 2D block cyclic matrix with NB " << NB << " N " << N << " M " << M << " P " << P << std::endl; parsec_matrix_sym_block_cyclic_t dcA; diff --git a/examples/potrf/testing_dpoinv.cc b/examples/potrf/testing_dpoinv.cc index 646f2477e6..ba8892e039 100644 --- a/examples/potrf/testing_dpoinv.cc +++ b/examples/potrf/testing_dpoinv.cc @@ -113,8 +113,6 @@ int main(int argc, char **argv) Q = (world.size() + P - 1)/P; } - static_assert(ttg::has_split_metadata>::value); - if(verbose) { std::cout << "Creating 2D block cyclic matrix with NB " << NB << " N " << N << " M " << M << " P " << P << std::endl; } diff --git a/examples/potrf/testing_dpotrf.cc b/examples/potrf/testing_dpotrf.cc index 781cc417ca..2200f30ef1 100644 --- a/examples/potrf/testing_dpotrf.cc +++ b/examples/potrf/testing_dpotrf.cc @@ -92,8 +92,6 @@ int main(int argc, char **argv) check = false; } - static_assert(ttg::has_split_metadata>::value); - if (world.rank() == 0) { std::cout << "Creating 2D block cyclic matrix with NB " << NB << " N " << N << " M " << M << " P " << P << " Q " << Q << std::endl; } diff --git a/examples/potrf/testing_dtrtri.cc b/examples/potrf/testing_dtrtri.cc index 3124ab2be0..41ff385e5b 100644 --- a/examples/potrf/testing_dtrtri.cc +++ b/examples/potrf/testing_dtrtri.cc @@ -92,8 +92,6 @@ int main(int argc, char **argv) check = false; } - static_assert(ttg::has_split_metadata>::value); - std::cout << "Creating 2D block cyclic matrix with NB " << NB << " N " << N << " M " << M << " P " << P << std::endl; parsec_matrix_sym_block_cyclic_t dcA; From bcde59a3e334efcfc232eef0de3e954f4b2ddd69 Mon Sep 17 00:00:00 2001 From: Joseph Schuchart Date: Fri, 15 Nov 2024 11:17:11 -0500 Subject: [PATCH 12/39] Rename BufferInspectorArchive to BufferVisitorArchive Signed-off-by: Joseph Schuchart --- tests/unit/device_coro.cc | 2 +- ttg/ttg/buffer.h | 20 ++++++++++---------- ttg/ttg/parsec/ttg.h | 2 +- 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/tests/unit/device_coro.cc b/tests/unit/device_coro.cc index f2340f259f..a8eabdaf03 100644 --- a/tests/unit/device_coro.cc +++ b/tests/unit/device_coro.cc @@ -62,7 +62,7 @@ namespace madness { } // namespace archive } // namespace madness -static_assert(madness::is_serializable_v, derived_value_t>); +static_assert(madness::is_serializable_v, derived_value_t>); static_assert(ttg::detail::has_buffer_apply_v); diff --git a/ttg/ttg/buffer.h b/ttg/ttg/buffer.h index 59f97bf70b..8f664215f7 100644 --- a/ttg/ttg/buffer.h +++ b/ttg/ttg/buffer.h @@ -61,13 +61,13 @@ namespace detail { namespace madness { namespace archive { template - struct BufferInspectorArchive : public madness::archive::BaseOutputArchive { + struct BufferVisitorArchive : public madness::archive::BaseOutputArchive { private: Fn m_fn; public: template - BufferInspectorArchive(_Fn&& fn) + BufferVisitorArchive(_Fn&& fn) : m_fn(fn) { } @@ -109,30 +109,30 @@ namespace madness { /* deduction guide */ template - BufferInspectorArchive(Fn&&) -> BufferInspectorArchive; + BufferVisitorArchive(Fn&&) -> BufferVisitorArchive; } // namespace archive template - struct is_archive> : std::true_type {}; + struct is_archive> : std::true_type {}; template - struct is_output_archive> : std::true_type {}; + struct is_output_archive> : std::true_type {}; template - struct is_default_serializable_helper, T, + struct is_default_serializable_helper, T, std::enable_if_t::value>> : std::true_type {}; template - struct is_default_serializable_helper, ttg::Buffer> + struct is_default_serializable_helper, ttg::Buffer> : std::true_type {}; } // namespace madness namespace ttg::detail { template - requires(madness::is_serializable_v, std::decay>) + requires(madness::is_serializable_v, std::decay>) void buffer_apply(T&& t, Fn&& fn) { - madness::archive::BufferInspectorArchive ar(std::forward(fn)); + madness::archive::BufferVisitorArchive ar(std::forward(fn)); ar & t; } @@ -140,7 +140,7 @@ namespace ttg::detail { using buffer_apply_dummy_fn = decltype([](const ttg::Buffer&){}); template - struct has_buffer_apply_helper, std::decay_t>>> + struct has_buffer_apply_helper, std::decay_t>>> : std::true_type { }; diff --git a/ttg/ttg/parsec/ttg.h b/ttg/ttg/parsec/ttg.h index 852dcfebe7..cb9b09c32c 100644 --- a/ttg/ttg/parsec/ttg.h +++ b/ttg/ttg/parsec/ttg.h @@ -3483,7 +3483,7 @@ namespace ttg_parsec { check_parsec_data(detail::get_parsec_data(buffer)); }); } else { - throw std::runtime_error("Value type must be serializable with ttg::BufferInspectorArchive"); + throw std::runtime_error("Value type must be serializable with ttg::BufferVisitorArchive"); } } From 1bf4b117aa7176faee2714a516b8dbb72b594564 Mon Sep 17 00:00:00 2001 From: Joseph Schuchart Date: Fri, 15 Nov 2024 12:16:32 -0500 Subject: [PATCH 13/39] Bump PaRSEC commit sha to point to a branch with parsec_data_discard() Signed-off-by: Joseph Schuchart --- cmake/modules/ExternalDependenciesVersions.cmake | 2 +- cmake/modules/FindOrFetchPARSEC.cmake | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/cmake/modules/ExternalDependenciesVersions.cmake b/cmake/modules/ExternalDependenciesVersions.cmake index dd7347c254..89654a2562 100644 --- a/cmake/modules/ExternalDependenciesVersions.cmake +++ b/cmake/modules/ExternalDependenciesVersions.cmake @@ -4,7 +4,7 @@ set(TTG_TRACKED_VG_CMAKE_KIT_TAG 878654d0cb1904049fbd2c37b37d5385ae897658) # provides FindOrFetchLinalgPP and "real" FindOrFetchBoost set(TTG_TRACKED_CATCH2_VERSION 3.5.0) set(TTG_TRACKED_MADNESS_TAG 93a9a5cec2a8fa87fba3afe8056607e6062a9058) -set(TTG_TRACKED_PARSEC_TAG 58f8f3089ecad2e8ee50e80a9586e05ce8873b1c) +set(TTG_TRACKED_PARSEC_TAG 996dda4c0ff3120bc65385f86e999befd4b3fe7a) set(TTG_TRACKED_BTAS_TAG c25b0a11d2a76190bfb13fa72f9e9dc3e57c3c2f) set(TTG_TRACKED_TILEDARRAY_TAG 5944bdba3266a3fa19f1809c8e2accf3dad4d815) diff --git a/cmake/modules/FindOrFetchPARSEC.cmake b/cmake/modules/FindOrFetchPARSEC.cmake index b3fd5faa3a..7b164019fe 100644 --- a/cmake/modules/FindOrFetchPARSEC.cmake +++ b/cmake/modules/FindOrFetchPARSEC.cmake @@ -17,7 +17,7 @@ if (NOT TARGET PaRSEC::parsec) FetchContent_Declare( PARSEC - GIT_REPOSITORY https://github.com/ICLDisco/parsec.git + GIT_REPOSITORY https://github.com/devreal/parsec-1.git GIT_TAG ${TTG_TRACKED_PARSEC_TAG} ) FetchContent_MakeAvailable(PARSEC) From 671024f2a9d82ab113838be2195467608d6839bd Mon Sep 17 00:00:00 2001 From: Joseph Schuchart Date: Mon, 18 Nov 2024 17:11:33 -0500 Subject: [PATCH 14/39] Fix usage of Allocator in MatrixTile Signed-off-by: Joseph Schuchart --- examples/matrixtile.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/matrixtile.h b/examples/matrixtile.h index d62e7047dc..4288d0aa4e 100644 --- a/examples/matrixtile.h +++ b/examples/matrixtile.h @@ -41,8 +41,8 @@ inline void allocator_fini() { } template > class MatrixTile : public ttg::TTValue> { public: - using buffer_t = typename ttg::Buffer; - using ttvalue_type = ttg::TTValue>; + using buffer_t = typename ttg::Buffer; + using ttvalue_type = ttg::TTValue>; private: buffer_t _buffer; From 0429c7e61f8ef826322f33b83b0bb006e5eea57c Mon Sep 17 00:00:00 2001 From: Joseph Schuchart Date: Mon, 18 Nov 2024 17:26:49 -0500 Subject: [PATCH 15/39] Bring back scopes on Buffer construction This allows us to mark buffers as allocate-only. SyncIn is the default. Ignored in the MANDESS backend. Signed-off-by: Joseph Schuchart --- ttg/ttg/madness/buffer.h | 50 +++++------------------ ttg/ttg/parsec/buffer.h | 86 ++++++++-------------------------------- 2 files changed, 25 insertions(+), 111 deletions(-) diff --git a/ttg/ttg/madness/buffer.h b/ttg/ttg/madness/buffer.h index 3aeddd2434..1d92d7551c 100644 --- a/ttg/ttg/madness/buffer.h +++ b/ttg/ttg/madness/buffer.h @@ -49,7 +49,7 @@ struct Buffer : private Allocator { Buffer() : Buffer(nullptr, 0) { } - Buffer(std::size_t n) + Buffer(std::size_t n, ttg::scope scope = ttg::scope::SyncIn) : allocator_type() , m_host_data(allocate(n)) , m_count(n) @@ -60,7 +60,7 @@ struct Buffer : private Allocator { * The memory pointed to by ptr must be accessible during * the life-time of the buffer. */ template - Buffer(std::unique_ptr ptr, std::size_t n) + Buffer(std::unique_ptr ptr, std::size_t n, ttg::scope scope = ttg::scope::SyncIn) : allocator_type() , m_sptr(std::move(ptr)) , m_host_data(m_sptr.get()) @@ -71,7 +71,7 @@ struct Buffer : private Allocator { /* Constructing a buffer using application-managed memory. * The memory pointed to by ptr must be accessible during * the life-time of the buffer. */ - Buffer(std::shared_ptr ptr, std::size_t n) + Buffer(std::shared_ptr ptr, std::size_t n, ttg::scope scope = ttg::scope::SyncIn) : allocator_type() , m_sptr(std::move(ptr)) , m_host_data(m_sptr.get()) @@ -241,7 +241,7 @@ struct Buffer : private Allocator { } /* Reallocate the buffer with count elements */ - void reset(std::size_t n) { + void reset(std::size_t n, ttg::scope scope = ttg::scope::SyncIn) { if (m_owned) { deallocate(); @@ -258,47 +258,15 @@ struct Buffer : private Allocator { m_count = n; } - /* Reset the buffer to use the ptr to count elements */ - void reset(T* ptr, std::size_t n = 1) { - /* TODO: can we resize if count is smaller than m_count? */ - if (n == m_count) { - return; - } - - if (m_owned) { - deallocate(); - } - - if (nullptr == ptr) { - m_host_data = nullptr; - m_count = 0; - m_owned = false; - } else { - m_host_data = ptr; - m_count = n; - m_owned = false; - } + /** + * Resets the scope of the buffer. Ignored in MADNESS. + */ + void reset_scope(ttg::scope scope) { + /* nothing to do here */ } /* serialization support */ -#if defined(TTG_SERIALIZATION_SUPPORTS_BOOST) && 0 - template - void serialize(Archive& ar, const unsigned int version) { - if constexpr (ttg::detail::is_output_archive_v) { - std::size_t s = size(); - ar& s; - /* TODO: how to serialize the array? */ - } else { - std::size_t s; - ar & s; - /* initialize internal pointers and then reset */ - reset(s); - /* TODO: how to deserialize the array? */ - } - } -#endif // TTG_SERIALIZATION_SUPPORTS_BOOST - #if defined(TTG_SERIALIZATION_SUPPORTS_MADNESS) template std::enable_if_t || diff --git a/ttg/ttg/parsec/buffer.h b/ttg/ttg/parsec/buffer.h index 9c21758b17..105a7b9667 100644 --- a/ttg/ttg/parsec/buffer.h +++ b/ttg/ttg/parsec/buffer.h @@ -98,7 +98,8 @@ namespace detail { this->device_private = to_address(m_ptr); } - void construct(std::size_t size, const allocator_type& alloc = allocator_type()) { + void construct(std::size_t size, + const allocator_type& alloc = allocator_type()) { constexpr const bool is_empty_allocator = std::is_same_v>; assert(!is_empty_allocator); m_allocator = alloc; @@ -111,65 +112,6 @@ namespace detail { } }; -#if 0 - /** - * derived from parsec_data_t managing an allocator for host memory - * and creating a host copy through a callback - */ - struct data_type : public parsec_data_t { - private: - [[no_unique_address]] - Allocator host_allocator; - - allocator_type& get_allocator_reference() { return static_cast(*this); } - - element_type* do_allocate(std::size_t n) { - return allocator_traits::allocate(get_allocator_reference(), n); - } - - public: - - explicit data_type(const Allocator& host_alloc = Allocator()) - : host_allocator(host_alloc) - { } - - void* allocate(std::size_t size) { - return do_allocate(size); - } - - void deallocate(void* ptr, std::size_t size) { - allocator_traits::deallocate(get_allocator_reference(), ptr, size); - } - }; - - /* Allocate the host copy for a given data_t */ - static parsec_data_copy_t* data_copy_allocate(parsec_data_t* pdata, int device) { - if (device != 0) return nullptr; // we only allocate for the host - data_type* data = static_cast(pdata); // downcast - data_copy_type* copy = PARSEC_OBJ_NEW(data_copy_type); - copy->device_private = data->allocate(data->nb_elts); - return copy; - } - /** - * Create the PaRSEC object infrastructure for the data copy type - */ - inline void data_construct(data_type* obj) - { - obj->allocate_cb = &data_copy_allocate; - } - - static void data_destruct(data_type* obj) - { - /* cleanup alloctor instance */ - obj->~data_type(); - } - - static constexpr PARSEC_OBJ_CLASS_INSTANCE(data_type, parsec_data_t, - data_copy_construct, - data_copy_destruct); - -#endif // 0 - /** * Create the PaRSEC object infrastructure for the data copy type */ @@ -187,7 +129,7 @@ namespace detail { data_copy_construct, data_copy_destruct); - static parsec_data_t * create_data(std::size_t size, + static parsec_data_t * create_data(std::size_t size, ttg::scope scope, const allocator_type& allocator = allocator_type()) { parsec_data_t *data = PARSEC_OBJ_NEW(parsec_data_t); data->owner_device = 0; @@ -201,12 +143,13 @@ namespace detail { /* adjust data flags */ data->device_copies[0]->flags |= PARSEC_DATA_FLAG_PARSEC_MANAGED; data->device_copies[0]->coherency_state = PARSEC_DATA_COHERENCY_SHARED; - data->device_copies[0]->version = 1; + /* setting version to 0 causes data not to be sent to the device */ + data->device_copies[0]->version = (scope == ttg::scope::SyncIn) ? 1 : 0; return data; } - static parsec_data_t * create_data(PtrT& ptr, std::size_t size) { + static parsec_data_t * create_data(PtrT& ptr, std::size_t size, ttg::scope scope) { parsec_data_t *data = PARSEC_OBJ_NEW(parsec_data_t); data->owner_device = 0; data->nb_elts = size; @@ -219,7 +162,8 @@ namespace detail { /* adjust data flags */ data->device_copies[0]->flags |= PARSEC_DATA_FLAG_PARSEC_MANAGED; data->device_copies[0]->coherency_state = PARSEC_DATA_COHERENCY_SHARED; - data->device_copies[0]->version = 1; + /* setting version to 0 causes data not to be sent to the device */ + data->device_copies[0]->version = (scope == ttg::scope::SyncIn) ? 1 : 0; return data; } @@ -273,8 +217,8 @@ struct Buffer { Buffer() { } - Buffer(std::size_t n) - : m_data(detail::ttg_parsec_data_types::create_data(n*sizeof(element_type))) + Buffer(std::size_t n, ttg::scope scope = ttg::scope::SyncIn) + : m_data(detail::ttg_parsec_data_types::create_data(n*sizeof(element_type), scope)) , m_count(n) { } @@ -283,18 +227,20 @@ struct Buffer { * The shared_ptr will ensure that the memory is not free'd before * the runtime has released all of its references. */ - Buffer(std::shared_ptr ptr, std::size_t n) + Buffer(std::shared_ptr ptr, std::size_t n, + ttg::scope scope = ttg::scope::SyncIn) : m_data(detail::ttg_parsec_data_types, detail::empty_allocator> - ::create_data(ptr, n*sizeof(element_type))) + ::create_data(ptr, n*sizeof(element_type), scope)) , m_count(n) { } template - Buffer(std::unique_ptr ptr, std::size_t n) + Buffer(std::unique_ptr ptr, std::size_t n, + ttg::scope scope = ttg::scope::SyncIn) : m_data(detail::ttg_parsec_data_types, detail::empty_allocator> - ::create_data(ptr, n*sizeof(element_type))) + ::create_data(ptr, n*sizeof(element_type), scope)) , m_count(n) { } From 281fec3b5ebfe33a6e236bcfcfc93e6b9aeae95a Mon Sep 17 00:00:00 2001 From: Joseph Schuchart Date: Mon, 18 Nov 2024 18:36:00 -0500 Subject: [PATCH 16/39] Buffer: handle constness in connection with smart pointers Signed-off-by: Joseph Schuchart --- ttg/ttg/parsec/buffer.h | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/ttg/ttg/parsec/buffer.h b/ttg/ttg/parsec/buffer.h index 105a7b9667..7f204840e0 100644 --- a/ttg/ttg/parsec/buffer.h +++ b/ttg/ttg/parsec/buffer.h @@ -95,7 +95,7 @@ namespace detail { constexpr const bool is_empty_allocator = std::is_same_v>; assert(is_empty_allocator); m_ptr = std::move(ptr); - this->device_private = to_address(m_ptr); + this->device_private = const_cast(to_address(m_ptr)); } void construct(std::size_t size, @@ -133,7 +133,7 @@ namespace detail { const allocator_type& allocator = allocator_type()) { parsec_data_t *data = PARSEC_OBJ_NEW(parsec_data_t); data->owner_device = 0; - data->nb_elts = size; + data->nb_elts = size*sizeof(value_type); /* create the host copy and allocate host memory */ data_copy_type *copy = PARSEC_OBJ_NEW(data_copy_type); @@ -152,7 +152,7 @@ namespace detail { static parsec_data_t * create_data(PtrT& ptr, std::size_t size, ttg::scope scope) { parsec_data_t *data = PARSEC_OBJ_NEW(parsec_data_t); data->owner_device = 0; - data->nb_elts = size; + data->nb_elts = size*sizeof(value_type); /* create the host copy and allocate host memory */ data_copy_type *copy = PARSEC_OBJ_NEW(data_copy_type); @@ -196,7 +196,6 @@ struct Buffer { private: using delete_fn_t = std::function; - using host_data_ptr = std::add_pointer_t; parsec_data_t *m_data = nullptr; std::size_t m_count = 0; @@ -218,7 +217,7 @@ struct Buffer { { } Buffer(std::size_t n, ttg::scope scope = ttg::scope::SyncIn) - : m_data(detail::ttg_parsec_data_types::create_data(n*sizeof(element_type), scope)) + : m_data(detail::ttg_parsec_data_types::create_data(n, scope)) , m_count(n) { } @@ -227,20 +226,20 @@ struct Buffer { * The shared_ptr will ensure that the memory is not free'd before * the runtime has released all of its references. */ - Buffer(std::shared_ptr ptr, std::size_t n, + Buffer(std::shared_ptr ptr, std::size_t n, ttg::scope scope = ttg::scope::SyncIn) - : m_data(detail::ttg_parsec_data_types, - detail::empty_allocator> - ::create_data(ptr, n*sizeof(element_type), scope)) + : m_data(detail::ttg_parsec_data_types, + detail::empty_allocator> + ::create_data(ptr, n, scope)) , m_count(n) { } template - Buffer(std::unique_ptr ptr, std::size_t n, + Buffer(std::unique_ptr ptr, std::size_t n, ttg::scope scope = ttg::scope::SyncIn) - : m_data(detail::ttg_parsec_data_types, - detail::empty_allocator> - ::create_data(ptr, n*sizeof(element_type), scope)) + : m_data(detail::ttg_parsec_data_types, + detail::empty_allocator> + ::create_data(ptr, n, scope)) , m_count(n) { } @@ -433,7 +432,7 @@ struct Buffer { /* Reallocate the buffer with count elements */ void reset(std::size_t n, ttg::scope scope = ttg::scope::SyncIn) { release_data(); - m_data = detail::ttg_parsec_data_types::create_data(n*sizeof(element_type), scope); + m_data = detail::ttg_parsec_data_types::create_data(n, scope); m_count = n; } From 1effdc1760d2f259d088c1ce2f6c2d2ec5bef655 Mon Sep 17 00:00:00 2001 From: Joseph Schuchart Date: Mon, 18 Nov 2024 22:06:29 -0500 Subject: [PATCH 17/39] Add missing parsec_data.h to install list Signed-off-by: Joseph Schuchart --- ttg/CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/ttg/CMakeLists.txt b/ttg/CMakeLists.txt index 27cb08526d..be834a6341 100644 --- a/ttg/CMakeLists.txt +++ b/ttg/CMakeLists.txt @@ -237,6 +237,7 @@ if (TARGET PaRSEC::parsec) ${CMAKE_CURRENT_SOURCE_DIR}/ttg/parsec/import.h ${CMAKE_CURRENT_SOURCE_DIR}/ttg/parsec/ptr.h ${CMAKE_CURRENT_SOURCE_DIR}/ttg/parsec/parsec-ext.h + ${CMAKE_CURRENT_SOURCE_DIR}/ttg/parsec/parsec_data.h ${CMAKE_CURRENT_SOURCE_DIR}/ttg/parsec/task.h ${CMAKE_CURRENT_SOURCE_DIR}/ttg/parsec/thread_local.h ${CMAKE_CURRENT_SOURCE_DIR}/ttg/parsec/ttg.h From 5e7d765a6fa9e7a9220096cfb2b24a9617894df0 Mon Sep 17 00:00:00 2001 From: Joseph Schuchart Date: Tue, 19 Nov 2024 15:41:20 -0500 Subject: [PATCH 18/39] Buffer: in-place construct data_copy_type in construct callback Signed-off-by: Joseph Schuchart --- ttg/ttg/parsec/buffer.h | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/ttg/ttg/parsec/buffer.h b/ttg/ttg/parsec/buffer.h index 7f204840e0..b3d3e465c4 100644 --- a/ttg/ttg/parsec/buffer.h +++ b/ttg/ttg/parsec/buffer.h @@ -88,7 +88,12 @@ namespace detail { public: - /* allocation through PARSEC_OBJ_NEW, so no inline constructor */ + /* default construction and move, but not copy */ + data_copy_type() = default; + data_copy_type(const data_copy_type&) = delete; + data_copy_type(data_copy_type&&) = default; + data_copy_type& operator=(const data_copy_type&) = delete; + data_copy_type& operator=(data_copy_type&&) = default; void construct(PtrT ptr, std::size_t size) { m_allocator = allocator_type{}; @@ -117,7 +122,8 @@ namespace detail { */ static void data_copy_construct(data_copy_type* obj) { - /* nothing to do */ + /* placement new */ + new(obj)(data_copy_type); } static void data_copy_destruct(data_copy_type* obj) @@ -229,7 +235,7 @@ struct Buffer { Buffer(std::shared_ptr ptr, std::size_t n, ttg::scope scope = ttg::scope::SyncIn) : m_data(detail::ttg_parsec_data_types, - detail::empty_allocator> + detail::empty_allocator> ::create_data(ptr, n, scope)) , m_count(n) { } @@ -238,7 +244,7 @@ struct Buffer { Buffer(std::unique_ptr ptr, std::size_t n, ttg::scope scope = ttg::scope::SyncIn) : m_data(detail::ttg_parsec_data_types, - detail::empty_allocator> + detail::empty_allocator> ::create_data(ptr, n, scope)) , m_count(n) { } From 04b67fd589c0a5a5ff334cfe54a910eae7189d87 Mon Sep 17 00:00:00 2001 From: Joseph Schuchart Date: Tue, 19 Nov 2024 17:16:26 -0500 Subject: [PATCH 19/39] Protect parsec data buffer inspection against empty buffers Signed-off-by: Joseph Schuchart --- ttg/ttg/parsec/parsec_data.h | 5 ++- ttg/ttg/parsec/ttg.h | 67 +++++++++++++++++------------------- 2 files changed, 35 insertions(+), 37 deletions(-) diff --git a/ttg/ttg/parsec/parsec_data.h b/ttg/ttg/parsec/parsec_data.h index e7b1c9f564..ab9dba219a 100644 --- a/ttg/ttg/parsec/parsec_data.h +++ b/ttg/ttg/parsec/parsec_data.h @@ -10,7 +10,10 @@ namespace ttg_parsec::detail { /* protect for non-serializable types, allowed if the TT has no device op */ if constexpr (ttg::detail::has_buffer_apply_v) { ttg::detail::buffer_apply(value, [&](B&& b){ - fn(detail::get_parsec_data(b)); + parsec_data_t *data = detail::get_parsec_data(b); + if (nullptr != data) { + fn(data); + } }); } } diff --git a/ttg/ttg/parsec/ttg.h b/ttg/ttg/parsec/ttg.h index cb9b09c32c..4783dbe1cc 100644 --- a/ttg/ttg/parsec/ttg.h +++ b/ttg/ttg/parsec/ttg.h @@ -746,9 +746,8 @@ namespace ttg_parsec { template inline void transfer_ownership_impl(T&& arg, int device) { - if constexpr(!std::is_const_v> && ttg::detail::has_buffer_apply_v) { - ttg::detail::buffer_apply(arg, [&](auto&& buffer){ - auto *data = detail::get_parsec_data(buffer); + if constexpr(!std::is_const_v>) { + detail::foreach_parsec_data(arg, [&](parsec_data_t *data){ parsec_data_transfer_ownership_to_copy(data, device, PARSEC_FLOW_ACCESS_RW); /* make sure we increment the version since we will modify the data */ data->device_copies[0]->version++; @@ -3450,41 +3449,37 @@ namespace ttg_parsec { template void copy_mark_pushout(const Value& value) { - if constexpr (ttg::detail::has_buffer_apply_v) { - assert(detail::parsec_ttg_caller->dev_ptr && detail::parsec_ttg_caller->dev_ptr->gpu_task); - parsec_gpu_task_t *gpu_task = detail::parsec_ttg_caller->dev_ptr->gpu_task; - auto check_parsec_data = [&](parsec_data_t* data) { - if (data->owner_device != 0) { - /* find the flow */ - int flowidx = 0; - while (flowidx < MAX_PARAM_COUNT && - gpu_task->flow[flowidx]->flow_flags != PARSEC_FLOW_ACCESS_NONE) { - if (detail::parsec_ttg_caller->parsec_task.data[flowidx].data_in->original == data) { - /* found the right data, set the corresponding flow as pushout */ - break; - } - ++flowidx; - } - if (flowidx == MAX_PARAM_COUNT) { - throw std::runtime_error("Cannot add more than MAX_PARAM_COUNT flows to a task!"); - } - if (gpu_task->flow[flowidx]->flow_flags == PARSEC_FLOW_ACCESS_NONE) { - /* no flow found, add one and mark it pushout */ - detail::parsec_ttg_caller->parsec_task.data[flowidx].data_in = data->device_copies[0]; - gpu_task->flow_nb_elts[flowidx] = data->nb_elts; + assert(detail::parsec_ttg_caller->dev_ptr && detail::parsec_ttg_caller->dev_ptr->gpu_task); + parsec_gpu_task_t *gpu_task = detail::parsec_ttg_caller->dev_ptr->gpu_task; + auto check_parsec_data = [&](parsec_data_t* data) { + if (data->owner_device != 0) { + /* find the flow */ + int flowidx = 0; + while (flowidx < MAX_PARAM_COUNT && + gpu_task->flow[flowidx]->flow_flags != PARSEC_FLOW_ACCESS_NONE) { + if (detail::parsec_ttg_caller->parsec_task.data[flowidx].data_in->original == data) { + /* found the right data, set the corresponding flow as pushout */ + break; } - /* need to mark the flow WRITE to make PaRSEC happy */ - ((parsec_flow_t *)gpu_task->flow[flowidx])->flow_flags |= PARSEC_FLOW_ACCESS_WRITE; - gpu_task->pushout |= 1<(const ttg::Buffer& buffer){ - check_parsec_data(detail::get_parsec_data(buffer)); - }); - } else { - throw std::runtime_error("Value type must be serializable with ttg::BufferVisitorArchive"); - } + if (flowidx == MAX_PARAM_COUNT) { + throw std::runtime_error("Cannot add more than MAX_PARAM_COUNT flows to a task!"); + } + if (gpu_task->flow[flowidx]->flow_flags == PARSEC_FLOW_ACCESS_NONE) { + /* no flow found, add one and mark it pushout */ + detail::parsec_ttg_caller->parsec_task.data[flowidx].data_in = data->device_copies[0]; + gpu_task->flow_nb_elts[flowidx] = data->nb_elts; + } + /* need to mark the flow RW to make PaRSEC happy */ + ((parsec_flow_t *)gpu_task->flow[flowidx])->flow_flags |= PARSEC_FLOW_ACCESS_RW; + gpu_task->pushout |= 1< Date: Mon, 23 Dec 2024 12:13:23 -0500 Subject: [PATCH 20/39] Add missing include in madness backend Signed-off-by: Joseph Schuchart --- ttg/ttg/madness/devicefunc.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ttg/ttg/madness/devicefunc.h b/ttg/ttg/madness/devicefunc.h index 563b433cf4..23bd167ad5 100644 --- a/ttg/ttg/madness/devicefunc.h +++ b/ttg/ttg/madness/devicefunc.h @@ -1,6 +1,8 @@ #ifndef TTG_MAD_DEVICEFUNC_H #define TTG_MAD_DEVICEFUNC_H +#include "ttg/devicescope.h" + #include "ttg/madness/buffer.h" namespace ttg_madness { From 819be34272eb291abc132805946f800d57846046 Mon Sep 17 00:00:00 2001 From: Joseph Schuchart Date: Mon, 23 Dec 2024 15:04:12 -0500 Subject: [PATCH 21/39] ParSEC: Templatize derived device ops check So we can delay evaluation until the derived struct has been populated. Signed-off-by: Joseph Schuchart --- ttg/ttg/parsec/ttg.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/ttg/ttg/parsec/ttg.h b/ttg/ttg/parsec/ttg.h index 4783dbe1cc..3f0ccfd042 100644 --- a/ttg/ttg/parsec/ttg.h +++ b/ttg/ttg/parsec/ttg.h @@ -1224,6 +1224,7 @@ namespace ttg_parsec { public: /// @return true if derivedT::have_cuda_op exists and is defined to true + template static constexpr bool derived_has_cuda_op() { if constexpr (ttg::meta::is_detected_v) { return derivedT::have_cuda_op; @@ -1233,6 +1234,7 @@ namespace ttg_parsec { } /// @return true if derivedT::have_hip_op exists and is defined to true + template static constexpr bool derived_has_hip_op() { if constexpr (ttg::meta::is_detected_v) { return derivedT::have_hip_op; @@ -1242,6 +1244,7 @@ namespace ttg_parsec { } /// @return true if derivedT::have_hip_op exists and is defined to true + template static constexpr bool derived_has_level_zero_op() { if constexpr (ttg::meta::is_detected_v) { return derivedT::have_level_zero_op; @@ -1251,6 +1254,7 @@ namespace ttg_parsec { } /// @return true if the TT supports device execution + template static constexpr bool derived_has_device_op() { return (derived_has_cuda_op() || derived_has_hip_op() || derived_has_level_zero_op()); } From 03cf233dadf5654e5e7160f409338646de2237a0 Mon Sep 17 00:00:00 2001 From: Joseph Schuchart Date: Mon, 23 Dec 2024 15:30:28 -0500 Subject: [PATCH 22/39] Use smart pointer for buffer in device chain benchmark Signed-off-by: Joseph Schuchart --- examples/task-benchmarks/chain-ttg-dev.cc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/task-benchmarks/chain-ttg-dev.cc b/examples/task-benchmarks/chain-ttg-dev.cc index 80f14bff44..633e80da64 100644 --- a/examples/task-benchmarks/chain-ttg-dev.cc +++ b/examples/task-benchmarks/chain-ttg-dev.cc @@ -19,12 +19,12 @@ std::atomic task_counter = 0; struct A : public ttg::TTValue { // TODO: allocate pinned memory - int v = 0; + std::unique_ptr v; ttg::Buffer b; - A() : b(&v, 1) { } + A() : v(std::make_unique(0)), b(v) { } A(A&& a) = default; - A(const A& a) : v(a.v), b(&v, 1) { } + A(const A& a) : v(std::make_unique(*a.v)), b(v) { } template void serialize(Archive& ar) { From bf86552c778354ff44ac69b45548e6f96c384ecd Mon Sep 17 00:00:00 2001 From: Joseph Schuchart Date: Mon, 23 Dec 2024 15:32:37 -0500 Subject: [PATCH 23/39] Another attempt at fixing CRTP usage in device map setter Signed-off-by: Joseph Schuchart --- ttg/ttg/parsec/ttg.h | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/ttg/ttg/parsec/ttg.h b/ttg/ttg/parsec/ttg.h index 3f0ccfd042..205ac2aff7 100644 --- a/ttg/ttg/parsec/ttg.h +++ b/ttg/ttg/parsec/ttg.h @@ -1224,39 +1224,41 @@ namespace ttg_parsec { public: /// @return true if derivedT::have_cuda_op exists and is defined to true - template + template static constexpr bool derived_has_cuda_op() { - if constexpr (ttg::meta::is_detected_v) { - return derivedT::have_cuda_op; + if constexpr (ttg::meta::is_detected_v) { + return DerivedT::have_cuda_op; } else { return false; } } /// @return true if derivedT::have_hip_op exists and is defined to true - template + template static constexpr bool derived_has_hip_op() { - if constexpr (ttg::meta::is_detected_v) { - return derivedT::have_hip_op; + if constexpr (ttg::meta::is_detected_v) { + return DerivedT::have_hip_op; } else { return false; } } /// @return true if derivedT::have_hip_op exists and is defined to true - template + template static constexpr bool derived_has_level_zero_op() { - if constexpr (ttg::meta::is_detected_v) { - return derivedT::have_level_zero_op; + if constexpr (ttg::meta::is_detected_v) { + return DerivedT::have_level_zero_op; } else { return false; } } /// @return true if the TT supports device execution - template + template static constexpr bool derived_has_device_op() { - return (derived_has_cuda_op() || derived_has_hip_op() || derived_has_level_zero_op()); + return (derived_has_cuda_op() || + derived_has_hip_op() || + derived_has_level_zero_op()); } static_assert(!derived_has_device_op() || ttg::meta::probe_all_v void set_devicemap(Devicemap&& dm) { - static_assert(derived_has_device_op(), "Device map only allowed on device-enabled TT!"); + //static_assert(derived_has_device_op(), "Device map only allowed on device-enabled TT!"); if constexpr (std::is_same_v()))>) { // dm returns a Device devicemap = std::forward(dm); From 32c82003b40fe66c8ccb42a341b6401516747a45 Mon Sep 17 00:00:00 2001 From: Joseph Schuchart Date: Mon, 23 Dec 2024 17:30:49 -0500 Subject: [PATCH 24/39] Fix chain device test by removing the non-move path Signed-off-by: Joseph Schuchart --- examples/task-benchmarks/chain-ttg-dev.cc | 93 +++++++---------------- 1 file changed, 29 insertions(+), 64 deletions(-) diff --git a/examples/task-benchmarks/chain-ttg-dev.cc b/examples/task-benchmarks/chain-ttg-dev.cc index 633e80da64..66e15adbff 100644 --- a/examples/task-benchmarks/chain-ttg-dev.cc +++ b/examples/task-benchmarks/chain-ttg-dev.cc @@ -18,13 +18,10 @@ using namespace ttg; std::atomic task_counter = 0; struct A : public ttg::TTValue { - // TODO: allocate pinned memory - std::unique_ptr v; ttg::Buffer b; - A() : v(std::make_unique(0)), b(v) { } + A() : b(1) { } A(A&& a) = default; - A(const A& a) : v(std::make_unique(*a.v)), b(v) { } template void serialize(Archive& ar) { @@ -38,11 +35,11 @@ struct A : public ttg::TTValue { }; template -auto make_ttg(bool do_move); +auto make_ttg(); // flows task ids via values template <> -auto make_ttg<1>(bool do_move) { +auto make_ttg<1>() { Edge I2N, N2N; Edge N2S; @@ -57,11 +54,7 @@ auto make_ttg<1>(bool do_move) { //++task_counter; co_await ttg::device::select(value.b); if (key < NUM_TASKS) { - if (do_move) { - co_await ttg::device::forward(ttg::device::send<0>(key+1, std::move(value))); - } else { - co_await ttg::device::forward(ttg::device::send<0>(key+1, value)); - } + co_await ttg::device::forward(ttg::device::send<0>(key+1, std::move(value))); } else { } } , edges(fuse(I2N, N2N)), edges(N2N)); @@ -70,7 +63,7 @@ auto make_ttg<1>(bool do_move) { } template <> -auto make_ttg<2>(bool do_move) { +auto make_ttg<2>() { Edge I2N1, I2N2; Edge N2N1, N2N2; Edge N2S1, N2S2; @@ -83,13 +76,8 @@ auto make_ttg<2>(bool do_move) { auto next = make_tt([=](const int &key, A&& v1, A&& v2) -> ttg::device::Task { co_await ttg::device::select(v1.b, v2.b); if (key < NUM_TASKS) { - if (do_move) { - co_await ttg::device::forward(ttg::device::send<0>(key+1, std::move(v1)), - ttg::device::send<1>(key+1, std::move(v2))); - } else { - co_await ttg::device::forward(ttg::device::send<0>(key+1, v1), - ttg::device::send<1>(key+1, v2)); - } + co_await ttg::device::forward(ttg::device::send<0>(key+1, std::move(v1)), + ttg::device::send<1>(key+1, std::move(v2))); } } , edges(fuse(I2N1, N2N1), fuse(I2N2, N2N2)), edges(N2N1, N2N2)); @@ -97,7 +85,7 @@ auto make_ttg<2>(bool do_move) { } template <> -auto make_ttg<4>(bool do_move) { +auto make_ttg<4>() { Edge I2N1, I2N2, I2N3, I2N4; Edge N2N1, N2N2, N2N3, N2N4; Edge N2S1, N2S2, N2S3, N2S4; @@ -113,17 +101,10 @@ auto make_ttg<4>(bool do_move) { auto next = make_tt([=](const int &key, A&& v1, A&& v2, A&& v3, A&& v4) -> ttg::device::Task { co_await ttg::device::select(v1.b, v2.b, v3.b, v4.b); if (key < NUM_TASKS) { - if (do_move) { - co_await ttg::device::forward(ttg::device::send<0>(key+1, std::move(v1)), - ttg::device::send<1>(key+1, std::move(v2)), - ttg::device::send<2>(key+1, std::move(v3)), - ttg::device::send<3>(key+1, std::move(v4))); - } else { - co_await ttg::device::forward(ttg::device::send<0>(key+1, v1), - ttg::device::send<1>(key+1, v2), - ttg::device::send<2>(key+1, v3), - ttg::device::send<3>(key+1, v4)); - } + co_await ttg::device::forward(ttg::device::send<0>(key+1, std::move(v1)), + ttg::device::send<1>(key+1, std::move(v2)), + ttg::device::send<2>(key+1, std::move(v3)), + ttg::device::send<3>(key+1, std::move(v4))); } }, edges(fuse(I2N1, N2N1), fuse(I2N2, N2N2), fuse(I2N3, N2N3), fuse(I2N4, N2N4)), @@ -133,7 +114,7 @@ auto make_ttg<4>(bool do_move) { } template <> -auto make_ttg<8>(bool do_move) { +auto make_ttg<8>() { Edge I2N1, I2N2, I2N3, I2N4, I2N5, I2N6, I2N7, I2N8; Edge N2N1, N2N2, N2N3, N2N4, N2N5, N2N6, N2N7, N2N8; Edge N2S1, N2S2, N2S3, N2S4, N2S5, N2S6, N2S7, N2S8; @@ -153,25 +134,14 @@ auto make_ttg<8>(bool do_move) { auto next = make_tt([=](const int &key, auto&& v1, auto&& v2, auto&& v3, auto&& v4, auto&& v5, auto&& v6, auto&& v7, auto&& v8) -> ttg::device::Task { co_await ttg::device::select(v1.b, v2.b, v3.b, v4.b, v5.b, v6.b, v7.b, v8.b); if (key < NUM_TASKS) { - if (do_move) { - co_await ttg::device::forward(ttg::device::send<0>(key+1, std::move(v1)), - ttg::device::send<1>(key+1, std::move(v2)), - ttg::device::send<2>(key+1, std::move(v3)), - ttg::device::send<3>(key+1, std::move(v4)), - ttg::device::send<4>(key+1, std::move(v5)), - ttg::device::send<5>(key+1, std::move(v6)), - ttg::device::send<6>(key+1, std::move(v7)), - ttg::device::send<7>(key+1, std::move(v8))); - } else { - co_await ttg::device::forward(ttg::device::send<0>(key+1, v1), - ttg::device::send<1>(key+1, v2), - ttg::device::send<2>(key+1, v3), - ttg::device::send<3>(key+1, v4), - ttg::device::send<4>(key+1, v5), - ttg::device::send<5>(key+1, v6), - ttg::device::send<6>(key+1, v7), - ttg::device::send<7>(key+1, v8)); - } + co_await ttg::device::forward(ttg::device::send<0>(key+1, std::move(v1)), + ttg::device::send<1>(key+1, std::move(v2)), + ttg::device::send<2>(key+1, std::move(v3)), + ttg::device::send<3>(key+1, std::move(v4)), + ttg::device::send<4>(key+1, std::move(v5)), + ttg::device::send<5>(key+1, std::move(v6)), + ttg::device::send<6>(key+1, std::move(v7)), + ttg::device::send<7>(key+1, std::move(v8))); } }, edges(fuse(I2N1, N2N1), fuse(I2N2, N2N2), fuse(I2N3, N2N3), fuse(I2N4, N2N4), fuse(I2N5, N2N5), fuse(I2N6, N2N6), fuse(I2N7, N2N7), fuse(I2N8, N2N8)), edges(N2N1, N2N2, N2N3, N2N4, N2N5, N2N6, N2N7, N2N8)); @@ -181,7 +151,7 @@ auto make_ttg<8>(bool do_move) { // flows task ids via keys template <> -auto make_ttg<0>(bool do_move) { +auto make_ttg<0>() { Edge I2N, N2N; Edge N2S; @@ -198,9 +168,9 @@ auto make_ttg<0>(bool do_move) { } template -void run_bench(bool do_move) +void run_bench() { - auto [init, next] = make_ttg(do_move); + auto [init, next] = make_ttg(); auto connected = make_graph_executable(init.get()); assert(connected); @@ -225,23 +195,18 @@ void run_bench(bool do_move) int main(int argc, char* argv[]) { int num_flows = 0; - int do_move = 1; ttg_initialize(argc, argv, -1); if (argc > 1) { num_flows = std::atoi(argv[1]); } - if (argc > 2) { - do_move = std::atoi(argv[2]); - } - switch(num_flows) { - case 0: run_bench<0>(do_move); break; - case 1: run_bench<1>(do_move); break; - case 2: run_bench<2>(do_move); break; - case 4: run_bench<4>(do_move); break; - case 8: run_bench<8>(do_move); break; + case 0: run_bench<0>(); break; + case 1: run_bench<1>(); break; + case 2: run_bench<2>(); break; + case 4: run_bench<4>(); break; + case 8: run_bench<8>(); break; default: std::cout << "Unsupported number of flows: " << NUM_TASKS << std::endl; } From 36a0d23ac9a109bca945dd7980dbdefd457d7080 Mon Sep 17 00:00:00 2001 From: Joseph Schuchart Date: Mon, 6 Jan 2025 15:22:53 -0500 Subject: [PATCH 25/39] SPMM: pass shared_ptr to buffer Extract the shared ptr from the mohndl and tie the shared_ptr to the life-time of the mohndl shared ptr. Signed-off-by: Joseph Schuchart --- examples/spmm/devicetensor.h | 49 +++++++++++++++++++++++++----------- ttg/ttg/madness/buffer.h | 13 ++++++++++ ttg/ttg/parsec/buffer.h | 9 +++++++ 3 files changed, 56 insertions(+), 15 deletions(-) diff --git a/examples/spmm/devicetensor.h b/examples/spmm/devicetensor.h index d5a0f6d9fe..f6a661b0b6 100644 --- a/examples/spmm/devicetensor.h +++ b/examples/spmm/devicetensor.h @@ -20,6 +20,18 @@ #if defined(BTAS_IS_USABLE) +namespace detail { + template + struct mohndle_type; + + template + struct mohndle_type> : std::integral_constant + { }; + + template + constexpr btas::Handle mohndle_type_v = mohndle_type::value; +} // namespace detail + /** * Derives from btas::Tensor and wraps a ttg::Buffer * to enable device support in SPMM. The ttg::Buffer @@ -38,6 +50,9 @@ struct DeviceTensor : public ttg::TTValue> using storage_type = typename tensor_type::storage_type; using range_type = typename tensor_type::range_type; + static_assert(detail::mohndle_type_v<_Storage> == btas::Handle::shared_ptr, + "DeviceTensor only supports shared_ptr"); + public: DeviceTensor() = default; @@ -48,7 +63,7 @@ struct DeviceTensor : public ttg::TTValue> explicit DeviceTensor(const size_type& first, const _args&... rest) : ttvalue_type() , tensor_type(first, rest...) - , b(this->size() ? this->data() : nullptr, this->size()) + , b(std::shared_ptr<_T[]>(pointer(), this->size() ? this->data() : nullptr), this->size()) { } /// construct from \c range, allocate data, but not initialized @@ -56,7 +71,7 @@ struct DeviceTensor : public ttg::TTValue> explicit DeviceTensor(const Range& range, typename std::enable_if::value>::type* = 0) : ttvalue_type() , tensor_type(range) - , b(this->size() ? this->data() : nullptr, this->size()) + , b(std::shared_ptr<_T[]>(pointer(), this->size() ? this->data() : nullptr), this->size()) { } /// construct from \c range object, set all elements to \c v @@ -64,7 +79,7 @@ struct DeviceTensor : public ttg::TTValue> DeviceTensor(const Range& range, value_type v, typename std::enable_if::value>::type* = 0) : ttvalue_type() , tensor_type(range) - , b(this->size() ? this->data() : nullptr, this->size()) + , b(std::shared_ptr<_T[]>(pointer(), this->size() ? this->data() : nullptr), this->size()) { } /// construct from \c range object, copy elements from \c vec @@ -72,7 +87,7 @@ struct DeviceTensor : public ttg::TTValue> DeviceTensor(const Range& range, U* vec, typename std::enable_if::value>::type* = 0) : ttvalue_type() , tensor_type(range, vec) - , b(this->size() ? this->data() : nullptr, this->size()) + , b(std::shared_ptr<_T[]>(pointer(), this->size() ? this->data() : nullptr), this->size()) { } /// construct from \c range and \c storage @@ -82,28 +97,28 @@ struct DeviceTensor : public ttg::TTValue> not std::is_same::value>::type* = 0) : ttvalue_type() , tensor_type(range, storage) - , b(this->size() ? this->data() : nullptr, this->size()) + , b(std::shared_ptr<_T[]>(pointer(), this->size() ? this->data() : nullptr), this->size()) { } /// copy-copy-construct from \c range and \c storage DeviceTensor(const range_type& range, const storage_type& storage) : ttvalue_type() , tensor_type(range, storage) - , b(this->size() ? this->data() : nullptr, this->size()) + , b(std::shared_ptr<_T[]>(pointer(), this->size() ? this->data() : nullptr), this->size()) { } /// copy-move-construct from \c range and \c storage DeviceTensor(const range_type& range, storage_type&& storage) : ttvalue_type() , tensor_type(range, std::forward(storage)) - , b(this->size() ? this->data() : nullptr, this->size()) + , b(std::shared_ptr<_T[]>(pointer(), this->size() ? this->data() : nullptr), this->size()) { } /// move-construct from \c range and \c storage DeviceTensor(range_type&& range, storage_type&& storage) : ttvalue_type() , tensor_type(std::forward(range), std::forward(storage)) - , b(this->size() ? this->data() : nullptr, this->size()) + , b(std::shared_ptr<_T[]>(pointer(), this->size() ? this->data() : nullptr), this->size()) { } /// Construct an evaluated tensor @@ -125,7 +140,7 @@ struct DeviceTensor : public ttg::TTValue> typename std::enable_if::value>::type* = 0) : ttvalue_type() , tensor_type(range, it, op) - , b(this->size() ? this->data() : nullptr, this->size()) + , b(std::shared_ptr<_T[]>(pointer(), this->size() ? this->data() : nullptr), this->size()) { } /// copy constructor @@ -134,7 +149,7 @@ struct DeviceTensor : public ttg::TTValue> DeviceTensor(const _Tensor& x) noexcept : ttvalue_type() , tensor_type(x.clone()) - , b(this->size() ? this->data() : nullptr, this->size()) + , b(std::shared_ptr<_T[]>(pointer(), this->size() ? this->data() : nullptr), this->size()) { //std::cout << "DeviceTensor tensor_type copy ctor" << std::endl; } @@ -143,7 +158,7 @@ struct DeviceTensor : public ttg::TTValue> DeviceTensor(const DeviceTensor& x) noexcept : ttvalue_type(x) , tensor_type(x.clone()) - , b(this->size() ? this->data() : nullptr, this->size()) + , b(std::shared_ptr<_T[]>(pointer(), this->size() ? this->data() : nullptr), this->size()) { //std::cout << "DeviceTensor copy ctor" << std::endl; } @@ -152,7 +167,7 @@ struct DeviceTensor : public ttg::TTValue> DeviceTensor(tensor_type&& x) noexcept : ttvalue_type() , tensor_type(std::move(x)) - , b(this->size() ? this->data() : nullptr, this->size()) + , b(std::shared_ptr<_T[]>(pointer(), this->size() ? this->data() : nullptr), this->size()) { //std::cout << "DeviceTensor tensor_type move ctor" << std::endl; } @@ -172,7 +187,7 @@ struct DeviceTensor : public ttg::TTValue> not std::is_same::value>::type> DeviceTensor& operator=(const _Tensor& x) noexcept { tensor_type::operator=(x.clone()); - b.reset(this->size() ? this->data() : nullptr, this->size()); + b.reset(std::shared_ptr<_T[]>(pointer(), this->size() ? this->data() : nullptr), this->size()); //std::cout << "DeviceTensor tensor_type copy operator" << std::endl; return *this; } @@ -183,7 +198,7 @@ struct DeviceTensor : public ttg::TTValue> std::is_same::value>::type> DeviceTensor& operator=(const _Tensor& x) noexcept { tensor_type::operator=(x.clone()); - b.reset(this->size() ? this->data() : nullptr, this->size()); + b.reset(std::shared_ptr<_T[]>(pointer(), this->size() ? this->data() : nullptr), this->size()); //std::cout << "DeviceTensor tensor_type copy operator" << std::endl; return *this; } @@ -192,7 +207,7 @@ struct DeviceTensor : public ttg::TTValue> DeviceTensor& operator=(const DeviceTensor& x) noexcept { ttvalue_type::operator=(x); tensor_type::operator=(x.clone()); - b.reset(this->size() ? this->data() : nullptr, this->size()); + b.reset(std::shared_ptr<_T[]>(pointer(), this->size() ? this->data() : nullptr), this->size()); //std::cout << "DeviceTensor copy operator" << std::endl; return *this; } @@ -206,6 +221,10 @@ struct DeviceTensor : public ttg::TTValue> return *this; } + auto pointer() { + return std::get(detail::mohndle_type_v)>(this->storage().base()); + } + using tensor_type::begin; using tensor_type::cbegin; using tensor_type::end; diff --git a/ttg/ttg/madness/buffer.h b/ttg/ttg/madness/buffer.h index 1d92d7551c..a532d9965d 100644 --- a/ttg/ttg/madness/buffer.h +++ b/ttg/ttg/madness/buffer.h @@ -258,6 +258,19 @@ struct Buffer : private Allocator { m_count = n; } + /* Reallocate the buffer with count elements */ + void reset(std::shared_ptr ptr, std::size_t n, ttg::scope scope = ttg::scope::SyncIn) { + + if (m_owned) { + deallocate(); + m_owned = false; + } + + m_sptr = std::move(ptr); + m_host_data = m_sptr.get(); + m_count = n; + } + /** * Resets the scope of the buffer. Ignored in MADNESS. */ diff --git a/ttg/ttg/parsec/buffer.h b/ttg/ttg/parsec/buffer.h index b3d3e465c4..bffdc80a1a 100644 --- a/ttg/ttg/parsec/buffer.h +++ b/ttg/ttg/parsec/buffer.h @@ -442,6 +442,15 @@ struct Buffer { m_count = n; } + /* Reallocate the buffer with count elements */ + void reset(std::shared_ptr ptr, std::size_t n, ttg::scope scope = ttg::scope::SyncIn) { + release_data(); + m_data = detail::ttg_parsec_data_types, + detail::empty_allocator> + ::create_data(ptr, n, scope); + m_count = n; + } + /** * Resets the scope of the buffer. * If scope is SyncIn then the next time From cf0347c554e9822e3c447acf45a69c03aeeccb9d Mon Sep 17 00:00:00 2001 From: Joseph Schuchart Date: Wed, 15 Jan 2025 22:09:04 -0500 Subject: [PATCH 26/39] Default to MADNESS serialization with PaRSEC backend Boost has little use and we currently need the madness serialization for GPU support. Signed-off-by: Joseph Schuchart --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 230c231242..e790d09036 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -52,7 +52,7 @@ set(CMAKE_INSTALL_CMAKEDIR "lib/cmake/ttg" ######################################## #### user-defined configuration options ######################################## -option(TTG_PARSEC_USE_BOOST_SERIALIZATION "Whether to select Boost serialization methods in PaRSEC backend" ON) +option(TTG_PARSEC_USE_BOOST_SERIALIZATION "Whether to select Boost serialization methods in PaRSEC backend" OFF) option(TTG_ENABLE_CUDA "Whether to TTG will look for CUDA" OFF) option(TTG_ENABLE_HIP "Whether to TTG will look for HIP" OFF) option(TTG_ENABLE_LEVEL_ZERO "Whether to TTG will look for Intel oneAPI Level Zero" OFF) From 19f06b45ae07fdf7c71522b9b775b2402eef30e8 Mon Sep 17 00:00:00 2001 From: Joseph Schuchart Date: Thu, 16 Jan 2025 21:38:07 -0500 Subject: [PATCH 27/39] SPMM: Add madness serialization to DeviceTensor Signed-off-by: Joseph Schuchart --- examples/spmm/devicetensor.h | 26 +++++++++++++++++++++++++- ttg/ttg/parsec/buffer.h | 21 +++++++++++++-------- 2 files changed, 38 insertions(+), 9 deletions(-) diff --git a/examples/spmm/devicetensor.h b/examples/spmm/devicetensor.h index f6a661b0b6..ce896b6457 100644 --- a/examples/spmm/devicetensor.h +++ b/examples/spmm/devicetensor.h @@ -229,9 +229,33 @@ struct DeviceTensor : public ttg::TTValue> using tensor_type::cbegin; using tensor_type::end; using tensor_type::cend; - }; +namespace madness { + namespace archive { + template + struct ArchiveLoadImpl> { + static inline void load(const Archive& ar, + DeviceTensor<_T, _Range, _Store>& t) { + _Range range{}; + ar& range; + t = DeviceTensor<_T, _Range, _Store>(std::move(range)); + /* pick the buffer out of the archive + * this should not change the buffer since it's + * already been allocated above */ + ar & t.b; + } + }; + + template + struct ArchiveStoreImpl> { + static inline void store(const Archive& ar, + const DeviceTensor<_T, _Range, _Store>& t) { + ar& t.range() & t.b; + } + }; + } // namespace archive +} // namespace madness #endif // defined(BTAS_IS_USABLE) #endif // HAVE_DEVICETENSOR_H \ No newline at end of file diff --git a/ttg/ttg/parsec/buffer.h b/ttg/ttg/parsec/buffer.h index bffdc80a1a..0563456c4d 100644 --- a/ttg/ttg/parsec/buffer.h +++ b/ttg/ttg/parsec/buffer.h @@ -200,8 +200,6 @@ struct Buffer { "Only trivially copyable types are supported for devices."); private: - using delete_fn_t = std::function; - parsec_data_t *m_data = nullptr; std::size_t m_count = 0; @@ -219,8 +217,7 @@ struct Buffer { public: - Buffer() - { } + Buffer() = default; Buffer(std::size_t n, ttg::scope scope = ttg::scope::SyncIn) : m_data(detail::ttg_parsec_data_types::create_data(n, scope)) @@ -250,8 +247,8 @@ struct Buffer { { } virtual ~Buffer() { - release_data(); unpin(); // make sure the copies are not pinned + release_data(); } /* allow moving device buffers */ @@ -437,6 +434,7 @@ struct Buffer { /* Reallocate the buffer with count elements */ void reset(std::size_t n, ttg::scope scope = ttg::scope::SyncIn) { + if (n == m_count) return; release_data(); m_data = detail::ttg_parsec_data_types::create_data(n, scope); m_count = n; @@ -526,9 +524,16 @@ struct Buffer { } else { std::size_t s; ar & s; - //std::cout << "serialize(IN) buffer " << this << " size " << s << std::endl; - /* initialize internal pointers and then reset */ - reset(s); + /* if we have been initialized already we just make sure the size matches */ + if (m_data != nullptr) { + if (s != size()) { + throw std::runtime_error("Buffer size mismatch in deserialization!"); + } + } else { + //std::cout << "serialize(IN) buffer " << this << " size " << s << std::endl; + /* initialize internal pointers and then reset */ + reset(s); + } } } #endif // TTG_SERIALIZATION_SUPPORTS_MADNESS From 65a61f0a12c5e3a0ffb523d50dd89d2da1a5f570 Mon Sep 17 00:00:00 2001 From: Joseph Schuchart Date: Fri, 17 Jan 2025 22:18:12 -0500 Subject: [PATCH 28/39] Comment out Boost::serialization dependency It's all broken. All of it. Signed-off-by: Joseph Schuchart --- .github/workflows/cmake.yml | 1 - ttg/CMakeLists.txt | 32 ++++++++++++++++---------------- 2 files changed, 16 insertions(+), 17 deletions(-) diff --git a/.github/workflows/cmake.yml b/.github/workflows/cmake.yml index 20d8960a9c..571e0c5921 100644 --- a/.github/workflows/cmake.yml +++ b/.github/workflows/cmake.yml @@ -42,7 +42,6 @@ jobs: -DMPIEXEC_PREFLAGS='--bind-to;none;--allow-run-as-root' -DCMAKE_INSTALL_PREFIX=${{github.workspace}}/install -DTTG_EXAMPLES=ON - -DCMAKE_CXX_STANDARD=20 steps: - uses: actions/checkout@v4 diff --git a/ttg/CMakeLists.txt b/ttg/CMakeLists.txt index be834a6341..a18939d4ea 100644 --- a/ttg/CMakeLists.txt +++ b/ttg/CMakeLists.txt @@ -171,15 +171,15 @@ if (TARGET MADworld) list(APPEND ttg-serialization-deps MADworld) list(APPEND ttg-serialization-compile-definitions TTG_SERIALIZATION_SUPPORTS_MADNESS=1) endif(TARGET MADworld) -if (TARGET Boost::serialization) - list(APPEND ttg-serialization-deps Boost::serialization) - list(APPEND ttg-serialization-boost-deps Boost::serialization) - if (TARGET Boost::iostreams) # using modularized Boost? - list(APPEND ttg-serialization-deps Boost::iostreams) - list(APPEND ttg-serialization-boost-deps Boost::iostreams) - endif() - list(APPEND ttg-serialization-compile-definitions TTG_SERIALIZATION_SUPPORTS_BOOST=1) -endif (TARGET Boost::serialization) +#if (TARGET Boost::serialization) +# list(APPEND ttg-serialization-deps Boost::serialization) +# list(APPEND ttg-serialization-boost-deps Boost::serialization) +# if (TARGET Boost::iostreams) # using modularized Boost? +# list(APPEND ttg-serialization-deps Boost::iostreams) +# list(APPEND ttg-serialization-boost-deps Boost::iostreams) +# endif() +# list(APPEND ttg-serialization-compile-definitions TTG_SERIALIZATION_SUPPORTS_BOOST=1) +#endif (TARGET Boost::serialization) add_ttg_library(ttg-serialization "${ttg-serialization-sources}" @@ -195,13 +195,13 @@ if (TARGET MADworld) COMPILE_DEFINITIONS "TTG_SERIALIZATION_SUPPORTS_MADNESS=1") endif(TARGET MADworld) # make boost-only serialization target -if (TARGET Boost::serialization) - add_ttg_library(ttg-serialization-boost - "${ttg-serialization-sources}" - PUBLIC_HEADER "${ttg-serialization-headers}" - LINK_LIBRARIES "${ttg-serialization-boost-deps}" - COMPILE_DEFINITIONS "TTG_SERIALIZATION_SUPPORTS_BOOST=1") -endif(TARGET Boost::serialization) +#if (TARGET Boost::serialization) +# add_ttg_library(ttg-serialization-boost +# "${ttg-serialization-sources}" +# PUBLIC_HEADER "${ttg-serialization-headers}" +# LINK_LIBRARIES "${ttg-serialization-boost-deps}" +# COMPILE_DEFINITIONS "TTG_SERIALIZATION_SUPPORTS_BOOST=1") +#endif(TARGET Boost::serialization) ######################### ####### MADNESS-specific From 75b8a49749102e0a1a95fb709553f1406ee9c95b Mon Sep 17 00:00:00 2001 From: Joseph Schuchart Date: Fri, 17 Jan 2025 22:33:10 -0500 Subject: [PATCH 29/39] Condition serialization_boost-parsec on TTG_PARSEC_USE_BOOST_SERIALIZATION Signed-off-by: Joseph Schuchart --- tests/unit/CMakeLists.txt | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/unit/CMakeLists.txt b/tests/unit/CMakeLists.txt index a33bc3c942..c0d056dfa2 100644 --- a/tests/unit/CMakeLists.txt +++ b/tests/unit/CMakeLists.txt @@ -31,8 +31,10 @@ add_ttg_executable(serialization serialization.cc unit_main.cpp COMPILE_DEFINITIONS $<$:TTG_HAS_BTAS=1>) # Boost serialization test: checks low-level codegen -add_ttg_executable(serialization_boost serialization_boost.cc - LINK_LIBRARIES ttg-serialization-boost RUNTIMES "parsec") +if (TTG_PARSEC_USE_BOOST_SERIALIZATION) + add_ttg_executable(serialization_boost serialization_boost.cc + LINK_LIBRARIES ttg-serialization-boost RUNTIMES "parsec") +endif(TTG_PARSEC_USE_BOOST_SERIALIZATION) # TODO: convert into unit test #if (TARGET MADworld) From 317bd7cb93b1bc9554812770fb9949a9d48a98d9 Mon Sep 17 00:00:00 2001 From: Joseph Schuchart Date: Sat, 18 Jan 2025 09:07:16 -0500 Subject: [PATCH 30/39] Don't install boost from apt/brew The version from apt is 1.74 which is too old so we end up building our own anyway. Signed-off-by: Joseph Schuchart --- .github/workflows/cmake.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/cmake.yml b/.github/workflows/cmake.yml index 571e0c5921..c338a87836 100644 --- a/.github/workflows/cmake.yml +++ b/.github/workflows/cmake.yml @@ -49,7 +49,7 @@ jobs: - name: Install prerequisite MacOS packages if: ${{ matrix.os == 'macos-latest' }} run: | - brew install ninja boost eigen open-mpi bison ccache + brew install ninja eigen open-mpi bison ccache echo "MPIEXEC=/opt/homebrew/bin/mpiexec" >> $GITHUB_ENV - name: Install prerequisites Ubuntu packages @@ -58,7 +58,7 @@ jobs: wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null | gpg --dearmor - | sudo tee /etc/apt/trusted.gpg.d/kitware.gpg >/dev/null sudo apt-add-repository "deb https://apt.kitware.com/ubuntu/ $(lsb_release -cs) main" sudo apt-get update - sudo apt-get -y install ninja-build g++-12 liblapack-dev libboost-dev libboost-serialization-dev libboost-random-dev libeigen3-dev openmpi-bin libopenmpi-dev libtbb-dev ccache flex bison cmake doxygen + sudo apt-get -y install ninja-build g++-12 liblapack-dev libeigen3-dev openmpi-bin libopenmpi-dev libtbb-dev ccache flex bison cmake doxygen echo "MPIEXEC=/usr/bin/mpiexec" >> $GITHUB_ENV - name: Install extra dependencies From 8df71422ee6431fdd9268d1ffe5248b5866b8e51 Mon Sep 17 00:00:00 2001 From: Joseph Schuchart Date: Sat, 18 Jan 2025 09:46:39 -0500 Subject: [PATCH 31/39] Boost::type_index is required by the PaRSEC backend Signed-off-by: Joseph Schuchart --- cmake/modules/FindOrFetchBoost.cmake | 1 + ttg/CMakeLists.txt | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/cmake/modules/FindOrFetchBoost.cmake b/cmake/modules/FindOrFetchBoost.cmake index 8817413a2d..bab4eaa2a1 100644 --- a/cmake/modules/FindOrFetchBoost.cmake +++ b/cmake/modules/FindOrFetchBoost.cmake @@ -16,6 +16,7 @@ endif() set(required_components headers callable_traits + type_index ) set(optional_components ) diff --git a/ttg/CMakeLists.txt b/ttg/CMakeLists.txt index a18939d4ea..c3c4d8dbbb 100644 --- a/ttg/CMakeLists.txt +++ b/ttg/CMakeLists.txt @@ -245,7 +245,7 @@ if (TARGET PaRSEC::parsec) ${CMAKE_CURRENT_SOURCE_DIR}/ttg/parsec/ttvalue.h ) find_package(MPI) - set(ttg-parsec-deps "ttg;MPI::MPI_CXX;PaRSEC::parsec") + set(ttg-parsec-deps "ttg;MPI::MPI_CXX;PaRSEC::parsec;Boost::type_index") # parsec depends on TTG's serialization layer since it does not provide its own if (TTG_PARSEC_USE_BOOST_SERIALIZATION AND TARGET ttg-serialization-boost) list(APPEND ttg-parsec-deps ttg-serialization-boost) From 4ecf4e487d6087cd9a71a40c3c8d986b51b1dc8e Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Tue, 21 Jan 2025 23:20:32 -0500 Subject: [PATCH 32/39] boost_iostreams install fails unless it's built via dependence on a previous target, exclude it from the build unless TTG_PARSEC_USE_BOOST_SERIALIZATION=ON --- cmake/modules/ExternalDependenciesVersions.cmake | 2 +- cmake/modules/FindOrFetchBoost.cmake | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/cmake/modules/ExternalDependenciesVersions.cmake b/cmake/modules/ExternalDependenciesVersions.cmake index 89654a2562..b408fac581 100644 --- a/cmake/modules/ExternalDependenciesVersions.cmake +++ b/cmake/modules/ExternalDependenciesVersions.cmake @@ -1,7 +1,7 @@ # for each dependency track both current and previous id (the variable for the latter must contain PREVIOUS) # to be able to auto-update them -set(TTG_TRACKED_VG_CMAKE_KIT_TAG 878654d0cb1904049fbd2c37b37d5385ae897658) # provides FindOrFetchLinalgPP and "real" FindOrFetchBoost +set(TTG_TRACKED_VG_CMAKE_KIT_TAG cda539db32be6e8171f5cbebdb1a7c38d5ab4b34) # provides FindOrFetchLinalgPP and "real" FindOrFetchBoost set(TTG_TRACKED_CATCH2_VERSION 3.5.0) set(TTG_TRACKED_MADNESS_TAG 93a9a5cec2a8fa87fba3afe8056607e6062a9058) set(TTG_TRACKED_PARSEC_TAG 996dda4c0ff3120bc65385f86e999befd4b3fe7a) diff --git a/cmake/modules/FindOrFetchBoost.cmake b/cmake/modules/FindOrFetchBoost.cmake index bab4eaa2a1..ab4c703b83 100644 --- a/cmake/modules/FindOrFetchBoost.cmake +++ b/cmake/modules/FindOrFetchBoost.cmake @@ -25,6 +25,8 @@ if (TTG_PARSEC_USE_BOOST_SERIALIZATION) serialization iostreams ) +else() + list(APPEND BOOST_EXCLUDE_LIBRARIES iostreams) # install of this library fails unless it's already built endif() if (BUILD_EXAMPLES) list(APPEND optional_components From 81868465ed685b616d817ef74ad4a3a26b63e1e5 Mon Sep 17 00:00:00 2001 From: Joseph Schuchart Date: Wed, 22 Jan 2025 10:34:07 -0500 Subject: [PATCH 33/39] Revert "Comment out Boost::serialization dependency" This reverts commit 65a61f0a12c5e3a0ffb523d50dd89d2da1a5f570. --- .github/workflows/cmake.yml | 1 + ttg/CMakeLists.txt | 32 ++++++++++++++++---------------- 2 files changed, 17 insertions(+), 16 deletions(-) diff --git a/.github/workflows/cmake.yml b/.github/workflows/cmake.yml index c338a87836..0da4a7a71a 100644 --- a/.github/workflows/cmake.yml +++ b/.github/workflows/cmake.yml @@ -42,6 +42,7 @@ jobs: -DMPIEXEC_PREFLAGS='--bind-to;none;--allow-run-as-root' -DCMAKE_INSTALL_PREFIX=${{github.workspace}}/install -DTTG_EXAMPLES=ON + -DCMAKE_CXX_STANDARD=20 steps: - uses: actions/checkout@v4 diff --git a/ttg/CMakeLists.txt b/ttg/CMakeLists.txt index c3c4d8dbbb..7e361802bf 100644 --- a/ttg/CMakeLists.txt +++ b/ttg/CMakeLists.txt @@ -171,15 +171,15 @@ if (TARGET MADworld) list(APPEND ttg-serialization-deps MADworld) list(APPEND ttg-serialization-compile-definitions TTG_SERIALIZATION_SUPPORTS_MADNESS=1) endif(TARGET MADworld) -#if (TARGET Boost::serialization) -# list(APPEND ttg-serialization-deps Boost::serialization) -# list(APPEND ttg-serialization-boost-deps Boost::serialization) -# if (TARGET Boost::iostreams) # using modularized Boost? -# list(APPEND ttg-serialization-deps Boost::iostreams) -# list(APPEND ttg-serialization-boost-deps Boost::iostreams) -# endif() -# list(APPEND ttg-serialization-compile-definitions TTG_SERIALIZATION_SUPPORTS_BOOST=1) -#endif (TARGET Boost::serialization) +if (TARGET Boost::serialization) + list(APPEND ttg-serialization-deps Boost::serialization) + list(APPEND ttg-serialization-boost-deps Boost::serialization) + if (TARGET Boost::iostreams) # using modularized Boost? + list(APPEND ttg-serialization-deps Boost::iostreams) + list(APPEND ttg-serialization-boost-deps Boost::iostreams) + endif() + list(APPEND ttg-serialization-compile-definitions TTG_SERIALIZATION_SUPPORTS_BOOST=1) +endif (TARGET Boost::serialization) add_ttg_library(ttg-serialization "${ttg-serialization-sources}" @@ -195,13 +195,13 @@ if (TARGET MADworld) COMPILE_DEFINITIONS "TTG_SERIALIZATION_SUPPORTS_MADNESS=1") endif(TARGET MADworld) # make boost-only serialization target -#if (TARGET Boost::serialization) -# add_ttg_library(ttg-serialization-boost -# "${ttg-serialization-sources}" -# PUBLIC_HEADER "${ttg-serialization-headers}" -# LINK_LIBRARIES "${ttg-serialization-boost-deps}" -# COMPILE_DEFINITIONS "TTG_SERIALIZATION_SUPPORTS_BOOST=1") -#endif(TARGET Boost::serialization) +if (TARGET Boost::serialization) + add_ttg_library(ttg-serialization-boost + "${ttg-serialization-sources}" + PUBLIC_HEADER "${ttg-serialization-headers}" + LINK_LIBRARIES "${ttg-serialization-boost-deps}" + COMPILE_DEFINITIONS "TTG_SERIALIZATION_SUPPORTS_BOOST=1") +endif(TARGET Boost::serialization) ######################### ####### MADNESS-specific From b440287667f2278c42cec88d901ffe27acf2ba92 Mon Sep 17 00:00:00 2001 From: Joseph Schuchart Date: Wed, 22 Jan 2025 10:47:28 -0500 Subject: [PATCH 34/39] CI: don't specify C++ standard We should specify a standard ourselves independent of CI. Signed-off-by: Joseph Schuchart --- .github/workflows/cmake.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/cmake.yml b/.github/workflows/cmake.yml index 0da4a7a71a..c338a87836 100644 --- a/.github/workflows/cmake.yml +++ b/.github/workflows/cmake.yml @@ -42,7 +42,6 @@ jobs: -DMPIEXEC_PREFLAGS='--bind-to;none;--allow-run-as-root' -DCMAKE_INSTALL_PREFIX=${{github.workspace}}/install -DTTG_EXAMPLES=ON - -DCMAKE_CXX_STANDARD=20 steps: - uses: actions/checkout@v4 From 87d0055580fe32888867ef6e83e5de472a442bfa Mon Sep 17 00:00:00 2001 From: Joseph Schuchart Date: Wed, 22 Jan 2025 11:19:28 -0500 Subject: [PATCH 35/39] Remove use of Boost::typeindex The detection is flaky and the benefit little. Signed-off-by: Joseph Schuchart --- cmake/modules/FindOrFetchBoost.cmake | 1 - ttg/CMakeLists.txt | 2 +- ttg/ttg/parsec/ttg.h | 8 +++----- 3 files changed, 4 insertions(+), 7 deletions(-) diff --git a/cmake/modules/FindOrFetchBoost.cmake b/cmake/modules/FindOrFetchBoost.cmake index ab4c703b83..4c6422400f 100644 --- a/cmake/modules/FindOrFetchBoost.cmake +++ b/cmake/modules/FindOrFetchBoost.cmake @@ -16,7 +16,6 @@ endif() set(required_components headers callable_traits - type_index ) set(optional_components ) diff --git a/ttg/CMakeLists.txt b/ttg/CMakeLists.txt index 7e361802bf..be834a6341 100644 --- a/ttg/CMakeLists.txt +++ b/ttg/CMakeLists.txt @@ -245,7 +245,7 @@ if (TARGET PaRSEC::parsec) ${CMAKE_CURRENT_SOURCE_DIR}/ttg/parsec/ttvalue.h ) find_package(MPI) - set(ttg-parsec-deps "ttg;MPI::MPI_CXX;PaRSEC::parsec;Boost::type_index") + set(ttg-parsec-deps "ttg;MPI::MPI_CXX;PaRSEC::parsec") # parsec depends on TTG's serialization layer since it does not provide its own if (TTG_PARSEC_USE_BOOST_SERIALIZATION AND TARGET ttg-serialization-boost) list(APPEND ttg-parsec-deps ttg-serialization-boost) diff --git a/ttg/ttg/parsec/ttg.h b/ttg/ttg/parsec/ttg.h index 14394cba9e..98974d5c59 100644 --- a/ttg/ttg/parsec/ttg.h +++ b/ttg/ttg/parsec/ttg.h @@ -121,8 +121,6 @@ #include "ttg/device/device.h" -#include - #undef TTG_PARSEC_DEBUG_TRACK_DATA_COPIES /* PaRSEC function declarations */ @@ -1042,7 +1040,7 @@ namespace ttg_parsec { } } else { - throw std::logic_error(std::string("TTG::PaRSEC: need to copy a datum of type") + boost::typeindex::type_id>().pretty_name() + " but the type is not copyable"); + throw std::logic_error(std::string("TTG::PaRSEC: need to copy a datum of type") + typeid(std::decay_t).name() + " but the type is not copyable"); } } return copy_res; @@ -2028,7 +2026,7 @@ namespace ttg_parsec { if (std::size(keylist) == 1) set_arg_local_impl(key, std::move(*reinterpret_cast(copy->get_ptr())), copy, &task_ring); else { - throw std::logic_error(std::string("TTG::PaRSEC: need to copy a datum of type") + boost::typeindex::type_id>().pretty_name() + " but the type is not copyable"); + throw std::logic_error(std::string("TTG::PaRSEC: need to copy a datum of type") + typeid(std::decay_t).name() + " but the type is not copyable"); } } } @@ -3661,7 +3659,7 @@ namespace ttg_parsec { set_arg(value); } else { - throw std::logic_error(std::string("TTG::PaRSEC: send_callback is invoked on datum of type ") + boost::typeindex::type_id().pretty_name() + " which is not copy constructible, std::move datum into send/broadcast statement"); + throw std::logic_error(std::string("TTG::PaRSEC: send_callback is invoked on datum of type ") + typeid(std::decay_t).name() + " which is not copy constructible, std::move datum into send/broadcast statement"); } }; auto setsize_callback = [this](std::size_t size) { set_argstream_size(size); }; From 4e6943bf3f2a257815c4b0500311f01326d3a771 Mon Sep 17 00:00:00 2001 From: Joseph Schuchart Date: Wed, 22 Jan 2025 11:30:18 -0500 Subject: [PATCH 36/39] More fun with Boost serialization Don't even bother trying to create a target if !TTG_PARSEC_USE_BOOST_SERIALIZATION Signed-off-by: Joseph Schuchart --- ttg/CMakeLists.txt | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/ttg/CMakeLists.txt b/ttg/CMakeLists.txt index be834a6341..64f167c8f8 100644 --- a/ttg/CMakeLists.txt +++ b/ttg/CMakeLists.txt @@ -171,7 +171,7 @@ if (TARGET MADworld) list(APPEND ttg-serialization-deps MADworld) list(APPEND ttg-serialization-compile-definitions TTG_SERIALIZATION_SUPPORTS_MADNESS=1) endif(TARGET MADworld) -if (TARGET Boost::serialization) +if (TTG_PARSEC_USE_BOOST_SERIALIZATION AND TARGET Boost::serialization) list(APPEND ttg-serialization-deps Boost::serialization) list(APPEND ttg-serialization-boost-deps Boost::serialization) if (TARGET Boost::iostreams) # using modularized Boost? @@ -179,7 +179,7 @@ if (TARGET Boost::serialization) list(APPEND ttg-serialization-boost-deps Boost::iostreams) endif() list(APPEND ttg-serialization-compile-definitions TTG_SERIALIZATION_SUPPORTS_BOOST=1) -endif (TARGET Boost::serialization) +endif (TTG_PARSEC_USE_BOOST_SERIALIZATION AND TARGET Boost::serialization) add_ttg_library(ttg-serialization "${ttg-serialization-sources}" @@ -195,13 +195,13 @@ if (TARGET MADworld) COMPILE_DEFINITIONS "TTG_SERIALIZATION_SUPPORTS_MADNESS=1") endif(TARGET MADworld) # make boost-only serialization target -if (TARGET Boost::serialization) +if (TTG_PARSEC_USE_BOOST_SERIALIZATION AND TARGET Boost::serialization) add_ttg_library(ttg-serialization-boost "${ttg-serialization-sources}" PUBLIC_HEADER "${ttg-serialization-headers}" LINK_LIBRARIES "${ttg-serialization-boost-deps}" COMPILE_DEFINITIONS "TTG_SERIALIZATION_SUPPORTS_BOOST=1") -endif(TARGET Boost::serialization) +endif(TTG_PARSEC_USE_BOOST_SERIALIZATION AND TARGET Boost::serialization) ######################### ####### MADNESS-specific From 914cc3d75385e129b3a2b3d910a41501300b5a47 Mon Sep 17 00:00:00 2001 From: Joseph Schuchart Date: Mon, 27 Jan 2025 16:13:35 -0500 Subject: [PATCH 37/39] Look for CUDA 12, not 12.6 NVIDIA released CUDA 12.8 which is now the default. Signed-off-by: Joseph Schuchart --- .github/workflows/cmake.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/cmake.yml b/.github/workflows/cmake.yml index c338a87836..6809ab9db9 100644 --- a/.github/workflows/cmake.yml +++ b/.github/workflows/cmake.yml @@ -68,7 +68,7 @@ jobs: sudo dpkg -i cuda-keyring_1.1-1_all.deb sudo apt update sudo apt install -y cuda-toolkit - echo "CUDA_BUILD_OPTS=-DENABLE_CUDA=ON -DTTG_ENABLE_CUDA=ON -DCUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda-12.6 -DCMAKE_CUDA_COMPILER=/usr/local/cuda-12.6/bin/nvcc -DCMAKE_CUDA_HOST_COMPILER=${{ matrix.cxx }}" >> $GITHUB_ENV + echo "CUDA_BUILD_OPTS=-DENABLE_CUDA=ON -DTTG_ENABLE_CUDA=ON -DCUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda-12 -DCMAKE_CUDA_COMPILER=/usr/local/cuda-12/bin/nvcc -DCMAKE_CUDA_HOST_COMPILER=${{ matrix.cxx }}" >> $GITHUB_ENV - name: Create Build Environment # Some projects don't allow in-source building, so create a separate build directory From f44407c90084ec89ece59a6c17dc582a3f21ea51 Mon Sep 17 00:00:00 2001 From: Joseph Schuchart Date: Wed, 29 Jan 2025 15:04:43 -0500 Subject: [PATCH 38/39] PaRSEC: fix handling of Buffer::scope() for empty buffers Signed-off-by: Joseph Schuchart --- ttg/ttg/devicescope.h | 1 + ttg/ttg/parsec/buffer.h | 1 + 2 files changed, 2 insertions(+) diff --git a/ttg/ttg/devicescope.h b/ttg/ttg/devicescope.h index 594e6db0b3..7184dafff9 100644 --- a/ttg/ttg/devicescope.h +++ b/ttg/ttg/devicescope.h @@ -5,6 +5,7 @@ namespace ttg { enum class scope { Allocate = 0x0, //< memory allocated as scratch, but not moved in or out SyncIn = 0x2, //< memory allocated as scratch and data transferred to device + Invalid = 0xF, //< invalid scope }; } // namespace ttg diff --git a/ttg/ttg/parsec/buffer.h b/ttg/ttg/parsec/buffer.h index 0563456c4d..62be4270ce 100644 --- a/ttg/ttg/parsec/buffer.h +++ b/ttg/ttg/parsec/buffer.h @@ -473,6 +473,7 @@ struct Buffer { ttg::scope scope() const { /* if the host owns the data and has a version of zero we only have to allocate data */ + if (nullptr != m_data) return ttg::scope::Invalid; return (m_data->device_copies[0]->version == 0 && m_data->owner_device == 0) ? ttg::scope::Allocate : ttg::scope::SyncIn; } From b502ac843cf6b08bb679486b8ad78bc9bc1a6c13 Mon Sep 17 00:00:00 2001 From: Joseph Schuchart Date: Wed, 29 Jan 2025 15:06:17 -0500 Subject: [PATCH 39/39] PaRSEC: don't inline data in device tasks Signed-off-by: Joseph Schuchart --- ttg/ttg/parsec/ttg.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/ttg/ttg/parsec/ttg.h b/ttg/ttg/parsec/ttg.h index 98974d5c59..ae0bf557b7 100644 --- a/ttg/ttg/parsec/ttg.h +++ b/ttg/ttg/parsec/ttg.h @@ -2739,6 +2739,11 @@ namespace ttg_parsec { template bool can_inline_data(Value* value_ptr, detail::ttg_data_copy_t *copy, const Key& key, std::size_t num_keys) { + if constexpr (derived_has_device_op()) { + /* don't inline if data is possibly on the device */ + return false; + } + /* non-device data */ using decvalueT = std::decay_t; bool inline_data = false; /* check whether to send data in inline */