Skip to content

Commit 60531f9

Browse files
authoredApr 15, 2025··
[SYCL] Remove OoO Emulation (#17943)
Remove the legacy implementation of emulation for OoO queues. This code predated the PI and UR layers in DPC++ because fpga OpenCL did not support OoO queues. Today, this type of thing should be done by the backend compiler or UR layer. --------- Signed-off-by: James Brodman <[email protected]>
1 parent 97d56c1 commit 60531f9

File tree

5 files changed

+20
-273
lines changed

5 files changed

+20
-273
lines changed
 

‎sycl/source/detail/queue_impl.cpp

+10-70
Original file line numberDiff line numberDiff line change
@@ -60,8 +60,7 @@ template <>
6060
uint32_t queue_impl::get_info<info::queue::reference_count>() const {
6161
ur_result_t result = UR_RESULT_SUCCESS;
6262
getAdapter()->call<UrApiKind::urQueueGetInfo>(
63-
MQueues[0], UR_QUEUE_INFO_REFERENCE_COUNT, sizeof(result), &result,
64-
nullptr);
63+
MQueue, UR_QUEUE_INFO_REFERENCE_COUNT, sizeof(result), &result, nullptr);
6564
return result;
6665
}
6766

@@ -303,55 +302,14 @@ void queue_impl::addEvent(const event &Event) {
303302
const EventImplPtr &EImpl = getSyclObjImpl(Event);
304303
assert(EImpl && "Event implementation is missing");
305304
auto *Cmd = static_cast<Command *>(EImpl->getCommand());
306-
if (!Cmd) {
307-
// if there is no command on the event, we cannot track it with MEventsWeak
308-
// as that will leave it with no owner. Track in MEventsShared only if we're
309-
// unable to call urQueueFinish during wait.
310-
if (MEmulateOOO)
311-
addSharedEvent(Event);
312-
}
313-
// As long as the queue supports urQueueFinish we only need to store events
314-
// for undiscarded, unenqueued commands and host tasks.
315-
else if (MEmulateOOO ||
316-
(EImpl->getHandle() == nullptr && !EImpl->isDiscarded())) {
305+
if (Cmd != nullptr && EImpl->getHandle() == nullptr &&
306+
!EImpl->isDiscarded()) {
317307
std::weak_ptr<event_impl> EventWeakPtr{EImpl};
318308
std::lock_guard<std::mutex> Lock{MMutex};
319309
MEventsWeak.push_back(std::move(EventWeakPtr));
320310
}
321311
}
322312

323-
/// addSharedEvent - queue_impl tracks events with weak pointers
324-
/// but some events have no other owner. In this case,
325-
/// addSharedEvent will have the queue track the events via a shared pointer.
326-
void queue_impl::addSharedEvent(const event &Event) {
327-
assert(MEmulateOOO);
328-
std::lock_guard<std::mutex> Lock(MMutex);
329-
// Events stored in MEventsShared are not released anywhere else aside from
330-
// calls to queue::wait/wait_and_throw, which a user application might not
331-
// make, and ~queue_impl(). If the number of events grows large enough,
332-
// there's a good chance that most of them are already completed and ownership
333-
// of them can be released.
334-
const size_t EventThreshold = 128;
335-
if (MEventsShared.size() >= EventThreshold) {
336-
// Generally, the vector is ordered so that the oldest events are in the
337-
// front and the newer events are in the end. So, search to find the first
338-
// event that isn't yet complete. All the events prior to that can be
339-
// erased. This could leave some few events further on that have completed
340-
// not yet erased, but that is OK. This cleanup doesn't have to be perfect.
341-
// This also keeps the algorithm linear rather than quadratic because it
342-
// doesn't continually recheck things towards the back of the list that
343-
// really haven't had time to complete.
344-
MEventsShared.erase(
345-
MEventsShared.begin(),
346-
std::find_if(
347-
MEventsShared.begin(), MEventsShared.end(), [](const event &E) {
348-
return E.get_info<info::event::command_execution_status>() !=
349-
info::event_command_status::complete;
350-
}));
351-
}
352-
MEventsShared.push_back(Event);
353-
}
354-
355313
event queue_impl::submit_impl(const detail::type_erased_cgfo_ty &CGF,
356314
const std::shared_ptr<queue_impl> &Self,
357315
const std::shared_ptr<queue_impl> &PrimaryQueue,
@@ -490,9 +448,7 @@ event queue_impl::submitMemOpHelper(const std::shared_ptr<queue_impl> &Self,
490448
: MExtGraphDeps.LastEventPtr;
491449
EventToStoreIn = EventImpl;
492450
}
493-
// Track only if we won't be able to handle it with urQueueFinish.
494-
if (MEmulateOOO)
495-
addSharedEvent(ResEvent);
451+
496452
return discard_or_return(ResEvent);
497453
}
498454
}
@@ -600,10 +556,9 @@ void queue_impl::wait(const detail::code_location &CodeLoc) {
600556

601557
// Additionally, we can clean up the event lists that we would have
602558
// otherwise cleared.
603-
if (!MEventsWeak.empty() || !MEventsShared.empty()) {
559+
if (!MEventsWeak.empty()) {
604560
std::lock_guard<std::mutex> Lock(MMutex);
605561
MEventsWeak.clear();
606-
MEventsShared.clear();
607562
}
608563
if (!MStreamsServiceEvents.empty()) {
609564
std::lock_guard<std::mutex> Lock(MStreamsServiceEventsMutex);
@@ -612,11 +567,9 @@ void queue_impl::wait(const detail::code_location &CodeLoc) {
612567
}
613568

614569
std::vector<std::weak_ptr<event_impl>> WeakEvents;
615-
std::vector<event> SharedEvents;
616570
{
617571
std::lock_guard<std::mutex> Lock(MMutex);
618572
WeakEvents.swap(MEventsWeak);
619-
SharedEvents.swap(MEventsShared);
620573

621574
MMissedCleanupRequests.unset(
622575
[&](MissedCleanupRequestsType &MissedCleanupRequests) {
@@ -630,27 +583,19 @@ void queue_impl::wait(const detail::code_location &CodeLoc) {
630583
// directly. Otherwise, only wait for unenqueued or host task events, starting
631584
// from the latest submitted task in order to minimize total amount of calls,
632585
// then handle the rest with urQueueFinish.
633-
const bool SupportsPiFinish = !MEmulateOOO;
634586
for (auto EventImplWeakPtrIt = WeakEvents.rbegin();
635587
EventImplWeakPtrIt != WeakEvents.rend(); ++EventImplWeakPtrIt) {
636588
if (std::shared_ptr<event_impl> EventImplSharedPtr =
637589
EventImplWeakPtrIt->lock()) {
638590
// A nullptr UR event indicates that urQueueFinish will not cover it,
639591
// either because it's a host task event or an unenqueued one.
640-
if (!SupportsPiFinish || nullptr == EventImplSharedPtr->getHandle()) {
592+
if (nullptr == EventImplSharedPtr->getHandle()) {
641593
EventImplSharedPtr->wait(EventImplSharedPtr);
642594
}
643595
}
644596
}
645-
if (SupportsPiFinish) {
646-
const AdapterPtr &Adapter = getAdapter();
647-
Adapter->call<UrApiKind::urQueueFinish>(getHandleRef());
648-
assert(SharedEvents.empty() && "Queues that support calling piQueueFinish "
649-
"shouldn't have shared events");
650-
} else {
651-
for (event &Event : SharedEvents)
652-
Event.wait();
653-
}
597+
const AdapterPtr &Adapter = getAdapter();
598+
Adapter->call<UrApiKind::urQueueFinish>(getHandleRef());
654599

655600
std::vector<EventImplPtr> StreamsServiceEvents;
656601
{
@@ -730,7 +675,7 @@ ur_native_handle_t queue_impl::getNative(int32_t &NativeHandleDesc) const {
730675
nullptr, nullptr};
731676
UrNativeDesc.pNativeData = &NativeHandleDesc;
732677

733-
Adapter->call<UrApiKind::urQueueGetNativeHandle>(MQueues[0], &UrNativeDesc,
678+
Adapter->call<UrApiKind::urQueueGetNativeHandle>(MQueue, &UrNativeDesc,
734679
&Handle);
735680
if (getContextImplPtr()->getBackend() == backend::opencl)
736681
__SYCL_OCL_CALL(clRetainCommandQueue, ur::cast<cl_command_queue>(Handle));
@@ -759,18 +704,13 @@ bool queue_impl::ext_oneapi_empty() const {
759704
// Check the status of the backend queue if this is not a host queue.
760705
ur_bool_t IsReady = false;
761706
getAdapter()->call<UrApiKind::urQueueGetInfo>(
762-
MQueues[0], UR_QUEUE_INFO_EMPTY, sizeof(IsReady), &IsReady, nullptr);
707+
MQueue, UR_QUEUE_INFO_EMPTY, sizeof(IsReady), &IsReady, nullptr);
763708
if (!IsReady)
764709
return false;
765710

766711
// We may have events like host tasks which are not submitted to the backend
767712
// queue so we need to get their status separately.
768713
std::lock_guard<std::mutex> Lock(MMutex);
769-
for (event Event : MEventsShared)
770-
if (Event.get_info<info::event::command_execution_status>() !=
771-
info::event_command_status::complete)
772-
return false;
773-
774714
for (auto EventImplWeakPtrIt = MEventsWeak.begin();
775715
EventImplWeakPtrIt != MEventsWeak.end(); ++EventImplWeakPtrIt)
776716
if (std::shared_ptr<event_impl> EventImplSharedPtr =

‎sycl/source/detail/queue_impl.hpp

+10-66
Original file line numberDiff line numberDiff line change
@@ -166,7 +166,7 @@ class queue_impl {
166166
}
167167
const QueueOrder QOrder =
168168
MIsInorder ? QueueOrder::Ordered : QueueOrder::OOO;
169-
MQueues.push_back(createQueue(QOrder));
169+
MQueue = createQueue(QOrder);
170170
// This section is the second part of the instrumentation that uses the
171171
// tracepoint information and notifies
172172

@@ -191,13 +191,13 @@ class queue_impl {
191191
"discard_events and enable_profiling.");
192192
}
193193

194-
MQueues.push_back(UrQueue);
194+
MQueue = UrQueue;
195195

196196
ur_device_handle_t DeviceUr{};
197197
const AdapterPtr &Adapter = getAdapter();
198198
// TODO catch an exception and put it to list of asynchronous exceptions
199199
Adapter->call<UrApiKind::urQueueGetInfo>(
200-
MQueues[0], UR_QUEUE_INFO_DEVICE, sizeof(DeviceUr), &DeviceUr, nullptr);
200+
MQueue, UR_QUEUE_INFO_DEVICE, sizeof(DeviceUr), &DeviceUr, nullptr);
201201
MDevice = MContext->findMatchingDeviceImpl(DeviceUr);
202202
if (MDevice == nullptr) {
203203
throw sycl::exception(
@@ -264,7 +264,7 @@ class queue_impl {
264264
destructorNotification();
265265
#endif
266266
throw_asynchronous();
267-
getAdapter()->call<UrApiKind::urQueueRelease>(MQueues[0]);
267+
getAdapter()->call<UrApiKind::urQueueRelease>(MQueue);
268268
} catch (std::exception &e) {
269269
__SYCL_REPORT_EXCEPTION_TO_STREAM("exception in ~queue_impl", e);
270270
}
@@ -274,7 +274,7 @@ class queue_impl {
274274

275275
cl_command_queue get() {
276276
ur_native_handle_t nativeHandle = 0;
277-
getAdapter()->call<UrApiKind::urQueueGetNativeHandle>(MQueues[0], nullptr,
277+
getAdapter()->call<UrApiKind::urQueueGetNativeHandle>(MQueue, nullptr,
278278
&nativeHandle);
279279
__SYCL_OCL_CALL(clRetainCommandQueue, ur::cast<cl_command_queue>(nativeHandle));
280280
return ur::cast<cl_command_queue>(nativeHandle);
@@ -322,9 +322,7 @@ class queue_impl {
322322
"flush cannot be called for a queue which is "
323323
"recording to a command graph.");
324324
}
325-
for (const auto &queue : MQueues) {
326-
getAdapter()->call<UrApiKind::urQueueFlush>(queue);
327-
}
325+
getAdapter()->call<UrApiKind::urQueueFlush>(MQueue);
328326
}
329327

330328
/// Submits a command group function object to the queue, in order to be
@@ -540,62 +538,15 @@ class queue_impl {
540538
.get_index();
541539
Properties.pNext = &IndexProperties;
542540
}
543-
ur_result_t Error = Adapter->call_nocheck<UrApiKind::urQueueCreate>(
544-
Context, Device, &Properties, &Queue);
545-
546-
// If creating out-of-order queue failed and this property is not
547-
// supported (for example, on FPGA), it will return
548-
// UR_RESULT_ERROR_INVALID_QUEUE_PROPERTIES and will try to create in-order
549-
// queue.
550-
if (!MEmulateOOO && Error == UR_RESULT_ERROR_INVALID_QUEUE_PROPERTIES) {
551-
MEmulateOOO = true;
552-
Queue = createQueue(QueueOrder::Ordered);
553-
} else {
554-
Adapter->checkUrResult(Error);
555-
}
541+
Adapter->call<UrApiKind::urQueueCreate>(Context, Device, &Properties,
542+
&Queue);
556543

557544
return Queue;
558545
}
559546

560-
/// \return a raw UR handle for a free queue. The returned handle is not
561-
/// retained. It is caller responsibility to make sure queue is still alive.
562-
ur_queue_handle_t &getExclusiveUrQueueHandleRef() {
563-
ur_queue_handle_t *PIQ = nullptr;
564-
bool ReuseQueue = false;
565-
{
566-
std::lock_guard<std::mutex> Lock(MMutex);
567-
568-
// To achieve parallelism for FPGA with in order execution model with
569-
// possibility of two kernels to share data with each other we shall
570-
// create a queue for every kernel enqueued.
571-
if (MQueues.size() < MaxNumQueues) {
572-
MQueues.push_back({});
573-
PIQ = &MQueues.back();
574-
} else {
575-
// If the limit of OpenCL queues is going to be exceeded - take the
576-
// earliest used queue, wait until it finished and then reuse it.
577-
PIQ = &MQueues[MNextQueueIdx];
578-
MNextQueueIdx = (MNextQueueIdx + 1) % MaxNumQueues;
579-
ReuseQueue = true;
580-
}
581-
}
582-
583-
if (!ReuseQueue)
584-
*PIQ = createQueue(QueueOrder::Ordered);
585-
else
586-
getAdapter()->call<UrApiKind::urQueueFinish>(*PIQ);
587-
588-
return *PIQ;
589-
}
590-
591547
/// \return a raw UR queue handle. The returned handle is not retained. It
592548
/// is caller responsibility to make sure queue is still alive.
593-
ur_queue_handle_t &getHandleRef() {
594-
if (!MEmulateOOO)
595-
return MQueues[0];
596-
597-
return getExclusiveUrQueueHandleRef();
598-
}
549+
ur_queue_handle_t &getHandleRef() { return MQueue; }
599550

600551
/// \return true if the queue was constructed with property specified by
601552
/// PropertyT.
@@ -998,19 +949,12 @@ class queue_impl {
998949
/// Events without data dependencies (such as USM) need an owner,
999950
/// additionally, USM operations are not added to the scheduler command graph,
1000951
/// queue is the only owner on the runtime side.
1001-
std::vector<event> MEventsShared;
1002952
exception_list MExceptions;
1003953
const async_handler MAsyncHandler;
1004954
const property_list MPropList;
1005955

1006956
/// List of queues created for FPGA device from a single SYCL queue.
1007-
std::vector<ur_queue_handle_t> MQueues;
1008-
/// Iterator through MQueues.
1009-
size_t MNextQueueIdx = 0;
1010-
1011-
/// Indicates that a native out-of-order queue could not be created and we
1012-
/// need to emulate it with multiple native in-order queues.
1013-
bool MEmulateOOO = false;
957+
ur_queue_handle_t MQueue;
1014958

1015959
// Access should be guarded with MMutex
1016960
struct DependencyTrackingItems {

‎sycl/unittests/queue/CMakeLists.txt

-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
add_sycl_unittest(QueueTests OBJECT
22
DeviceCheck.cpp
3-
EventClear.cpp
43
Hash.cpp
54
USM.cpp
65
Wait.cpp

0 commit comments

Comments
 (0)
Please sign in to comment.