diff --git a/sycl/CMakeLists.txt b/sycl/CMakeLists.txt index cd6daaaa8944..1a24c6fcda90 100644 --- a/sycl/CMakeLists.txt +++ b/sycl/CMakeLists.txt @@ -11,6 +11,7 @@ option(SYCL_ADD_DEV_VERSION_POSTFIX "Adds -V postfix to version string" ON) option(SYCL_ENABLE_COVERAGE "Enables code coverage for runtime and unit tests" OFF) option(SYCL_ENABLE_STACK_PRINTING "Enables stack printing on crashes of SYCL applications" OFF) option(SYCL_LIB_WITH_DEBUG_SYMBOLS "Builds SYCL runtime libraries with debug symbols" OFF) +option(SYCL_ENABLE_UNFINISHED_NO_CGH_SUBMIT "Builds SYCL runtime libraries with the support for Command Group Handler bypass kernel submit" OFF) if (NOT SYCL_COVERAGE_PATH) set(SYCL_COVERAGE_PATH "${CMAKE_CURRENT_BINARY_DIR}/profiles") diff --git a/sycl/cmake/modules/AddSYCLUnitTest.cmake b/sycl/cmake/modules/AddSYCLUnitTest.cmake index 3fdc6fde119b..39a479869642 100644 --- a/sycl/cmake/modules/AddSYCLUnitTest.cmake +++ b/sycl/cmake/modules/AddSYCLUnitTest.cmake @@ -115,6 +115,14 @@ macro(add_sycl_unittest test_dirname link_variant) -Wno-inconsistent-missing-override ) endif() - + target_compile_definitions(${test_dirname} PRIVATE SYCL_DISABLE_FSYCL_SYCLHPP_WARNING) + + if (SYCL_ENABLE_UNFINISHED_NO_CGH_SUBMIT) + target_compile_definitions( + ${test_dirname} + PRIVATE + __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT + ) + endif() endmacro() diff --git a/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp b/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp index f599078a6769..123e883ccd10 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp @@ -110,6 +110,27 @@ event submit_with_event_impl(const queue &Q, PropertiesT Props, return Q.submit_with_event<__SYCL_USE_FALLBACK_ASSERT>( Props, detail::type_erased_cgfo_ty{CGF}, nullptr, CodeLoc); } + +#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT +template +void submit_direct_impl(const queue &Q, PropertiesT Props, + nd_range Range, + const KernelType &KernelFunc, + const sycl::detail::code_location &CodeLoc) { + Q.submit_direct_without_event(Props, Range, KernelFunc, CodeLoc); +} +template +event submit_direct_with_event_impl(const queue &Q, PropertiesT Props, + nd_range Range, + const KernelType &KernelFunc, + const sycl::detail::code_location &CodeLoc) { + return Q.submit_direct_with_event(Props, Range, KernelFunc, CodeLoc); +} +#endif //__DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT } // namespace detail template @@ -127,11 +148,24 @@ void submit(const queue &Q, CommandGroupFunc &&CGF, submit(Q, empty_properties_t{}, std::forward(CGF), CodeLoc); } +#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT +template +void submit(const queue &Q, PropertiesT Props, + nd_range Range, + const KernelType &KernelFunc, + const sycl::detail::code_location &CodeLoc = + sycl::detail::code_location::current()) { + sycl::ext::oneapi::experimental::detail::submit_direct_impl + (Q, Props, Range, KernelFunc, CodeLoc); +} +#endif //__DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT + template event submit_with_event(const queue &Q, PropertiesT Props, CommandGroupFunc &&CGF, const sycl::detail::code_location &CodeLoc = - sycl::detail::code_location::current()) { + sycl::detail::code_location::current()) { return sycl::ext::oneapi::experimental::detail::submit_with_event_impl( Q, Props, std::forward(CGF), CodeLoc); } @@ -144,6 +178,19 @@ event submit_with_event(const queue &Q, CommandGroupFunc &&CGF, std::forward(CGF), CodeLoc); } +#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT +template +event submit_with_event(const queue &Q, PropertiesT Props, + nd_range Range, + const KernelType &KernelFunc, + const sycl::detail::code_location &CodeLoc = + sycl::detail::code_location::current()) { + return sycl::ext::oneapi::experimental::detail::submit_direct_with_event_impl + (Q, Props, Range, KernelFunc, CodeLoc); +} +#endif //__DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT + template void single_task(handler &CGH, const KernelType &KernelObj) { CGH.single_task(KernelObj); @@ -256,6 +303,21 @@ void nd_launch(handler &CGH, nd_range Range, KernelObj); } +#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT +template +void nd_launch(queue Q, nd_range Range, const KernelType &KernelObj, + ReductionsT &&...Reductions) { + if constexpr (sizeof...(ReductionsT) == 0) { + submit(std::move(Q), empty_properties_t{}, Range, KernelObj); + } else { + submit(std::move(Q), [&](handler &CGH) { + nd_launch(CGH, Range, KernelObj, + std::forward(Reductions)...); + }); + } +} +#else template void nd_launch(queue Q, nd_range Range, const KernelType &KernelObj, @@ -265,6 +327,7 @@ void nd_launch(queue Q, nd_range Range, const KernelType &KernelObj, std::forward(Reductions)...); }); } +#endif template @@ -280,6 +343,25 @@ void nd_launch(handler &CGH, std::forward(Reductions)..., KernelObj); } +#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT +template +void nd_launch(queue Q, launch_config, Properties> Config, + const KernelType &KernelObj, ReductionsT &&...Reductions) { + if constexpr (sizeof...(ReductionsT) == 0) { + ext::oneapi::experimental::detail::LaunchConfigAccess, + Properties> + ConfigAccess(Config); + submit(std::move(Q), ConfigAccess.getProperties(), ConfigAccess.getRange(), + KernelObj); + } else { + submit(std::move(Q), [&](handler &CGH) { + nd_launch(CGH, Config, KernelObj, + std::forward(Reductions)...); + }); + } +} +#else template void nd_launch(queue Q, launch_config, Properties> Config, @@ -289,6 +371,7 @@ void nd_launch(queue Q, launch_config, Properties> Config, std::forward(Reductions)...); }); } +#endif template void nd_launch(handler &CGH, nd_range Range, diff --git a/sycl/include/sycl/khr/free_function_commands.hpp b/sycl/include/sycl/khr/free_function_commands.hpp index 4138edd5821e..e45bcdeca7d4 100644 --- a/sycl/include/sycl/khr/free_function_commands.hpp +++ b/sycl/include/sycl/khr/free_function_commands.hpp @@ -153,27 +153,42 @@ void launch_grouped(const queue &q, range<1> r, range<1> size, const KernelType &k, const sycl::detail::code_location &codeLoc = sycl::detail::code_location::current()) { +#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT + submit(std::move(q), ext::oneapi::experimental::empty_properties_t{}, + nd_range<1>(r, size), k); +#else submit( q, [&](handler &h) { launch_grouped(h, r, size, k); }, codeLoc); +#endif } template void launch_grouped(const queue &q, range<2> r, range<2> size, const KernelType &k, const sycl::detail::code_location &codeLoc = sycl::detail::code_location::current()) { +#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT + submit(std::move(q), ext::oneapi::experimental::empty_properties_t{}, + nd_range<2>(r, size), k); +#else submit( q, [&](handler &h) { launch_grouped(h, r, size, k); }, codeLoc); +#endif } template void launch_grouped(const queue &q, range<3> r, range<3> size, const KernelType &k, const sycl::detail::code_location &codeLoc = sycl::detail::code_location::current()) { +#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT + submit(std::move(q), ext::oneapi::experimental::empty_properties_t{}, + nd_range<3>(r, size), k); +#else submit( q, [&](handler &h) { launch_grouped(h, r, size, k); }, codeLoc); +#endif } template diff --git a/sycl/include/sycl/queue.hpp b/sycl/include/sycl/queue.hpp index a0dbdf5c540e..cb0227e72141 100644 --- a/sycl/include/sycl/queue.hpp +++ b/sycl/include/sycl/queue.hpp @@ -149,6 +149,66 @@ class __SYCL_EXPORT SubmissionInfo { ext::oneapi::experimental::event_mode_enum::none; }; +#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT +using KernelParamDescGetterFuncPtr = detail::kernel_param_desc_t (*)(int); + +// This class is intended to store the kernel runtime information, +// extracted from the compile time kernel structures. +class __SYCL_EXPORT KernelRuntimeInfo { +public: + KernelRuntimeInfo() {} + + KernelRuntimeInfo(const KernelRuntimeInfo &rhs) = delete; + + KernelRuntimeInfo(KernelRuntimeInfo &&rhs) = delete; + + KernelRuntimeInfo &operator=(const KernelRuntimeInfo &rhs) = delete; + + KernelRuntimeInfo &operator=(KernelRuntimeInfo &&rhs) = delete; + + detail::ABINeutralKernelNameStrT &KernelName() { return MKernelName; } + const detail::ABINeutralKernelNameStrT &KernelName() const { return MKernelName; } + + std::shared_ptr &HostKernel() { return MHostKernel; } + const std::shared_ptr &HostKernel() const { return MHostKernel; } + + char *GetKernelFuncPtr() { return (*MHostKernel).getPtr(); } + char *GetKernelFuncPtr() const { return (*MHostKernel).getPtr(); } + + int &KernelNumArgs() { return MKernelNumArgs; } + const int &KernelNumArgs() const { return MKernelNumArgs; } + + KernelParamDescGetterFuncPtr &KernelParamDescGetter() { + return MKernelParamDescGetter; + } + const KernelParamDescGetterFuncPtr &KernelParamDescGetter() const { + return MKernelParamDescGetter; + } + + bool &KernelIsESIMD() { return MKernelIsESIMD; } + const bool &KernelIsESIMD() const { return MKernelIsESIMD; } + + bool &KernelHasSpecialCaptures() { return MKernelHasSpecialCaptures; } + const bool &KernelHasSpecialCaptures() const { return MKernelHasSpecialCaptures; } + + detail::KernelNameBasedCacheT *&KernelNameBasedCachePtr() { + return MKernelNameBasedCachePtr; + } + detail::KernelNameBasedCacheT *KernelNameBasedCachePtr() const { + return MKernelNameBasedCachePtr; + } + +private: + detail::ABINeutralKernelNameStrT MKernelName; + std::shared_ptr MHostKernel; + int MKernelNumArgs = 0; + KernelParamDescGetterFuncPtr MKernelParamDescGetter = nullptr; + bool MKernelIsESIMD = false; + bool MKernelHasSpecialCaptures = true; + detail::KernelNameBasedCacheT *MKernelNameBasedCachePtr = nullptr; +}; +#endif //__DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT + } // namespace v1 } // namespace detail @@ -167,6 +227,22 @@ template event submit_with_event_impl(const queue &Q, PropertiesT Props, CommandGroupFunc &&CGF, const sycl::detail::code_location &CodeLoc); + +#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT +template +void submit_direct_impl(const queue &Q, PropertiesT Props, + nd_range Range, + const KernelType &KernelFunc, + const sycl::detail::code_location &CodeLoc); + +template +event submit_direct_with_event_impl(const queue &Q, PropertiesT Props, + nd_range Range, + const KernelType &KernelFunc, + const sycl::detail::code_location &CodeLoc); +#endif //__DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT } // namespace detail } // namespace ext::oneapi::experimental @@ -3203,6 +3279,31 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { TlsCodeLocCapture.query()); } +#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT + /// parallel_for version with a kernel represented as a lambda + nd_range that + /// specifies global, local sizes and offset. + /// + /// \param Range specifies the global and local work spaces of the kernel + /// \param Rest acts as-if: "ReductionTypes&&... Reductions, + /// const KernelType &KernelFunc". + template + std::enable_if_t::value, event> + parallel_for(nd_range Range, RestT &&...Rest) { + constexpr detail::code_location CodeLoc = getCodeLocation(); + detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); + if constexpr (sizeof...(RestT) == 1) { + return submit_direct_with_event(ext::oneapi::experimental::empty_properties_t{}, + Range, Rest...); + } else { + return submit( + [&](handler &CGH) { + CGH.template parallel_for(Range, Rest...); + }, + TlsCodeLocCapture.query()); + } + } +#else /// parallel_for version with a kernel represented as a lambda + nd_range that /// specifies global, local sizes and offset. /// @@ -3221,7 +3322,7 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { }, TlsCodeLocCapture.query()); } - +#endif /// parallel_for version with a kernel represented as a lambda + nd_range that /// specifies global, local sizes and offset. /// @@ -3596,6 +3697,23 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { const queue &Q, PropertiesT Props, CommandGroupFunc &&CGF, const sycl::detail::code_location &CodeLoc); +#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT + template + friend void ext::oneapi::experimental::detail::submit_direct_impl( + const queue &Q, PropertiesT Props, nd_range Range, + const KernelType &KernelFunc, + const sycl::detail::code_location &CodeLoc); + + template + friend event ext::oneapi::experimental::detail::submit_direct_with_event_impl( + const queue &Q, PropertiesT Props, nd_range Range, + const KernelType &KernelFunc, + const sycl::detail::code_location &CodeLoc); + +#endif //__DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT + template void ProcessSubmitProperties(PropertiesT Props, detail::v1::SubmissionInfo &SI) const { @@ -3609,6 +3727,37 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { } } + template struct TransformUserItemType { + using type = std::conditional_t< + std::is_convertible_v, LambdaArgType>, nd_item, + std::conditional_t, LambdaArgType>, + item, LambdaArgType>>; + }; + +#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT + template + void ProcessKernelRuntimeInfo( + PropertiesT Props, const KernelType &KernelFunc, + detail::v1::KernelRuntimeInfo &KRInfo) const { + + using LambdaArgType = sycl::detail::lambda_arg_type>; + using TransformedArgType = std::conditional_t< + std::is_integral::value && Dims == 1, item, + typename TransformUserItemType::type>; + + KRInfo.HostKernel().reset( + new detail::HostKernel( + KernelFunc)); + KRInfo.KernelName() = detail::getKernelName(); + KRInfo.KernelNumArgs() = detail::getKernelNumParams(); + KRInfo.KernelParamDescGetter() = &(detail::getKernelParamDesc); + KRInfo.KernelIsESIMD() = detail::isKernelESIMD(); + KRInfo.KernelHasSpecialCaptures() = detail::hasSpecialCaptures(); + KRInfo.KernelNameBasedCachePtr() = detail::getKernelNameBasedCache(); + } +#endif //__DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT + #ifndef __INTEL_PREVIEW_BREAKING_CHANGES /// TODO: Unused. Remove these when ABI-break window is open. /// Not using `type_erased_cgfo_ty` on purpose. @@ -3680,6 +3829,44 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { const detail::code_location &CodeLoc, bool IsTopCodeLoc) const; +#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT + event submit_direct_with_event_impl( + nd_range<1> Range, + const detail::v1::SubmissionInfo &SubmitInfo, + const detail::v1::KernelRuntimeInfo &KRInfo, + const detail::code_location &CodeLoc, bool IsTopCodeLoc) const; + + event submit_direct_with_event_impl( + nd_range<2> Range, + const detail::v1::SubmissionInfo &SubmitInfo, + const detail::v1::KernelRuntimeInfo &KRInfo, + const detail::code_location &CodeLoc, bool IsTopCodeLoc) const; + + event submit_direct_with_event_impl( + nd_range<3> Range, + const detail::v1::SubmissionInfo &SubmitInfo, + const detail::v1::KernelRuntimeInfo &KRInfo, + const detail::code_location &CodeLoc, bool IsTopCodeLoc) const; + + void submit_direct_without_event_impl( + nd_range<1> Range, + const detail::v1::SubmissionInfo &SubmitInfo, + const detail::v1::KernelRuntimeInfo &KRInfo, + const detail::code_location &CodeLoc, bool IsTopCodeLoc) const; + + void submit_direct_without_event_impl( + nd_range<2> Range, + const detail::v1::SubmissionInfo &SubmitInfo, + const detail::v1::KernelRuntimeInfo &KRInfo, + const detail::code_location &CodeLoc, bool IsTopCodeLoc) const; + + void submit_direct_without_event_impl( + nd_range<3> Range, + const detail::v1::SubmissionInfo &SubmitInfo, + const detail::v1::KernelRuntimeInfo &KRInfo, + const detail::code_location &CodeLoc, bool IsTopCodeLoc) const; +#endif //__DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT + /// A template-free version of submit_without_event as const member function. void submit_without_event_impl(const detail::type_erased_cgfo_ty &CGH, const detail::v1::SubmissionInfo &SubmitInfo, @@ -3763,6 +3950,95 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { TlsCodeLocCapture.isToplevel()); } +#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT + +#ifdef SYCL_LANGUAGE_VERSION +#ifndef __INTEL_SYCL_USE_INTEGRATION_HEADERS +#define __SYCL_KERNEL_ATTR__ [[clang::sycl_kernel_entry_point(KernelName)]] +#else +#define __SYCL_KERNEL_ATTR__ [[clang::sycl_kernel]] +#endif // __INTEL_SYCL_USE_INTEGRATION_HEADERS +#else +#define __SYCL_KERNEL_ATTR__ +#endif // SYCL_LANGUAGE_VERSION + + // TODO The kernel wrapper functions have to be moved from the handler class + // to a place where they are accessible by both the handler and the queue class. + // For test purposes, this wrapper is a copy of the handler wrapper, + // but an aplication which would use both handler and no-handler APIs, + // won't compile. + // + // NOTE: the name of these functions - "kernel_parallel_for" - are used by the + // Front End to determine kernel invocation kind. + template +#ifdef __SYCL_DEVICE_ONLY__ + [[__sycl_detail__::add_ir_attributes_function( + ext::oneapi::experimental::detail::PropertyMetaInfo::name..., + ext::oneapi::experimental::detail::PropertyMetaInfo::value...)]] +#endif + __SYCL_KERNEL_ATTR__ static void + kernel_parallel_for(const KernelType &KernelFunc) { +#ifdef __SYCL_DEVICE_ONLY__ + KernelFunc(detail::Builder::getElement(detail::declptr())); +#else + (void)KernelFunc; +#endif + } + + template + event submit_direct_with_event(PropertiesT Props, nd_range Range, + const KernelType &KernelFunc, + const detail::code_location &CodeLoc = + detail::code_location::current()) const { + detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); + detail::v1::SubmissionInfo SI{}; + detail::v1::KernelRuntimeInfo KRInfo{}; + + using NameT = + typename detail::get_kernel_name_t::name; + + ProcessSubmitProperties(Props, SI); + ProcessKernelRuntimeInfo(Props, + KernelFunc, KRInfo); + + kernel_parallel_for, KernelType, + PropertiesT>(KernelFunc); + + // TODO UseFallbackAssert + + return submit_direct_with_event_impl(Range, SI, KRInfo, + TlsCodeLocCapture.query(), TlsCodeLocCapture.isToplevel()); + } + + template + void submit_direct_without_event(PropertiesT Props, nd_range Range, + const KernelType &KernelFunc, + const detail::code_location &CodeLoc = + detail::code_location::current()) const { + detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); + detail::v1::SubmissionInfo SI{}; + detail::v1::KernelRuntimeInfo KRInfo{}; + + using NameT = + typename detail::get_kernel_name_t::name; + + ProcessSubmitProperties(Props, SI); + ProcessKernelRuntimeInfo(Props, + KernelFunc, KRInfo); + + kernel_parallel_for, KernelType, + PropertiesT>(KernelFunc); + + // TODO UseFallbackAssert + + submit_direct_without_event_impl(Range, SI, KRInfo, + TlsCodeLocCapture.query(), TlsCodeLocCapture.isToplevel()); + } +#endif //__DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT + /// Submits a command group function object to the queue, in order to be /// scheduled for execution on the device. /// diff --git a/sycl/source/CMakeLists.txt b/sycl/source/CMakeLists.txt index dd1b83338305..0af3cf995b60 100644 --- a/sycl/source/CMakeLists.txt +++ b/sycl/source/CMakeLists.txt @@ -56,6 +56,14 @@ function(add_sycl_rt_library LIB_NAME LIB_OBJ_NAME) $<$:__SYCL_BUILD_SYCL_DLL> ) + if (SYCL_ENABLE_UNFINISHED_NO_CGH_SUBMIT) + target_compile_definitions( + ${LIB_OBJ_NAME} + PRIVATE + __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT + ) + endif() + target_include_directories( ${LIB_OBJ_NAME} PRIVATE diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index c2a66f485d63..50ad60689a45 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -415,6 +415,103 @@ queue_impl::submit_impl(const detail::type_erased_cgfo_ty &CGF, } #endif +#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT + +std::vector queue_impl::extractArgsAndReqsFromLambda( + char *LambdaPtr, detail::kernel_param_desc_t (*ParamDescGetter)(int), + size_t NumKernelParams) { + + size_t IndexShift = 0; + std::vector Args; + + Args.reserve(NumKernelParams); + + for (size_t I = 0; I < NumKernelParams; ++I) { + detail::kernel_param_desc_t ParamDesc = ParamDescGetter(I); + void *Ptr = LambdaPtr + ParamDesc.offset; + const detail::kernel_param_kind_t &Kind = ParamDesc.kind; + const int &Size = ParamDesc.info; + + Args.emplace_back(Kind, Ptr, Size, I + IndexShift); + } + + return Args; +} + +detail::EventImplPtr +queue_impl::submit_direct_impl(const NDRDescT &NDRDesc, + const v1::SubmissionInfo &SubmitInfo, + const v1::KernelRuntimeInfo &KRInfo, + bool CallerNeedsEvent, + const detail::code_location &CodeLoc, + bool IsTopCodeLoc) { + + std::unique_ptr CommandGroup; + detail::CG::StorageInitHelper CGData; + std::vector Args; + std::vector> StreamStorage; + std::vector> AuxiliaryResources; + + std::unique_lock Lock(MMutex); + + // Graphs are not supported yet for the no-handler path + assert(!hasCommandGraph()); + + // Set the No Last Event Mode to false, since the no-handler path + // does not support it yet. + MNoLastEventMode.store(false, std::memory_order_relaxed); + + // Used by queue_empty() and getLastEvent() + MEmpty.store(false, std::memory_order_release); + + // Sync with an external event + std::optional ExternalEvent = popExternalEvent(); + if (ExternalEvent) { + CGData.MEvents.push_back(getSyclObjImpl(*ExternalEvent)); + } + + // Sync with the last event for in order queue + EventImplPtr &LastEvent = MDefaultGraphDeps.LastEventPtr; + if (isInOrder() && LastEvent) { + CGData.MEvents.push_back(LastEvent); + } + + // TODO UnenqueuedCmdEvents for out of order queue + + Args = extractArgsAndReqsFromLambda(KRInfo.GetKernelFuncPtr(), + KRInfo.KernelParamDescGetter(), KRInfo.KernelNumArgs()); + + CommandGroup.reset(new detail::CGExecKernel( + std::move(NDRDesc), + KRInfo.HostKernel(), + nullptr, // MKernel + nullptr, // MKernelBundle + std::move(CGData), + std::move(Args), + toKernelNameStrT(KRInfo.KernelName()), + KRInfo.KernelNameBasedCachePtr(), + std::move(StreamStorage), + std::move(AuxiliaryResources), + detail::CGType::Kernel, + UR_KERNEL_CACHE_CONFIG_DEFAULT, + false, // MKernelIsCooperative + false, // MKernelUsesClusterLaunch + 0, // MKernelWorkGroupMemorySize + CodeLoc)); + + EventImplPtr EventImpl = detail::Scheduler::getInstance().addCG( + std::move(CommandGroup), *this, CallerNeedsEvent); + + // Sync with the last event for in order queue + if (isInOrder() && !EventImpl->isDiscarded()) { + LastEvent = EventImpl; + } + + return CallerNeedsEvent ? EventImpl : nullptr; +} + +#endif //__DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT + template event queue_impl::submitWithHandler(const std::vector &DepEvents, bool CallerNeedsEvent, diff --git a/sycl/source/detail/queue_impl.hpp b/sycl/source/detail/queue_impl.hpp index b3e7eb4bbe17..269ac27d8843 100644 --- a/sycl/source/detail/queue_impl.hpp +++ b/sycl/source/detail/queue_impl.hpp @@ -370,6 +370,68 @@ class queue_impl : public std::enable_shared_from_this { return createSyclObjFromImpl(ResEvent); } +#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT + event + submit_direct_with_event(nd_range<1> Range, + const detail::v1::SubmissionInfo &SubmitInfo, + const detail::v1::KernelRuntimeInfo &KRInfo, + const detail::code_location &CodeLoc, bool IsTopCodeLoc) { + detail::EventImplPtr EventImpl = + submit_direct_impl(NDRDescT{Range}, SubmitInfo, KRInfo, true, CodeLoc, + IsTopCodeLoc); + return createSyclObjFromImpl(EventImpl); + } + + event + submit_direct_with_event(nd_range<2> Range, + const detail::v1::SubmissionInfo &SubmitInfo, + const detail::v1::KernelRuntimeInfo &KRInfo, + const detail::code_location &CodeLoc, bool IsTopCodeLoc) { + detail::EventImplPtr EventImpl = + submit_direct_impl(NDRDescT{Range}, SubmitInfo, KRInfo, true, CodeLoc, + IsTopCodeLoc); + return createSyclObjFromImpl(EventImpl); + } + + event + submit_direct_with_event(nd_range<3> Range, + const detail::v1::SubmissionInfo &SubmitInfo, + const detail::v1::KernelRuntimeInfo &KRInfo, + const detail::code_location &CodeLoc, bool IsTopCodeLoc) { + detail::EventImplPtr EventImpl = + submit_direct_impl(NDRDescT{Range}, SubmitInfo, KRInfo, true, CodeLoc, + IsTopCodeLoc); + return createSyclObjFromImpl(EventImpl); + } + + void + submit_direct_without_event(nd_range<1> Range, + const detail::v1::SubmissionInfo &SubmitInfo, + const detail::v1::KernelRuntimeInfo &KRInfo, + const detail::code_location &CodeLoc, bool IsTopCodeLoc) { + submit_direct_impl(NDRDescT{Range}, SubmitInfo, KRInfo, false, CodeLoc, + IsTopCodeLoc); + } + + void + submit_direct_without_event(nd_range<2> Range, + const detail::v1::SubmissionInfo &SubmitInfo, + const detail::v1::KernelRuntimeInfo &KRInfo, + const detail::code_location &CodeLoc, bool IsTopCodeLoc) { + submit_direct_impl(NDRDescT{Range}, SubmitInfo, KRInfo, false, CodeLoc, + IsTopCodeLoc); + } + + void + submit_direct_without_event(nd_range<3> Range, + const detail::v1::SubmissionInfo &SubmitInfo, + const detail::v1::KernelRuntimeInfo &KRInfo, + const detail::code_location &CodeLoc, bool IsTopCodeLoc) { + submit_direct_impl(NDRDescT{Range}, SubmitInfo, KRInfo, false, CodeLoc, + IsTopCodeLoc); + } +#endif //__DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT + void submit_without_event(const detail::type_erased_cgfo_ty &CGF, const v1::SubmissionInfo &SubmitInfo, const detail::code_location &Loc, @@ -905,6 +967,10 @@ class queue_impl : public std::enable_shared_from_this { bool IsTopCodeLoc, const SubmissionInfo &SubmitInfo); #endif + std::vector extractArgsAndReqsFromLambda( + char *LambdaPtr, detail::kernel_param_desc_t (*ParamDescGetter)(int), + size_t NumKernelParams); + /// Performs command group submission to the queue. /// /// \param CGF is a function object containing command group. @@ -921,6 +987,14 @@ class queue_impl : public std::enable_shared_from_this { bool IsTopCodeLoc, const v1::SubmissionInfo &SubmitInfo); +#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT + detail::EventImplPtr submit_direct_impl(const NDRDescT &NDRDesc, + const v1::SubmissionInfo &SubmitInfo, + const v1::KernelRuntimeInfo &KRInfo, + bool CallerNeedsEvent, + const detail::code_location &CodeLoc, + bool IsTopCodeLoc); +#endif /// Helper function for submitting a memory operation with a handler. /// \param DepEvents is a vector of dependencies of the operation. /// \param HandlerFunc is a function that submits the operation with a diff --git a/sycl/source/queue.cpp b/sycl/source/queue.cpp index ed0b0e42e6e6..be599169b0d9 100644 --- a/sycl/source/queue.cpp +++ b/sycl/source/queue.cpp @@ -312,6 +312,63 @@ event queue::submit_with_event_impl( return impl->submit_with_event(CGH, SubmitInfo, CodeLoc, IsTopCodeLoc); } +#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT +event queue::submit_direct_with_event_impl( + nd_range<1> Range, + const detail::v1::SubmissionInfo &SubmitInfo, + const detail::v1::KernelRuntimeInfo &KRInfo, + const detail::code_location &CodeLoc, bool IsTopCodeLoc) const { + return impl->submit_direct_with_event(Range, SubmitInfo, KRInfo, + CodeLoc, IsTopCodeLoc); +} + +event queue::submit_direct_with_event_impl( + nd_range<2> Range, + const detail::v1::SubmissionInfo &SubmitInfo, + const detail::v1::KernelRuntimeInfo &KRInfo, + const detail::code_location &CodeLoc, bool IsTopCodeLoc) const { + return impl->submit_direct_with_event(Range, SubmitInfo, KRInfo, + CodeLoc, IsTopCodeLoc); +} + +event queue::submit_direct_with_event_impl( + nd_range<3> Range, + const detail::v1::SubmissionInfo &SubmitInfo, + const detail::v1::KernelRuntimeInfo &KRInfo, + const detail::code_location &CodeLoc, bool IsTopCodeLoc) const { + return impl->submit_direct_with_event(Range, SubmitInfo, KRInfo, + CodeLoc, IsTopCodeLoc); +} + +void queue::submit_direct_without_event_impl( + nd_range<1> Range, + const detail::v1::SubmissionInfo &SubmitInfo, + const detail::v1::KernelRuntimeInfo &KRInfo, + const detail::code_location &CodeLoc, bool IsTopCodeLoc) const { + impl->submit_direct_without_event(Range, SubmitInfo, KRInfo, + CodeLoc, IsTopCodeLoc); +} + +void queue::submit_direct_without_event_impl( + nd_range<2> Range, + const detail::v1::SubmissionInfo &SubmitInfo, + const detail::v1::KernelRuntimeInfo &KRInfo, + const detail::code_location &CodeLoc, bool IsTopCodeLoc) const { + impl->submit_direct_without_event(Range, SubmitInfo, KRInfo, + CodeLoc, IsTopCodeLoc); +} + +void queue::submit_direct_without_event_impl( + nd_range<3> Range, + const detail::v1::SubmissionInfo &SubmitInfo, + const detail::v1::KernelRuntimeInfo &KRInfo, + const detail::code_location &CodeLoc, bool IsTopCodeLoc) const { + impl->submit_direct_without_event(Range, SubmitInfo, KRInfo, + CodeLoc, IsTopCodeLoc); +} + +#endif //__DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT + void queue::submit_without_event_impl( const detail::type_erased_cgfo_ty &CGH, const detail::v1::SubmissionInfo &SubmitInfo, diff --git a/sycl/unittests/scheduler/InOrderQueueHostTaskDeps.cpp b/sycl/unittests/scheduler/InOrderQueueHostTaskDeps.cpp index 31a2914e2c80..b10ec4173e36 100644 --- a/sycl/unittests/scheduler/InOrderQueueHostTaskDeps.cpp +++ b/sycl/unittests/scheduler/InOrderQueueHostTaskDeps.cpp @@ -48,7 +48,7 @@ TEST_F(SchedulerTest, InOrderQueueHostTaskDeps) { EXPECT_EQ(GEventsWaitCounter, expectedCount); } -enum class CommandType { KERNEL = 1, MEMSET = 2 }; +enum class CommandType { KERNEL = 1, MEMSET = 2, HOST_TASK = 3 }; std::vector> ExecutedCommands; inline ur_result_t customEnqueueKernelLaunch(void *pParams) { @@ -162,3 +162,45 @@ TEST_F(SchedulerTest, InOrderQueueCrossDepsShortcutFuncs) { EXPECT_EQ(ExecutedCommands[1].first /*CommandType*/, CommandType::KERNEL); EXPECT_EQ(ExecutedCommands[1].second /*EventsCount*/, 0u); } + +TEST_F(SchedulerTest, InOrderQueueCrossDepsShortcutFuncsParallelFor) { + ExecutedCommands.clear(); + sycl::unittest::UrMock<> Mock; + mock::getCallbacks().set_before_callback("urEnqueueKernelLaunch", + &customEnqueueKernelLaunch); + + sycl::platform Plt = sycl::platform(); + + context Ctx{Plt}; + queue InOrderQueue{Ctx, default_selector_v, property::queue::in_order()}; + + std::mutex CvMutex; + std::condition_variable Cv; + bool ready = false; + + InOrderQueue.submit([&](sycl::handler &CGH) { + CGH.host_task([&] { + std::unique_lock lk(CvMutex); + Cv.wait(lk, [&ready] { return ready; }); + ExecutedCommands.push_back( + {CommandType::HOST_TASK, 0}); + }); + }); + + event Ev2 = InOrderQueue.parallel_for(nd_range<1>{range{32}, range{32}}, + [](nd_item<1>) {}); + + { + std::unique_lock lk(CvMutex); + ready = true; + } + Cv.notify_one(); + + InOrderQueue.wait(); + + ASSERT_EQ(ExecutedCommands.size(), 2u); + EXPECT_EQ(ExecutedCommands[0].first /*CommandType*/, CommandType::HOST_TASK); + EXPECT_EQ(ExecutedCommands[0].second /*EventsCount*/, 0u); + EXPECT_EQ(ExecutedCommands[1].first /*CommandType*/, CommandType::KERNEL); + EXPECT_EQ(ExecutedCommands[1].second /*EventsCount*/, 0u); +}