88
99#include " gloo/cuda_allreduce_bcube.h"
1010
11+ #include " gloo/common/log.h"
1112#include " gloo/cuda_collectives_device.h"
1213#include " gloo/cuda_collectives_host.h"
1314#include " gloo/cuda_private.h"
1415
1516#include < sstream>
17+ #include < string_view>
1618#ifndef _WIN32
1719#include < unistd.h>
1820#endif
@@ -226,35 +228,48 @@ bool CudaAllreduceBcube<T, W>::printCheck(int /* rank */) {
226228 return false ;
227229}
228230
229- template <typename T, typename W>
230- void CudaAllreduceBcube<T, W>::printBreak(T* p, int x) {
231- if (0 == x % wordsPerLine) {
232- std::cout << std::endl
233- << &p[x] << " " << std::setfill (' 0' ) << std::setw (5 ) << x << " : " ;
234- } else if (0 == x % wordsPerSection) {
235- std::cout << " - " ;
236- }
237- }
238-
239231template <typename T, typename W>
240232void CudaAllreduceBcube<T, W>::printElems(T* p, int count, int start) {
241- auto alignedStart = (start / wordsPerLine) * wordsPerLine;
242- for (int x = alignedStart; x < start + count; ++x) {
243- printBreak (p, x);
244- if (x < start) {
245- std::cout << " ..... " ;
246- } else {
247- std::cout << std::setfill (' 0' ) << std::setw (5 ) << p[x] << " " ;
233+ /* Early return if log level is not high enough, to prevent expensive code
234+ * running. */
235+ if (!spdlog::should_log (spdlog::level::trace))
236+ return ;
237+
238+ const std::size_t alignedStart = (start / wordsPerLine) * wordsPerLine;
239+ fmt::memory_buffer line{};
240+
241+ /* Logs/flushes the line buffer - starting a new line */
242+ auto printLine = [&]() {
243+ if (!line.size ())
244+ return ;
245+ std::string_view sv{line.data (), line.size ()};
246+ GLOO_TRACE (" {}" , sv);
247+ line.clear ();
248+ };
249+
250+ for (std::size_t x = alignedStart; x < start + count; ++x) {
251+ if (x % wordsPerLine == 0 ) {
252+ if (x != alignedStart)
253+ printLine ();
254+ fmt::format_to (
255+ std::back_inserter (line), " {} {:05}: " , fmt::ptr (&p[x]), x);
256+ } else if (x % wordsPerSection == 0 ) {
257+ fmt::format_to (std::back_inserter (line), " - " );
248258 }
259+
260+ if (x < start)
261+ fmt::format_to (std::back_inserter (line), " ..... " );
262+ else
263+ fmt::format_to (std::back_inserter (line), " {:05} " , p[x]);
249264 }
265+ printLine ();
250266}
251267
252268template <typename T, typename W>
253269void CudaAllreduceBcube<T, W>::printStageBuffer(const std::string& msg) {
254270 if (printCheck (myRank_)) {
255- std::cout << " rank (" << myRank_ << " ) " << msg << " : " ;
271+ GLOO_TRACE ( " rank ({}) {}: " , myRank_, msg) ;
256272 printElems (&scratch_[0 ], totalNumElems_);
257- std::cout << std::endl;
258273 }
259274}
260275
@@ -268,10 +283,13 @@ void CudaAllreduceBcube<T, W>::printStepBuffer(
268283 int count,
269284 int start) {
270285 if (printCheck (myRank_)) {
271- std::cout << stage << " : step (" << step << " ) " << " srcRank (" << srcRank
272- << " ) -> " << " destRank (" << destRank << " ): " ;
286+ GLOO_TRACE (
287+ " {}: step ({}) srcRank ({}) -> destRank ({}):" ,
288+ stage,
289+ step,
290+ srcRank,
291+ destRank);
273292 printElems (p, count, start);
274- std::cout << std::endl;
275293 }
276294}
277295
0 commit comments