Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added Performance class for collecting task timing info. #3547

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 19 additions & 4 deletions include/picongpu/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
#include <pmacc/Environment.hpp>
#include <pmacc/types.hpp>

#include <pmacc/eventSystem/PerfInfo.hpp>
#include <picongpu/simulation_defines.hpp>

#include <cstdlib>
Expand Down Expand Up @@ -53,16 +54,23 @@ namespace
errorCode = EXIT_FAILURE;
break;
case ArgsParser::Status::success:
sim.load();
sim.start();
sim.unload();
{ pmacc::PerfTimed timer("Load", 0, 0);
sim.load();
}
{ pmacc::PerfTimed start("Start", 0, 0);
sim.start();
}
{ pmacc::PerfTimed timer("Unload", 0, 0);
sim.unload();
}
PMACC_FALLTHROUGH;
case ArgsParser::Status::successExit:
errorCode = 0;
break;
};

// finalize the pmacc context */
pmacc::PerfTimed timer("Finalize", 0, 0);
pmacc::Environment<>::get().finalize();

return errorCode;
Expand All @@ -77,20 +85,27 @@ namespace
*/
int main(int argc, char** argv)
{
pmacc::PerfInfo& perf = pmacc::Environment<>::get().PerfInfo();
perf.on();

try
{
return runSimulation(argc, argv);
auto ret = runSimulation(argc, argv);
perf.show(std::cout);
return ret;
}
// A last-ditch effort to report exceptions to a user
catch(const std::exception& ex)
{
auto const typeName = std::string(typeid(ex).name());
std::cerr << "Unhandled exception of type '" + typeName + "' with message '" + ex.what() + "', terminating\n";
perf.show(std::cout);
return EXIT_FAILURE;
}
catch(...)
{
std::cerr << "Unhandled exception of unknown type, terminating\n";
perf.show(std::cout);
return EXIT_FAILURE;
}
}
9 changes: 9 additions & 0 deletions include/pmacc/Environment.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -231,6 +231,15 @@ namespace pmacc
return EventPool::getInstance();
}

/** get the singleton PerfInfo
*
* @return instance of PerfInfo
*/
pmacc::PerfInfo& PerfInfo()
{
return PerfInfo::getInstance();
}

/** get the singleton ParticleFactory
*
* @return instance of ParticleFactory
Expand Down
1 change: 1 addition & 0 deletions include/pmacc/eventSystem/EventSystem.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
#pragma once

#include "pmacc/eventSystem/Manager.hpp"
#include "pmacc/eventSystem/PerfInfo.hpp"
#include "pmacc/eventSystem/tasks/StreamTask.hpp"
#include "pmacc/eventSystem/transactions/Transaction.hpp"
#include "pmacc/eventSystem/transactions/TransactionManager.hpp"
Expand Down
32 changes: 30 additions & 2 deletions include/pmacc/eventSystem/Manager.tpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
#include "pmacc/eventSystem/streams/StreamController.hpp"
#include "pmacc/eventSystem/EventSystem.hpp"
#include "pmacc/eventSystem/Manager.hpp"
#include "pmacc/eventSystem/PerfInfo.hpp"
#include "pmacc/assert.hpp"

#include <cstdlib>
Expand All @@ -42,6 +43,31 @@ namespace pmacc
CUDA_CHECK_NO_EXCEPT(cuplaGetLastError());
}

/** Re-entrant function that calls task->execute() in sequence.
*
* 1. if "iter" is at end, rewind to beginning
* 2. repeatedly call execute() method of "iter" (= the next task in
* the `tasks` map) until hitting "end"
* ~> This creates recursion indirectly when tasks await one another.
* a. if the awaited task has completed:
* i. cleanup by deleting its taskPtr if necessary
* ii. if this task was taskToWait, move "iter" to end, then return "true"
* 3. If the end is reached, return "false"
*
* counter = number of calls to execute() since the last completed task
* deep = number of times execute() has been entered - 1
*
* Design notes: This system works well as a "reaper" to remove completed tasks.
* However, it cannot guarantee immediately returning "true" when called - even
* if the taskToWait has actually completed. A callback mechanism populating a list of
* completed tasks would work much better for that purpose.
*
* Potentially worthwhile callback mechanisms to investigate are
* a stack protected by a single lock
* or zeroMQ's "local" socket type
* or anything providing the equivalent of a golang channel
*
*/
inline bool Manager::execute(id_t taskToWait)
{
#ifdef DEBUG_EVENTS
Expand Down Expand Up @@ -73,8 +99,10 @@ namespace pmacc
if(counter == 500000)
std::cout << taskPtr->toString() << " " << passiveTasks.size() << std::endl;
#endif
if(taskPtr->execute())
if(taskPtr->execute()) // Looks like the only place in code calling taskPtr->execute().
{
taskPtr->perfInfo.stop();
Environment<>::get().PerfInfo().append(taskPtr->toString(), taskPtr->perfInfo);
/*test if task is deleted by other stackdeep*/
if(getActiveITaskIfNotFinished(id) == taskPtr)
{
Expand Down Expand Up @@ -135,7 +163,7 @@ namespace pmacc
return nullptr;
}

inline void Manager::waitForFinished(id_t taskId)
inline void Manager::waitForFinished(id_t taskId) // DMR: TODO
{
if(taskId == 0)
return;
Expand Down
203 changes: 203 additions & 0 deletions include/pmacc/eventSystem/PerfInfo.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,203 @@
/* Copyright 2021 David M. Rogers
*
* This file is part of PMacc.
*
* PMacc is free software: you can redistribute it and/or modify
* it under the terms of either the GNU General Public License or
* the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* PMacc is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License and the GNU Lesser General Public License
* for more details.
*
* You should have received a copy of the GNU General Public License
* and the GNU Lesser General Public License along with PMacc.
* If not, see <http://www.gnu.org/licenses/>.
*/

#pragma once

#include <stdint.h>
#include <iostream>
#include <vector>
#include <tuple>
#include <map>
#include <cmath>
#include <chrono>
#include <string>
#include "pmacc/Environment.def"

namespace pmacc
{
/* Internal data structures for counters, etc. */
namespace {
using secs = std::chrono::duration<double, std::ratio<1, 1>>;
using tpoint_t = std::chrono::time_point<std::chrono::steady_clock>;
inline tpoint_t walltime() {
return std::chrono::steady_clock::now();
}
// returns value in seconds
inline double timeDelta(tpoint_t t0, tpoint_t t1) {
return secs(t1 - t0).count();
}

/** Accumulator for:
*
* min = min_i x_i
* max = max_i x_i
* s = \sum_i x_i
* v = \sum_i (x_i - s/n)^2
* n = \sum_i 1
*/
struct PerfAvg {
PerfAvg(double x) : min(x), max(x), s(x), v(0.0), n(1) { }

void add(const double x) {
// Old Trick From: https://manual.gromacs.org/2020/reference-manual/averages.html
if(x < min) min = x;
if(x > max) max = x;
v += (s - n*x)*(s - n*x) / (n*(n+1.0));
s += x;
n += 1;
}
double min, max, s, v;
int n;
};
}

/** Simple container for bytes, flops, and times associated with a code region.
*/
struct PerfData {
tpoint_t t0, t1;
double bytes, flops;
PerfData(double _bytes, double _flops)
: t0(walltime()), t1(t0), bytes(_bytes), flops(_flops) {}
void stop() { if(t1 == t0) t1 = walltime(); }; ///< stop-if-not-already-stopped
bool stopped() const { return t1 != t0; } ///< return true if it has been stopped
double duration() const { return timeDelta(t0, t1); } ///< return seconds elapsed
};

/** Singleton class collecting all PerfData reports.
* It's disabled by default.
*
* To use:
* 1. run (at code start) PerfInfo::getInstance().on();
* 2. construct PerfInfo section("section name", bytes, flops);
* within blocks you wish to time.
* 3. optionally (when done) run PerfInfo::getInstance().off();
* 4. print accumulated information using PerfInfo::getInstance().show(std::cout);
*
* Statistics (#calls, avg. time per call, etc.) are reported separately
* for each ("section name", bytes, flops) combination.
*
*/
class PerfInfo
{
public:
/** Performance Timers will save into collection when completed. (append is enabled)
*/
void on() { tracing = true; }

/** Performance Timers will not save into collection when completed. (append is disabled)
*/
void off() { tracing = false; }

/** Append the datum to the collection (under "section name" = label).
*/
void append(const std::string &label, const PerfData &datum) {
if(!tracing || !datum.stopped()) return;
auto it = events.find(label);
d2 key(datum.bytes, datum.flops);
if (it == events.end()) {
auto v = std::map<d2,PerfAvg>();
v.emplace(key, PerfAvg(datum.duration()));
events[label] = v;
} else {
auto &et = it->second;
auto ev = et.find(key);
if (ev == et.end()) {
et.emplace(key, PerfAvg(datum.duration()));
} else {
ev->second.add(datum.duration());
}
}
}

/** Print out a json-formatted summary of the PerfInfo data collected.
*/
void show(std::ostream &os) {
const char hdr1[] = "{ ";
const char hdr2[] = ", ";

const char *bhdr = hdr1;
for(auto et : events) { // all events for thread
os << bhdr << "\"" << et.first << "\" : [" << std::endl;
bhdr = hdr2;
int i = 0;
for(auto ev : et.second) { // all map values
if(i != 0)
std::cout << ",\n";
i = 1;
auto x = ev.second;
os << " { \"Bytes\": " << std::get<0>(ev.first) << std::endl
<< " , \"Flops\": " << std::get<1>(ev.first) << std::endl
<< " , \"Calls\": " << x.n << std::endl
<< " , \"Min\": " << x.min << std::endl
<< " , \"Max\": " << x.max << std::endl
<< " , \"Avg\": " << x.s/x.n << std::endl
<< " , \"Stddev\": " << std::sqrt(x.v/x.n) << " }";
}
os << "\n ]\n";
}
os << "}\n";
}

private:
friend struct detail::Environment;
friend class PerfTimed;

bool tracing = false;
using d2 = std::tuple<double,double>;
/** Data is gathered into name and then grouped by call size (bytes,flops)
*
* We could collapse these two into <name,bytes,flops> and then column-ize
* PerfAvg to make more concise output too...
*
* name : { <bytes,flops> : PerfAvg }
*/
std::map<std::string,std::map<d2,PerfAvg>> events;

PerfInfo() {}
PerfInfo(const PerfInfo &) = delete; // copy ctor
PerfInfo(PerfInfo&&) = delete; // move ctor
PerfInfo& operator=(const PerfInfo&) = delete; // assign ctor
PerfInfo& operator=(PerfInfo&) = delete;
static PerfInfo& getInstance()
{
static PerfInfo instance;
return instance;
}
};

/** Scoped class holding a name and a PerfData timer.
* It's the user's responsibility to provide meaningful bytes and flops.
* Start/stop timers are set by this class's ctor and dtor.
*/
class PerfTimed {
public:
PerfTimed(const std::string &_label, double bytes, double flops)
: label(_label), datum(bytes, flops) {}
~PerfTimed() {
datum.stop();
PerfInfo::getInstance().append(label, datum);
}
void stop() { datum.stop(); }; ///< Stop the underlying clock.
private:
const std::string label;
PerfData datum;
};
}
2 changes: 1 addition & 1 deletion include/pmacc/eventSystem/tasks/Factory.tpp
Original file line number Diff line number Diff line change
Expand Up @@ -189,7 +189,7 @@ namespace pmacc
}

/**
* Creates a new TaskGetCurrentSizeFromDevic.
* Creates a new TaskGetCurrentSizeFromDevice.
* @param buffer DeviceBuffer to get current size from
* @param registeringTask optional pointer to an ITask which should be registered at the new task as an observer
*/
Expand Down
9 changes: 7 additions & 2 deletions include/pmacc/eventSystem/tasks/ITask.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,11 +26,11 @@
#include "pmacc/eventSystem/events/IEvent.hpp"
#include "pmacc/types.hpp"
#include "pmacc/assert.hpp"
#include "pmacc/eventSystem/PerfInfo.hpp"

#include <string>
#include <set>


namespace pmacc
{
/**
Expand All @@ -49,16 +49,21 @@ namespace pmacc
TASK_HOST
};

/** Flops, Bytes, and timer for this task.
*/
PerfData perfInfo;

/**
* constructor
*/
ITask() : myType(ITask::TASK_UNKNOWN)
ITask() : myType(ITask::TASK_UNKNOWN), perfInfo(0, 0)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@j-stephan this is sexy to measure the lifetime of an object, that's maybe something for bactria too.

{
// task id 0 is reserved for invalid
static id_t globalId = 1;

myId = globalId++;
PMACC_ASSERT(myId > 0);

}


Expand Down