Skip to content

Commit 5fa3043

Browse files
committed
Make recovery.py report time without failure detection.
Even though failure detection time is more deterministic now, it is easier to compare changes to the recovery routines if the times are separated in the report. This means all recovery trends now record recovery time not including the time for failure detection.
1 parent 0d2a4cd commit 5fa3043

File tree

4 files changed

+28
-4
lines changed

4 files changed

+28
-4
lines changed

scripts/recoverymetrics.py

+10-2
Original file line numberDiff line numberDiff line change
@@ -375,9 +375,13 @@ def parseRecovery(recovery_dir):
375375

376376
data.client = AttrDict()
377377
for line in open(glob('%s/client*.*.log' % recovery_dir)[0]):
378-
m = re.search(r'\bRecovery completed in (\d+) ns\b', line)
378+
m = re.search(
379+
r'\bRecovery completed in (\d+) ns, failure detected in (\d+) ns\b',
380+
line)
379381
if m:
380-
data.client.recoveryNs = int(m.group(1))
382+
failureDetectionNs = int(m.group(2))
383+
data.client.recoveryNs = int(m.group(1)) - failureDetectionNs
384+
data.client.failureDetectionNs = failureDetectionNs
381385
return data
382386

383387
def rawSample(data):
@@ -408,6 +412,7 @@ def makeReport(data):
408412
servers = data.servers
409413

410414
recoveryTime = data.client.recoveryNs / 1e9
415+
failureDetectionTime = data.client.failureDetectionNs / 1e9
411416
report = Report()
412417

413418
# TODO(ongaro): Size distributions of filtered segments
@@ -436,6 +441,9 @@ def on_backups(fun, fail=0):
436441

437442
summary = report.add(Section('Summary'))
438443
summary.line('Recovery time', recoveryTime, 's')
444+
summary.line('Failure detection time', failureDetectionTime, 's')
445+
summary.line('Recovery + detection time',
446+
recoveryTime + failureDetectionTime, 's')
439447
summary.line('Masters', len(masters))
440448
summary.line('Backups', len(backups))
441449
summary.line('Total nodes', data.totalNodes)

src/CoordinatorService.cc

+1-1
Original file line numberDiff line numberDiff line change
@@ -364,7 +364,7 @@ CoordinatorService::hintServerDown(const HintServerDownRpc::Request& reqHdr,
364364
return;
365365
}
366366

367-
LOG(NOTICE, "Server failure detected: id %lu (\"%s\")",
367+
LOG(NOTICE, "Verified host failure, removing from cluster: id %lu (\"%s\")",
368368
*serverId, serviceLocator.c_str());
369369

370370
/*

src/ServerMetrics.cc

+16-1
Original file line numberDiff line numberDiff line change
@@ -39,10 +39,25 @@ ServerMetrics::ServerMetrics() : metrics()
3939
void
4040
ServerMetrics::load(Buffer& buffer)
4141
{
42-
ProtoBuf::MetricList list;
4342
uint32_t bufferLength = buffer.getTotalLength();
4443
string s(static_cast<const char*>(buffer.getRange(0, bufferLength)),
4544
bufferLength);
45+
load(s);
46+
}
47+
48+
/**
49+
* Incorporate a server's metrics data into this object. Existing
50+
* entries are not deleted, but may be overridden by new data.
51+
*
52+
* \param s
53+
* Contains metrics data formatted as a binary string using Protocol
54+
* Buffers in the form of a MetricList, such as the result of a
55+
* GET_METRICS RPC.
56+
*/
57+
void
58+
ServerMetrics::load(const string& s)
59+
{
60+
ProtoBuf::MetricList list;
4661
if (!list.ParseFromString(s)) {
4762
throw FormatError(HERE);
4863
}

src/ServerMetrics.h

+1
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ class ServerMetrics {
3636
ServerMetrics();
3737
~ServerMetrics();
3838
void load(Buffer& buffer);
39+
void load(const string& s);
3940
ServerMetrics difference(ServerMetrics& other);
4041

4142
// The following methods all delegate directly to the corresponding

0 commit comments

Comments
 (0)