Skip to content

Commit 7f78712

Browse files
authored
Merge pull request #20307 from jackdelv/issue34809
HPCC-34809 Improve XRef status updates during long running scans
2 parents d9b8a50 + d006587 commit 7f78712

File tree

1 file changed

+110
-37
lines changed

1 file changed

+110
-37
lines changed

dali/sasha/saxref.cpp

Lines changed: 110 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -585,6 +585,57 @@ static void mergeDirPerPartDirs(cDirDesc *parent, cDirDesc *dir, const char *cur
585585
}
586586
}
587587

588+
constexpr int64_t oneSecondNS = 1000 * 1000 * 1000; // 1 second in nanoseconds
589+
constexpr int64_t oneHourNS = 60 * 60 * oneSecondNS; // 1 hour in nanoseconds
590+
class XRefPeriodicTimer : public PeriodicTimer
591+
{
592+
public:
593+
XRefPeriodicTimer() = default;
594+
XRefPeriodicTimer(unsigned seconds, bool suppressFirst, const char *_clustname)
595+
: clustname(_clustname) { reset(seconds, suppressFirst, clustname); }
596+
597+
unsigned calcElapsedMinutes() const
598+
{
599+
int64_t elapsedNS = cycle_to_nanosec(lastElapsedCycles - startCycles);
600+
return elapsedNS / oneSecondNS / 60;
601+
}
602+
603+
bool hasElapsed()
604+
{
605+
// MORE: Could make PeriodicTimer::hasElapsed thread safe and remove CriticalBlock
606+
CriticalBlock block(timerSect);
607+
return PeriodicTimer::hasElapsed();
608+
}
609+
610+
void reset(unsigned seconds, bool suppressFirst, const char *_clustname)
611+
{
612+
clustname = _clustname;
613+
PeriodicTimer::reset(seconds*1000, suppressFirst);
614+
startCycles = lastElapsedCycles;
615+
}
616+
617+
// Double the time period until it reaches 1 hour
618+
void updatePeriod()
619+
{
620+
int64_t timePeriodNS = cycle_to_nanosec(timePeriodCycles) + 1; // 1 second is lost converting to nanoseconds
621+
if (timePeriodNS < oneHourNS)
622+
{
623+
int64_t newTimePeriodNS = timePeriodNS >= oneHourNS / 2 ? oneHourNS : timePeriodNS * 2;
624+
timePeriodCycles = nanosec_to_cycle(newTimePeriodNS);
625+
626+
unsigned intervalMinutes = newTimePeriodNS / oneSecondNS / 60;
627+
if (clustname)
628+
DBGLOG(LOGPFX "[%s] Heartbeat interval increased to %u minutes", clustname, intervalMinutes);
629+
else
630+
DBGLOG(LOGPFX "Heartbeat interval increased to %u minutes", intervalMinutes);
631+
}
632+
}
633+
634+
private:
635+
CriticalSection timerSect;
636+
const char *clustname = nullptr; // Cluster name for logging
637+
cycle_t startCycles = 0;
638+
};
588639

589640
class CNewXRefManagerBase
590641
{
@@ -599,40 +650,30 @@ class CNewXRefManagerBase
599650
unsigned lastlog;
600651
unsigned sfnum;
601652
unsigned fnum;
653+
XRefPeriodicTimer heartbeatTimer;
654+
602655
Owned<IPropertyTree> foundbranch;
603656
Owned<IPropertyTree> lostbranch;
604657
Owned<IPropertyTree> orphansbranch;
605658
Owned<IPropertyTree> dirbranch;
606659

607-
void log(const char * format, ...) __attribute__((format(printf, 2, 3)))
660+
void log(bool forceStatusUpdate, const char * format, ...) __attribute__((format(printf, 3, 4)))
608661
{
609-
CriticalBlock block(logsect);
662+
StringBuffer line;
610663
va_list args;
611664
va_start(args, format);
612-
StringBuffer line;
613665
line.valist_appendf(format, args);
614666
va_end(args);
667+
615668
if (clustname.get())
616669
PROGLOG(LOGPFX "[%s] %s",clustname.get(),line.str());
617670
else
618671
PROGLOG(LOGPFX "%s",line.str());
619-
if (logconn) {
620-
logcache.set(line.str());
621-
updateStatus(false);
622-
}
623-
}
624672

625-
void statlog(const char * format, ...) __attribute__((format(printf, 2, 3)))
626-
{
627-
CriticalBlock block(logsect);
628-
va_list args;
629-
va_start(args, format);
630-
StringBuffer line;
631-
line.valist_appendf(format, args);
632-
va_end(args);
633673
if (logconn) {
674+
CriticalBlock block(logsect);
634675
logcache.set(line.str());
635-
updateStatus(false);
676+
updateStatus(forceStatusUpdate);
636677
}
637678
}
638679

@@ -684,6 +725,34 @@ class CNewXRefManagerBase
684725

685726
}
686727

728+
void startHeartbeat(const char * op)
729+
{
730+
heartbeatTimer.reset(60, true, clustname.get()); // 1 minute interval
731+
log(true, "%s heartbeat started (interval: 1 minute)", op);
732+
}
733+
734+
void checkHeartbeat(const char * op)
735+
{
736+
time_t now = time(NULL);
737+
if (!heartbeatTimer.hasElapsed())
738+
return;
739+
740+
unsigned elapsedMinutes = heartbeatTimer.calcElapsedMinutes();
741+
unsigned elapsedHours = elapsedMinutes / 60;
742+
unsigned remainingMinutes = elapsedMinutes % 60;
743+
744+
struct tm *utc_tm = gmtime(&now);
745+
char timestamp[32];
746+
strftime(timestamp, sizeof(timestamp), "%Y-%m-%d %H:%M:%S", utc_tm);
747+
748+
if (elapsedHours > 0)
749+
log(true, "%s - elapsed: %uh %um (%s UTC)", op, elapsedHours, remainingMinutes, timestamp);
750+
else
751+
log(true, "%s - elapsed: %um (%s UTC)", op, elapsedMinutes, timestamp);
752+
753+
heartbeatTimer.updatePeriod();
754+
}
755+
687756
void addBranch(IPropertyTree *root,const char *name,IPropertyTree *branch)
688757
{
689758
if (!branch)
@@ -716,13 +785,13 @@ class CNewXRefManagerBase
716785
xpath.insert(0,"/DFU/XREF/");
717786
logconn.setown(querySDS().connect(xpath.str(),myProcessSession(),0 ,INFINITE));
718787
}
719-
log("Starting");
788+
log(false, "Starting");
720789
}
721790

722791
void finish(bool aborted)
723792
{
724793
if (aborted)
725-
log("Aborted");
794+
log(false, "Aborted");
726795
logconn.clear(); // final message done by save to eclwatch
727796
}
728797

@@ -751,7 +820,7 @@ class CNewXRefManagerBase
751820
{
752821
if (abort)
753822
return;
754-
log("Saving information");
823+
log(false,"Saving information");
755824
Owned<IPropertyTree> croot = createPTree("Cluster");
756825
croot->setProp("@name",clustname);
757826
if (!rootdir.isEmpty())
@@ -825,7 +894,7 @@ class CNewXRefManager: public CNewXRefManagerBase
825894
lostbranch.setown(createPTree("Lost"));
826895
orphansbranch.setown(createPTree("Orphans"));
827896
dirbranch.setown(createPTree("Directories"));
828-
log("Max memory = %d MB", maxMb);
897+
log(false, "Max memory = %d MB", maxMb);
829898

830899
StringBuffer userName;
831900
serverConfig->getProp("@sashaUser", userName);
@@ -1037,6 +1106,7 @@ class CNewXRefManager: public CNewXRefManagerBase
10371106

10381107
bool scanDirectory(unsigned node,const SocketEndpoint &ep,StringBuffer &path, unsigned drv, cDirDesc *pdir, IFile *cachefile, unsigned level)
10391108
{
1109+
checkHeartbeat("Directory scan");
10401110
size32_t dsz = path.length();
10411111
if (pdir==NULL)
10421112
pdir = root;
@@ -1168,7 +1238,7 @@ class CNewXRefManager: public CNewXRefManagerBase
11681238
SocketEndpoint localEP;
11691239
localEP.setLocalHost(0);
11701240
addPathSepChar(path).append('d').append(i+1);
1171-
parent.log("Scanning %s directory %s",parent.storagePlane->queryProp("@name"),path.str());
1241+
parent.log(false,"Scanning %s directory %s",parent.storagePlane->queryProp("@name"),path.str());
11721242
if (!parent.scanDirectory(0,localEP,path,0,parent.root,NULL,1))
11731243
{
11741244
ok = false;
@@ -1178,7 +1248,7 @@ class CNewXRefManager: public CNewXRefManagerBase
11781248
else
11791249
{
11801250
SocketEndpoint ep = parent.rawgrp->queryNode(i).endpoint();
1181-
parent.log("Scanning %s directory %s",ep.getEndpointHostText(tmp).str(),path.str());
1251+
parent.log(false,"Scanning %s directory %s",ep.getEndpointHostText(tmp).str(),path.str());
11821252
if (!parent.scanDirectory(i,ep,path,0,NULL,NULL,0)) {
11831253
ok = false;
11841254
return;
@@ -1187,7 +1257,7 @@ class CNewXRefManager: public CNewXRefManagerBase
11871257
i = (i+r)%n;
11881258
setReplicateFilename(path,1);
11891259
ep = parent.rawgrp->queryNode(i).endpoint();
1190-
parent.log("Scanning %s directory %s",ep.getEndpointHostText(tmp.clear()).str(),path.str());
1260+
parent.log(false,"Scanning %s directory %s",ep.getEndpointHostText(tmp.clear()).str(),path.str());
11911261
if (!parent.scanDirectory(i,ep,path,1,NULL,NULL,0)) {
11921262
ok = false;
11931263
}
@@ -1203,11 +1273,12 @@ class CNewXRefManager: public CNewXRefManagerBase
12031273
numMaxThreads = numuniqnodes;
12041274
if (numThreads > numMaxThreads)
12051275
numThreads = numMaxThreads;
1276+
startHeartbeat("Directory scan"); // Initialize heartbeat mechanism
12061277
afor.For(numMaxThreads,numThreads,true,numThreads>1);
12071278
if (afor.ok)
1208-
log("Directory scan complete");
1279+
log(true,"Directory scan complete");
12091280
else
1210-
log("Errors occurred during scan");
1281+
log(true,"Errors occurred during scan");
12111282
return afor.ok;
12121283
}
12131284

@@ -1253,7 +1324,7 @@ class CNewXRefManager: public CNewXRefManagerBase
12531324
{
12541325
if (abort)
12551326
return;
1256-
parent.log("Process file %s",name.str());
1327+
parent.log(false,"Process file %s",name.str());
12571328
parent.fnum++;
12581329

12591330
Owned<IFileDescriptor> fdesc;
@@ -1354,7 +1425,7 @@ class CNewXRefManager: public CNewXRefManagerBase
13541425
} filescan(*this,abort);
13551426

13561427
filescan.scan();
1357-
log("File scan complete");
1428+
log(true,"File scan complete");
13581429

13591430
}
13601431

@@ -1688,6 +1759,7 @@ class CNewXRefManager: public CNewXRefManagerBase
16881759

16891760
void listOrphans(cDirDesc *d,StringBuffer &basedir,StringBuffer &scope,bool &abort,unsigned int recentCutoffDays)
16901761
{
1762+
checkHeartbeat("Orphan scan");
16911763
if (abort)
16921764
return;
16931765
if (!d) {
@@ -1755,23 +1827,24 @@ class CNewXRefManager: public CNewXRefManagerBase
17551827
void listOrphans(bool &abort,unsigned int recentCutoffDays)
17561828
{
17571829
// also does directories
1758-
log("Scanning for orphans");
1830+
log(true,"Scanning for orphans");
1831+
startHeartbeat("Orphan scan");
17591832
StringBuffer basedir;
17601833
StringBuffer scope;
17611834
listOrphans(NULL,basedir,scope,abort,recentCutoffDays);
17621835
if (abort)
17631836
return;
1764-
log("Orphan scan complete");
1837+
log(true,"Orphan scan complete");
17651838
sorteddirs.sort(compareDirs); // NB sort reverse
17661839
while (!abort&&sorteddirs.ordinality())
17671840
dirbranch->addPropTree("Directory",&sorteddirs.popGet());
1768-
log("Directories sorted");
1841+
log(true,"Directories sorted");
17691842
}
17701843

17711844

17721845
void listLost(bool &abort,bool ignorelazylost,unsigned int recentCutoffDays)
17731846
{
1774-
log("Scanning for lost files");
1847+
log(true,"Scanning for lost files");
17751848
StringBuffer tmp;
17761849
ForEachItemIn(i0,lostfiles) {
17771850
if (abort)
@@ -1911,7 +1984,7 @@ class CNewXRefManager: public CNewXRefManagerBase
19111984
lostbranch->addPropTree("File",ft.getClear());
19121985
}
19131986
}
1914-
log("Lost scan complete");
1987+
log(true,"Lost scan complete");
19151988
}
19161989

19171990

@@ -2048,7 +2121,7 @@ class CSuperfileCheckManager: public CNewXRefManagerBase
20482121
void processSuperFile(IPropertyTree &file,StringBuffer &name)
20492122
{
20502123
parent.sfnum++;
2051-
parent.log("Scanning SuperFile %s",name.str());
2124+
parent.log(false,"Scanning SuperFile %s",name.str());
20522125
unsigned numsub = file.getPropInt("@numsubfiles");
20532126
unsigned n = 0;
20542127
Owned<IPropertyTreeIterator> iter = file.getElements("SubFile");
@@ -2104,7 +2177,7 @@ class CSuperfileCheckManager: public CNewXRefManagerBase
21042177

21052178
bool fix = false;
21062179

2107-
log("Crossreferencing %d SuperFiles",superowner.ordinality());
2180+
log(false,"Crossreferencing %d SuperFiles",superowner.ordinality());
21082181
ForEachItemIn(i1,superowner) {
21092182
const char *owner = superowner.item(i1);
21102183
const char *owned = superowned.item(i1);
@@ -2147,8 +2220,8 @@ class CSuperfileCheckManager: public CNewXRefManagerBase
21472220
}
21482221
}
21492222
}
2150-
}
2151-
log("Crossreferencing %d Files",fileowned.ordinality());
2223+
}
2224+
log(false,"Crossreferencing %d Files",fileowned.ordinality());
21522225
ForEachItemIn(i3,fileowned) {
21532226
const char *fowner = fileowner.item(i3);
21542227
const char *fowned = fileowned.item(i3);

0 commit comments

Comments
 (0)