From d224c923fe1dc033dd467ed6cce5a446f6e5db1e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mislav=20Marohni=C4=87?= Date: Mon, 16 Jun 2025 17:45:25 +0200 Subject: [PATCH] Optimize logging HTML nodes by skipping costly compute when Debug is off By default, `Parser.Debug` property is false and writing to the logger is disabled. However, invocations of `Parser.logf()` would still happen regardless of the Debug property, which included arguments that were the result of `dom.OuterHTML()` invocations. OuterHTML is costly because it serializes the entire DOM into a string. For large HTML documents, this would cause unnecessary overhead in default mode of operation where logging is disabled. My benchmark environment: goos: darwin goarch: arm64 cpu: Apple M1 Benchmark results for parsing `test-pages/wikipedia-2/source.html`: | times | speed | memory | allocations -------+-------+------------------+-----------------+------------------ before | 25 | 46,007,988 ns/op | 73,742,416 B/op | 201,175 allocs/op after | 30 | 38,579,637 ns/op | 59,611,452 B/op | 199,875 allocs/op The reduction for a very large HTML document was significant, at no change to readability functionality. --- parser.go | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/parser.go b/parser.go index 6e9ad45..8a8b950 100644 --- a/parser.go +++ b/parser.go @@ -976,7 +976,7 @@ func (ps *Parser) grabArticle() *html.Node { for i := 0; i < len(candidates); i++ { candidate := candidates[i] candidateScore := ps.getContentScore(candidate) * (1 - ps.getLinkDensity(candidate)) - ps.logf("candidate %q with score: %f\n", dom.OuterHTML(candidate), candidateScore) + ps.logf("candidate %q with score: %f\n", inspectNode(candidate), candidateScore) ps.setContentScore(candidate, candidateScore) } @@ -1009,7 +1009,7 @@ func (ps *Parser) grabArticle() *html.Node { // Move everything (not just elements, also text nodes etc.) // into the container so we even include text directly in the body: for page.FirstChild != nil { - ps.logf("moving child out: %q\n", dom.OuterHTML(page.FirstChild)) + ps.logf("moving child out: %q\n", inspectNode(page.FirstChild)) dom.AppendChild(topCandidate, page.FirstChild) } @@ -2124,7 +2124,7 @@ func (ps *Parser) cleanHeaders(e *html.Node) { ps.removeNodes(headingNodes, func(node *html.Node) bool { // Removing header with low class weight if ps.getClassWeight(node) < 0 { - ps.logf("removing header with low class weight: %q\n", dom.OuterHTML(node)) + ps.logf("removing header with low class weight: %q\n", inspectNode(node)) return true } return false @@ -2304,6 +2304,19 @@ func (ps *Parser) logf(format string, args ...interface{}) { } } +// inspectNode wraps a HTML node to use with printf-style functions. +func inspectNode(node *html.Node) fmt.Stringer { + return &inspectedNode{node} +} + +type inspectedNode struct { + node *html.Node +} + +func (n *inspectedNode) String() string { + return dom.OuterHTML(n.node) +} + // UNUSED CODES // Codes below these points are defined in original Readability.js but not used, // so here we commented it out so it can be used later if necessary.