Skip to content

Commit a17b96e

Browse files
committed
fix: very slow tree generation
1 parent af30eb7 commit a17b96e

File tree

1 file changed

+10
-7
lines changed

1 file changed

+10
-7
lines changed

har2tree/har2tree.py

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -333,8 +333,9 @@ def __init__(self, har_path: Path, capture_uuid: str):
333333
self.pages_root: dict[str, str] = {}
334334

335335
self.all_redirects: list[str] = []
336-
self.all_referer: dict[str, list[str]] = defaultdict(list)
337-
self.all_initiator_url: dict[str, list[str]] = defaultdict(list)
336+
# 2025-11-16: make values of referers and initiators sets because there will be duplicates
337+
self.all_referer: dict[str, set[str]] = defaultdict(set)
338+
self.all_initiator_url: dict[str, set[str]] = defaultdict(set)
338339
self._load_url_entries()
339340

340341
# Generate cookies lookup tables
@@ -441,14 +442,15 @@ def _load_iframes(self, current: URLNode, frames: FramesResponse) -> None:
441442
if (frames.get('url')
442443
and not (frames['url'] in ['about:blank'] # not loading anything, same as empty
443444
or frames['url'].startswith('data') # base64 encoded content
445+
or frames['url'].startswith('chrome-error') # not in the HAR/tree
444446
or frames['url'].startswith('blob') # blobs aren't URLs
445447
)):
446448
u = unquote_plus(frames['url'])
447-
possile_child_name = [u, u.split('#', 1)[0]]
449+
possible_child_name = {u, u.split('#', 1)[0]}
448450
# this url should be in a node directly attached to that one
449451
# we need to find that node
450452
for child in current.traverse():
451-
if child.name in possile_child_name:
453+
if child.name in possible_child_name:
452454
self.logger.debug(f'Found URL "{u}".')
453455
# Found the node, adding the content
454456
if not hasattr(child, 'rendered_frame'):
@@ -463,7 +465,7 @@ def _load_iframes(self, current: URLNode, frames: FramesResponse) -> None:
463465
break
464466
else:
465467
# Couldn'd find the node Oo
466-
to_print = ', '.join(possile_child_name)
468+
to_print = ', '.join(possible_child_name)
467469
children_to_print = ', '.join([child.name for child in current.traverse()])
468470
self.logger.warning(f'Unable to find "{to_print}" in the children of "{current.name}" - {children_to_print}')
469471
else:
@@ -602,7 +604,7 @@ def _load_url_entries(self) -> None:
602604

603605
if hasattr(n, 'initiator_url'):
604606
# The HAR file was created by chrome/chromium and we got the _initiator key
605-
self.all_initiator_url[n.initiator_url].append(n.name)
607+
self.all_initiator_url[n.initiator_url].add(n.name)
606608

607609
if url_entry['startedDateTime'] in self.har.pages_start_times:
608610
for page in self.har.pages_start_times[url_entry['startedDateTime']]:
@@ -615,7 +617,7 @@ def _load_url_entries(self) -> None:
615617
if hasattr(n, 'referer') and i > 0:
616618
# NOTE 2021-05-14: referer to self are a real thing: url -> POST to self
617619
if n.name != n.referer or ('method' in n.request and n.request['method'] == 'POST'):
618-
self.all_referer[n.referer].append(n.name)
620+
self.all_referer[n.referer].add(n.name)
619621

620622
self._nodes_list.append(n)
621623
self.all_url_requests[n.name].append(n)
@@ -918,6 +920,7 @@ def _make_subtree(self, root: URLNode, nodes_to_attach: list[URLNode] | None=Non
918920
cu = unquote_plus(possible_url)
919921
for u in {cu, cu.split('#', 1)[0]}:
920922
if u not in self.all_url_requests:
923+
self.logger.info(f'"{u}" in the frames URLs, but not in the HAR.')
921924
continue
922925
matching_urls = [url_node for url_node in self.all_url_requests[u]
923926
if url_node in self._nodes_list]

0 commit comments

Comments
 (0)