@@ -333,8 +333,9 @@ def __init__(self, har_path: Path, capture_uuid: str):
333333 self .pages_root : dict [str , str ] = {}
334334
335335 self .all_redirects : list [str ] = []
336- self .all_referer : dict [str , list [str ]] = defaultdict (list )
337- self .all_initiator_url : dict [str , list [str ]] = defaultdict (list )
336+ # 2025-11-16: make values of referers and initiators sets because there will be duplicates
337+ self .all_referer : dict [str , set [str ]] = defaultdict (set )
338+ self .all_initiator_url : dict [str , set [str ]] = defaultdict (set )
338339 self ._load_url_entries ()
339340
340341 # Generate cookies lookup tables
@@ -441,14 +442,15 @@ def _load_iframes(self, current: URLNode, frames: FramesResponse) -> None:
441442 if (frames .get ('url' )
442443 and not (frames ['url' ] in ['about:blank' ] # not loading anything, same as empty
443444 or frames ['url' ].startswith ('data' ) # base64 encoded content
445+ or frames ['url' ].startswith ('chrome-error' ) # not in the HAR/tree
444446 or frames ['url' ].startswith ('blob' ) # blobs aren't URLs
445447 )):
446448 u = unquote_plus (frames ['url' ])
447- possile_child_name = [ u , u .split ('#' , 1 )[0 ]]
449+ possible_child_name = { u , u .split ('#' , 1 )[0 ]}
448450 # this url should be in a node directly attached to that one
449451 # we need to find that node
450452 for child in current .traverse ():
451- if child .name in possile_child_name :
453+ if child .name in possible_child_name :
452454 self .logger .debug (f'Found URL "{ u } ".' )
453455 # Found the node, adding the content
454456 if not hasattr (child , 'rendered_frame' ):
@@ -463,7 +465,7 @@ def _load_iframes(self, current: URLNode, frames: FramesResponse) -> None:
463465 break
464466 else :
465467 # Couldn'd find the node Oo
466- to_print = ', ' .join (possile_child_name )
468+ to_print = ', ' .join (possible_child_name )
467469 children_to_print = ', ' .join ([child .name for child in current .traverse ()])
468470 self .logger .warning (f'Unable to find "{ to_print } " in the children of "{ current .name } " - { children_to_print } ' )
469471 else :
@@ -602,7 +604,7 @@ def _load_url_entries(self) -> None:
602604
603605 if hasattr (n , 'initiator_url' ):
604606 # The HAR file was created by chrome/chromium and we got the _initiator key
605- self .all_initiator_url [n .initiator_url ].append (n .name )
607+ self .all_initiator_url [n .initiator_url ].add (n .name )
606608
607609 if url_entry ['startedDateTime' ] in self .har .pages_start_times :
608610 for page in self .har .pages_start_times [url_entry ['startedDateTime' ]]:
@@ -615,7 +617,7 @@ def _load_url_entries(self) -> None:
615617 if hasattr (n , 'referer' ) and i > 0 :
616618 # NOTE 2021-05-14: referer to self are a real thing: url -> POST to self
617619 if n .name != n .referer or ('method' in n .request and n .request ['method' ] == 'POST' ):
618- self .all_referer [n .referer ].append (n .name )
620+ self .all_referer [n .referer ].add (n .name )
619621
620622 self ._nodes_list .append (n )
621623 self .all_url_requests [n .name ].append (n )
@@ -918,6 +920,7 @@ def _make_subtree(self, root: URLNode, nodes_to_attach: list[URLNode] | None=Non
918920 cu = unquote_plus (possible_url )
919921 for u in {cu , cu .split ('#' , 1 )[0 ]}:
920922 if u not in self .all_url_requests :
923+ self .logger .info (f'"{ u } " in the frames URLs, but not in the HAR.' )
921924 continue
922925 matching_urls = [url_node for url_node in self .all_url_requests [u ]
923926 if url_node in self ._nodes_list ]
0 commit comments