1414from operator import itemgetter
1515from pathlib import Path
1616from typing import Any , TypedDict
17+ from collections .abc import Iterator
1718from collections .abc import Callable
1819from urllib .parse import unquote_plus , urlparse
1920
@@ -443,10 +444,11 @@ def _load_iframes(self, current: URLNode, frames: FramesResponse) -> None:
443444 or frames ['url' ].startswith ('blob' ) # blobs aren't URLs
444445 )):
445446 u = unquote_plus (frames ['url' ])
447+ possile_child_name = [u , u .split ('#' , 1 )[0 ]]
446448 # this url should be in a node directly attached to that one
447449 # we need to find that node
448450 for child in current .traverse ():
449- if child .name in [ u , u . split ( '#' , 1 )[ 0 ]] :
451+ if child .name in possile_child_name :
450452 self .logger .debug (f'Found URL "{ u } ".' )
451453 # Found the node, adding the content
452454 if not hasattr (child , 'rendered_frame' ):
@@ -461,7 +463,9 @@ def _load_iframes(self, current: URLNode, frames: FramesResponse) -> None:
461463 break
462464 else :
463465 # Couldn'd find the node Oo
464- self .logger .warning (f'Unable to find "{ u } " in the children of "{ current .name } "' )
466+ to_print = ', ' .join (possile_child_name )
467+ children_to_print = ', ' .join ([child .name for child in current .traverse ()])
468+ self .logger .warning (f'Unable to find "{ to_print } " in the children of "{ current .name } " - { children_to_print } ' )
465469 else :
466470 self .logger .debug (f'"{ current .name } " contains an iFrame.' )
467471 # No URL, this frame is directly in the parent frame.
@@ -813,6 +817,33 @@ def _make_subtree_fallback(self, node: URLNode, dev_debug: bool=False) -> None:
813817 # no way to attach it to anything else, attach to the root node
814818 self ._make_subtree (self .url_tree , [node ], fallback = True )
815819
820+ def all_real_urls_in_children (self , frame : FramesResponse ) -> Iterator [str ]:
821+ # from a frame, search all the real urls in each of the children, stop at the first one
822+ if (frame .get ('url' ) and frame ['url' ] is not None
823+ and not (frame ['url' ] in ['about:blank' ] # not loading anything, same as empty
824+ or frame ['url' ].startswith ('data' ) # base64 encoded content
825+ or frame ['url' ].startswith ('blob' ))): # blobs aren't URLs
826+ yield frame ['url' ]
827+ else :
828+ # got no real URL, try the children
829+ if frame .get ('children' ) and frame ['children' ] is not None :
830+ for c in frame ['children' ]:
831+ yield from self .all_real_urls_in_children (c )
832+
833+ def search_in_frames (self , urls : set [str ], frame : FramesResponse ) -> Iterator [str ]:
834+ # If the frame doesn't have children, there are no potential URLs to attach
835+ if not frame .get ('children' ) or frame ['children' ] is None :
836+ return None
837+
838+ if frame .get ('url' ):
839+ u = unquote_plus (frame ['url' ])
840+ if urls & {u , u .split ('#' , 1 )[0 ]}:
841+ # got a matching URL, get list of potential iframes urls
842+ for c in frame ['children' ]:
843+ yield from self .all_real_urls_in_children (c )
844+ for c in frame ['children' ]:
845+ yield from self .search_in_frames (urls , c )
846+
816847 @trace_make_subtree
817848 def _make_subtree (self , root : URLNode , nodes_to_attach : list [URLNode ] | None = None ,
818849 dev_debug : bool = False , fallback : bool = False ) -> None :
@@ -877,6 +908,24 @@ def _make_subtree(self, root: URLNode, nodes_to_attach: list[URLNode] | None=Non
877908 if unode .empty_response :
878909 continue
879910
911+ # 2025-11-14
912+ # the referer of an iframe is the hostname of the parent, even if the parent
913+ # is a URL with a full path. Before using the referer, we need to check if we have
914+ # the current url in the frame tree. If we do, find nodes (in the remaining list)
915+ # with the URLs of the children - any fragment will be missing - and attach that node
916+ possible_iframe_urls = {unode .name , unode .name .split ('#' , 1 )[0 ]}
917+ for possible_url in self .search_in_frames (possible_iframe_urls , self .har .frames ):
918+ cu = unquote_plus (possible_url )
919+ for u in {cu , cu .split ('#' , 1 )[0 ]}:
920+ if u not in self .all_url_requests :
921+ continue
922+ matching_urls = [url_node for url_node in self .all_url_requests [u ]
923+ if url_node in self ._nodes_list ]
924+ self ._nodes_list = [node for node in self ._nodes_list if node not in matching_urls ]
925+ if dev_debug :
926+ self .logger .warning (f'Found via initiator from { unode .name } to { matching_urls } .' )
927+ self ._make_subtree (unode , matching_urls )
928+
880929 # The node can have a redirect, but also trigger ressources refering to themselves, we need to trigger this code on each node.
881930 if self .all_initiator_url .get (unode .name ):
882931 # The URL (unode.name) is in the list of known urls initiating calls
0 commit comments