Skip to content

Commit af30eb7

Browse files
committed
new: Use frames URLs when building tree
1 parent 3320ef1 commit af30eb7

File tree

1 file changed

+51
-2
lines changed

1 file changed

+51
-2
lines changed

har2tree/har2tree.py

Lines changed: 51 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
from operator import itemgetter
1515
from pathlib import Path
1616
from typing import Any, TypedDict
17+
from collections.abc import Iterator
1718
from collections.abc import Callable
1819
from urllib.parse import unquote_plus, urlparse
1920

@@ -443,10 +444,11 @@ def _load_iframes(self, current: URLNode, frames: FramesResponse) -> None:
443444
or frames['url'].startswith('blob') # blobs aren't URLs
444445
)):
445446
u = unquote_plus(frames['url'])
447+
possile_child_name = [u, u.split('#', 1)[0]]
446448
# this url should be in a node directly attached to that one
447449
# we need to find that node
448450
for child in current.traverse():
449-
if child.name in [u, u.split('#', 1)[0]]:
451+
if child.name in possile_child_name:
450452
self.logger.debug(f'Found URL "{u}".')
451453
# Found the node, adding the content
452454
if not hasattr(child, 'rendered_frame'):
@@ -461,7 +463,9 @@ def _load_iframes(self, current: URLNode, frames: FramesResponse) -> None:
461463
break
462464
else:
463465
# Couldn'd find the node Oo
464-
self.logger.warning(f'Unable to find "{u}" in the children of "{current.name}"')
466+
to_print = ', '.join(possile_child_name)
467+
children_to_print = ', '.join([child.name for child in current.traverse()])
468+
self.logger.warning(f'Unable to find "{to_print}" in the children of "{current.name}" - {children_to_print}')
465469
else:
466470
self.logger.debug(f'"{current.name}" contains an iFrame.')
467471
# No URL, this frame is directly in the parent frame.
@@ -813,6 +817,33 @@ def _make_subtree_fallback(self, node: URLNode, dev_debug: bool=False) -> None:
813817
# no way to attach it to anything else, attach to the root node
814818
self._make_subtree(self.url_tree, [node], fallback=True)
815819

820+
def all_real_urls_in_children(self, frame: FramesResponse) -> Iterator[str]:
821+
# from a frame, search all the real urls in each of the children, stop at the first one
822+
if (frame.get('url') and frame['url'] is not None
823+
and not (frame['url'] in ['about:blank'] # not loading anything, same as empty
824+
or frame['url'].startswith('data') # base64 encoded content
825+
or frame['url'].startswith('blob'))): # blobs aren't URLs
826+
yield frame['url']
827+
else:
828+
# got no real URL, try the children
829+
if frame.get('children') and frame['children'] is not None:
830+
for c in frame['children']:
831+
yield from self.all_real_urls_in_children(c)
832+
833+
def search_in_frames(self, urls: set[str], frame: FramesResponse) -> Iterator[str]:
834+
# If the frame doesn't have children, there are no potential URLs to attach
835+
if not frame.get('children') or frame['children'] is None:
836+
return None
837+
838+
if frame.get('url'):
839+
u = unquote_plus(frame['url'])
840+
if urls & {u, u.split('#', 1)[0]}:
841+
# got a matching URL, get list of potential iframes urls
842+
for c in frame['children']:
843+
yield from self.all_real_urls_in_children(c)
844+
for c in frame['children']:
845+
yield from self.search_in_frames(urls, c)
846+
816847
@trace_make_subtree
817848
def _make_subtree(self, root: URLNode, nodes_to_attach: list[URLNode] | None=None,
818849
dev_debug: bool=False, fallback: bool=False) -> None:
@@ -877,6 +908,24 @@ def _make_subtree(self, root: URLNode, nodes_to_attach: list[URLNode] | None=Non
877908
if unode.empty_response:
878909
continue
879910

911+
# 2025-11-14
912+
# the referer of an iframe is the hostname of the parent, even if the parent
913+
# is a URL with a full path. Before using the referer, we need to check if we have
914+
# the current url in the frame tree. If we do, find nodes (in the remaining list)
915+
# with the URLs of the children - any fragment will be missing - and attach that node
916+
possible_iframe_urls = {unode.name, unode.name.split('#', 1)[0]}
917+
for possible_url in self.search_in_frames(possible_iframe_urls, self.har.frames):
918+
cu = unquote_plus(possible_url)
919+
for u in {cu, cu.split('#', 1)[0]}:
920+
if u not in self.all_url_requests:
921+
continue
922+
matching_urls = [url_node for url_node in self.all_url_requests[u]
923+
if url_node in self._nodes_list]
924+
self._nodes_list = [node for node in self._nodes_list if node not in matching_urls]
925+
if dev_debug:
926+
self.logger.warning(f'Found via initiator from {unode.name} to {matching_urls}.')
927+
self._make_subtree(unode, matching_urls)
928+
880929
# The node can have a redirect, but also trigger ressources refering to themselves, we need to trigger this code on each node.
881930
if self.all_initiator_url.get(unode.name):
882931
# The URL (unode.name) is in the list of known urls initiating calls

0 commit comments

Comments
 (0)