From 4feb475c609937a464291eb6f64c34bc8b9855d9 Mon Sep 17 00:00:00 2001
From: zzjchen <chenzijie@westlake.edu.cn>
Date: Thu, 24 Oct 2024 16:26:21 +0800
Subject: [PATCH] use coord to help Element path exact match

---
 evaluate/evaluate_utils.py | 36 ++++++++++++++---------------
 evaluate/step_score.py     | 46 +++++++++++++++++++++-----------------
 2 files changed, 43 insertions(+), 39 deletions(-)

diff --git a/evaluate/evaluate_utils.py b/evaluate/evaluate_utils.py
index 195dfc3..0b7e205 100644
--- a/evaluate/evaluate_utils.py
+++ b/evaluate/evaluate_utils.py
@@ -118,7 +118,9 @@ def get_netloc(url: str) -> str:
     return netloc
 
 
-async def step_evaluate(page: Page, evaluate_steps=[], input_path=None, element_value=None, text_content=None):
+async def step_evaluate(page: Page, evaluate_steps=[], input_path=None, element_value=None, text_content=None,input_coords=None):
+    # input_coords should be (x,y) in pixels, if not None
+    # and will be used in ElementEvaluator.path_exact_match()
     """Evaluate step score"""
     step_score = 0
     match_result = []
@@ -135,15 +137,12 @@ async def step_evaluate(page: Page, evaluate_steps=[], input_path=None, element_
             elif match_function == "url_semantic_match":
                 score = await URLEvaluator.url_semantic_match(
                     page.url, evaluate["reference_answer"], evaluate["key"])
-                # print(score, "url_semantic_match")
             elif match_function == "element_path_exactly_match":
                 input_netloc = get_netloc(page.url)
                 method = evaluate["method"]
-                score = ElementEvaluator.path_exact_match(
-                    input_path, evaluate["reference_answer"], method, await page.content(), input_netloc,
-                    evaluate["netloc"])
-                # print(score, "path_exact_match:", input_path,
-                #       "***", evaluate["reference_answer"])
+                score = await ElementEvaluator.path_exact_match(
+                    input_path, evaluate["reference_answer"], method, page, input_netloc,
+                    evaluate["netloc"],input_coords=input_coords)
             elif match_function == "element_path_included_match":
                 pass
                 # * Temporarily not doing
@@ -155,9 +154,9 @@ async def step_evaluate(page: Page, evaluate_steps=[], input_path=None, element_
                     # print(element_value)
                     # print(await page.locator(input_path).input_value())
                     if "path" in evaluate.keys():
-                        path_score = ElementEvaluator.path_exact_match(input_path, evaluate["path"], "selector",
-                                                                       await page.content(), input_netloc,
-                                                                       evaluate["netloc"])
+                        path_score = await ElementEvaluator.path_exact_match(input_path, evaluate["path"], "selector",
+                                                                       page, input_netloc,
+                                                                       evaluate["netloc"],input_coords=input_coords)
                         if path_score == 0:
                             # print("Path mismatch in value evaluation")
                             score = 0
@@ -172,12 +171,12 @@ async def step_evaluate(page: Page, evaluate_steps=[], input_path=None, element_
                 else:
                     score = 0
             elif match_function == "element_value_included_match":
-                if input_path is not None and element_value is not None:
+                if (input_path is not None or input_coords is not None) and element_value is not None:
                     input_netloc = get_netloc(page.url)
                     if "path" in evaluate.keys():
-                        path_score = ElementEvaluator.path_exact_match(input_path, evaluate["path"], "selector",
-                                                                       await page.content(), input_netloc,
-                                                                       evaluate["netloc"])
+                        path_score = await ElementEvaluator.path_exact_match(input_path, evaluate["path"], "selector",
+                                                                       page, input_netloc,
+                                                                       evaluate["netloc"],input_coords=input_coords)
                         if path_score == 0:
                             # print("Path mismatch in value evaluation")
                             score = 0
@@ -192,14 +191,14 @@ async def step_evaluate(page: Page, evaluate_steps=[], input_path=None, element_
                 else:
                     score = 0
             elif match_function == "element_value_semantic_match":
-                if input_path is not None and element_value is not None:
+                if (input_path is not None or input_coords is not None) and element_value is not None:
                     input_netloc = get_netloc(page.url)
 
                     if len(element_value) > 0:
                         if "path" in evaluate.keys():
-                            path_score = ElementEvaluator.path_exact_match(input_path, evaluate["path"], "selector",
-                                                                           await page.content(), input_netloc,
-                                                                           evaluate["netloc"])
+                            path_score = await ElementEvaluator.path_exact_match(input_path, evaluate["path"], "selector",
+                                                                           page, input_netloc,
+                                                                           evaluate["netloc"],input_coords=input_coords)
                             if path_score == 0:
                                 # print("Path mismatch in value evaluation")
                                 score = 0
@@ -247,6 +246,7 @@ async def step_evaluate(page: Page, evaluate_steps=[], input_path=None, element_
     return evaluate_steps, match_result
 
 
+
 def parse_current_trace(response: dict, env: AsyncHTMLEnvironment, step_reward: dict):
     thought = response["description"].get("thought")
     action_type = response.get(
diff --git a/evaluate/step_score.py b/evaluate/step_score.py
index a20f77e..36f2414 100644
--- a/evaluate/step_score.py
+++ b/evaluate/step_score.py
@@ -77,7 +77,26 @@ async def url_semantic_match(input_url, semantic_method, key=False):
 class ElementEvaluator(StepEvaluator):
     '''Element evaluation and scoring'''
     @staticmethod
-    def path_exact_match(input_answer, reference_answer, method, html_content, input_netloc, reference_netloc):
+    def is_same_element(page, input_coord, reference_element_handle):
+        x,y=input_coord
+        # Get the bounding box of the element, usually 2s is enough, but set to 5s here
+        bounding_box = reference_element_handle.bounding_box(timeout=5000)
+        if bounding_box:
+            element_x = bounding_box['x']
+            element_y = bounding_box['y']
+            element_width = bounding_box['width']
+            element_height = bounding_box['height']
+            # Check if the given (x, y) is within the bounding box
+            if (element_x <= x <= element_x + element_width and
+                element_y <= y <= element_y + element_height):
+                return True
+         return False
+
+
+    @staticmethod
+    def path_exact_match(input_answer, reference_answer, method, page, input_netloc, reference_netloc,input_coords=None):
+        # input_coords should be (x,y) in pixels, if not None
+        # and will be used in ElementEvaluator.path_exact_match()
         score = 0
         if method == "xpath":
             if reference_netloc != input_netloc:
@@ -85,6 +104,7 @@ def path_exact_match(input_answer, reference_answer, method, html_content, input
                 #       "input_netloc:", input_netloc)
                 return 0
             try:
+                html_content = await page.content()
                 tree = html.fromstring(html_content)
                 input_elements = tree.xpath(input_answer)
                 reference_elements = tree.xpath(reference_answer)
@@ -105,32 +125,16 @@ def path_exact_match(input_answer, reference_answer, method, html_content, input
                     pass
             else:
                 score = 0
-        elif method == "selector":
+        elif method == "selector": #modified to use coords
             if reference_netloc != input_netloc:
                 # print("reference_netloc:", reference_netloc,
                 #       "input_netloc:", input_netloc)
                 return 0
             try:
-                soup = BeautifulSoup(html_content, 'html.parser')
-                input_element = soup.select_one(input_answer)
-                reference_element = soup.select_one(reference_answer)
+                input_element = input_coords#input element is input coord
+                reference_element = page.locator(reference_answer)
                 if (input_element is not None) and (reference_element is not None):
-                    score = input_element is reference_element
-
-                    try:
-                        if reference_element.name in MapTagNameList:
-                            # parent_elements = reference_element.parent
-                            # score_parent = input_element is parent_elements
-                            # score = max(score, score_parent)
-                            trace_up_count = 0
-                            current_element = reference_element
-                            while trace_up_count < 3 and score == 0:
-                                trace_up_count += 1
-                                current_element = current_element.parent
-                                score_parent = input_element is current_element
-                                score = max(score, score_parent)
-                    except:
-                        pass
+                    score = ElementEvaluator().is_same_element(page, input_coord=input_element, reference_element_handle=reference_element)
             except:
                 score = 0
         # result_score = MatchFunction.include_match(