Fixed bugs. Refactored code. Added more descriptions.

AmeWenJ · AmeWenJ · commit 04c3550a1ae8 · 2025-08-28T16:09:10.000Z
diff --git a/silnlp/nmt/exp_summary.py b/silnlp/nmt/exp_summary.py
@@ -11,18 +11,19 @@
 from .config import get_mt_exp_dir
 
 chap_num = 0
-trained_books = []
-target_book = ""
-all_books = []
-metrics = []
-key_word = ""
 
 
-def read_data(file_path: str, data: dict, chapters: set) -> None:
+def read_group_results(
+    file_path: str,
+    target_book: str,
+    all_books: list[str],
+    metrics: list[str],
+    key_word: str,
+) -> tuple[dict[str, dict[int, list[str]]], set[int]]:
     global chap_num
-    global all_books
-    global key_word
 
+    data = {}
+    chapter_groups = set()
     for lang_pair in os.listdir(file_path):
         lang_pattern = re.compile(r"([\w-]+)\-([\w-]+)")
         if not lang_pattern.match(lang_pair):
@@ -33,24 +34,22 @@ def read_data(file_path: str, data: dict, chapters: set) -> None:
         pattern = re.compile(rf"^{re.escape(prefix)}_{key_word}_order_(\d+)_ch$")
 
         for groups in os.listdir(os.path.join(file_path, lang_pair)):
-            m = pattern.match(os.path.basename(groups))
-            if m:
+            if m := pattern.match(os.path.basename(groups)):
                 folder_path = os.path.join(file_path, lang_pair, os.path.basename(groups))
                 diff_pred_file = glob.glob(os.path.join(folder_path, "diff_predictions*"))
                 if diff_pred_file:
-                    r = extract_data(diff_pred_file[0])
+                    r = extract_diff_pred_data(diff_pred_file[0], metrics, target_book)
                     data[lang_pair][int(m.group(1))] = r
-                    chapters.add(int(m.group(1)))
-                    if int(m.group(1)) > chap_num:
-                        chap_num = int(m.group(1))
                 else:
+                    data[lang_pair][int(m.group(1))] = {}
                     print(folder_path + " has no diff_predictions file.")
+                chapter_groups.add(int(m.group(1)))
+                chap_num = max(chap_num, int(m.group(1)))
+    return data, chapter_groups
 
 
-def extract_data(filename: str, header_row=5) -> dict:
+def extract_diff_pred_data(filename: str, metrics: list[str], target_book: str, header_row=5) -> dict[int, list[str]]:
     global chap_num
-    global metrics
-    global target_book
 
     metrics = [m.lower() for m in metrics]
     try:
@@ -67,47 +66,49 @@ def extract_data(filename: str, header_row=5) -> dict:
     for _, row in df.iterrows():
         vref = row["vref"]
         m = re.match(r"(\d?[A-Z]{2,3}) (\d+)", str(vref))
+        if not m:
+            print(f"Invalid VREF format: {str(vref)}")
+            return {}
 
         book_name, chap = m.groups()
         if book_name != target_book:
             continue
 
-        if int(chap) > chap_num:
-            chap_num = int(chap)
-
+        chap_num = max(chap_num, int(chap))
         values = []
         for metric in metrics:
             if metric in row:
                 values.append(row[metric])
             else:
-                metric = True
+                metric_warning = True
                 values.append(None)
 
         result[int(chap)] = values
 
     if metric_warning:
-        print("Warning: {metric} is not calculated in {filename}")
+        print("Warning: {metric} was not calculated in {filename}")
 
     return result
 
 
-def flatten_dict(data: dict, chapters: list, baseline={}) -> list:
+def flatten_dict(data: dict, chapter_groups: list[int], metrics: list[str], baseline={}) -> list[str]:
     global chap_num
-    global metrics
 
     rows = []
     if len(data) > 0:
         for lang_pair in data:
             for chap in range(1, chap_num + 1):
                 row = [lang_pair, chap]
                 row.extend([None, None, None] * len(metrics) * len(data[lang_pair]))
-                row.extend([None] * len(chapters))
+                row.extend([None] * len(chapter_groups))
                 row.extend([None] * (1 + len(metrics)))
 
                 for res_chap in data[lang_pair]:
                     if chap in data[lang_pair][res_chap]:
                         for m in range(len(metrics)):
-                            index_m = 3 + 1 + len(metrics) + chapters.index(res_chap) * (len(metrics) * 3 + 1) + m * 3
+                            index_m = (
+                                3 + 1 + len(metrics) + chapter_groups.index(res_chap) * (len(metrics) * 3 + 1) + m * 3
+                            )
                             row[index_m] = data[lang_pair][res_chap][chap][m]
                 if len(baseline) > 0:
                     for m in range(len(metrics)):
@@ -126,16 +127,15 @@ def flatten_dict(data: dict, chapters: list, baseline={}) -> list:
     return rows
 
 
-def create_xlsx(rows: list, chapters: list, output_path: str) -> None:
+def create_xlsx(rows: list[str], chapter_groups: list[str], output_path: str, metrics: list[str]) -> None:
     global chap_num
-    global metrics
 
     wb = Workbook()
     ws = wb.active
 
     num_col = len(metrics) * 3 + 1
     groups = [("language pair", 1), ("Chapter", 1), ("Baseline", (1 + len(metrics)))]
-    for chap in chapters:
+    for chap in chapter_groups:
         groups.append((chap, num_col))
 
     col = 1
@@ -239,16 +239,28 @@ def create_xlsx(rows: list, chapters: list, output_path: str) -> None:
 # --trained-books MRK --target-book MAT --metrics chrf3 confidence --key-word conf --baseline Catapult_Reloaded/2nd_book/MRK
 def main() -> None:
     global chap_num
-    global trained_books
-    global target_book
-    global all_books
-    global metrics
-    global key_word
 
     parser = argparse.ArgumentParser(
-        description="Pull results. At least one --exp or --baseline needs to be specified."
+        description="Pulling results from a single experiment and/or multiple experiment groups."
+        "A valid experiment should have the following format:"
+        "baseline/lang_pair/exp_group/diff_predictions or baseline/lang_pair/diff_predictions for a single experiment"
+        "or "
+        "exp/lang_pair/exp_groups/diff_predictions for multiple experiment groups"
+        "More information in --exp and --baseline."
+        "Use --exp for multiple experiment groups and --baseline for a single experiment."
+        "At least one --exp or --baseline needs to be specified."
+    )
+    parser.add_argument(
+        "--exp",
+        type=str,
+        help="Experiment folder with progression results. "
+        "A valid experiment groups should have the following format:"
+        "exp/lang_pair/exp_groups/diff_predictions"
+        "where there should be at least one exp_groups that naming in the following format:"
+        "*book*+*book*_*key-word*_order_*number*_ch"
+        "where *book*+*book*... are the combination of all --trained-books with the last one being --target-book."
+        "More information in --key-word.",
     )
-    parser.add_argument("--exp", type=str, help="Experiment folder with progression results")
     parser.add_argument(
         "--trained-books", nargs="*", required=True, type=str.upper, help="Books that are trained in the exp"
     )
@@ -261,8 +273,25 @@ def main() -> None:
         type=str.lower,
         help="Metrics that will be analyzed with",
     )
-    parser.add_argument("--key-word", type=str, default="conf", help="Key word in the filename for the exp group")
-    parser.add_argument("--baseline", type=str, help="Baseline or non-progression result for the exp group")
+    parser.add_argument(
+        "--key-word",
+        type=str,
+        default="conf",
+        help="Key word in the filename for the exp group to distinguish between the experiment purpose."
+        "For example, in LUK+ACT_conf_order_12_ch, the key-word should be conf."
+        "Another example, in LUK+ACT_standard_order_12_ch, the key-word should be standard.",
+    )
+    parser.add_argument(
+        "--baseline",
+        type=str,
+        help="A non-progression folder for a single experiment."
+        "A valid single experiment should have the following format:"
+        "baseline/lang_pair/exp_group/diff_predictions where exp_group will be in the following format:"
+        "*book*+*book*... as the combination of all --trained-books."
+        "or"
+        "baseline/lang_pair/diff_predictions "
+        "where the information of --trained-books should have already been indicated in the baseline name.",
+    )
     args = parser.parse_args()
 
     if not (args.exp or args.baseline):
@@ -274,46 +303,46 @@ def main() -> None:
     metrics = args.metrics
     key_word = args.key_word
 
-    exp1_name = args.exp
-    exp1_dir = get_mt_exp_dir(exp1_name) if exp1_name else None
+    multi_group_exp_name = args.exp
+    multi_group_exp_dir = get_mt_exp_dir(multi_group_exp_name) if multi_group_exp_name else None
 
-    exp2_name = args.baseline
-    exp2_dir = get_mt_exp_dir(exp2_name) if exp2_name else None
+    single_group_exp_name = args.baseline
+    single_group_exp_dir = get_mt_exp_dir(single_group_exp_name) if single_group_exp_name else None
 
-    folder_name = "+".join(all_books)
-    result_dir = exp1_dir if exp1_dir else exp2_dir
+    result_file_name = "+".join(all_books)
+    result_dir = multi_group_exp_dir if multi_group_exp_dir else single_group_exp_dir
     os.makedirs(os.path.join(result_dir, "a_result_folder"), exist_ok=True)
-    output_path = os.path.join(result_dir, "a_result_folder", f"{folder_name}.xlsx")
+    output_path = os.path.join(result_dir, "a_result_folder", f"{result_file_name}.xlsx")
 
     data = {}
-    chapters = set()
-    if exp1_dir:
-        read_data(exp1_dir, data, chapters)
-        chapters = sorted(chapters)
+    chapter_groups = set()
+    if multi_group_exp_dir:
+        data, chapter_groups = read_group_results(multi_group_exp_dir, target_book, all_books, metrics, key_word)
+        chapter_groups = sorted(chapter_groups)
 
     baseline_data = {}
-    if exp2_dir:
-        for lang_pair in os.listdir(exp2_dir):
+    if single_group_exp_dir:
+        for lang_pair in os.listdir(single_group_exp_dir):
             lang_pattern = re.compile(r"([\w-]+)\-([\w-]+)")
             if not lang_pattern.match(lang_pair):
                 continue
 
-            baseline_path = os.path.join(exp2_dir, lang_pair)
+            baseline_path = os.path.join(single_group_exp_dir, lang_pair)
             baseline_diff_pred = glob.glob(os.path.join(baseline_path, "diff_predictions*"))
             if baseline_diff_pred:
-                baseline_data[lang_pair] = extract_data(baseline_diff_pred[0])
+                baseline_data[lang_pair] = extract_diff_pred_data(baseline_diff_pred[0], metrics, target_book)
             else:
                 print(f"Checking experiments under {baseline_path}...")
                 sub_baseline_path = os.path.join(baseline_path, "+".join(trained_books))
                 baseline_diff_pred = glob.glob(os.path.join(sub_baseline_path, "diff_predictions*"))
                 if baseline_diff_pred:
-                    baseline_data[lang_pair] = extract_data(baseline_diff_pred[0])
+                    baseline_data[lang_pair] = extract_diff_pred_data(baseline_diff_pred[0], metrics, target_book)
                 else:
                     print(f"Baseline experiment has no diff_predictions file in {sub_baseline_path}")
 
     print("Writing data...")
-    rows = flatten_dict(data, chapters, baseline=baseline_data)
-    create_xlsx(rows, chapters, output_path)
+    rows = flatten_dict(data, chapter_groups, metrics, baseline=baseline_data)
+    create_xlsx(rows, chapter_groups, output_path, metrics)
     print(f"Result is in {output_path}")