classify flags to 'general', 'linear', 'nn' categories, and change some flags orders in main.py

Winter Deng · Winter Deng · commit 4c5459c8edce · 2025-10-09T15:55:34.000+08:00
diff --git a/docs/cli/classifier.py b/docs/cli/classifier.py
@@ -0,0 +1,137 @@
+import os
+import sys
+import glob
+import re
+from pathlib import Path
+from collections import defaultdict
+
+current_dir = os.path.dirname(os.path.abspath(__file__))
+lib_path = os.path.abspath(os.path.join(current_dir, "..", ".."))
+sys.path.insert(0, lib_path)
+
+def classify_file_category(path):
+
+    relative_path = Path(path).relative_to(lib_path)
+    return_path = relative_path.as_posix()
+    filename = Path(*relative_path.parts[1:]).as_posix() if len(relative_path.parts) > 1 else return_path
+
+    if filename.startswith("linear"):
+        category = "linear"
+    elif filename.startswith("torch") or filename.startswith("nn"):
+        category = "nn"
+    else:
+        category = "general"
+    return category, return_path
+
+
+def fetch_option_flags(flags):
+    # flags = genflags.parser.flags
+    flag_list = []
+
+    for flag in flags:
+        flag_list.append(
+                {
+                    "name": flag["name"].replace("\\", ""),
+                    "instruction": flag["name"].split("-")[-1],
+                    "description": flag["description"]
+                }
+            )
+
+    return flag_list
+
+
+def fetch_all_files():
+    main_files = [
+        os.path.join(lib_path, "linear_trainer.py"),
+        os.path.join(lib_path, "torch_trainer.py")
+    ]
+    lib_files = glob.glob(os.path.join(lib_path, "libmultilabel/**/*.py"), recursive=True)
+    file_set = set(map(os.path.abspath, main_files + lib_files))
+    return file_set
+
+
+def find_config_usages_in_file(file_path, allowed_keys):
+    pattern = re.compile(r'\bconfig\.([a-zA-Z_][a-zA-Z0-9_]*)')
+    detailed_results = {}
+    try:
+        with open(file_path, "r", encoding="utf-8") as f:
+            lines = f.readlines()
+    except (IOError, UnicodeDecodeError):
+        return []
+
+    category, path = classify_file_category(file_path)
+
+    for i, line in enumerate(lines, start=1):
+        matches = pattern.findall(line)
+        for key in matches:
+            if key in allowed_keys:
+                if key not in detailed_results:
+                    detailed_results[key] = {"file": path, "lines": []}
+                detailed_results[key]["lines"].append(str(i))
+
+    return detailed_results
+
+
+def move_duplicates_together(data, keep):
+    all_keys = list(data.keys())
+    duplicates = set()
+
+    for i, key1 in enumerate(all_keys):
+        for key2 in all_keys[i+1:]:
+            duplicates |= data[key1] & data[key2]
+
+    data[keep] |= duplicates
+
+    for key in all_keys:
+        if key != keep:
+            data[key] -= duplicates
+
+    return data
+
+
+def classify(raw_flags):
+
+    category_set = {"general": set(), "linear": set(), "nn": set()}
+    flags = fetch_option_flags(raw_flags)
+    allowed_keys = set(flag["instruction"] for flag in flags)
+    file_set = fetch_all_files()
+    usage_map = defaultdict(list)
+    collected = {}
+
+    for file_path in file_set:
+        detailed_results = find_config_usages_in_file(file_path, allowed_keys)        
+        if detailed_results:
+            usage_map[file_path] = set(detailed_results.keys())
+            for k, v in detailed_results.items():
+                if k not in collected:
+                    collected[k] = []
+                collected[k].append(v)
+
+    for path, keys in usage_map.items():
+        category, path = classify_file_category(path)
+        category_set[category] = category_set[category].union(keys)
+
+    category_set = move_duplicates_together(category_set, "general")
+
+    for flag in flags:
+        for k, v in category_set.items():
+            for i in v:
+                if flag["instruction"] == i:
+                    flag["category"] = k
+        if "category" not in flag:
+            flag["category"] = "general"
+
+    result = {}
+    for flag in flags:
+        if flag["category"] not in result:
+            result[flag["category"]] = []
+        result[flag["category"]].append({"name": flag["name"].replace("--", r"\-\-"), "description": flag["description"]})
+
+    result["details"] = []
+    for k, v in collected.items():
+        result["details"].append({"name": k, "file": v[0]["file"], "location": ", ".join(v[0]["lines"])})
+        if len(v) > 1:
+            for i in v[1:]:
+                result["details"].append({"name": "", "file": i["file"], "location": ", ".join(i["lines"])})
+
+    return result
diff --git a/docs/cli/genflags.py b/docs/cli/genflags.py
@@ -2,8 +2,11 @@
 import os
 
 sys.path.insert(1, os.path.join(sys.path[0], "..", ".."))
+
 import main
 
+from classifier import classify
+
 
 class FakeParser(dict):
     def __init__(self):
@@ -29,21 +32,42 @@ def add_argument(
 parser.add_argument("-c", "--config", help="Path to configuration file")
 main.add_all_arguments(parser)
 
+classified = classify(parser.flags)
 
-def width(key):
-    return max(map(lambda f: len(f[key]), parser.flags))
+def width_title(key, title):
+    return max(map(lambda f: len(f[key]), classified[title]))
 
+def print_table(title, flags, intro):
+    print()
+    print(intro)
+    print()
 
-wn = width("name")
-wd = width("description")
+    wn = width_title("name", title)
+    wd = width_title("description", title)
 
-print(
-    """..
-    Do not modify this file. This file is generated by genflags.py.\n"""
+    print("=" * wn, "=" * wd)
+    print("Name".ljust(wn), "Description".ljust(wd))
+    print("=" * wn, "=" * wd)
+    for flag in flags:
+        print(flag["name"].ljust(wn), flag["description"].ljust(wd))
+    print("=" * wn, "=" * wd)
+    print()
+
+print_table(
+    "general",
+    classified["general"],
+    intro="**General options**:\n\
+Common configurations shared across both linear and neural network trainers."
+)
+print_table(
+    "linear",
+    classified["linear"],
+    intro="**Linear options**:\n\
+Configurations specific to linear trainer."
 )
-print("=" * wn, "=" * wd)
-print("Name".ljust(wn), "Description".ljust(wd))
-print("=" * wn, "=" * wd)
-for flag in parser.flags:
-    print(flag["name"].ljust(wn), flag["description"].ljust(wd))
-print("=" * wn, "=" * wd)
+print_table(
+    "nn",
+    classified["nn"],
+    intro="**Neural network options**:\n\
+Configurations specific to torch (neural networks) trainer."
+)
diff --git a/main.py b/main.py
@@ -11,21 +11,50 @@
 
 
 def add_all_arguments(parser):
-    # path / directory
+
     parser.add_argument(
-        "--result_dir", default="./runs", help="The directory to save checkpoints and logs (default: %(default)s)"
+        "-h",
+        "--help",
+        action="help",
+        help="Quickstart: https://www.csie.ntu.edu.tw/~cjlin/libmultilabel/cli/quickstart.html",
     )
 
+    parser.add_argument("--seed", type=int, help="Random seed (default: %(default)s)")
+
+    # choose model (linear / nn)
+    parser.add_argument("--linear", action="store_true", help="Train linear model")
+
+    # others
+    parser.add_argument("--cpu", action="store_true", help="Disable CUDA")
+    parser.add_argument("--silent", action="store_true", help="Enable silent mode")
+    parser.add_argument(
+        "--data_workers", type=int, default=4, help="Use multi-cpu core for data pre-processing (default: %(default)s)"
+    )
+    parser.add_argument(
+        "--embed_cache_dir",
+        type=str,
+        help="For parameter search only: path to a directory for storing embeddings for multiple runs. (default: %(default)s)",
+    )
+    parser.add_argument(
+        "--eval", action="store_true", help="Only run evaluation on the test set (default: %(default)s)"
+    )
+    parser.add_argument("--checkpoint_path", help="The checkpoint to warm-up with (default: %(default)s)")
+
     # data
-    parser.add_argument("--data_name", default="unnamed_data", help="Dataset name (default: %(default)s)")
+    parser.add_argument(
+        "--data_name",
+        default="unnamed_data",
+        help="Dataset name for generating the output directory (default: %(default)s)",
+    )
     parser.add_argument("--training_file", help="Path to training data (default: %(default)s)")
     parser.add_argument("--val_file", help="Path to validation data (default: %(default)s)")
-    parser.add_argument("--test_file", help="Path to test data (default: %(default)s")
+    parser.add_argument("--test_file", help="Path to test data (default: %(default)s)")
+    parser.add_argument("--label_file", type=str, help="Path to a file holding all labels (default: %(default)s)")
     parser.add_argument(
         "--val_size",
         type=float,
         default=0.2,
-        help="Training-validation split: a ratio in [0, 1] or an integer for the size of the validation set (default: %(default)s).",
+        help="Training-validation split: a ratio in [0, 1] or an integer for the size of the validation set (default: %(default)s)",
     )
     parser.add_argument(
         "--min_vocab_freq",
@@ -67,8 +96,24 @@ def add_all_arguments(parser):
         help="Whether to add the special tokens for inputs of the transformer-based language model. (default: %(default)s)",
     )
 
+    # model
+    parser.add_argument("--model_name", default="unnamed_model", help="Model to be used (default: %(default)s)")
+    parser.add_argument(
+        "--init_weight", default="kaiming_uniform", help="Weight initialization to be used (default: %(default)s)"
+    )
+    parser.add_argument(
+        "--loss_function", default="binary_cross_entropy_with_logits", help="Loss function (default: %(default)s)"
+    )
+
+    # pretrained vocab / embeddings
+    parser.add_argument("--vocab_file", type=str, help="Path to a file holding vocabuaries (default: %(default)s)")
+    parser.add_argument(
+        "--embed_file",
+        type=str,
+        help="Path to a file holding pre-trained embeddings or the name of the pretrained GloVe embedding (default: %(default)s)",
+    )
+
     # train
-    parser.add_argument("--seed", type=int, help="Random seed (default: %(default)s)")
     parser.add_argument(
         "--epochs", type=int, default=10000, help="The number of epochs to train (default: %(default)s)"
     )
@@ -109,15 +154,6 @@ def add_all_arguments(parser):
         help="Whether the embeddings of each word is normalized to a unit vector (default: %(default)s)",
     )
 
-    # model
-    parser.add_argument("--model_name", default="unnamed_model", help="Model to be used (default: %(default)s)")
-    parser.add_argument(
-        "--init_weight", default="kaiming_uniform", help="Weight initialization to be used (default: %(default)s)"
-    )
-    parser.add_argument(
-        "--loss_function", default="binary_cross_entropy_with_logits", help="Loss function (default: %(default)s)"
-    )
-
     # eval
     parser.add_argument(
         "--eval_batch_size", type=int, default=256, help="Size of evaluating batches (default: %(default)s)"
@@ -138,28 +174,6 @@ def add_all_arguments(parser):
         "--val_metric", default="P@1", help="The metric to select the best model for testing (default: %(default)s)"
     )
 
-    # pretrained vocab / embeddings
-    parser.add_argument("--vocab_file", type=str, help="Path to a file holding vocabuaries (default: %(default)s)")
-    parser.add_argument(
-        "--embed_file", type=str, help="Path to a file holding pre-trained embeddings or the name of the pretrained GloVe embedding (default: %(default)s)"
-    )
-    parser.add_argument("--label_file", type=str, help="Path to a file holding all labels (default: %(default)s)")
-
-    # log
-    parser.add_argument(
-        "--save_k_predictions",
-        type=int,
-        nargs="?",
-        const=100,
-        default=0,
-        help="Save top k predictions on test set. k=%(const)s if not specified. (default: %(default)s)",
-    )
-    parser.add_argument(
-        "--predict_out_path",
-        default="./predictions.txt",
-        help="Path to the output file holding label results (default: %(default)s)",
-    )
-
     # auto-test
     parser.add_argument(
         "--limit_train_batches",
@@ -180,24 +194,27 @@ def add_all_arguments(parser):
         help="Percentage of test dataset to use for auto-testing (default: %(default)s)",
     )
 
-    # others
-    parser.add_argument("--cpu", action="store_true", help="Disable CUDA")
-    parser.add_argument("--silent", action="store_true", help="Enable silent mode")
+    # log
     parser.add_argument(
-        "--data_workers", type=int, default=4, help="Use multi-cpu core for data pre-processing (default: %(default)s)"
+        "--save_k_predictions",
+        type=int,
+        nargs="?",
+        const=100,
+        default=0,
+        help="Save top k predictions on test set. k=%(const)s if not specified. (default: %(default)s)",
     )
     parser.add_argument(
-        "--embed_cache_dir",
-        type=str,
-        help="For parameter search only: path to a directory for storing embeddings for multiple runs. (default: %(default)s)",
+        "--predict_out_path",
+        default="./predictions.txt",
+        help="Path to the output file holding label results (default: %(default)s)",
     )
+
+    # path / directory
     parser.add_argument(
-        "--eval", action="store_true", help="Only run evaluation on the test set (default: %(default)s)"
+        "--result_dir", default="./runs", help="The directory to save checkpoints and logs (default: %(default)s)"
     )
-    parser.add_argument("--checkpoint_path", help="The checkpoint to warm-up with (default: %(default)s)")
 
     # linear options
-    parser.add_argument("--linear", action="store_true", help="Train linear model")
     parser.add_argument(
         "--data_format",
         type=str,
@@ -224,7 +241,10 @@ def add_all_arguments(parser):
         "--tree_max_depth", type=int, default=10, help="Maximum depth of the tree (default: %(default)s)"
     )
     parser.add_argument(
-        "--tree_ensemble_models", type=int, default=1, help="Number of models in the tree ensemble (default: %(default)s)"
+        "--tree_ensemble_models",
+        type=int,
+        default=1,
+        help="Number of models in the tree ensemble (default: %(default)s)",
     )
     parser.add_argument(
         "--beam_width",
@@ -239,13 +259,6 @@ def add_all_arguments(parser):
         default=8,
         help="the maximal number of labels inside a cluster (default: %(default)s)",
     )
-    parser.add_argument(
-        "-h",
-        "--help",
-        action="help",
-        help="If you are trying to specify network config such as dropout or activation or config of the learning rate scheduler, use a yaml file instead. "
-        "See example configs in example_config",
-    )
 
 
 def get_config():