WAT-ai · hpatel0816 · Sep 27, 2025 · Sep 27, 2025 · Sep 27, 2025
diff --git a/.gitignore b/.gitignore
@@ -4,3 +4,5 @@ venv/
 *.DS_STORE
 sklearn-env/
 .ruff_cache/
+.env
+artifacts/
diff --git a/Makefile b/Makefile
@@ -0,0 +1,17 @@
+.PHONY: up down mlflow-ui prefect-ui install
+
+up:
+	docker compose up -d mlflow prefect
+
+down:
+	docker compose down
+
+mlflow-ui:
+	@echo "MLflow -> http://localhost:5050"
+
+prefect-ui:
+	@echo "Prefect -> http://localhost:4200"
+
+install:
+	python -m pip install -U pip
+	pip install -r requirements.txt
diff --git a/config/defaults.yaml b/config/defaults.yaml
@@ -0,0 +1,53 @@
+experiment_name: bbox  # change to "bbox" | "seg" as needed
+
+mlflow:
+  tracking_uri: "http://localhost:5050"
+  experiment: "${experiment_name}"
+
+data:
+  raw_root: "data/raw_data/STARCOP_train_easy"
+  processed_root: "artifacts/datasets"
+  shard_format: "hdf5"           # zarr | hdf5
+  image_exts: [".tif", ".tiff"]
+  label_binary_name: "labelbinary.tif"
+  label_rgba_name: "label_rgba.tif"
+  normalize: {enabled: true, mean: 0.5, std: 0.25, clip_min: 0.0, clip_max: 1.0}
+  input_size: {bbox: [256,256]}
+  max_boxes: 10
+  pad_mode: "constant"
+  split: {train_ratio: 0.8, val_ratio: 0.2, seed: 42}
+  aug:
+    bbox: {hflip_prob: 0.5, vflip_prob: 0.0, rotate_deg: 10}
+
+train:
+  seed: 42
+  device: "auto"           # auto | cpu | cuda
+  epochs: 20
+  batch_size: 16
+  num_workers: 4
+  optimizer: {name: "adamw", lr: 3.0e-4, weight_decay: 1.0e-2}
+  scheduler: {name: "cosine", warmup_epochs: 2}
+  checkpointing:
+    dir: "artifacts/models/${experiment_name}"
+    save_top_k: 1
+  logging: {log_every_n_steps: 50}
+
+infer:
+  input_dir: "data/raw_data/STARCOP_train_easy"
+  output_dir: "artifacts/preds/${experiment_name}"
+  bbox: {score_thresh: 0.35, nms_iou_thresh: 0.5, max_dets: 50}
+  models: {bbox_path: null, seg_path: null}
+
+eval:
+  bbox:     {ap50_target: 0.50}
+  stitched: {miou_target: 0.55}
+  overlays: {save: true, limit: 50, out_dir: "artifacts/reports/${experiment_name}"}
+
+model:
+  bbox:
+    name: "bbox_cnn"
+    in_channels: 1
+    num_classes: 1
+    backbone: "resnet18"
+    pretrained: false
+    head: {type: "conv", hidden_dim: 128}
diff --git a/docker-compose.yml b/docker-compose.yml
@@ -15,3 +15,29 @@ services:
             - capabilities: [gpu]
     stdin_open: true
     tty: true
+
+  mlflow:
+    image: ghcr.io/mlflow/mlflow:v2.16.0
+    command: >
+      mlflow server
+        --host 0.0.0.0
+        --port 5000
+        --backend-store-uri sqlite:////db/mlflow.db
+        --artifacts-destination /mlruns/artifacts
+    ports:
+      - "5050:5050"
+    volumes:
+      - ./artifacts/mlruns:/mlruns
+      - ./artifacts/mlflow_db:/db
+    user: "0:0"
+
+  prefect:
+    image: prefecthq/prefect:2-latest
+    command: >
+      bash -lc "prefect server start --host 0.0.0.0"
+    ports:
+      - "4200:4200"
+    environment:
+      PREFECT_API_URL: "http://localhost:4200/api"
+    volumes:
+      - ./artifacts/prefect:/root/.prefect
diff --git a/requirements.txt b/requirements.txt
@@ -3,7 +3,6 @@ ipython==8.27.0
 jupyter_client==8.6.3
 jupyter_core==5.7.2
 matplotlib-inline==0.1.7
-numpy
 opencv-python==4.10.0.84
 pandas==2.2.3
 ipywidgets==8.1.5
@@ -16,4 +15,19 @@ scikit-image==0.25.0
 ruff==0.9.9
 pytest
 imagecodecs
-gradio==5.36.2
+gradio==5.36.2
+mlflow>=2.12
+prefect>=2.19
+fastapi>=0.109
+uvicorn[standard]>=0.27
+pydantic>=2.6
+torch>=2.3
+torchvision>=0.18
+torchaudio>=2.3
+albumentations>=1.4
+opencv-python-headless>=4.9
+tifffile>=2024.2
+numpy>=1.26
+pyarrow>=15.0
+h5py>=3.11
+torchmetrics>=1.3
diff --git a/src/config.py b/src/config.py
@@ -0,0 +1,68 @@
+import os, yaml
+from copy import deepcopy
+import argparse, pprint
+
+def _set_by_dotted_key(d, dotted, value):
+    keys = dotted.split(".")
+    cur = d
+    for k in keys[:-1]:
+        if k not in cur or not isinstance(cur[k], dict):
+            cur[k] = {}
+        cur = cur[k]
+    cur[keys[-1]] = value
+
+def apply_overrides(cfg: dict, pairs: list[str]) -> dict:
+    """pairs like ['train.epochs=10', 'train.optimizer.lr=1e-4']"""
+    out = deepcopy(cfg)
+    for p in pairs or []:
+        if "=" not in p:
+            continue
+        k, v = p.split("=", 1)
+        # try to cast numbers/bools
+        vv = v
+        if v.lower() in ("true", "false"):
+            vv = v.lower() == "true"
+        else:
+            try:
+                vv = int(v)
+            except ValueError:
+                try:
+                    vv = float(v)
+                except ValueError:
+                    pass
+        _set_by_dotted_key(out, k, vv)
+    return out
+
+def load_config(path: str, overrides: list[str] | None = None) -> dict:
+    with open(path, "r") as f:
+        cfg = yaml.safe_load(f)
+    # simple ${var} interpolation for two common cases
+    exp = cfg.get("experiment_name", "exp")
+    def subst(s: str) -> str:
+        return (s.replace("${experiment_name}", exp)
+                 if isinstance(s, str) else s)
+    def walk(x):
+        if isinstance(x, dict):
+            return {k: walk(subst(v)) for k,v in x.items()}
+        if isinstance(x, list):
+            return [walk(subst(v)) for v in x]
+        return subst(x)
+    cfg = walk(cfg)
+    # MLflow URI: env wins
+    ml_uri_env = os.getenv("MLFLOW_TRACKING_URI")
+    if ml_uri_env:
+        cfg.setdefault("mlflow", {})["tracking_uri"] = ml_uri_env
+    # cli overrides
+    return apply_overrides(cfg, overrides)
+
+
+if __name__ == "__main__":
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--config", default="config/defaults.yaml")
+    ap.add_argument("--set", action="append", dest="sets",
+                    help="override like key.sub=val", default=[])
+    args = ap.parse_args()
+
+    cfg = load_config(args.config, args.sets)
+    pprint.pp(cfg)
+    print("\nOK: loaded config and overrides.")
diff --git a/src/preprocessing/build_hdf5.py b/src/preprocessing/build_hdf5.py
@@ -0,0 +1,134 @@
+# src/preprocessing/build_hdf5.py
+from __future__ import annotations
+import argparse
+from pathlib import Path
+import os
+import numpy as np
+import pandas as pd
+import h5py
+import cv2
+
+from src.config import load_config
+from src.image_utils import load_image_set, extract_bounding_boxes
+from src.constants import IMAGE_FILE_NAMES
+
+def list_sample_dirs(root: Path, label_binary_name: str) -> list[Path]:
+    return sorted([p for p in root.iterdir() if p.is_dir() and (p/label_binary_name).exists()])
+
+def to_chw(x: np.ndarray) -> np.ndarray:
+    x = x.astype(np.float32, copy=False)
+    return np.transpose(x, (2, 0, 1))
+
+def mask_to_chw(m: np.ndarray) -> np.ndarray:
+    if m.ndim == 2:
+        m = m[..., None]
+    m = (m > 0).astype(np.uint8)
+    return np.transpose(m, (2, 0, 1))
+
+def minmax01_inplace(x: np.ndarray):
+    for c in range(x.shape[0]):
+        ch = x[c]; mn, mx = float(ch.min()), float(ch.max())
+        if mx > mn: ch[:] = (ch - mn) / (mx - mn)
+        else: ch[:] = 0.0
+
+def standardize_clip_inplace(x: np.ndarray, mean: float, std: float, cmin: float, cmax: float):
+    x -= mean
+    x /= (std + 1e-8)
+    np.clip(x, cmin, cmax, out=x)
+
+def resize_chw(x: np.ndarray, out_hw: tuple[int, int]) -> np.ndarray:
+    C, H, W = x.shape
+    oh, ow = out_hw
+    y = np.empty((C, oh, ow), dtype=x.dtype)
+    for c in range(C):
+        y[c] = cv2.resize(x[c], (ow, oh), interpolation=cv2.INTER_LINEAR)
+    return y
+
+def write_h5_bbox(out_path: Path, imgs: list[np.ndarray], boxes5: list[np.ndarray], dirs: list[str]):
+    out_path.parent.mkdir(parents=True, exist_ok=True)
+    if not imgs:
+        return
+    C, H, W = imgs[0].shape
+    with h5py.File(out_path, "w") as f:
+        f.create_dataset("images", data=np.stack(imgs, 0).astype(np.float16),
+                         compression="gzip", compression_opts=4, chunks=(1, C, H, W))
+        f.create_dataset("bboxes", data=np.stack(boxes5, 0).astype(np.float32),
+                         compression="gzip", compression_opts=4)
+        dt = h5py.string_dtype(encoding="utf-8")
+        f.create_dataset("dirs", data=np.array(dirs, dtype=object), dtype=dt)
+
+def main():
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--config", default="config/defaults.yaml")
+    args = ap.parse_args()
+
+    cfg = load_config(args.config, [])
+    dc = cfg["data"]
+
+    raw_root = Path(dc["raw_root"])
+    out_root = Path(dc["processed_root"]); out_root.mkdir(parents=True, exist_ok=True)
+    Hb, Wb = dc["input_size"]["bbox"]
+    label_binary_name = dc.get("label_binary_name", "labelbinary.tif")
+    max_boxes = int(dc.get("max_boxes", 10))
+
+    nc = dc["normalize"]; do_norm = bool(nc.get("enabled", True))
+    mean, std = float(nc.get("mean", 0.5)), float(nc.get("std", 0.25))
+    cmin, cmax = float(nc.get("clip_min", 0.0)), float(nc.get("clip_max", 1.0))
+    sp = dc["split"]; train_ratio, seed = float(sp["train_ratio"]), int(sp["seed"])
+
+    tiles = list_sample_dirs(raw_root, label_binary_name)
+    assert tiles, f"No tiles under {raw_root} (looked for '{label_binary_name}')"
+
+    rng = np.random.default_rng(seed)
+    idx = np.arange(len(tiles)); rng.shuffle(idx)
+    ntr = int(len(idx) * train_ratio)
+    train_idx, val_idx = idx[:ntr], idx[ntr:]
+
+    def process(indices):
+        imgs, boxes, dirs, meta_rows = [], [], [], []
+        for tid in indices:
+            tdir = tiles[tid]
+            try:
+                img_hwc, mask_hw1 = load_image_set(tdir.as_posix(), tuple(IMAGE_FILE_NAMES))
+            except Exception as e:
+                print(f"[warn] skip {tdir}: {e}")
+                continue
+
+            H0, W0, C0 = img_hwc.shape
+            assert C0 == len(IMAGE_FILE_NAMES), f"expected {len(IMAGE_FILE_NAMES)} channels, got {C0} @ {tdir}"
+
+            x = to_chw(img_hwc)
+            _ = mask_to_chw(mask_hw1)  # only to ensure binary label exists, boxes computed by your helper
+
+            minmax01_inplace(x)
+            if do_norm:
+                standardize_clip_inplace(x, mean, std, cmin, cmax)
+
+            x_resized = resize_chw(x, (Hb, Wb))
+
+            boxes5 = extract_bounding_boxes(mask_hw1.squeeze().astype(np.uint8),
+                                            num_boxes=max_boxes, force_square=False)
+            if len(boxes5) > max_boxes:
+                boxes5 = boxes5[:max_boxes]
+            elif len(boxes5) < max_boxes:
+                boxes5 += [(0.0, 0.0, 0.0, 0.0, 0.0)] * (max_boxes - len(boxes5))
+            boxes5 = np.array(boxes5, dtype=np.float32)
+
+            imgs.append(x_resized)
+            boxes.append(boxes5)
+            dirs.append(tdir.as_posix())
+            meta_rows.append({"tile_idx": len(imgs)-1, "tile_dir": tdir.as_posix(),
+                              "H0": H0, "W0": W0, "num_boxes": int((boxes5[:,0] > 0).sum())})
+        return imgs, boxes, dirs, pd.DataFrame(meta_rows)
+
+    tr_imgs, tr_boxes, tr_dirs, tr_meta = process(train_idx)
+    va_imgs, va_boxes, va_dirs, va_meta = process(val_idx)
+
+    write_h5_bbox(out_root / "bbox_train.h5", tr_imgs, tr_boxes, tr_dirs)
+    write_h5_bbox(out_root / "bbox_val.h5",   va_imgs, va_boxes, va_dirs)
+    tr_meta.to_csv(out_root / "bbox_train_meta.csv", index=False)
+    va_meta.to_csv(out_root / "bbox_val_meta.csv",   index=False)
+    print("[ok] Wrote bbox HDF5 + sidecars to", out_root)
+
+if __name__ == "__main__":
+    main()