diff --git a/.gitignore b/.gitignore index d843b2a..b0681e6 100644 --- a/.gitignore +++ b/.gitignore @@ -4,3 +4,5 @@ venv/ *.DS_STORE sklearn-env/ .ruff_cache/ +.env +artifacts/ diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..3664429 --- /dev/null +++ b/Makefile @@ -0,0 +1,17 @@ +.PHONY: up down mlflow-ui prefect-ui install + +up: + docker compose up -d mlflow prefect + +down: + docker compose down + +mlflow-ui: + @echo "MLflow -> http://localhost:5050" + +prefect-ui: + @echo "Prefect -> http://localhost:4200" + +install: + python -m pip install -U pip + pip install -r requirements.txt diff --git a/config/defaults.yaml b/config/defaults.yaml new file mode 100644 index 0000000..eb401cb --- /dev/null +++ b/config/defaults.yaml @@ -0,0 +1,53 @@ +experiment_name: bbox # change to "bbox" | "seg" as needed + +mlflow: + tracking_uri: "http://localhost:5050" + experiment: "${experiment_name}" + +data: + raw_root: "data/raw_data/STARCOP_train_easy" + processed_root: "artifacts/datasets" + shard_format: "hdf5" # zarr | hdf5 + image_exts: [".tif", ".tiff"] + label_binary_name: "labelbinary.tif" + label_rgba_name: "label_rgba.tif" + normalize: {enabled: true, mean: 0.5, std: 0.25, clip_min: 0.0, clip_max: 1.0} + input_size: {bbox: [256,256]} + max_boxes: 10 + pad_mode: "constant" + split: {train_ratio: 0.8, val_ratio: 0.2, seed: 42} + aug: + bbox: {hflip_prob: 0.5, vflip_prob: 0.0, rotate_deg: 10} + +train: + seed: 42 + device: "auto" # auto | cpu | cuda + epochs: 20 + batch_size: 16 + num_workers: 4 + optimizer: {name: "adamw", lr: 3.0e-4, weight_decay: 1.0e-2} + scheduler: {name: "cosine", warmup_epochs: 2} + checkpointing: + dir: "artifacts/models/${experiment_name}" + save_top_k: 1 + logging: {log_every_n_steps: 50} + +infer: + input_dir: "data/raw_data/STARCOP_train_easy" + output_dir: "artifacts/preds/${experiment_name}" + bbox: {score_thresh: 0.35, nms_iou_thresh: 0.5, max_dets: 50} + models: {bbox_path: null, seg_path: null} + +eval: + bbox: {ap50_target: 0.50} + stitched: {miou_target: 0.55} + overlays: {save: true, limit: 50, out_dir: "artifacts/reports/${experiment_name}"} + +model: + bbox: + name: "bbox_cnn" + in_channels: 1 + num_classes: 1 + backbone: "resnet18" + pretrained: false + head: {type: "conv", hidden_dim: 128} diff --git a/docker-compose.yml b/docker-compose.yml index de2804f..220069a 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -15,3 +15,29 @@ services: - capabilities: [gpu] stdin_open: true tty: true + + mlflow: + image: ghcr.io/mlflow/mlflow:v2.16.0 + command: > + mlflow server + --host 0.0.0.0 + --port 5000 + --backend-store-uri sqlite:////db/mlflow.db + --artifacts-destination /mlruns/artifacts + ports: + - "5050:5050" + volumes: + - ./artifacts/mlruns:/mlruns + - ./artifacts/mlflow_db:/db + user: "0:0" + + prefect: + image: prefecthq/prefect:2-latest + command: > + bash -lc "prefect server start --host 0.0.0.0" + ports: + - "4200:4200" + environment: + PREFECT_API_URL: "http://localhost:4200/api" + volumes: + - ./artifacts/prefect:/root/.prefect \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 153434b..5320404 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,7 +3,6 @@ ipython==8.27.0 jupyter_client==8.6.3 jupyter_core==5.7.2 matplotlib-inline==0.1.7 -numpy opencv-python==4.10.0.84 pandas==2.2.3 ipywidgets==8.1.5 @@ -16,4 +15,19 @@ scikit-image==0.25.0 ruff==0.9.9 pytest imagecodecs -gradio==5.36.2 \ No newline at end of file +gradio==5.36.2 +mlflow>=2.12 +prefect>=2.19 +fastapi>=0.109 +uvicorn[standard]>=0.27 +pydantic>=2.6 +torch>=2.3 +torchvision>=0.18 +torchaudio>=2.3 +albumentations>=1.4 +opencv-python-headless>=4.9 +tifffile>=2024.2 +numpy>=1.26 +pyarrow>=15.0 +h5py>=3.11 +torchmetrics>=1.3 \ No newline at end of file diff --git a/src/config.py b/src/config.py new file mode 100644 index 0000000..2cdc8af --- /dev/null +++ b/src/config.py @@ -0,0 +1,68 @@ +import os, yaml +from copy import deepcopy +import argparse, pprint + +def _set_by_dotted_key(d, dotted, value): + keys = dotted.split(".") + cur = d + for k in keys[:-1]: + if k not in cur or not isinstance(cur[k], dict): + cur[k] = {} + cur = cur[k] + cur[keys[-1]] = value + +def apply_overrides(cfg: dict, pairs: list[str]) -> dict: + """pairs like ['train.epochs=10', 'train.optimizer.lr=1e-4']""" + out = deepcopy(cfg) + for p in pairs or []: + if "=" not in p: + continue + k, v = p.split("=", 1) + # try to cast numbers/bools + vv = v + if v.lower() in ("true", "false"): + vv = v.lower() == "true" + else: + try: + vv = int(v) + except ValueError: + try: + vv = float(v) + except ValueError: + pass + _set_by_dotted_key(out, k, vv) + return out + +def load_config(path: str, overrides: list[str] | None = None) -> dict: + with open(path, "r") as f: + cfg = yaml.safe_load(f) + # simple ${var} interpolation for two common cases + exp = cfg.get("experiment_name", "exp") + def subst(s: str) -> str: + return (s.replace("${experiment_name}", exp) + if isinstance(s, str) else s) + def walk(x): + if isinstance(x, dict): + return {k: walk(subst(v)) for k,v in x.items()} + if isinstance(x, list): + return [walk(subst(v)) for v in x] + return subst(x) + cfg = walk(cfg) + # MLflow URI: env wins + ml_uri_env = os.getenv("MLFLOW_TRACKING_URI") + if ml_uri_env: + cfg.setdefault("mlflow", {})["tracking_uri"] = ml_uri_env + # cli overrides + return apply_overrides(cfg, overrides) + + +if __name__ == "__main__": + ap = argparse.ArgumentParser() + ap.add_argument("--config", default="config/defaults.yaml") + ap.add_argument("--set", action="append", dest="sets", + help="override like key.sub=val", default=[]) + args = ap.parse_args() + + cfg = load_config(args.config, args.sets) + pprint.pp(cfg) + print("\nOK: loaded config and overrides.") diff --git a/src/preprocessing/build_hdf5.py b/src/preprocessing/build_hdf5.py new file mode 100644 index 0000000..655a8eb --- /dev/null +++ b/src/preprocessing/build_hdf5.py @@ -0,0 +1,134 @@ +# src/preprocessing/build_hdf5.py +from __future__ import annotations +import argparse +from pathlib import Path +import os +import numpy as np +import pandas as pd +import h5py +import cv2 + +from src.config import load_config +from src.image_utils import load_image_set, extract_bounding_boxes +from src.constants import IMAGE_FILE_NAMES + +def list_sample_dirs(root: Path, label_binary_name: str) -> list[Path]: + return sorted([p for p in root.iterdir() if p.is_dir() and (p/label_binary_name).exists()]) + +def to_chw(x: np.ndarray) -> np.ndarray: + x = x.astype(np.float32, copy=False) + return np.transpose(x, (2, 0, 1)) + +def mask_to_chw(m: np.ndarray) -> np.ndarray: + if m.ndim == 2: + m = m[..., None] + m = (m > 0).astype(np.uint8) + return np.transpose(m, (2, 0, 1)) + +def minmax01_inplace(x: np.ndarray): + for c in range(x.shape[0]): + ch = x[c]; mn, mx = float(ch.min()), float(ch.max()) + if mx > mn: ch[:] = (ch - mn) / (mx - mn) + else: ch[:] = 0.0 + +def standardize_clip_inplace(x: np.ndarray, mean: float, std: float, cmin: float, cmax: float): + x -= mean + x /= (std + 1e-8) + np.clip(x, cmin, cmax, out=x) + +def resize_chw(x: np.ndarray, out_hw: tuple[int, int]) -> np.ndarray: + C, H, W = x.shape + oh, ow = out_hw + y = np.empty((C, oh, ow), dtype=x.dtype) + for c in range(C): + y[c] = cv2.resize(x[c], (ow, oh), interpolation=cv2.INTER_LINEAR) + return y + +def write_h5_bbox(out_path: Path, imgs: list[np.ndarray], boxes5: list[np.ndarray], dirs: list[str]): + out_path.parent.mkdir(parents=True, exist_ok=True) + if not imgs: + return + C, H, W = imgs[0].shape + with h5py.File(out_path, "w") as f: + f.create_dataset("images", data=np.stack(imgs, 0).astype(np.float16), + compression="gzip", compression_opts=4, chunks=(1, C, H, W)) + f.create_dataset("bboxes", data=np.stack(boxes5, 0).astype(np.float32), + compression="gzip", compression_opts=4) + dt = h5py.string_dtype(encoding="utf-8") + f.create_dataset("dirs", data=np.array(dirs, dtype=object), dtype=dt) + +def main(): + ap = argparse.ArgumentParser() + ap.add_argument("--config", default="config/defaults.yaml") + args = ap.parse_args() + + cfg = load_config(args.config, []) + dc = cfg["data"] + + raw_root = Path(dc["raw_root"]) + out_root = Path(dc["processed_root"]); out_root.mkdir(parents=True, exist_ok=True) + Hb, Wb = dc["input_size"]["bbox"] + label_binary_name = dc.get("label_binary_name", "labelbinary.tif") + max_boxes = int(dc.get("max_boxes", 10)) + + nc = dc["normalize"]; do_norm = bool(nc.get("enabled", True)) + mean, std = float(nc.get("mean", 0.5)), float(nc.get("std", 0.25)) + cmin, cmax = float(nc.get("clip_min", 0.0)), float(nc.get("clip_max", 1.0)) + sp = dc["split"]; train_ratio, seed = float(sp["train_ratio"]), int(sp["seed"]) + + tiles = list_sample_dirs(raw_root, label_binary_name) + assert tiles, f"No tiles under {raw_root} (looked for '{label_binary_name}')" + + rng = np.random.default_rng(seed) + idx = np.arange(len(tiles)); rng.shuffle(idx) + ntr = int(len(idx) * train_ratio) + train_idx, val_idx = idx[:ntr], idx[ntr:] + + def process(indices): + imgs, boxes, dirs, meta_rows = [], [], [], [] + for tid in indices: + tdir = tiles[tid] + try: + img_hwc, mask_hw1 = load_image_set(tdir.as_posix(), tuple(IMAGE_FILE_NAMES)) + except Exception as e: + print(f"[warn] skip {tdir}: {e}") + continue + + H0, W0, C0 = img_hwc.shape + assert C0 == len(IMAGE_FILE_NAMES), f"expected {len(IMAGE_FILE_NAMES)} channels, got {C0} @ {tdir}" + + x = to_chw(img_hwc) + _ = mask_to_chw(mask_hw1) # only to ensure binary label exists, boxes computed by your helper + + minmax01_inplace(x) + if do_norm: + standardize_clip_inplace(x, mean, std, cmin, cmax) + + x_resized = resize_chw(x, (Hb, Wb)) + + boxes5 = extract_bounding_boxes(mask_hw1.squeeze().astype(np.uint8), + num_boxes=max_boxes, force_square=False) + if len(boxes5) > max_boxes: + boxes5 = boxes5[:max_boxes] + elif len(boxes5) < max_boxes: + boxes5 += [(0.0, 0.0, 0.0, 0.0, 0.0)] * (max_boxes - len(boxes5)) + boxes5 = np.array(boxes5, dtype=np.float32) + + imgs.append(x_resized) + boxes.append(boxes5) + dirs.append(tdir.as_posix()) + meta_rows.append({"tile_idx": len(imgs)-1, "tile_dir": tdir.as_posix(), + "H0": H0, "W0": W0, "num_boxes": int((boxes5[:,0] > 0).sum())}) + return imgs, boxes, dirs, pd.DataFrame(meta_rows) + + tr_imgs, tr_boxes, tr_dirs, tr_meta = process(train_idx) + va_imgs, va_boxes, va_dirs, va_meta = process(val_idx) + + write_h5_bbox(out_root / "bbox_train.h5", tr_imgs, tr_boxes, tr_dirs) + write_h5_bbox(out_root / "bbox_val.h5", va_imgs, va_boxes, va_dirs) + tr_meta.to_csv(out_root / "bbox_train_meta.csv", index=False) + va_meta.to_csv(out_root / "bbox_val_meta.csv", index=False) + print("[ok] Wrote bbox HDF5 + sidecars to", out_root) + +if __name__ == "__main__": + main()