Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,5 @@ venv/
*.DS_STORE
sklearn-env/
.ruff_cache/
.env
artifacts/
17 changes: 17 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
.PHONY: up down mlflow-ui prefect-ui install

up:
docker compose up -d mlflow prefect

down:
docker compose down

mlflow-ui:
@echo "MLflow -> http://localhost:5050"

prefect-ui:
@echo "Prefect -> http://localhost:4200"

install:
python -m pip install -U pip
pip install -r requirements.txt
53 changes: 53 additions & 0 deletions config/defaults.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
experiment_name: bbox # change to "bbox" | "seg" as needed

mlflow:
tracking_uri: "http://localhost:5050"
experiment: "${experiment_name}"

data:
raw_root: "data/raw_data/STARCOP_train_easy"
processed_root: "artifacts/datasets"
shard_format: "hdf5" # zarr | hdf5
image_exts: [".tif", ".tiff"]
label_binary_name: "labelbinary.tif"
label_rgba_name: "label_rgba.tif"
normalize: {enabled: true, mean: 0.5, std: 0.25, clip_min: 0.0, clip_max: 1.0}
input_size: {bbox: [256,256]}
max_boxes: 10
pad_mode: "constant"
split: {train_ratio: 0.8, val_ratio: 0.2, seed: 42}
aug:
bbox: {hflip_prob: 0.5, vflip_prob: 0.0, rotate_deg: 10}

train:
seed: 42
device: "auto" # auto | cpu | cuda
epochs: 20
batch_size: 16
num_workers: 4
optimizer: {name: "adamw", lr: 3.0e-4, weight_decay: 1.0e-2}
scheduler: {name: "cosine", warmup_epochs: 2}
checkpointing:
dir: "artifacts/models/${experiment_name}"
save_top_k: 1
logging: {log_every_n_steps: 50}

infer:
input_dir: "data/raw_data/STARCOP_train_easy"
output_dir: "artifacts/preds/${experiment_name}"
bbox: {score_thresh: 0.35, nms_iou_thresh: 0.5, max_dets: 50}
models: {bbox_path: null, seg_path: null}

eval:
bbox: {ap50_target: 0.50}
stitched: {miou_target: 0.55}
overlays: {save: true, limit: 50, out_dir: "artifacts/reports/${experiment_name}"}

model:
bbox:
name: "bbox_cnn"
in_channels: 1
num_classes: 1
backbone: "resnet18"
pretrained: false
head: {type: "conv", hidden_dim: 128}
26 changes: 26 additions & 0 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,3 +15,29 @@ services:
- capabilities: [gpu]
stdin_open: true
tty: true

mlflow:
image: ghcr.io/mlflow/mlflow:v2.16.0
command: >
mlflow server
--host 0.0.0.0
--port 5000
--backend-store-uri sqlite:////db/mlflow.db
--artifacts-destination /mlruns/artifacts
ports:
- "5050:5050"
volumes:
- ./artifacts/mlruns:/mlruns
- ./artifacts/mlflow_db:/db
user: "0:0"

prefect:
image: prefecthq/prefect:2-latest
command: >
bash -lc "prefect server start --host 0.0.0.0"
ports:
- "4200:4200"
environment:
PREFECT_API_URL: "http://localhost:4200/api"
volumes:
- ./artifacts/prefect:/root/.prefect
18 changes: 16 additions & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@ ipython==8.27.0
jupyter_client==8.6.3
jupyter_core==5.7.2
matplotlib-inline==0.1.7
numpy
opencv-python==4.10.0.84
pandas==2.2.3
ipywidgets==8.1.5
Expand All @@ -16,4 +15,19 @@ scikit-image==0.25.0
ruff==0.9.9
pytest
imagecodecs
gradio==5.36.2
gradio==5.36.2
mlflow>=2.12
prefect>=2.19
fastapi>=0.109
uvicorn[standard]>=0.27
pydantic>=2.6
torch>=2.3
torchvision>=0.18
torchaudio>=2.3
albumentations>=1.4
opencv-python-headless>=4.9
tifffile>=2024.2
numpy>=1.26
pyarrow>=15.0
h5py>=3.11
torchmetrics>=1.3
68 changes: 68 additions & 0 deletions src/config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
import os, yaml
from copy import deepcopy
import argparse, pprint

def _set_by_dotted_key(d, dotted, value):
keys = dotted.split(".")
cur = d
for k in keys[:-1]:
if k not in cur or not isinstance(cur[k], dict):
cur[k] = {}
cur = cur[k]
cur[keys[-1]] = value

def apply_overrides(cfg: dict, pairs: list[str]) -> dict:
"""pairs like ['train.epochs=10', 'train.optimizer.lr=1e-4']"""
out = deepcopy(cfg)
for p in pairs or []:
if "=" not in p:
continue
k, v = p.split("=", 1)
# try to cast numbers/bools
vv = v
if v.lower() in ("true", "false"):
vv = v.lower() == "true"
else:
try:
vv = int(v)
except ValueError:
try:
vv = float(v)
except ValueError:
pass
_set_by_dotted_key(out, k, vv)
return out

def load_config(path: str, overrides: list[str] | None = None) -> dict:
with open(path, "r") as f:
cfg = yaml.safe_load(f)
# simple ${var} interpolation for two common cases
exp = cfg.get("experiment_name", "exp")
def subst(s: str) -> str:
return (s.replace("${experiment_name}", exp)
if isinstance(s, str) else s)
def walk(x):
if isinstance(x, dict):
return {k: walk(subst(v)) for k,v in x.items()}
if isinstance(x, list):
return [walk(subst(v)) for v in x]
return subst(x)
cfg = walk(cfg)
# MLflow URI: env wins
ml_uri_env = os.getenv("MLFLOW_TRACKING_URI")
if ml_uri_env:
cfg.setdefault("mlflow", {})["tracking_uri"] = ml_uri_env
# cli overrides
return apply_overrides(cfg, overrides)


if __name__ == "__main__":
ap = argparse.ArgumentParser()
ap.add_argument("--config", default="config/defaults.yaml")
ap.add_argument("--set", action="append", dest="sets",
help="override like key.sub=val", default=[])
args = ap.parse_args()

cfg = load_config(args.config, args.sets)
pprint.pp(cfg)
print("\nOK: loaded config and overrides.")
134 changes: 134 additions & 0 deletions src/preprocessing/build_hdf5.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
# src/preprocessing/build_hdf5.py
from __future__ import annotations
import argparse
from pathlib import Path
import os
import numpy as np
import pandas as pd
import h5py
import cv2

from src.config import load_config
from src.image_utils import load_image_set, extract_bounding_boxes
from src.constants import IMAGE_FILE_NAMES

def list_sample_dirs(root: Path, label_binary_name: str) -> list[Path]:
return sorted([p for p in root.iterdir() if p.is_dir() and (p/label_binary_name).exists()])

def to_chw(x: np.ndarray) -> np.ndarray:
x = x.astype(np.float32, copy=False)
return np.transpose(x, (2, 0, 1))

def mask_to_chw(m: np.ndarray) -> np.ndarray:
if m.ndim == 2:
m = m[..., None]
m = (m > 0).astype(np.uint8)
return np.transpose(m, (2, 0, 1))

def minmax01_inplace(x: np.ndarray):
for c in range(x.shape[0]):
ch = x[c]; mn, mx = float(ch.min()), float(ch.max())
if mx > mn: ch[:] = (ch - mn) / (mx - mn)
else: ch[:] = 0.0

def standardize_clip_inplace(x: np.ndarray, mean: float, std: float, cmin: float, cmax: float):
x -= mean
x /= (std + 1e-8)
np.clip(x, cmin, cmax, out=x)

def resize_chw(x: np.ndarray, out_hw: tuple[int, int]) -> np.ndarray:
C, H, W = x.shape
oh, ow = out_hw
y = np.empty((C, oh, ow), dtype=x.dtype)
for c in range(C):
y[c] = cv2.resize(x[c], (ow, oh), interpolation=cv2.INTER_LINEAR)
return y

def write_h5_bbox(out_path: Path, imgs: list[np.ndarray], boxes5: list[np.ndarray], dirs: list[str]):
out_path.parent.mkdir(parents=True, exist_ok=True)
if not imgs:
return
C, H, W = imgs[0].shape
with h5py.File(out_path, "w") as f:
f.create_dataset("images", data=np.stack(imgs, 0).astype(np.float16),
compression="gzip", compression_opts=4, chunks=(1, C, H, W))
f.create_dataset("bboxes", data=np.stack(boxes5, 0).astype(np.float32),
compression="gzip", compression_opts=4)
dt = h5py.string_dtype(encoding="utf-8")
f.create_dataset("dirs", data=np.array(dirs, dtype=object), dtype=dt)

def main():
ap = argparse.ArgumentParser()
ap.add_argument("--config", default="config/defaults.yaml")
args = ap.parse_args()

cfg = load_config(args.config, [])
dc = cfg["data"]

raw_root = Path(dc["raw_root"])
out_root = Path(dc["processed_root"]); out_root.mkdir(parents=True, exist_ok=True)
Hb, Wb = dc["input_size"]["bbox"]
label_binary_name = dc.get("label_binary_name", "labelbinary.tif")
max_boxes = int(dc.get("max_boxes", 10))

nc = dc["normalize"]; do_norm = bool(nc.get("enabled", True))
mean, std = float(nc.get("mean", 0.5)), float(nc.get("std", 0.25))
cmin, cmax = float(nc.get("clip_min", 0.0)), float(nc.get("clip_max", 1.0))
sp = dc["split"]; train_ratio, seed = float(sp["train_ratio"]), int(sp["seed"])

tiles = list_sample_dirs(raw_root, label_binary_name)
assert tiles, f"No tiles under {raw_root} (looked for '{label_binary_name}')"

rng = np.random.default_rng(seed)
idx = np.arange(len(tiles)); rng.shuffle(idx)
ntr = int(len(idx) * train_ratio)
train_idx, val_idx = idx[:ntr], idx[ntr:]

def process(indices):
imgs, boxes, dirs, meta_rows = [], [], [], []
for tid in indices:
tdir = tiles[tid]
try:
img_hwc, mask_hw1 = load_image_set(tdir.as_posix(), tuple(IMAGE_FILE_NAMES))
except Exception as e:
print(f"[warn] skip {tdir}: {e}")
continue

H0, W0, C0 = img_hwc.shape
assert C0 == len(IMAGE_FILE_NAMES), f"expected {len(IMAGE_FILE_NAMES)} channels, got {C0} @ {tdir}"

x = to_chw(img_hwc)
_ = mask_to_chw(mask_hw1) # only to ensure binary label exists, boxes computed by your helper

minmax01_inplace(x)
if do_norm:
standardize_clip_inplace(x, mean, std, cmin, cmax)

x_resized = resize_chw(x, (Hb, Wb))

boxes5 = extract_bounding_boxes(mask_hw1.squeeze().astype(np.uint8),
num_boxes=max_boxes, force_square=False)
if len(boxes5) > max_boxes:
boxes5 = boxes5[:max_boxes]
elif len(boxes5) < max_boxes:
boxes5 += [(0.0, 0.0, 0.0, 0.0, 0.0)] * (max_boxes - len(boxes5))
boxes5 = np.array(boxes5, dtype=np.float32)

imgs.append(x_resized)
boxes.append(boxes5)
dirs.append(tdir.as_posix())
meta_rows.append({"tile_idx": len(imgs)-1, "tile_dir": tdir.as_posix(),
"H0": H0, "W0": W0, "num_boxes": int((boxes5[:,0] > 0).sum())})
return imgs, boxes, dirs, pd.DataFrame(meta_rows)

tr_imgs, tr_boxes, tr_dirs, tr_meta = process(train_idx)
va_imgs, va_boxes, va_dirs, va_meta = process(val_idx)

write_h5_bbox(out_root / "bbox_train.h5", tr_imgs, tr_boxes, tr_dirs)
write_h5_bbox(out_root / "bbox_val.h5", va_imgs, va_boxes, va_dirs)
tr_meta.to_csv(out_root / "bbox_train_meta.csv", index=False)
va_meta.to_csv(out_root / "bbox_val_meta.csv", index=False)
print("[ok] Wrote bbox HDF5 + sidecars to", out_root)

if __name__ == "__main__":
main()