Skip to content

Commit 793d714

Browse files
computer vision project improvments. (#117)
* computer vision project improvments. Added examples of how to use aws/eks/s3 instead gcp. Fixed issues where the train_model step was failing to save the new weights of the model post training. * Update end-to-end-computer-vision/steps/train_model.py Co-authored-by: Alexej Penner <[email protected]> * add clearity to commented out config parameter * add clearity to commented out config parameter * added missing volume_mounts configuration inside the aws example config file --------- Co-authored-by: Alexej Penner <[email protected]>
1 parent 59182fa commit 793d714

13 files changed

+105
-24
lines changed

end-to-end-computer-vision/.dockerignore

+2-1
Original file line numberDiff line numberDiff line change
@@ -3,4 +3,5 @@
33
**/tmp*
44
data/*
55
*.jpg
6-
*.pt
6+
*.pt
7+
notebooks/*

end-to-end-computer-vision/.gitignore

+3-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
*.pt
2+
data/*
23
!data/.gitkeep
3-
data/
4+
images
5+
loaded-images
46
runs/
57
**/tmp*
68
runs_dir

end-to-end-computer-vision/configs/inference_pipeline.yaml

+1-1
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ settings:
88
- libxext6
99
- libcurl4
1010
required_integrations:
11-
- gcp
11+
- gcp # For AWS use "s3" instead of "gcp"
1212
- github
1313
requirements:
1414
- ultralytics

end-to-end-computer-vision/configs/ingest_data.yaml

+2-2
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
1-
21
steps:
32
process_hf_dataset:
43
enable_cache: True
@@ -9,4 +8,5 @@ steps:
98
upload_labels_to_label_studio:
109
enable_cache: False
1110
parameters:
12-
ls_project_id: 8
11+
ls_project_id: 1
12+
storage_type: gcp # For AWS use "s3" instead of "gcp" (the default value)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
parameters:
2+
model_checkpoint: yolov8l.pt
3+
4+
settings:
5+
docker:
6+
apt_packages:
7+
- ffmpeg
8+
- libsm6
9+
- libxext6
10+
required_integrations:
11+
- s3
12+
- github
13+
requirements:
14+
- ultralytics
15+
16+
steps:
17+
load_model:
18+
enable_cache: True
19+
train_model:
20+
enable_cache: False
21+
enable_step_logs: False
22+
parameters:
23+
data_source: "s3://<bucket-name>/<source-dir>" # Insert your bucket path here where the training images lives e.g. "gs://foo/bar"
24+
batch_size: 8
25+
imgsz: 736
26+
epochs: 300
27+
is_single_gpu_env: True
28+
settings:
29+
orchestrator.kubernetes:
30+
pod_settings:
31+
affinity:
32+
nodeAffinity:
33+
requiredDuringSchedulingIgnoredDuringExecution:
34+
nodeSelectorTerms:
35+
- matchExpressions:
36+
- key: eks.amazonaws.com/nodegroup
37+
operator: In
38+
values:
39+
- aws-nvidia-single-gpu-nodes
40+
annotations:
41+
cluster-autoscaler.kubernetes.io/safe-to-evict: "false"
42+
tolerations:
43+
- key: "eks.amazonaws.com/nodegroup"
44+
operator: "Equal"
45+
value: ""
46+
effect: "NoSchedule"
47+
volumes:
48+
- emptyDir:
49+
medium: Memory
50+
sizeLimit: 1024Mi
51+
name: dshm
52+
volume_mounts:
53+
- mountPath: /dev/shm
54+
name: dshm
55+
docker:
56+
parent_image: pytorch/pytorch:2.2.0-cuda11.8-cudnn8-runtime
57+
required_integrations:
58+
- s3
59+
- github
60+
requirements:
61+
- ultralytics
62+
- zenml
63+
- numpy
64+
65+
66+
# configuration of the Model Control Plane
67+
model:
68+
name: ShipDetector
69+
license: Apache 2.0
70+
description: Object Detection Model.
71+
tags: ["object detection"]
72+
version: staging
+2-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
# Ignore everything in this directory
22
*
33
# Except this file
4-
!.gitignore
4+
!.gitignore
5+
!README.md

end-to-end-computer-vision/materializers/ultralytics_materializer.py

+2-5
Original file line numberDiff line numberDiff line change
@@ -61,9 +61,6 @@ def save(self, model: YOLO) -> None:
6161
model: A ultralytics YOLO model.
6262
"""
6363
filepath = os.path.join(self.uri, DEFAULT_FILENAME)
64+
modelpath = "runs/detect/train/weights/best.pt"
6465

65-
# Make a temporary phantom artifact
66-
with tempfile.NamedTemporaryFile(mode="w", suffix=".json") as f:
67-
model.save(f.name)
68-
# Copy it into artifact store
69-
fileio.copy(f.name, filepath)
66+
fileio.copy(modelpath, filepath)

end-to-end-computer-vision/materializers/yolo_materializer.py

+2-5
Original file line numberDiff line numberDiff line change
@@ -61,9 +61,6 @@ def save(self, model: YOLO) -> None:
6161
model: A ultralytics YOLO model.
6262
"""
6363
filepath = os.path.join(self.uri, DEFAULT_FILENAME)
64+
modelpath = "runs/detect/train/weights/best.pt"
6465

65-
# Make a temporary phantom artifact
66-
with tempfile.NamedTemporaryFile(mode="w", suffix=".json") as f:
67-
model.save(f.name)
68-
# Copy it into artifact store
69-
fileio.copy(f.name, filepath)
66+
fileio.copy(modelpath, filepath)

end-to-end-computer-vision/pipelines/training.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ def training_pipeline(model_checkpoint: str = "yolov8l.pt"):
3737
mv = get_pipeline_context().model
3838
dataset = mv.get_artifact(LABELED_DATASET_NAME)
3939

40-
trained_model, metrics = train_model(
40+
trained_model, metrics, names = train_model(
4141
model=model,
4242
dataset=dataset,
4343
)

end-to-end-computer-vision/requirements.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -10,5 +10,5 @@ huggingface_hub>=0.20.0
1010
fiftyone
1111
datasets
1212
albumentations
13-
pillow>=10.*.*
13+
pillow>=10.0.0
1414
dill

end-to-end-computer-vision/steps/export_label_studio.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,7 @@ def load_data_from_label_studio(
5858
"This step can only be used with the Label Studio annotator."
5959
)
6060

61-
if annotator and annotator._connection_available():
61+
if annotator:
6262
try:
6363
dataset = annotator.get_dataset(dataset_name=dataset_name)
6464
ls_dataset = LabelStudioAnnotationExport()

end-to-end-computer-vision/steps/fiftyone_inference.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@
3333

3434
os.environ["YOLO_VERBOSE"] = "False"
3535

36-
INFERENCE_BATCH = 5
36+
INFERENCE_BATCH = 20
3737

3838

3939
@step

end-to-end-computer-vision/steps/train_model.py

+15-4
Original file line numberDiff line numberDiff line change
@@ -41,12 +41,14 @@ def train_model(
4141
batch_size: int = 16,
4242
imgsz: int = 640,
4343
is_quad_gpu_env: bool = False,
44+
is_single_gpu_env: bool = False,
4445
is_apple_silicon_env: bool = False,
4546
) -> Tuple[
4647
Annotated[
4748
YOLO, ArtifactConfig(name="Trained_YOLO", is_model_artifact=True)
4849
],
4950
Annotated[Dict[str, Any], "validation_metrics"],
51+
Annotated[Dict[str, Any], "model_names"],
5052
]:
5153
"""Trains a model on a dataset.
5254
@@ -58,6 +60,7 @@ def train_model(
5860
dataset: Dataset to train the model on.
5961
data_source: Source where the data lives
6062
is_quad_gpu_env: Whether we are in an env with 4 gpus
63+
is_single_gpu_env: Whether we are in an env with a single gpu
6164
is_apple_silicon_env: In case we are running on Apple compute
6265
6366
Returns:
@@ -77,6 +80,14 @@ def train_model(
7780
imgsz=imgsz,
7881
device=[0, 1, 2, 3],
7982
)
83+
elif is_single_gpu_env:
84+
model.train(
85+
data=data_path,
86+
epochs=epochs,
87+
batch=batch_size,
88+
imgsz=imgsz,
89+
device=[0],
90+
)
8091
elif is_apple_silicon_env:
8192
model.train(
8293
data=data_path,
@@ -95,10 +106,10 @@ def train_model(
95106

96107
logger.info("Evaluating model...")
97108
metrics = model.val() # evaluate model performance on the validation set
98-
109+
99110
log_artifact_metadata(
100111
artifact_name="Trained_YOLO",
101-
metadata={"metrics": metrics.results_dict},
112+
metadata={"metrics": metrics.results_dict, "names": model.names},
102113
)
103-
104-
return model, metrics.results_dict
114+
115+
return model, metrics.results_dict, model.names

0 commit comments

Comments
 (0)