computer vision project improvments. (#117)

fmowersddx · AlexejPenner · web-flow · commit 793d714e77d2 · 2024-07-24T11:30:15.000+02:00
* computer vision project improvments. Added examples of how to use aws/eks/s3 instead gcp. Fixed issues where the train_model step was failing to save the new weights of the model post training.

* Update end-to-end-computer-vision/steps/train_model.py

Co-authored-by: Alexej Penner &lt;alexej@zenml.io&gt;

* add clearity to commented out config parameter

* add clearity to commented out config parameter

* added missing volume_mounts configuration inside the aws example config file

---------

Co-authored-by: Alexej Penner &lt;alexej@zenml.io&gt;
diff --git a/end-to-end-computer-vision/.dockerignore b/end-to-end-computer-vision/.dockerignore
@@ -3,4 +3,5 @@
 **/tmp*
 data/*
 *.jpg
-*.pt
+*.pt
+notebooks/*
diff --git a/end-to-end-computer-vision/.gitignore b/end-to-end-computer-vision/.gitignore
@@ -1,6 +1,8 @@
 *.pt
+data/*
 !data/.gitkeep
-data/
+images
+loaded-images
 runs/
 **/tmp*
 runs_dir
diff --git a/end-to-end-computer-vision/configs/inference_pipeline.yaml b/end-to-end-computer-vision/configs/inference_pipeline.yaml
@@ -8,7 +8,7 @@ settings:
       - libxext6
       - libcurl4
     required_integrations:
-      - gcp
+      - gcp # For AWS use "s3" instead of "gcp"
       - github
     requirements:
       - ultralytics
diff --git a/end-to-end-computer-vision/configs/ingest_data.yaml b/end-to-end-computer-vision/configs/ingest_data.yaml
@@ -1,4 +1,3 @@
-
 steps:
   process_hf_dataset:
     enable_cache: True
@@ -9,4 +8,5 @@ steps:
   upload_labels_to_label_studio:
     enable_cache: False
     parameters:
-      ls_project_id: 8
+      ls_project_id: 1
+      storage_type: gcp # For AWS use "s3" instead of "gcp" (the default value)
diff --git a/end-to-end-computer-vision/configs/training_pipeline_remote_eks_example.yaml b/end-to-end-computer-vision/configs/training_pipeline_remote_eks_example.yaml
@@ -0,0 +1,72 @@
+parameters:
+  model_checkpoint: yolov8l.pt
+
+settings:
+  docker:
+    apt_packages:
+      - ffmpeg
+      - libsm6
+      - libxext6
+    required_integrations:
+      - s3
+      - github
+    requirements:
+      - ultralytics
+
+steps:
+  load_model:
+    enable_cache: True
+  train_model:
+    enable_cache: False
+    enable_step_logs: False
+    parameters:
+      data_source: "s3://<bucket-name>/<source-dir>" # Insert your bucket path here where the training images lives e.g. "gs://foo/bar"
+      batch_size: 8
+      imgsz: 736
+      epochs: 300
+      is_single_gpu_env: True
+    settings:
+      orchestrator.kubernetes:
+        pod_settings:
+          affinity:
+            nodeAffinity:
+              requiredDuringSchedulingIgnoredDuringExecution:
+                nodeSelectorTerms:
+                  - matchExpressions:
+                      - key: eks.amazonaws.com/nodegroup
+                        operator: In
+                        values:
+                          - aws-nvidia-single-gpu-nodes
+          annotations:
+            cluster-autoscaler.kubernetes.io/safe-to-evict: "false"
+          tolerations:
+            - key: "eks.amazonaws.com/nodegroup"
+              operator: "Equal"
+              value: ""
+              effect: "NoSchedule"
+          volumes:
+          - emptyDir:
+              medium: Memory
+              sizeLimit: 1024Mi
+            name: dshm
+          volume_mounts:
+          - mountPath: /dev/shm
+            name: dshm
+      docker:
+        parent_image: pytorch/pytorch:2.2.0-cuda11.8-cudnn8-runtime
+        required_integrations:
+          - s3
+          - github
+        requirements:
+          - ultralytics
+          - zenml
+          - numpy
+
+
+# configuration of the Model Control Plane
+model:
+  name: ShipDetector
+  license: Apache 2.0
+  description: Object Detection Model.
+  tags: ["object detection"]
+  version: staging
diff --git a/end-to-end-computer-vision/data/.gitignore b/end-to-end-computer-vision/data/.gitignore
@@ -1,4 +1,5 @@
 # Ignore everything in this directory
 *
 # Except this file
-!.gitignore
+!.gitignore
+!README.md
diff --git a/end-to-end-computer-vision/materializers/ultralytics_materializer.py b/end-to-end-computer-vision/materializers/ultralytics_materializer.py
@@ -61,9 +61,6 @@ def save(self, model: YOLO) -> None:
             model: A ultralytics YOLO model.
         """
         filepath = os.path.join(self.uri, DEFAULT_FILENAME)
+        modelpath = "runs/detect/train/weights/best.pt"
 
-        # Make a temporary phantom artifact
-        with tempfile.NamedTemporaryFile(mode="w", suffix=".json") as f:
-            model.save(f.name)
-            # Copy it into artifact store
-            fileio.copy(f.name, filepath)
+        fileio.copy(modelpath, filepath)
diff --git a/end-to-end-computer-vision/materializers/yolo_materializer.py b/end-to-end-computer-vision/materializers/yolo_materializer.py
@@ -61,9 +61,6 @@ def save(self, model: YOLO) -> None:
             model: A ultralytics YOLO model.
         """
         filepath = os.path.join(self.uri, DEFAULT_FILENAME)
+        modelpath = "runs/detect/train/weights/best.pt"
 
-        # Make a temporary phantom artifact
-        with tempfile.NamedTemporaryFile(mode="w", suffix=".json") as f:
-            model.save(f.name)
-            # Copy it into artifact store
-            fileio.copy(f.name, filepath)
+        fileio.copy(modelpath, filepath)
diff --git a/end-to-end-computer-vision/pipelines/training.py b/end-to-end-computer-vision/pipelines/training.py
@@ -37,7 +37,7 @@ def training_pipeline(model_checkpoint: str = "yolov8l.pt"):
     mv = get_pipeline_context().model
     dataset = mv.get_artifact(LABELED_DATASET_NAME)
 
-    trained_model, metrics = train_model(
+    trained_model, metrics, names = train_model(
         model=model,
         dataset=dataset,
     )
diff --git a/end-to-end-computer-vision/requirements.txt b/end-to-end-computer-vision/requirements.txt
@@ -10,5 +10,5 @@ huggingface_hub>=0.20.0
 fiftyone
 datasets
 albumentations
-pillow>=10.*.*
+pillow>=10.0.0
 dill
diff --git a/end-to-end-computer-vision/steps/export_label_studio.py b/end-to-end-computer-vision/steps/export_label_studio.py
@@ -58,7 +58,7 @@ def load_data_from_label_studio(
             "This step can only be used with the Label Studio annotator."
         )
 
-    if annotator and annotator._connection_available():
+    if annotator:
         try:
             dataset = annotator.get_dataset(dataset_name=dataset_name)
             ls_dataset = LabelStudioAnnotationExport()
diff --git a/end-to-end-computer-vision/steps/fiftyone_inference.py b/end-to-end-computer-vision/steps/fiftyone_inference.py
@@ -33,7 +33,7 @@
 
 os.environ["YOLO_VERBOSE"] = "False"
 
-INFERENCE_BATCH = 5
+INFERENCE_BATCH = 20
 
 
 @step
diff --git a/end-to-end-computer-vision/steps/train_model.py b/end-to-end-computer-vision/steps/train_model.py
@@ -41,12 +41,14 @@ def train_model(
     batch_size: int = 16,
     imgsz: int = 640,
     is_quad_gpu_env: bool = False,
+    is_single_gpu_env: bool = False,
     is_apple_silicon_env: bool = False,
 ) -> Tuple[
     Annotated[
         YOLO, ArtifactConfig(name="Trained_YOLO", is_model_artifact=True)
     ],
     Annotated[Dict[str, Any], "validation_metrics"],
+    Annotated[Dict[str, Any], "model_names"],
 ]:
     """Trains a model on a dataset.
 
@@ -58,6 +60,7 @@ def train_model(
         dataset: Dataset to train the model on.
         data_source: Source where the data lives
         is_quad_gpu_env: Whether we are in an env with 4 gpus
+        is_single_gpu_env: Whether we are in an env with a single gpu
         is_apple_silicon_env: In case we are running on Apple compute
 
     Returns:
@@ -77,6 +80,14 @@ def train_model(
             imgsz=imgsz,
             device=[0, 1, 2, 3],
         )
+    elif is_single_gpu_env:
+        model.train(
+            data=data_path,
+            epochs=epochs,
+            batch=batch_size,
+            imgsz=imgsz,
+            device=[0],
+        )
     elif is_apple_silicon_env:
         model.train(
             data=data_path,
@@ -95,10 +106,10 @@ def train_model(
 
     logger.info("Evaluating model...")
     metrics = model.val()  # evaluate model performance on the validation set
-
+    
     log_artifact_metadata(
         artifact_name="Trained_YOLO",
-        metadata={"metrics": metrics.results_dict},
+        metadata={"metrics": metrics.results_dict, "names": model.names},
     )
-
-    return model, metrics.results_dict
+    
+    return model, metrics.results_dict, model.names

-Original file line number
+Diff line change
 **/tmp*
 data/*
 *.jpg
 -*.pt
 +*.pt
 +notebooks/*
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,5 @@`
`1`	`1`	`# Ignore everything in this directory`
`2`	`2`	`*`
`3`	`3`	`# Except this file`
`4`		`-!.gitignore`
	`4`	`+!.gitignore`
	`5`	`+!README.md`
Original file line number	Diff line number	Diff line change
`@@ -37,7 +37,7 @@ def training_pipeline(model_checkpoint: str = "yolov8l.pt"):`
`37`	`37`	`mv = get_pipeline_context().model`
`38`	`38`	`dataset = mv.get_artifact(LABELED_DATASET_NAME)`
`39`	`39`
`40`		`- trained_model, metrics = train_model(`
	`40`	`+ trained_model, metrics, names = train_model(`
`41`	`41`	`model=model,`
`42`	`42`	`dataset=dataset,`
`43`	`43`	`)`
Original file line number	Diff line number	Diff line change
`@@ -58,7 +58,7 @@ def load_data_from_label_studio(`
`58`	`58`	`"This step can only be used with the Label Studio annotator."`
`59`	`59`	`)`
`60`	`60`
`61`		`- if annotator and annotator._connection_available():`
	`61`	`+ if annotator:`
`62`	`62`	`try:`
`63`	`63`	`dataset = annotator.get_dataset(dataset_name=dataset_name)`
`64`	`64`	`ls_dataset = LabelStudioAnnotationExport()`