diff --git a/keras_ssd/.DS_Store b/keras_ssd/.DS_Store
new file mode 100644
index 0000000..fcc3484
Binary files /dev/null and b/keras_ssd/.DS_Store differ
diff --git a/keras_ssd/.gitattributes b/keras_ssd/.gitattributes
new file mode 100755
index 0000000..f4c7e5f
--- /dev/null
+++ b/keras_ssd/.gitattributes
@@ -0,0 +1 @@
+*.ipynb linguist-language=Python
diff --git a/keras_ssd/.github/stale.yml b/keras_ssd/.github/stale.yml
new file mode 100644
index 0000000..73cb6b9
--- /dev/null
+++ b/keras_ssd/.github/stale.yml
@@ -0,0 +1,24 @@
+# Configuration for probot-stale - https://github.com/probot/stale
+
+# Number of days of inactivity before an Issue or Pull Request becomes stale
+daysUntilStale: 7
+# Number of days of inactivity before a stale Issue or Pull Request is closed
+daysUntilClose: 7
+# Issues or Pull Requests with these labels will never be considered stale. Set to `[]` to disable
+exemptLabels:
+  - pinned
+  - security
+  - "[Status] Maybe Later"
+# Label to use when marking as stale
+staleLabel: stale
+# Comment to post when marking as stale. Set to `false` to disable
+markComment: >
+  This issue has been automatically marked as stale because it has not had
+  recent activity. It will be closed if no further activity occurs. Thank you
+  for your contributions.
+# Comment to post when removing the stale label. Set to `false` to disable
+unmarkComment: false
+# Comment to post when closing a stale Issue or Pull Request. Set to `false` to disable
+closeComment: false
+# Limit to only `issues` or `pulls`
+# only: issues
diff --git a/keras_ssd/.gitignore b/keras_ssd/.gitignore
new file mode 100755
index 0000000..9531469
--- /dev/null
+++ b/keras_ssd/.gitignore
@@ -0,0 +1,98 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+env/
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*,cover
+.hypothesis/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+.ipynb_checkpoints/
+
+# pyenv
+.python-version
+
+# celery beat schedule file
+celerybeat-schedule
+
+# SageMath parsed files
+*.sage.py
+
+# dotenv
+.env
+
+# virtualenv
+.venv
+venv/
+ENV/
+
+# Spyder project settings
+.spyderproject
+
+# Rope project settings
+.ropeproject
+
+# Ignore any files and directories that begin with the word "local"
+local*
diff --git a/keras_ssd/__init__.py b/keras_ssd/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/keras_ssd/bounding_box_utils/__init__.py b/keras_ssd/bounding_box_utils/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/keras_ssd/bounding_box_utils/bounding_box_utils.py b/keras_ssd/bounding_box_utils/bounding_box_utils.py
new file mode 100644
index 0000000..36ce3dc
--- /dev/null
+++ b/keras_ssd/bounding_box_utils/bounding_box_utils.py
@@ -0,0 +1,383 @@
+'''
+Includes:
+* Function to compute the IoU similarity for axis-aligned, rectangular, 2D bounding boxes
+* Function for coordinate conversion for axis-aligned, rectangular, 2D bounding boxes
+
+Copyright (C) 2018 Pierluigi Ferrari
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+'''
+
+from __future__ import division
+import numpy as np
+
+def convert_coordinates(tensor, start_index, conversion, border_pixels='half'):
+    '''
+    Convert coordinates for axis-aligned 2D boxes between two coordinate formats.
+
+    Creates a copy of `tensor`, i.e. does not operate in place. Currently there are
+    three supported coordinate formats that can be converted from and to each other:
+        1) (xmin, xmax, ymin, ymax) - the 'minmax' format
+        2) (xmin, ymin, xmax, ymax) - the 'corners' format
+        2) (cx, cy, w, h) - the 'centroids' format
+
+    Arguments:
+        tensor (array): A Numpy nD array containing the four consecutive coordinates
+            to be converted somewhere in the last axis.
+        start_index (int): The index of the first coordinate in the last axis of `tensor`.
+        conversion (str, optional): The conversion direction. Can be 'minmax2centroids',
+            'centroids2minmax', 'corners2centroids', 'centroids2corners', 'minmax2corners',
+            or 'corners2minmax'.
+        border_pixels (str, optional): How to treat the border pixels of the bounding boxes.
+            Can be 'include', 'exclude', or 'half'. If 'include', the border pixels belong
+            to the boxes. If 'exclude', the border pixels do not belong to the boxes.
+            If 'half', then one of each of the two horizontal and vertical borders belong
+            to the boxex, but not the other.
+
+    Returns:
+        A Numpy nD array, a copy of the input tensor with the converted coordinates
+        in place of the original coordinates and the unaltered elements of the original
+        tensor elsewhere.
+    '''
+    if border_pixels == 'half':
+        d = 0
+    elif border_pixels == 'include':
+        d = 1
+    elif border_pixels == 'exclude':
+        d = -1
+
+    ind = start_index
+    tensor1 = np.copy(tensor).astype(np.float)
+    if conversion == 'minmax2centroids':
+        tensor1[..., ind] = (tensor[..., ind] + tensor[..., ind+1]) / 2.0 # Set cx
+        tensor1[..., ind+1] = (tensor[..., ind+2] + tensor[..., ind+3]) / 2.0 # Set cy
+        tensor1[..., ind+2] = tensor[..., ind+1] - tensor[..., ind] + d # Set w
+        tensor1[..., ind+3] = tensor[..., ind+3] - tensor[..., ind+2] + d # Set h
+    elif conversion == 'centroids2minmax':
+        tensor1[..., ind] = tensor[..., ind] - tensor[..., ind+2] / 2.0 # Set xmin
+        tensor1[..., ind+1] = tensor[..., ind] + tensor[..., ind+2] / 2.0 # Set xmax
+        tensor1[..., ind+2] = tensor[..., ind+1] - tensor[..., ind+3] / 2.0 # Set ymin
+        tensor1[..., ind+3] = tensor[..., ind+1] + tensor[..., ind+3] / 2.0 # Set ymax
+    elif conversion == 'corners2centroids':
+        tensor1[..., ind] = (tensor[..., ind] + tensor[..., ind+2]) / 2.0 # Set cx
+        tensor1[..., ind+1] = (tensor[..., ind+1] + tensor[..., ind+3]) / 2.0 # Set cy
+        tensor1[..., ind+2] = tensor[..., ind+2] - tensor[..., ind] + d # Set w
+        tensor1[..., ind+3] = tensor[..., ind+3] - tensor[..., ind+1] + d # Set h
+    elif conversion == 'centroids2corners':
+        tensor1[..., ind] = tensor[..., ind] - tensor[..., ind+2] / 2.0 # Set xmin
+        tensor1[..., ind+1] = tensor[..., ind+1] - tensor[..., ind+3] / 2.0 # Set ymin
+        tensor1[..., ind+2] = tensor[..., ind] + tensor[..., ind+2] / 2.0 # Set xmax
+        tensor1[..., ind+3] = tensor[..., ind+1] + tensor[..., ind+3] / 2.0 # Set ymax
+    elif (conversion == 'minmax2corners') or (conversion == 'corners2minmax'):
+        tensor1[..., ind+1] = tensor[..., ind+2]
+        tensor1[..., ind+2] = tensor[..., ind+1]
+    else:
+        raise ValueError("Unexpected conversion value. Supported values are 'minmax2centroids', 'centroids2minmax', 'corners2centroids', 'centroids2corners', 'minmax2corners', and 'corners2minmax'.")
+
+    return tensor1
+
+def convert_coordinates2(tensor, start_index, conversion):
+    '''
+    A matrix multiplication implementation of `convert_coordinates()`.
+    Supports only conversion between the 'centroids' and 'minmax' formats.
+
+    This function is marginally slower on average than `convert_coordinates()`,
+    probably because it involves more (unnecessary) arithmetic operations (unnecessary
+    because the two matrices are sparse).
+
+    For details please refer to the documentation of `convert_coordinates()`.
+    '''
+    ind = start_index
+    tensor1 = np.copy(tensor).astype(np.float)
+    if conversion == 'minmax2centroids':
+        M = np.array([[0.5, 0. , -1.,  0.],
+                      [0.5, 0. ,  1.,  0.],
+                      [0. , 0.5,  0., -1.],
+                      [0. , 0.5,  0.,  1.]])
+        tensor1[..., ind:ind+4] = np.dot(tensor1[..., ind:ind+4], M)
+    elif conversion == 'centroids2minmax':
+        M = np.array([[ 1. , 1. ,  0. , 0. ],
+                      [ 0. , 0. ,  1. , 1. ],
+                      [-0.5, 0.5,  0. , 0. ],
+                      [ 0. , 0. , -0.5, 0.5]]) # The multiplicative inverse of the matrix above
+        tensor1[..., ind:ind+4] = np.dot(tensor1[..., ind:ind+4], M)
+    else:
+        raise ValueError("Unexpected conversion value. Supported values are 'minmax2centroids' and 'centroids2minmax'.")
+
+    return tensor1
+
+def intersection_area(boxes1, boxes2, coords='centroids', mode='outer_product', border_pixels='half'):
+    '''
+    Computes the intersection areas of two sets of axis-aligned 2D rectangular boxes.
+
+    Let `boxes1` and `boxes2` contain `m` and `n` boxes, respectively.
+
+    In 'outer_product' mode, returns an `(m,n)` matrix with the intersection areas for all possible
+    combinations of the boxes in `boxes1` and `boxes2`.
+
+    In 'element-wise' mode, `m` and `n` must be broadcast-compatible. Refer to the explanation
+    of the `mode` argument for details.
+
+    Arguments:
+        boxes1 (array): Either a 1D Numpy array of shape `(4, )` containing the coordinates for one box in the
+            format specified by `coords` or a 2D Numpy array of shape `(m, 4)` containing the coordinates for `m` boxes.
+            If `mode` is set to 'element_wise', the shape must be broadcast-compatible with `boxes2`.
+        boxes2 (array): Either a 1D Numpy array of shape `(4, )` containing the coordinates for one box in the
+            format specified by `coords` or a 2D Numpy array of shape `(n, 4)` containing the coordinates for `n` boxes.
+            If `mode` is set to 'element_wise', the shape must be broadcast-compatible with `boxes1`.
+        coords (str, optional): The coordinate format in the input arrays. Can be either 'centroids' for the format
+            `(cx, cy, w, h)`, 'minmax' for the format `(xmin, xmax, ymin, ymax)`, or 'corners' for the format
+            `(xmin, ymin, xmax, ymax)`.
+        mode (str, optional): Can be one of 'outer_product' and 'element-wise'. In 'outer_product' mode, returns an
+            `(m,n)` matrix with the intersection areas for all possible combinations of the `m` boxes in `boxes1` with the
+            `n` boxes in `boxes2`. In 'element-wise' mode, returns a 1D array and the shapes of `boxes1` and `boxes2`
+            must be boadcast-compatible. If both `boxes1` and `boxes2` have `m` boxes, then this returns an array of
+            length `m` where the i-th position contains the intersection area of `boxes1[i]` with `boxes2[i]`.
+        border_pixels (str, optional): How to treat the border pixels of the bounding boxes.
+            Can be 'include', 'exclude', or 'half'. If 'include', the border pixels belong
+            to the boxes. If 'exclude', the border pixels do not belong to the boxes.
+            If 'half', then one of each of the two horizontal and vertical borders belong
+            to the boxex, but not the other.
+
+    Returns:
+        A 1D or 2D Numpy array (refer to the `mode` argument for details) of dtype float containing values with
+        the intersection areas of the boxes in `boxes1` and `boxes2`.
+    '''
+
+    # Make sure the boxes have the right shapes.
+    if boxes1.ndim > 2: raise ValueError("boxes1 must have rank either 1 or 2, but has rank {}.".format(boxes1.ndim))
+    if boxes2.ndim > 2: raise ValueError("boxes2 must have rank either 1 or 2, but has rank {}.".format(boxes2.ndim))
+
+    if boxes1.ndim == 1: boxes1 = np.expand_dims(boxes1, axis=0)
+    if boxes2.ndim == 1: boxes2 = np.expand_dims(boxes2, axis=0)
+
+    if not (boxes1.shape[1] == boxes2.shape[1] == 4): raise ValueError("All boxes must consist of 4 coordinates, but the boxes in `boxes1` and `boxes2` have {} and {} coordinates, respectively.".format(boxes1.shape[1], boxes2.shape[1]))
+    if not mode in {'outer_product', 'element-wise'}: raise ValueError("`mode` must be one of 'outer_product' and 'element-wise', but got '{}'.",format(mode))
+
+    # Convert the coordinates if necessary.
+    if coords == 'centroids':
+        boxes1 = convert_coordinates(boxes1, start_index=0, conversion='centroids2corners')
+        boxes2 = convert_coordinates(boxes2, start_index=0, conversion='centroids2corners')
+        coords = 'corners'
+    elif not (coords in {'minmax', 'corners'}):
+        raise ValueError("Unexpected value for `coords`. Supported values are 'minmax', 'corners' and 'centroids'.")
+
+    m = boxes1.shape[0] # The number of boxes in `boxes1`
+    n = boxes2.shape[0] # The number of boxes in `boxes2`
+
+    # Set the correct coordinate indices for the respective formats.
+    if coords == 'corners':
+        xmin = 0
+        ymin = 1
+        xmax = 2
+        ymax = 3
+    elif coords == 'minmax':
+        xmin = 0
+        xmax = 1
+        ymin = 2
+        ymax = 3
+
+    if border_pixels == 'half':
+        d = 0
+    elif border_pixels == 'include':
+        d = 1 # If border pixels are supposed to belong to the bounding boxes, we have to add one pixel to any difference `xmax - xmin` or `ymax - ymin`.
+    elif border_pixels == 'exclude':
+        d = -1 # If border pixels are not supposed to belong to the bounding boxes, we have to subtract one pixel from any difference `xmax - xmin` or `ymax - ymin`.
+
+    # Compute the intersection areas.
+
+    if mode == 'outer_product':
+
+        # For all possible box combinations, get the greater xmin and ymin values.
+        # This is a tensor of shape (m,n,2).
+        min_xy = np.maximum(np.tile(np.expand_dims(boxes1[:,[xmin,ymin]], axis=1), reps=(1, n, 1)),
+                            np.tile(np.expand_dims(boxes2[:,[xmin,ymin]], axis=0), reps=(m, 1, 1)))
+
+        # For all possible box combinations, get the smaller xmax and ymax values.
+        # This is a tensor of shape (m,n,2).
+        max_xy = np.minimum(np.tile(np.expand_dims(boxes1[:,[xmax,ymax]], axis=1), reps=(1, n, 1)),
+                            np.tile(np.expand_dims(boxes2[:,[xmax,ymax]], axis=0), reps=(m, 1, 1)))
+
+        # Compute the side lengths of the intersection rectangles.
+        side_lengths = np.maximum(0, max_xy - min_xy + d)
+
+        return side_lengths[:,:,0] * side_lengths[:,:,1]
+
+    elif mode == 'element-wise':
+
+        min_xy = np.maximum(boxes1[:,[xmin,ymin]], boxes2[:,[xmin,ymin]])
+        max_xy = np.minimum(boxes1[:,[xmax,ymax]], boxes2[:,[xmax,ymax]])
+
+        # Compute the side lengths of the intersection rectangles.
+        side_lengths = np.maximum(0, max_xy - min_xy + d)
+
+        return side_lengths[:,0] * side_lengths[:,1]
+
+def intersection_area_(boxes1, boxes2, coords='corners', mode='outer_product', border_pixels='half'):
+    '''
+    The same as 'intersection_area()' but for internal use, i.e. without all the safety checks.
+    '''
+
+    m = boxes1.shape[0] # The number of boxes in `boxes1`
+    n = boxes2.shape[0] # The number of boxes in `boxes2`
+
+    # Set the correct coordinate indices for the respective formats.
+    if coords == 'corners':
+        xmin = 0
+        ymin = 1
+        xmax = 2
+        ymax = 3
+    elif coords == 'minmax':
+        xmin = 0
+        xmax = 1
+        ymin = 2
+        ymax = 3
+
+    if border_pixels == 'half':
+        d = 0
+    elif border_pixels == 'include':
+        d = 1 # If border pixels are supposed to belong to the bounding boxes, we have to add one pixel to any difference `xmax - xmin` or `ymax - ymin`.
+    elif border_pixels == 'exclude':
+        d = -1 # If border pixels are not supposed to belong to the bounding boxes, we have to subtract one pixel from any difference `xmax - xmin` or `ymax - ymin`.
+
+    # Compute the intersection areas.
+
+    if mode == 'outer_product':
+
+        # For all possible box combinations, get the greater xmin and ymin values.
+        # This is a tensor of shape (m,n,2).
+        min_xy = np.maximum(np.tile(np.expand_dims(boxes1[:,[xmin,ymin]], axis=1), reps=(1, n, 1)),
+                            np.tile(np.expand_dims(boxes2[:,[xmin,ymin]], axis=0), reps=(m, 1, 1)))
+
+        # For all possible box combinations, get the smaller xmax and ymax values.
+        # This is a tensor of shape (m,n,2).
+        max_xy = np.minimum(np.tile(np.expand_dims(boxes1[:,[xmax,ymax]], axis=1), reps=(1, n, 1)),
+                            np.tile(np.expand_dims(boxes2[:,[xmax,ymax]], axis=0), reps=(m, 1, 1)))
+
+        # Compute the side lengths of the intersection rectangles.
+        side_lengths = np.maximum(0, max_xy - min_xy + d)
+
+        return side_lengths[:,:,0] * side_lengths[:,:,1]
+
+    elif mode == 'element-wise':
+
+        min_xy = np.maximum(boxes1[:,[xmin,ymin]], boxes2[:,[xmin,ymin]])
+        max_xy = np.minimum(boxes1[:,[xmax,ymax]], boxes2[:,[xmax,ymax]])
+
+        # Compute the side lengths of the intersection rectangles.
+        side_lengths = np.maximum(0, max_xy - min_xy + d)
+
+        return side_lengths[:,0] * side_lengths[:,1]
+
+
+def iou(boxes1, boxes2, coords='centroids', mode='outer_product', border_pixels='half'):
+    '''
+    Computes the intersection-over-union similarity (also known as Jaccard similarity)
+    of two sets of axis-aligned 2D rectangular boxes.
+
+    Let `boxes1` and `boxes2` contain `m` and `n` boxes, respectively.
+
+    In 'outer_product' mode, returns an `(m,n)` matrix with the IoUs for all possible
+    combinations of the boxes in `boxes1` and `boxes2`.
+
+    In 'element-wise' mode, `m` and `n` must be broadcast-compatible. Refer to the explanation
+    of the `mode` argument for details.
+
+    Arguments:
+        boxes1 (array): Either a 1D Numpy array of shape `(4, )` containing the coordinates for one box in the
+            format specified by `coords` or a 2D Numpy array of shape `(m, 4)` containing the coordinates for `m` boxes.
+            If `mode` is set to 'element_wise', the shape must be broadcast-compatible with `boxes2`.
+        boxes2 (array): Either a 1D Numpy array of shape `(4, )` containing the coordinates for one box in the
+            format specified by `coords` or a 2D Numpy array of shape `(n, 4)` containing the coordinates for `n` boxes.
+            If `mode` is set to 'element_wise', the shape must be broadcast-compatible with `boxes1`.
+        coords (str, optional): The coordinate format in the input arrays. Can be either 'centroids' for the format
+            `(cx, cy, w, h)`, 'minmax' for the format `(xmin, xmax, ymin, ymax)`, or 'corners' for the format
+            `(xmin, ymin, xmax, ymax)`.
+        mode (str, optional): Can be one of 'outer_product' and 'element-wise'. In 'outer_product' mode, returns an
+            `(m,n)` matrix with the IoU overlaps for all possible combinations of the `m` boxes in `boxes1` with the
+            `n` boxes in `boxes2`. In 'element-wise' mode, returns a 1D array and the shapes of `boxes1` and `boxes2`
+            must be boadcast-compatible. If both `boxes1` and `boxes2` have `m` boxes, then this returns an array of
+            length `m` where the i-th position contains the IoU overlap of `boxes1[i]` with `boxes2[i]`.
+        border_pixels (str, optional): How to treat the border pixels of the bounding boxes.
+            Can be 'include', 'exclude', or 'half'. If 'include', the border pixels belong
+            to the boxes. If 'exclude', the border pixels do not belong to the boxes.
+            If 'half', then one of each of the two horizontal and vertical borders belong
+            to the boxex, but not the other.
+
+    Returns:
+        A 1D or 2D Numpy array (refer to the `mode` argument for details) of dtype float containing values in [0,1],
+        the Jaccard similarity of the boxes in `boxes1` and `boxes2`. 0 means there is no overlap between two given
+        boxes, 1 means their coordinates are identical.
+    '''
+
+    # Make sure the boxes have the right shapes.
+    if boxes1.ndim > 2: raise ValueError("boxes1 must have rank either 1 or 2, but has rank {}.".format(boxes1.ndim))
+    if boxes2.ndim > 2: raise ValueError("boxes2 must have rank either 1 or 2, but has rank {}.".format(boxes2.ndim))
+
+    if boxes1.ndim == 1: boxes1 = np.expand_dims(boxes1, axis=0)
+    if boxes2.ndim == 1: boxes2 = np.expand_dims(boxes2, axis=0)
+
+    if not (boxes1.shape[1] == boxes2.shape[1] == 4): raise ValueError("All boxes must consist of 4 coordinates, but the boxes in `boxes1` and `boxes2` have {} and {} coordinates, respectively.".format(boxes1.shape[1], boxes2.shape[1]))
+    if not mode in {'outer_product', 'element-wise'}: raise ValueError("`mode` must be one of 'outer_product' and 'element-wise', but got '{}'.".format(mode))
+
+    # Convert the coordinates if necessary.
+    if coords == 'centroids':
+        boxes1 = convert_coordinates(boxes1, start_index=0, conversion='centroids2corners')
+        boxes2 = convert_coordinates(boxes2, start_index=0, conversion='centroids2corners')
+        coords = 'corners'
+    elif not (coords in {'minmax', 'corners'}):
+        raise ValueError("Unexpected value for `coords`. Supported values are 'minmax', 'corners' and 'centroids'.")
+
+    # Compute the IoU.
+
+    # Compute the interesection areas.
+
+    intersection_areas = intersection_area_(boxes1, boxes2, coords=coords, mode=mode)
+
+    m = boxes1.shape[0] # The number of boxes in `boxes1`
+    n = boxes2.shape[0] # The number of boxes in `boxes2`
+
+    # Compute the union areas.
+
+    # Set the correct coordinate indices for the respective formats.
+    if coords == 'corners':
+        xmin = 0
+        ymin = 1
+        xmax = 2
+        ymax = 3
+    elif coords == 'minmax':
+        xmin = 0
+        xmax = 1
+        ymin = 2
+        ymax = 3
+
+    if border_pixels == 'half':
+        d = 0
+    elif border_pixels == 'include':
+        d = 1 # If border pixels are supposed to belong to the bounding boxes, we have to add one pixel to any difference `xmax - xmin` or `ymax - ymin`.
+    elif border_pixels == 'exclude':
+        d = -1 # If border pixels are not supposed to belong to the bounding boxes, we have to subtract one pixel from any difference `xmax - xmin` or `ymax - ymin`.
+
+    if mode == 'outer_product':
+
+        boxes1_areas = np.tile(np.expand_dims((boxes1[:,xmax] - boxes1[:,xmin] + d) * (boxes1[:,ymax] - boxes1[:,ymin] + d), axis=1), reps=(1,n))
+        boxes2_areas = np.tile(np.expand_dims((boxes2[:,xmax] - boxes2[:,xmin] + d) * (boxes2[:,ymax] - boxes2[:,ymin] + d), axis=0), reps=(m,1))
+
+    elif mode == 'element-wise':
+
+        boxes1_areas = (boxes1[:,xmax] - boxes1[:,xmin] + d) * (boxes1[:,ymax] - boxes1[:,ymin] + d)
+        boxes2_areas = (boxes2[:,xmax] - boxes2[:,xmin] + d) * (boxes2[:,ymax] - boxes2[:,ymin] + d)
+
+    union_areas = boxes1_areas + boxes2_areas - intersection_areas
+
+    return intersection_areas / union_areas
diff --git a/keras_ssd/data_generator/__init__.py b/keras_ssd/data_generator/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/keras_ssd/data_generator/data_augmentation_chain_constant_input_size.py b/keras_ssd/data_generator/data_augmentation_chain_constant_input_size.py
new file mode 100644
index 0000000..2c18a98
--- /dev/null
+++ b/keras_ssd/data_generator/data_augmentation_chain_constant_input_size.py
@@ -0,0 +1,183 @@
+'''
+The data augmentation operations of the original SSD implementation.
+
+Copyright (C) 2018 Pierluigi Ferrari
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+'''
+
+from __future__ import division
+import numpy as np
+
+from data_generator.object_detection_2d_photometric_ops import ConvertColor, ConvertDataType, ConvertTo3Channels, RandomBrightness, RandomContrast, RandomHue, RandomSaturation
+from data_generator.object_detection_2d_geometric_ops import RandomFlip, RandomTranslate, RandomScale
+from data_generator.object_detection_2d_image_boxes_validation_utils import BoundGenerator, BoxFilter, ImageValidator
+
+class DataAugmentationConstantInputSize:
+    '''
+    Applies a chain of photometric and geometric image transformations. For documentation, please refer
+    to the documentation of the individual transformations involved.
+
+    Important: This augmentation chain is suitable for constant-size images only.
+    '''
+
+    def __init__(self,
+                 random_brightness=(-48, 48, 0.5),
+                 random_contrast=(0.5, 1.8, 0.5),
+                 random_saturation=(0.5, 1.8, 0.5),
+                 random_hue=(18, 0.5),
+                 random_flip=0.5,
+                 random_translate=((0.03,0.5), (0.03,0.5), 0.5),
+                 random_scale=(0.5, 2.0, 0.5),
+                 n_trials_max=3,
+                 clip_boxes=True,
+                 overlap_criterion='area',
+                 bounds_box_filter=(0.3, 1.0),
+                 bounds_validator=(0.5, 1.0),
+                 n_boxes_min=1,
+                 background=(0,0,0),
+                 labels_format={'class_id': 0, 'xmin': 1, 'ymin': 2, 'xmax': 3, 'ymax': 4}):
+
+        if (random_scale[0] >= 1) or (random_scale[1] <= 1):
+            raise ValueError("This sequence of transformations only makes sense if the minimum scaling factor is <1 and the maximum scaling factor is >1.")
+
+        self.n_trials_max = n_trials_max
+        self.clip_boxes = clip_boxes
+        self.overlap_criterion = overlap_criterion
+        self.bounds_box_filter = bounds_box_filter
+        self.bounds_validator = bounds_validator
+        self.n_boxes_min = n_boxes_min
+        self.background = background
+        self.labels_format = labels_format
+
+        # Determines which boxes are kept in an image after the transformations have been applied.
+        self.box_filter = BoxFilter(check_overlap=True,
+                                    check_min_area=True,
+                                    check_degenerate=True,
+                                    overlap_criterion=self.overlap_criterion,
+                                    overlap_bounds=self.bounds_box_filter,
+                                    min_area=16,
+                                    labels_format=self.labels_format)
+
+        # Determines whether the result of the transformations is a valid training image.
+        self.image_validator = ImageValidator(overlap_criterion=self.overlap_criterion,
+                                              bounds=self.bounds_validator,
+                                              n_boxes_min=self.n_boxes_min,
+                                              labels_format=self.labels_format)
+
+        # Utility distortions
+        self.convert_RGB_to_HSV = ConvertColor(current='RGB', to='HSV')
+        self.convert_HSV_to_RGB = ConvertColor(current='HSV', to='RGB')
+        self.convert_to_float32 = ConvertDataType(to='float32')
+        self.convert_to_uint8 = ConvertDataType(to='uint8')
+        self.convert_to_3_channels = ConvertTo3Channels() # Make sure all images end up having 3 channels.
+
+        # Photometric transformations
+        self.random_brightness = RandomBrightness(lower=random_brightness[0], upper=random_brightness[1], prob=random_brightness[2])
+        self.random_contrast = RandomContrast(lower=random_contrast[0], upper=random_contrast[1], prob=random_contrast[2])
+        self.random_saturation = RandomSaturation(lower=random_saturation[0], upper=random_saturation[1], prob=random_saturation[2])
+        self.random_hue = RandomHue(max_delta=random_hue[0], prob=random_hue[1])
+
+        # Geometric transformations
+        self.random_flip = RandomFlip(dim='horizontal', prob=random_flip, labels_format=self.labels_format)
+        self.random_translate = RandomTranslate(dy_minmax=random_translate[0],
+                                                dx_minmax=random_translate[1],
+                                                prob=random_translate[2],
+                                                clip_boxes=self.clip_boxes,
+                                                box_filter=self.box_filter,
+                                                image_validator=self.image_validator,
+                                                n_trials_max=self.n_trials_max,
+                                                background=self.background,
+                                                labels_format=self.labels_format)
+        self.random_zoom_in = RandomScale(min_factor=1.0,
+                                          max_factor=random_scale[1],
+                                          prob=random_scale[2],
+                                          clip_boxes=self.clip_boxes,
+                                          box_filter=self.box_filter,
+                                          image_validator=self.image_validator,
+                                          n_trials_max=self.n_trials_max,
+                                          background=self.background,
+                                          labels_format=self.labels_format)
+        self.random_zoom_out = RandomScale(min_factor=random_scale[0],
+                                           max_factor=1.0,
+                                           prob=random_scale[2],
+                                           clip_boxes=self.clip_boxes,
+                                           box_filter=self.box_filter,
+                                           image_validator=self.image_validator,
+                                           n_trials_max=self.n_trials_max,
+                                           background=self.background,
+                                           labels_format=self.labels_format)
+
+        # If we zoom in, do translation before scaling.
+        self.sequence1 = [self.convert_to_3_channels,
+                          self.convert_to_float32,
+                          self.random_brightness,
+                          self.random_contrast,
+                          self.convert_to_uint8,
+                          self.convert_RGB_to_HSV,
+                          self.convert_to_float32,
+                          self.random_saturation,
+                          self.random_hue,
+                          self.convert_to_uint8,
+                          self.convert_HSV_to_RGB,
+                          self.random_translate,
+                          self.random_zoom_in,
+                          self.random_flip]
+
+        # If we zoom out, do scaling before translation.
+        self.sequence2 = [self.convert_to_3_channels,
+                          self.convert_to_float32,
+                          self.random_brightness,
+                          self.convert_to_uint8,
+                          self.convert_RGB_to_HSV,
+                          self.convert_to_float32,
+                          self.random_saturation,
+                          self.random_hue,
+                          self.convert_to_uint8,
+                          self.convert_HSV_to_RGB,
+                          self.convert_to_float32,
+                          self.random_contrast,
+                          self.convert_to_uint8,
+                          self.random_zoom_out,
+                          self.random_translate,
+                          self.random_flip]
+
+    def __call__(self, image, labels=None):
+
+        self.random_translate.labels_format = self.labels_format
+        self.random_zoom_in.labels_format = self.labels_format
+        self.random_zoom_out.labels_format = self.labels_format
+        self.random_flip.labels_format = self.labels_format
+
+        # Choose sequence 1 with probability 0.5.
+        if np.random.choice(2):
+
+            if not (labels is None):
+                for transform in self.sequence1:
+                    image, labels = transform(image, labels)
+                return image, labels
+            else:
+                for transform in self.sequence1:
+                    image = transform(image)
+                return image
+        # Choose sequence 2 with probability 0.5.
+        else:
+
+            if not (labels is None):
+                for transform in self.sequence2:
+                    image, labels = transform(image, labels)
+                return image, labels
+            else:
+                for transform in self.sequence2:
+                    image = transform(image)
+                return image
diff --git a/keras_ssd/data_generator/data_augmentation_chain_original_ssd.py b/keras_ssd/data_generator/data_augmentation_chain_original_ssd.py
new file mode 100644
index 0000000..af8d498
--- /dev/null
+++ b/keras_ssd/data_generator/data_augmentation_chain_original_ssd.py
@@ -0,0 +1,280 @@
+'''
+The data augmentation operations of the original SSD implementation.
+
+Copyright (C) 2018 Pierluigi Ferrari
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+'''
+
+from __future__ import division
+import numpy as np
+import cv2
+import inspect
+
+from data_generator.object_detection_2d_photometric_ops import ConvertColor, ConvertDataType, ConvertTo3Channels, RandomBrightness, RandomContrast, RandomHue, RandomSaturation, RandomChannelSwap
+from data_generator.object_detection_2d_patch_sampling_ops import PatchCoordinateGenerator, RandomPatch, RandomPatchInf
+from data_generator.object_detection_2d_geometric_ops import ResizeRandomInterp, RandomFlip
+from data_generator.object_detection_2d_image_boxes_validation_utils import BoundGenerator, BoxFilter, ImageValidator
+
+class SSDRandomCrop:
+    '''
+    Performs the same random crops as defined by the `batch_sampler` instructions
+    of the original Caffe implementation of SSD. A description of this random cropping
+    strategy can also be found in the data augmentation section of the paper:
+    https://arxiv.org/abs/1512.02325
+    '''
+
+    def __init__(self, labels_format={'class_id': 0, 'xmin': 1, 'ymin': 2, 'xmax': 3, 'ymax': 4}):
+        '''
+        Arguments:
+            labels_format (dict, optional): A dictionary that defines which index in the last axis of the labels
+                of an image contains which bounding box coordinate. The dictionary maps at least the keywords
+                'xmin', 'ymin', 'xmax', and 'ymax' to their respective indices within last axis of the labels array.
+        '''
+
+        self.labels_format = labels_format
+
+        # This randomly samples one of the lower IoU bounds defined
+        # by the `sample_space` every time it is called.
+        self.bound_generator = BoundGenerator(sample_space=((None, None),
+                                                            (0.1, None),
+                                                            (0.3, None),
+                                                            (0.5, None),
+                                                            (0.7, None),
+                                                            (0.9, None)),
+                                              weights=None)
+
+        # Produces coordinates for candidate patches such that the height
+        # and width of the patches are between 0.3 and 1.0 of the height
+        # and width of the respective image and the aspect ratio of the
+        # patches is between 0.5 and 2.0.
+        self.patch_coord_generator = PatchCoordinateGenerator(must_match='h_w',
+                                                              min_scale=0.3,
+                                                              max_scale=1.0,
+                                                              scale_uniformly=False,
+                                                              min_aspect_ratio = 0.5,
+                                                              max_aspect_ratio = 2.0)
+
+        # Filters out boxes whose center point does not lie within the
+        # chosen patches.
+        self.box_filter = BoxFilter(check_overlap=True,
+                                    check_min_area=False,
+                                    check_degenerate=False,
+                                    overlap_criterion='center_point',
+                                    labels_format=self.labels_format)
+
+        # Determines whether a given patch is considered a valid patch.
+        # Defines a patch to be valid if at least one ground truth bounding box
+        # (n_boxes_min == 1) has an IoU overlap with the patch that
+        # meets the requirements defined by `bound_generator`.
+        self.image_validator = ImageValidator(overlap_criterion='iou',
+                                              n_boxes_min=1,
+                                              labels_format=self.labels_format,
+                                              border_pixels='half')
+
+        # Performs crops according to the parameters set in the objects above.
+        # Runs until either a valid patch is found or the original input image
+        # is returned unaltered. Runs a maximum of 50 trials to find a valid
+        # patch for each new sampled IoU threshold. Every 50 trials, the original
+        # image is returned as is with probability (1 - prob) = 0.143.
+        self.random_crop = RandomPatchInf(patch_coord_generator=self.patch_coord_generator,
+                                          box_filter=self.box_filter,
+                                          image_validator=self.image_validator,
+                                          bound_generator=self.bound_generator,
+                                          n_trials_max=50,
+                                          clip_boxes=True,
+                                          prob=0.857,
+                                          labels_format=self.labels_format)
+
+    def __call__(self, image, labels=None, return_inverter=False):
+        self.random_crop.labels_format = self.labels_format
+        return self.random_crop(image, labels, return_inverter)
+
+class SSDExpand:
+    '''
+    Performs the random image expansion as defined by the `train_transform_param` instructions
+    of the original Caffe implementation of SSD. A description of this expansion strategy
+    can also be found in section 3.6 ("Data Augmentation for Small Object Accuracy") of the paper:
+    https://arxiv.org/abs/1512.02325
+    '''
+
+    def __init__(self, background=(123, 117, 104), labels_format={'class_id': 0, 'xmin': 1, 'ymin': 2, 'xmax': 3, 'ymax': 4}):
+        '''
+        Arguments:
+            background (list/tuple, optional): A 3-tuple specifying the RGB color value of the
+                background pixels of the translated images.
+            labels_format (dict, optional): A dictionary that defines which index in the last axis of the labels
+                of an image contains which bounding box coordinate. The dictionary maps at least the keywords
+                'xmin', 'ymin', 'xmax', and 'ymax' to their respective indices within last axis of the labels array.
+        '''
+
+        self.labels_format = labels_format
+
+        # Generate coordinates for patches that are between 1.0 and 4.0 times
+        # the size of the input image in both spatial dimensions.
+        self.patch_coord_generator = PatchCoordinateGenerator(must_match='h_w',
+                                                              min_scale=1.0,
+                                                              max_scale=4.0,
+                                                              scale_uniformly=True)
+
+        # With probability 0.5, place the input image randomly on a canvas filled with
+        # mean color values according to the parameters set above. With probability 0.5,
+        # return the input image unaltered.
+        self.expand = RandomPatch(patch_coord_generator=self.patch_coord_generator,
+                                  box_filter=None,
+                                  image_validator=None,
+                                  n_trials_max=1,
+                                  clip_boxes=False,
+                                  prob=0.5,
+                                  background=background,
+                                  labels_format=self.labels_format)
+
+    def __call__(self, image, labels=None, return_inverter=False):
+        self.expand.labels_format = self.labels_format
+        return self.expand(image, labels, return_inverter)
+
+class SSDPhotometricDistortions:
+    '''
+    Performs the photometric distortions defined by the `train_transform_param` instructions
+    of the original Caffe implementation of SSD.
+    '''
+
+    def __init__(self):
+
+        self.convert_RGB_to_HSV = ConvertColor(current='RGB', to='HSV')
+        self.convert_HSV_to_RGB = ConvertColor(current='HSV', to='RGB')
+        self.convert_to_float32 = ConvertDataType(to='float32')
+        self.convert_to_uint8 = ConvertDataType(to='uint8')
+        self.convert_to_3_channels = ConvertTo3Channels()
+        self.random_brightness = RandomBrightness(lower=-32, upper=32, prob=0.5)
+        self.random_contrast = RandomContrast(lower=0.5, upper=1.5, prob=0.5)
+        self.random_saturation = RandomSaturation(lower=0.5, upper=1.5, prob=0.5)
+        self.random_hue = RandomHue(max_delta=18, prob=0.5)
+        self.random_channel_swap = RandomChannelSwap(prob=0.0)
+
+        self.sequence1 = [self.convert_to_3_channels,
+                          self.convert_to_float32,
+                          self.random_brightness,
+                          self.random_contrast,
+                          self.convert_to_uint8,
+                          self.convert_RGB_to_HSV,
+                          self.convert_to_float32,
+                          self.random_saturation,
+                          self.random_hue,
+                          self.convert_to_uint8,
+                          self.convert_HSV_to_RGB,
+                          self.random_channel_swap]
+
+        self.sequence2 = [self.convert_to_3_channels,
+                          self.convert_to_float32,
+                          self.random_brightness,
+                          self.convert_to_uint8,
+                          self.convert_RGB_to_HSV,
+                          self.convert_to_float32,
+                          self.random_saturation,
+                          self.random_hue,
+                          self.convert_to_uint8,
+                          self.convert_HSV_to_RGB,
+                          self.convert_to_float32,
+                          self.random_contrast,
+                          self.convert_to_uint8,
+                          self.random_channel_swap]
+
+    def __call__(self, image, labels):
+
+        # Choose sequence 1 with probability 0.5.
+        if np.random.choice(2):
+
+            for transform in self.sequence1:
+                image, labels = transform(image, labels)
+            return image, labels
+        # Choose sequence 2 with probability 0.5.
+        else:
+
+            for transform in self.sequence2:
+                image, labels = transform(image, labels)
+            return image, labels
+
+class SSDDataAugmentation:
+    '''
+    Reproduces the data augmentation pipeline used in the training of the original
+    Caffe implementation of SSD.
+    '''
+
+    def __init__(self,
+                 img_height=300,
+                 img_width=300,
+                 background=(123, 117, 104),
+                 labels_format={'class_id': 0, 'xmin': 1, 'ymin': 2, 'xmax': 3, 'ymax': 4}):
+        '''
+        Arguments:
+            height (int): The desired height of the output images in pixels.
+            width (int): The desired width of the output images in pixels.
+            background (list/tuple, optional): A 3-tuple specifying the RGB color value of the
+                background pixels of the translated images.
+            labels_format (dict, optional): A dictionary that defines which index in the last axis of the labels
+                of an image contains which bounding box coordinate. The dictionary maps at least the keywords
+                'xmin', 'ymin', 'xmax', and 'ymax' to their respective indices within last axis of the labels array.
+        '''
+
+        self.labels_format = labels_format
+
+        self.photometric_distortions = SSDPhotometricDistortions()
+        self.expand = SSDExpand(background=background, labels_format=self.labels_format)
+        self.random_crop = SSDRandomCrop(labels_format=self.labels_format)
+        self.random_flip = RandomFlip(dim='horizontal', prob=0.5, labels_format=self.labels_format)
+
+        # This box filter makes sure that the resized images don't contain any degenerate boxes.
+        # Resizing the images could lead the boxes to becomes smaller. For boxes that are already
+        # pretty small, that might result in boxes with height and/or width zero, which we obviously
+        # cannot allow.
+        self.box_filter = BoxFilter(check_overlap=False,
+                                    check_min_area=False,
+                                    check_degenerate=True,
+                                    labels_format=self.labels_format)
+
+        self.resize = ResizeRandomInterp(height=img_height,
+                                         width=img_width,
+                                         interpolation_modes=[cv2.INTER_NEAREST,
+                                                              cv2.INTER_LINEAR,
+                                                              cv2.INTER_CUBIC,
+                                                              cv2.INTER_AREA,
+                                                              cv2.INTER_LANCZOS4],
+                                         box_filter=self.box_filter,
+                                         labels_format=self.labels_format)
+
+        self.sequence = [self.photometric_distortions,
+                         self.expand,
+                         self.random_crop,
+                         self.random_flip,
+                         self.resize]
+
+    def __call__(self, image, labels, return_inverter=False):
+        self.expand.labels_format = self.labels_format
+        self.random_crop.labels_format = self.labels_format
+        self.random_flip.labels_format = self.labels_format
+        self.resize.labels_format = self.labels_format
+
+        inverters = []
+
+        for transform in self.sequence:
+            if return_inverter and ('return_inverter' in inspect.signature(transform).parameters):
+                image, labels, inverter = transform(image, labels, return_inverter=True)
+                inverters.append(inverter)
+            else:
+                image, labels = transform(image, labels)
+
+        if return_inverter:
+            return image, labels, inverters[::-1]
+        else:
+            return image, labels
diff --git a/keras_ssd/data_generator/data_augmentation_chain_satellite.py b/keras_ssd/data_generator/data_augmentation_chain_satellite.py
new file mode 100644
index 0000000..c2e2cb9
--- /dev/null
+++ b/keras_ssd/data_generator/data_augmentation_chain_satellite.py
@@ -0,0 +1,157 @@
+'''
+A data augmentation pipeline for datasets in bird's eye view, i.e. where there is
+no "up" or "down" in the images.
+
+Copyright (C) 2018 Pierluigi Ferrari
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+'''
+
+from __future__ import division
+import numpy as np
+
+from data_generator.object_detection_2d_photometric_ops import ConvertColor, ConvertDataType, ConvertTo3Channels, RandomBrightness, RandomContrast, RandomHue, RandomSaturation
+from data_generator.object_detection_2d_geometric_ops import Resize, RandomFlip, RandomRotate
+from data_generator.object_detection_2d_patch_sampling_ops import PatchCoordinateGenerator, RandomPatch
+from data_generator.object_detection_2d_image_boxes_validation_utils import BoxFilter, ImageValidator
+
+class DataAugmentationSatellite:
+    '''
+    A data augmentation pipeline for datasets in bird's eye view, i.e. where there is
+    no "up" or "down" in the images.
+
+    Applies a chain of photometric and geometric image transformations. For documentation, please refer
+    to the documentation of the individual transformations involved.
+    '''
+
+    def __init__(self,
+                 resize_height,
+                 resize_width,
+                 random_brightness=(-48, 48, 0.5),
+                 random_contrast=(0.5, 1.8, 0.5),
+                 random_saturation=(0.5, 1.8, 0.5),
+                 random_hue=(18, 0.5),
+                 random_flip=0.5,
+                 random_rotate=([90, 180, 270], 0.5),
+                 min_scale=0.3,
+                 max_scale=2.0,
+                 min_aspect_ratio = 0.8,
+                 max_aspect_ratio = 1.25,
+                 n_trials_max=3,
+                 clip_boxes=True,
+                 overlap_criterion='area',
+                 bounds_box_filter=(0.3, 1.0),
+                 bounds_validator=(0.5, 1.0),
+                 n_boxes_min=1,
+                 background=(0,0,0),
+                 labels_format={'class_id': 0, 'xmin': 1, 'ymin': 2, 'xmax': 3, 'ymax': 4}):
+
+        self.n_trials_max = n_trials_max
+        self.clip_boxes = clip_boxes
+        self.overlap_criterion = overlap_criterion
+        self.bounds_box_filter = bounds_box_filter
+        self.bounds_validator = bounds_validator
+        self.n_boxes_min = n_boxes_min
+        self.background = background
+        self.labels_format = labels_format
+
+        # Determines which boxes are kept in an image after the transformations have been applied.
+        self.box_filter_patch = BoxFilter(check_overlap=True,
+                                          check_min_area=False,
+                                          check_degenerate=False,
+                                          overlap_criterion=self.overlap_criterion,
+                                          overlap_bounds=self.bounds_box_filter,
+                                          labels_format=self.labels_format)
+
+        self.box_filter_resize = BoxFilter(check_overlap=False,
+                                           check_min_area=True,
+                                           check_degenerate=True,
+                                           min_area=16,
+                                           labels_format=self.labels_format)
+
+        # Determines whether the result of the transformations is a valid training image.
+        self.image_validator = ImageValidator(overlap_criterion=self.overlap_criterion,
+                                              bounds=self.bounds_validator,
+                                              n_boxes_min=self.n_boxes_min,
+                                              labels_format=self.labels_format)
+
+        # Utility transformations
+        self.convert_to_3_channels  = ConvertTo3Channels() # Make sure all images end up having 3 channels.
+        self.convert_RGB_to_HSV     = ConvertColor(current='RGB', to='HSV')
+        self.convert_HSV_to_RGB     = ConvertColor(current='HSV', to='RGB')
+        self.convert_to_float32     = ConvertDataType(to='float32')
+        self.convert_to_uint8       = ConvertDataType(to='uint8')
+        self.resize                 = Resize(height=resize_height,
+                                             width=resize_width,
+                                             box_filter=self.box_filter_resize,
+                                             labels_format=self.labels_format)
+
+        # Photometric transformations
+        self.random_brightness      = RandomBrightness(lower=random_brightness[0], upper=random_brightness[1], prob=random_brightness[2])
+        self.random_contrast        = RandomContrast(lower=random_contrast[0], upper=random_contrast[1], prob=random_contrast[2])
+        self.random_saturation      = RandomSaturation(lower=random_saturation[0], upper=random_saturation[1], prob=random_saturation[2])
+        self.random_hue             = RandomHue(max_delta=random_hue[0], prob=random_hue[1])
+
+        # Geometric transformations
+        self.random_horizontal_flip = RandomFlip(dim='horizontal', prob=random_flip, labels_format=self.labels_format)
+        self.random_vertical_flip   = RandomFlip(dim='vertical', prob=random_flip, labels_format=self.labels_format)
+        self.random_rotate          = RandomRotate(angles=random_rotate[0], prob=random_rotate[1], labels_format=self.labels_format)
+        self.patch_coord_generator  = PatchCoordinateGenerator(must_match='w_ar',
+                                                               min_scale=min_scale,
+                                                               max_scale=max_scale,
+                                                               scale_uniformly=False,
+                                                               min_aspect_ratio = min_aspect_ratio,
+                                                               max_aspect_ratio = max_aspect_ratio)
+        self.random_patch           = RandomPatch(patch_coord_generator=self.patch_coord_generator,
+                                                  box_filter=self.box_filter_patch,
+                                                  image_validator=self.image_validator,
+                                                  n_trials_max=self.n_trials_max,
+                                                  clip_boxes=self.clip_boxes,
+                                                  prob=1.0,
+                                                  can_fail=False,
+                                                  labels_format=self.labels_format)
+
+        # Define the processing chain.
+        self.transformations = [self.convert_to_3_channels,
+                                self.convert_to_float32,
+                                self.random_brightness,
+                                self.random_contrast,
+                                self.convert_to_uint8,
+                                self.convert_RGB_to_HSV,
+                                self.convert_to_float32,
+                                self.random_saturation,
+                                self.random_hue,
+                                self.convert_to_uint8,
+                                self.convert_HSV_to_RGB,
+                                self.random_horizontal_flip,
+                                self.random_vertical_flip,
+                                self.random_rotate,
+                                self.random_patch,
+                                self.resize]
+
+    def __call__(self, image, labels=None):
+
+        self.random_patch.labels_format = self.labels_format
+        self.random_horizontal_flip.labels_format = self.labels_format
+        self.random_vertical_flip.labels_format = self.labels_format
+        self.random_rotate.labels_format = self.labels_format
+        self.resize.labels_format = self.labels_format
+
+        if not (labels is None):
+            for transform in self.transformations:
+                image, labels = transform(image, labels)
+            return image, labels
+        else:
+            for transform in self.sequence1:
+                image = transform(image)
+            return image
diff --git a/keras_ssd/data_generator/data_augmentation_chain_variable_input_size.py b/keras_ssd/data_generator/data_augmentation_chain_variable_input_size.py
new file mode 100644
index 0000000..7d9f2b4
--- /dev/null
+++ b/keras_ssd/data_generator/data_augmentation_chain_variable_input_size.py
@@ -0,0 +1,152 @@
+'''
+A data augmentation pipeline suitable for variable-size images that produces effects
+that are similar (but not identical) to those of the original SSD data augmentation
+pipeline while being faster.
+
+Copyright (C) 2018 Pierluigi Ferrari
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+'''
+
+from __future__ import division
+import numpy as np
+
+from data_generator.object_detection_2d_photometric_ops import ConvertColor, ConvertDataType, ConvertTo3Channels, RandomBrightness, RandomContrast, RandomHue, RandomSaturation
+from data_generator.object_detection_2d_geometric_ops import Resize, RandomFlip
+from data_generator.object_detection_2d_patch_sampling_ops import PatchCoordinateGenerator, RandomPatch
+from data_generator.object_detection_2d_image_boxes_validation_utils import BoxFilter, ImageValidator
+
+class DataAugmentationVariableInputSize:
+    '''
+    A data augmentation pipeline suitable for variable-size images that produces effects
+    that are similar (but not identical!) to those of the original SSD data augmentation
+    pipeline while being faster.
+
+    Applies a chain of photometric and geometric image transformations. For documentation, please refer
+    to the documentation of the individual transformations involved.
+    '''
+
+    def __init__(self,
+                 resize_height,
+                 resize_width,
+                 random_brightness=(-48, 48, 0.5),
+                 random_contrast=(0.5, 1.8, 0.5),
+                 random_saturation=(0.5, 1.8, 0.5),
+                 random_hue=(18, 0.5),
+                 random_flip=0.5,
+                 min_scale=0.3,
+                 max_scale=2.0,
+                 min_aspect_ratio = 0.5,
+                 max_aspect_ratio = 2.0,
+                 n_trials_max=3,
+                 clip_boxes=True,
+                 overlap_criterion='area',
+                 bounds_box_filter=(0.3, 1.0),
+                 bounds_validator=(0.5, 1.0),
+                 n_boxes_min=1,
+                 background=(0,0,0),
+                 labels_format={'class_id': 0, 'xmin': 1, 'ymin': 2, 'xmax': 3, 'ymax': 4}):
+
+        self.n_trials_max = n_trials_max
+        self.clip_boxes = clip_boxes
+        self.overlap_criterion = overlap_criterion
+        self.bounds_box_filter = bounds_box_filter
+        self.bounds_validator = bounds_validator
+        self.n_boxes_min = n_boxes_min
+        self.background = background
+        self.labels_format = labels_format
+
+        # Determines which boxes are kept in an image after the transformations have been applied.
+        self.box_filter_patch = BoxFilter(check_overlap=True,
+                                          check_min_area=False,
+                                          check_degenerate=False,
+                                          overlap_criterion=self.overlap_criterion,
+                                          overlap_bounds=self.bounds_box_filter,
+                                          labels_format=self.labels_format)
+
+        self.box_filter_resize = BoxFilter(check_overlap=False,
+                                           check_min_area=True,
+                                           check_degenerate=True,
+                                           min_area=16,
+                                           labels_format=self.labels_format)
+
+        # Determines whether the result of the transformations is a valid training image.
+        self.image_validator = ImageValidator(overlap_criterion=self.overlap_criterion,
+                                              bounds=self.bounds_validator,
+                                              n_boxes_min=self.n_boxes_min,
+                                              labels_format=self.labels_format)
+
+        # Utility transformations
+        self.convert_to_3_channels = ConvertTo3Channels() # Make sure all images end up having 3 channels.
+        self.convert_RGB_to_HSV    = ConvertColor(current='RGB', to='HSV')
+        self.convert_HSV_to_RGB    = ConvertColor(current='HSV', to='RGB')
+        self.convert_to_float32    = ConvertDataType(to='float32')
+        self.convert_to_uint8      = ConvertDataType(to='uint8')
+        self.resize                = Resize(height=resize_height,
+                                            width=resize_width,
+                                            box_filter=self.box_filter_resize,
+                                            labels_format=self.labels_format)
+
+        # Photometric transformations
+        self.random_brightness     = RandomBrightness(lower=random_brightness[0], upper=random_brightness[1], prob=random_brightness[2])
+        self.random_contrast       = RandomContrast(lower=random_contrast[0], upper=random_contrast[1], prob=random_contrast[2])
+        self.random_saturation     = RandomSaturation(lower=random_saturation[0], upper=random_saturation[1], prob=random_saturation[2])
+        self.random_hue            = RandomHue(max_delta=random_hue[0], prob=random_hue[1])
+
+        # Geometric transformations
+        self.random_flip           = RandomFlip(dim='horizontal', prob=random_flip, labels_format=self.labels_format)
+        self.patch_coord_generator = PatchCoordinateGenerator(must_match='w_ar',
+                                                              min_scale=min_scale,
+                                                              max_scale=max_scale,
+                                                              scale_uniformly=False,
+                                                              min_aspect_ratio = min_aspect_ratio,
+                                                              max_aspect_ratio = max_aspect_ratio)
+        self.random_patch          = RandomPatch(patch_coord_generator=self.patch_coord_generator,
+                                                 box_filter=self.box_filter_patch,
+                                                 image_validator=self.image_validator,
+                                                 n_trials_max=self.n_trials_max,
+                                                 clip_boxes=self.clip_boxes,
+                                                 prob=1.0,
+                                                 can_fail=False,
+                                                 labels_format=self.labels_format)
+
+        # Define the processing chain
+        self.transformations = [self.convert_to_3_channels,
+                                self.convert_to_float32,
+                                self.random_brightness,
+                                self.random_contrast,
+                                self.convert_to_uint8,
+                                self.convert_RGB_to_HSV,
+                                self.convert_to_float32,
+                                self.random_saturation,
+                                self.random_hue,
+                                self.convert_to_uint8,
+                                self.convert_HSV_to_RGB,
+                                self.random_patch,
+                                self.random_flip,
+                                self.resize]
+
+    def __call__(self, image, labels=None):
+
+        self.random_patch.labels_format = self.labels_format
+        self.random_flip.labels_format = self.labels_format
+        self.resize.labels_format = self.labels_format
+
+        if not (labels is None):
+            for transform in self.transformations:
+                image, labels = transform(image, labels)
+            return image, labels
+        else:
+            for transform in self.sequence1:
+                image = transform(image)
+            return image
diff --git a/keras_ssd/data_generator/object_detection_2d_data_generator.py b/keras_ssd/data_generator/object_detection_2d_data_generator.py
new file mode 100644
index 0000000..e5e6526
--- /dev/null
+++ b/keras_ssd/data_generator/object_detection_2d_data_generator.py
@@ -0,0 +1,1220 @@
+'''
+A data generator for 2D object detection.
+
+Copyright (C) 2018 Pierluigi Ferrari
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+'''
+
+from __future__ import division
+import numpy as np
+import inspect
+from collections import defaultdict
+import warnings
+import sklearn.utils
+from copy import deepcopy
+from PIL import Image
+import cv2
+import csv
+import os
+import sys
+from tqdm import tqdm, trange
+try:
+    import h5py
+except ImportError:
+    warnings.warn("'h5py' module is missing. The fast HDF5 dataset option will be unavailable.")
+try:
+    import json
+except ImportError:
+    warnings.warn("'json' module is missing. The JSON-parser will be unavailable.")
+try:
+    from bs4 import BeautifulSoup
+except ImportError:
+    warnings.warn("'BeautifulSoup' module is missing. The XML-parser will be unavailable.")
+try:
+    import pickle
+except ImportError:
+    warnings.warn("'pickle' module is missing. You won't be able to save parsed file lists and annotations as pickled files.")
+
+from ssd_encoder_decoder.ssd_input_encoder import SSDInputEncoder
+from data_generator.object_detection_2d_image_boxes_validation_utils import BoxFilter
+
+class DegenerateBatchError(Exception):
+    '''
+    An exception class to be raised if a generated batch ends up being degenerate,
+    e.g. if a generated batch is empty.
+    '''
+    pass
+
+class DatasetError(Exception):
+    '''
+    An exception class to be raised if a anything is wrong with the dataset,
+    in particular if you try to generate batches when no dataset was loaded.
+    '''
+    pass
+
+class DataGenerator:
+    '''
+    A generator to generate batches of samples and corresponding labels indefinitely.
+
+    Can shuffle the dataset consistently after each complete pass.
+
+    Currently provides three methods to parse annotation data: A general-purpose CSV parser,
+    an XML parser for the Pascal VOC datasets, and a JSON parser for the MS COCO datasets.
+    If the annotations of your dataset are in a format that is not supported by these parsers,
+    you could just add another parser method and still use this generator.
+
+    Can perform image transformations for data conversion and data augmentation,
+    for details please refer to the documentation of the `generate()` method.
+    '''
+
+    def __init__(self,
+                 load_images_into_memory=False,
+                 hdf5_dataset_path=None,
+                 filenames=None,
+                 filenames_type='text',
+                 images_dir=None,
+                 labels=None,
+                 image_ids=None,
+                 eval_neutral=None,
+                 labels_output_format=('class_id', 'xmin', 'ymin', 'xmax', 'ymax'),
+                 verbose=True):
+        '''
+        Initializes the data generator. You can either load a dataset directly here in the constructor,
+        e.g. an HDF5 dataset, or you can use one of the parser methods to read in a dataset.
+
+        Arguments:
+            load_images_into_memory (bool, optional): If `True`, the entire dataset will be loaded into memory.
+                This enables noticeably faster data generation than loading batches of images into memory ad hoc.
+                Be sure that you have enough memory before you activate this option.
+            hdf5_dataset_path (str, optional): The full file path of an HDF5 file that contains a dataset in the
+                format that the `create_hdf5_dataset()` method produces. If you load such an HDF5 dataset, you
+                don't need to use any of the parser methods anymore, the HDF5 dataset already contains all relevant
+                data.
+            filenames (string or list, optional): `None` or either a Python list/tuple or a string representing
+                a filepath. If a list/tuple is passed, it must contain the file names (full paths) of the
+                images to be used. Note that the list/tuple must contain the paths to the images,
+                not the images themselves. If a filepath string is passed, it must point either to
+                (1) a pickled file containing a list/tuple as described above. In this case the `filenames_type`
+                argument must be set to `pickle`.
+                Or
+                (2) a text file. Each line of the text file contains the file name (basename of the file only,
+                not the full directory path) to one image and nothing else. In this case the `filenames_type`
+                argument must be set to `text` and you must pass the path to the directory that contains the
+                images in `images_dir`.
+            filenames_type (string, optional): In case a string is passed for `filenames`, this indicates what
+                type of file `filenames` is. It can be either 'pickle' for a pickled file or 'text' for a
+                plain text file.
+            images_dir (string, optional): In case a text file is passed for `filenames`, the full paths to
+                the images will be composed from `images_dir` and the names in the text file, i.e. this
+                should be the directory that contains the images to which the text file refers.
+                If `filenames_type` is not 'text', then this argument is irrelevant.
+            labels (string or list, optional): `None` or either a Python list/tuple or a string representing
+                the path to a pickled file containing a list/tuple. The list/tuple must contain Numpy arrays
+                that represent the labels of the dataset.
+            image_ids (string or list, optional): `None` or either a Python list/tuple or a string representing
+                the path to a pickled file containing a list/tuple. The list/tuple must contain the image
+                IDs of the images in the dataset.
+            eval_neutral (string or list, optional): `None` or either a Python list/tuple or a string representing
+                the path to a pickled file containing a list/tuple. The list/tuple must contain for each image
+                a list that indicates for each ground truth object in the image whether that object is supposed
+                to be treated as neutral during an evaluation.
+            labels_output_format (list, optional): A list of five strings representing the desired order of the five
+                items class ID, xmin, ymin, xmax, ymax in the generated ground truth data (if any). The expected
+                strings are 'xmin', 'ymin', 'xmax', 'ymax', 'class_id'.
+            verbose (bool, optional): If `True`, prints out the progress for some constructor operations that may
+                take a bit longer.
+        '''
+        self.labels_output_format = labels_output_format
+        self.labels_format={'class_id': labels_output_format.index('class_id'),
+                            'xmin': labels_output_format.index('xmin'),
+                            'ymin': labels_output_format.index('ymin'),
+                            'xmax': labels_output_format.index('xmax'),
+                            'ymax': labels_output_format.index('ymax')} # This dictionary is for internal use.
+
+        self.dataset_size = 0 # As long as we haven't loaded anything yet, the dataset size is zero.
+        self.load_images_into_memory = load_images_into_memory
+        self.images = None # The only way that this list will not stay `None` is if `load_images_into_memory == True`.
+
+        # `self.filenames` is a list containing all file names of the image samples (full paths).
+        # Note that it does not contain the actual image files themselves. This list is one of the outputs of the parser methods.
+        # In case you are loading an HDF5 dataset, this list will be `None`.
+        if not filenames is None:
+            if isinstance(filenames, (list, tuple)):
+                self.filenames = filenames
+            elif isinstance(filenames, str):
+                with open(filenames, 'rb') as f:
+                    if filenames_type == 'pickle':
+                        self.filenames = pickle.load(f)
+                    elif filenames_type == 'text':
+                        self.filenames = [os.path.join(images_dir, line.strip()) for line in f]
+                    else:
+                        raise ValueError("`filenames_type` can be either 'text' or 'pickle'.")
+            else:
+                raise ValueError("`filenames` must be either a Python list/tuple or a string representing a filepath (to a pickled or text file). The value you passed is neither of the two.")
+            self.dataset_size = len(self.filenames)
+            self.dataset_indices = np.arange(self.dataset_size, dtype=np.int32)
+            if load_images_into_memory:
+                self.images = []
+                if verbose: it = tqdm(self.filenames, desc='Loading images into memory', file=sys.stdout)
+                else: it = self.filenames
+                for filename in it:
+                    with Image.open(filename) as image:
+                        self.images.append(np.array(image, dtype=np.uint8))
+        else:
+            self.filenames = None
+
+        # In case ground truth is available, `self.labels` is a list containing for each image a list (or NumPy array)
+        # of ground truth bounding boxes for that image.
+        if not labels is None:
+            if isinstance(labels, str):
+                with open(labels, 'rb') as f:
+                    self.labels = pickle.load(f)
+            elif isinstance(labels, (list, tuple)):
+                self.labels = labels
+            else:
+                raise ValueError("`labels` must be either a Python list/tuple or a string representing the path to a pickled file containing a list/tuple. The value you passed is neither of the two.")
+        else:
+            self.labels = None
+
+        if not image_ids is None:
+            if isinstance(image_ids, str):
+                with open(image_ids, 'rb') as f:
+                    self.image_ids = pickle.load(f)
+            elif isinstance(image_ids, (list, tuple)):
+                self.image_ids = image_ids
+            else:
+                raise ValueError("`image_ids` must be either a Python list/tuple or a string representing the path to a pickled file containing a list/tuple. The value you passed is neither of the two.")
+        else:
+            self.image_ids = None
+
+        if not eval_neutral is None:
+            if isinstance(eval_neutral, str):
+                with open(eval_neutral, 'rb') as f:
+                    self.eval_neutral = pickle.load(f)
+            elif isinstance(eval_neutral, (list, tuple)):
+                self.eval_neutral = eval_neutral
+            else:
+                raise ValueError("`image_ids` must be either a Python list/tuple or a string representing the path to a pickled file containing a list/tuple. The value you passed is neither of the two.")
+        else:
+            self.eval_neutral = None
+
+        if not hdf5_dataset_path is None:
+            self.hdf5_dataset_path = hdf5_dataset_path
+            self.load_hdf5_dataset(verbose=verbose)
+        else:
+            self.hdf5_dataset = None
+
+    def load_hdf5_dataset(self, verbose=True):
+        '''
+        Loads an HDF5 dataset that is in the format that the `create_hdf5_dataset()` method
+        produces.
+
+        Arguments:
+            verbose (bool, optional): If `True`, prints out the progress while loading
+                the dataset.
+
+        Returns:
+            None.
+        '''
+
+        self.hdf5_dataset = h5py.File(self.hdf5_dataset_path, 'r')
+        self.dataset_size = len(self.hdf5_dataset['images'])
+        self.dataset_indices = np.arange(self.dataset_size, dtype=np.int32) # Instead of shuffling the HDF5 dataset or images in memory, we will shuffle this index list.
+
+        if self.load_images_into_memory:
+            self.images = []
+            if verbose: tr = trange(self.dataset_size, desc='Loading images into memory', file=sys.stdout)
+            else: tr = range(self.dataset_size)
+            for i in tr:
+                self.images.append(self.hdf5_dataset['images'][i].reshape(self.hdf5_dataset['image_shapes'][i]))
+
+        if self.hdf5_dataset.attrs['has_labels']:
+            self.labels = []
+            labels = self.hdf5_dataset['labels']
+            label_shapes = self.hdf5_dataset['label_shapes']
+            if verbose: tr = trange(self.dataset_size, desc='Loading labels', file=sys.stdout)
+            else: tr = range(self.dataset_size)
+            for i in tr:
+                self.labels.append(labels[i].reshape(label_shapes[i]))
+
+        if self.hdf5_dataset.attrs['has_image_ids']:
+            self.image_ids = []
+            image_ids = self.hdf5_dataset['image_ids']
+            if verbose: tr = trange(self.dataset_size, desc='Loading image IDs', file=sys.stdout)
+            else: tr = range(self.dataset_size)
+            for i in tr:
+                self.image_ids.append(image_ids[i])
+
+        if self.hdf5_dataset.attrs['has_eval_neutral']:
+            self.eval_neutral = []
+            eval_neutral = self.hdf5_dataset['eval_neutral']
+            if verbose: tr = trange(self.dataset_size, desc='Loading evaluation-neutrality annotations', file=sys.stdout)
+            else: tr = range(self.dataset_size)
+            for i in tr:
+                self.eval_neutral.append(eval_neutral[i])
+
+    def parse_csv(self,
+                  images_dir,
+                  labels_filename,
+                  input_format,
+                  include_classes='all',
+                  random_sample=False,
+                  ret=False,
+                  verbose=True):
+        '''
+        Arguments:
+            images_dir (str): The path to the directory that contains the images.
+            labels_filename (str): The filepath to a CSV file that contains one ground truth bounding box per line
+                and each line contains the following six items: image file name, class ID, xmin, xmax, ymin, ymax.
+                The six items do not have to be in a specific order, but they must be the first six columns of
+                each line. The order of these items in the CSV file must be specified in `input_format`.
+                The class ID is an integer greater than zero. Class ID 0 is reserved for the background class.
+                `xmin` and `xmax` are the left-most and right-most absolute horizontal coordinates of the box,
+                `ymin` and `ymax` are the top-most and bottom-most absolute vertical coordinates of the box.
+                The image name is expected to be just the name of the image file without the directory path
+                at which the image is located.
+            input_format (list): A list of six strings representing the order of the six items
+                image file name, class ID, xmin, xmax, ymin, ymax in the input CSV file. The expected strings
+                are 'image_name', 'xmin', 'xmax', 'ymin', 'ymax', 'class_id'.
+            include_classes (list, optional): Either 'all' or a list of integers containing the class IDs that
+                are to be included in the dataset. If 'all', all ground truth boxes will be included in the dataset.
+            random_sample (float, optional): Either `False` or a float in `[0,1]`. If this is `False`, the
+                full dataset will be used by the generator. If this is a float in `[0,1]`, a randomly sampled
+                fraction of the dataset will be used, where `random_sample` is the fraction of the dataset
+                to be used. For example, if `random_sample = 0.2`, 20 precent of the dataset will be randomly selected,
+                the rest will be ommitted. The fraction refers to the number of images, not to the number
+                of boxes, i.e. each image that will be added to the dataset will always be added with all
+                of its boxes.
+            ret (bool, optional): Whether or not to return the outputs of the parser.
+            verbose (bool, optional): If `True`, prints out the progress for operations that may take a bit longer.
+
+        Returns:
+            None by default, optionally lists for whichever are available of images, image filenames, labels, and image IDs.
+        '''
+
+        # Set class members.
+        self.images_dir = images_dir
+        self.labels_filename = labels_filename
+        self.input_format = input_format
+        self.include_classes = include_classes
+
+        # Before we begin, make sure that we have a labels_filename and an input_format
+        if self.labels_filename is None or self.input_format is None:
+            raise ValueError("`labels_filename` and/or `input_format` have not been set yet. You need to pass them as arguments.")
+
+        # Erase data that might have been parsed before
+        self.filenames = []
+        self.image_ids = []
+        self.labels = []
+
+        # First, just read in the CSV file lines and sort them.
+
+        data = []
+
+        with open(self.labels_filename, newline='') as csvfile:
+            csvread = csv.reader(csvfile, delimiter=',')
+            next(csvread) # Skip the header row.
+            for row in csvread: # For every line (i.e for every bounding box) in the CSV file...
+                if self.include_classes == 'all' or int(row[self.input_format.index('class_id')].strip()) in self.include_classes: # If the class_id is among the classes that are to be included in the dataset...
+                    box = [] # Store the box class and coordinates here
+                    box.append(row[self.input_format.index('image_name')].strip()) # Select the image name column in the input format and append its content to `box`
+                    for element in self.labels_output_format: # For each element in the output format (where the elements are the class ID and the four box coordinates)...
+                        box.append(int(row[self.input_format.index(element)].strip())) # ...select the respective column in the input format and append it to `box`.
+                    data.append(box)
+
+        data = sorted(data) # The data needs to be sorted, otherwise the next step won't give the correct result
+
+        # Now that we've made sure that the data is sorted by file names,
+        # we can compile the actual samples and labels lists
+
+        current_file = data[0][0] # The current image for which we're collecting the ground truth boxes
+        current_image_id = data[0][0].split('.')[0] # The image ID will be the portion of the image name before the first dot.
+        current_labels = [] # The list where we collect all ground truth boxes for a given image
+        add_to_dataset = False
+        for i, box in enumerate(data):
+
+            if box[0] == current_file: # If this box (i.e. this line of the CSV file) belongs to the current image file
+                current_labels.append(box[1:])
+                if i == len(data)-1: # If this is the last line of the CSV file
+                    if random_sample: # In case we're not using the full dataset, but a random sample of it.
+                        p = np.random.uniform(0,1)
+                        if p >= (1-random_sample):
+                            self.labels.append(np.stack(current_labels, axis=0))
+                            self.filenames.append(os.path.join(self.images_dir, current_file))
+                            self.image_ids.append(current_image_id)
+                    else:
+                        self.labels.append(np.stack(current_labels, axis=0))
+                        self.filenames.append(os.path.join(self.images_dir, current_file))
+                        self.image_ids.append(current_image_id)
+            else: # If this box belongs to a new image file
+                if random_sample: # In case we're not using the full dataset, but a random sample of it.
+                    p = np.random.uniform(0,1)
+                    if p >= (1-random_sample):
+                        self.labels.append(np.stack(current_labels, axis=0))
+                        self.filenames.append(os.path.join(self.images_dir, current_file))
+                        self.image_ids.append(current_image_id)
+                else:
+                    self.labels.append(np.stack(current_labels, axis=0))
+                    self.filenames.append(os.path.join(self.images_dir, current_file))
+                    self.image_ids.append(current_image_id)
+                current_labels = [] # Reset the labels list because this is a new file.
+                current_file = box[0]
+                current_image_id = box[0].split('.')[0]
+                current_labels.append(box[1:])
+                if i == len(data)-1: # If this is the last line of the CSV file
+                    if random_sample: # In case we're not using the full dataset, but a random sample of it.
+                        p = np.random.uniform(0,1)
+                        if p >= (1-random_sample):
+                            self.labels.append(np.stack(current_labels, axis=0))
+                            self.filenames.append(os.path.join(self.images_dir, current_file))
+                            self.image_ids.append(current_image_id)
+                    else:
+                        self.labels.append(np.stack(current_labels, axis=0))
+                        self.filenames.append(os.path.join(self.images_dir, current_file))
+                        self.image_ids.append(current_image_id)
+
+        self.dataset_size = len(self.filenames)
+        self.dataset_indices = np.arange(self.dataset_size, dtype=np.int32)
+        if self.load_images_into_memory:
+            self.images = []
+            if verbose: it = tqdm(self.filenames, desc='Loading images into memory', file=sys.stdout)
+            else: it = self.filenames
+            for filename in it:
+                with Image.open(filename) as image:
+                    self.images.append(np.array(image, dtype=np.uint8))
+
+        if ret: # In case we want to return these
+            return self.images, self.filenames, self.labels, self.image_ids
+
+    def parse_xml(self,
+                  images_dirs,
+                  image_set_filenames,
+                  annotations_dirs=[],
+                  classes=['background',
+                           'aeroplane', 'bicycle', 'bird', 'boat',
+                           'bottle', 'bus', 'car', 'cat',
+                           'chair', 'cow', 'diningtable', 'dog',
+                           'horse', 'motorbike', 'person', 'pottedplant',
+                           'sheep', 'sofa', 'train', 'tvmonitor'],
+                  include_classes = 'all',
+                  exclude_truncated=False,
+                  exclude_difficult=False,
+                  ret=False,
+                  verbose=True):
+        '''
+        This is an XML parser for the Pascal VOC datasets. It might be applicable to other datasets with minor changes to
+        the code, but in its current form it expects the data format and XML tags of the Pascal VOC datasets.
+
+        Arguments:
+            images_dirs (list): A list of strings, where each string is the path of a directory that
+                contains images that are to be part of the dataset. This allows you to aggregate multiple datasets
+                into one (e.g. one directory that contains the images for Pascal VOC 2007, another that contains
+                the images for Pascal VOC 2012, etc.).
+            image_set_filenames (list): A list of strings, where each string is the path of the text file with the image
+                set to be loaded. Must be one file per image directory given. These text files define what images in the
+                respective image directories are to be part of the dataset and simply contains one image ID per line
+                and nothing else.
+            annotations_dirs (list, optional): A list of strings, where each string is the path of a directory that
+                contains the annotations (XML files) that belong to the images in the respective image directories given.
+                The directories must contain one XML file per image and the name of an XML file must be the image ID
+                of the image it belongs to. The content of the XML files must be in the Pascal VOC format.
+            classes (list, optional): A list containing the names of the object classes as found in the
+                `name` XML tags. Must include the class `background` as the first list item. The order of this list
+                defines the class IDs.
+            include_classes (list, optional): Either 'all' or a list of integers containing the class IDs that
+                are to be included in the dataset. If 'all', all ground truth boxes will be included in the dataset.
+            exclude_truncated (bool, optional): If `True`, excludes boxes that are labeled as 'truncated'.
+            exclude_difficult (bool, optional): If `True`, excludes boxes that are labeled as 'difficult'.
+            ret (bool, optional): Whether or not to return the outputs of the parser.
+            verbose (bool, optional): If `True`, prints out the progress for operations that may take a bit longer.
+
+        Returns:
+            None by default, optionally lists for whichever are available of images, image filenames, labels, image IDs,
+            and a list indicating which boxes are annotated with the label "difficult".
+        '''
+        # Set class members.
+        self.images_dirs = images_dirs
+        self.annotations_dirs = annotations_dirs
+        self.image_set_filenames = image_set_filenames
+        self.classes = classes
+        self.include_classes = include_classes
+
+        # Erase data that might have been parsed before.
+        self.filenames = []
+        self.image_ids = []
+        self.labels = []
+        self.eval_neutral = []
+        if not annotations_dirs:
+            self.labels = None
+            self.eval_neutral = None
+            annotations_dirs = [None] * len(images_dirs)
+
+        for images_dir, image_set_filename, annotations_dir in zip(images_dirs, image_set_filenames, annotations_dirs):
+            # Read the image set file that so that we know all the IDs of all the images to be included in the dataset.
+            with open(image_set_filename) as f:
+                image_ids = [line.strip() for line in f] # Note: These are strings, not integers.
+                self.image_ids += image_ids
+
+            if verbose: it = tqdm(image_ids, desc="Processing image set '{}'".format(os.path.basename(image_set_filename)), file=sys.stdout)
+            else: it = image_ids
+
+            # Loop over all images in this dataset.
+            for image_id in it:
+
+                filename = '{}'.format(image_id) + '.jpg'
+                self.filenames.append(os.path.join(images_dir, filename))
+
+                if not annotations_dir is None:
+                    # Parse the XML file for this image.
+                    with open(os.path.join(annotations_dir, image_id + '.xml')) as f:
+                        soup = BeautifulSoup(f, 'xml')
+
+                    folder = soup.folder.text # In case we want to return the folder in addition to the image file name. Relevant for determining which dataset an image belongs to.
+                    #filename = soup.filename.text
+
+                    boxes = [] # We'll store all boxes for this image here.
+                    eval_neutr = [] # We'll store whether a box is annotated as "difficult" here.
+                    objects = soup.find_all('object') # Get a list of all objects in this image.
+
+                    # Parse the data for each object.
+                    for obj in objects:
+                        class_name = obj.find('name', recursive=False).text
+                        class_id = self.classes.index(class_name)
+                        # Check whether this class is supposed to be included in the dataset.
+                        if (not self.include_classes == 'all') and (not class_id in self.include_classes): continue
+                        pose = obj.find('pose', recursive=False).text
+                        truncated = int(obj.find('truncated', recursive=False).text)
+                        if exclude_truncated and (truncated == 1): continue
+                        difficult = int(obj.find('difficult', recursive=False).text)
+                        if exclude_difficult and (difficult == 1): continue
+                        # Get the bounding box coordinates.
+                        bndbox = obj.find('bndbox', recursive=False)
+                        xmin = int(bndbox.xmin.text)
+                        ymin = int(bndbox.ymin.text)
+                        xmax = int(bndbox.xmax.text)
+                        ymax = int(bndbox.ymax.text)
+                        item_dict = {'folder': folder,
+                                     'image_name': filename,
+                                     'image_id': image_id,
+                                     'class_name': class_name,
+                                     'class_id': class_id,
+                                     'pose': pose,
+                                     'truncated': truncated,
+                                     'difficult': difficult,
+                                     'xmin': xmin,
+                                     'ymin': ymin,
+                                     'xmax': xmax,
+                                     'ymax': ymax}
+                        box = []
+                        for item in self.labels_output_format:
+                            box.append(item_dict[item])
+                        boxes.append(box)
+                        if difficult: eval_neutr.append(True)
+                        else: eval_neutr.append(False)
+
+                    self.labels.append(boxes)
+                    self.eval_neutral.append(eval_neutr)
+
+        self.dataset_size = len(self.filenames)
+        self.dataset_indices = np.arange(self.dataset_size, dtype=np.int32)
+        if self.load_images_into_memory:
+            self.images = []
+            if verbose: it = tqdm(self.filenames, desc='Loading images into memory', file=sys.stdout)
+            else: it = self.filenames
+            for filename in it:
+                with Image.open(filename) as image:
+                    self.images.append(np.array(image, dtype=np.uint8))
+
+        if ret:
+            return self.images, self.filenames, self.labels, self.image_ids, self.eval_neutral
+
+    def parse_json(self,
+                   images_dirs,
+                   annotations_filenames,
+                   ground_truth_available=False,
+                   include_classes='all',
+                   ret=False,
+                   verbose=True):
+        '''
+        This is an JSON parser for the MS COCO datasets. It might be applicable to other datasets with minor changes to
+        the code, but in its current form it expects the JSON format of the MS COCO datasets.
+
+        Arguments:
+            images_dirs (list, optional): A list of strings, where each string is the path of a directory that
+                contains images that are to be part of the dataset. This allows you to aggregate multiple datasets
+                into one (e.g. one directory that contains the images for MS COCO Train 2014, another one for MS COCO
+                Val 2014, another one for MS COCO Train 2017 etc.).
+            annotations_filenames (list): A list of strings, where each string is the path of the JSON file
+                that contains the annotations for the images in the respective image directories given, i.e. one
+                JSON file per image directory that contains the annotations for all images in that directory.
+                The content of the JSON files must be in MS COCO object detection format. Note that these annotations
+                files do not necessarily need to contain ground truth information. MS COCO also provides annotations
+                files without ground truth information for the test datasets, called `image_info_[...].json`.
+            ground_truth_available (bool, optional): Set `True` if the annotations files contain ground truth information.
+            include_classes (list, optional): Either 'all' or a list of integers containing the class IDs that
+                are to be included in the dataset. If 'all', all ground truth boxes will be included in the dataset.
+            ret (bool, optional): Whether or not to return the outputs of the parser.
+            verbose (bool, optional): If `True`, prints out the progress for operations that may take a bit longer.
+
+        Returns:
+            None by default, optionally lists for whichever are available of images, image filenames, labels and image IDs.
+        '''
+        self.images_dirs = images_dirs
+        self.annotations_filenames = annotations_filenames
+        self.include_classes = include_classes
+        # Erase data that might have been parsed before.
+        self.filenames = []
+        self.image_ids = []
+        self.labels = []
+        if not ground_truth_available:
+            self.labels = None
+
+        # Build the dictionaries that map between class names and class IDs.
+        with open(annotations_filenames[0], 'r') as f:
+            annotations = json.load(f)
+        # Unfortunately the 80 MS COCO class IDs are not all consecutive. They go
+        # from 1 to 90 and some numbers are skipped. Since the IDs that we feed
+        # into a neural network must be consecutive, we'll save both the original
+        # (non-consecutive) IDs as well as transformed maps.
+        # We'll save both the map between the original
+        self.cats_to_names = {} # The map between class names (values) and their original IDs (keys)
+        self.classes_to_names = [] # A list of the class names with their indices representing the transformed IDs
+        self.classes_to_names.append('background') # Need to add the background class first so that the indexing is right.
+        self.cats_to_classes = {} # A dictionary that maps between the original (keys) and the transformed IDs (values)
+        self.classes_to_cats = {} # A dictionary that maps between the transformed (keys) and the original IDs (values)
+        for i, cat in enumerate(annotations['categories']):
+            self.cats_to_names[cat['id']] = cat['name']
+            self.classes_to_names.append(cat['name'])
+            self.cats_to_classes[cat['id']] = i + 1
+            self.classes_to_cats[i + 1] = cat['id']
+
+        # Iterate over all datasets.
+        for images_dir, annotations_filename in zip(self.images_dirs, self.annotations_filenames):
+            # Load the JSON file.
+            with open(annotations_filename, 'r') as f:
+                annotations = json.load(f)
+
+            if ground_truth_available:
+                # Create the annotations map, a dictionary whose keys are the image IDs
+                # and whose values are the annotations for the respective image ID.
+                image_ids_to_annotations = defaultdict(list)
+                for annotation in annotations['annotations']:
+                    image_ids_to_annotations[annotation['image_id']].append(annotation)
+
+            if verbose: it = tqdm(annotations['images'], desc="Processing '{}'".format(os.path.basename(annotations_filename)), file=sys.stdout)
+            else: it = annotations['images']
+
+            # Loop over all images in this dataset.
+            for img in it:
+
+                self.filenames.append(os.path.join(images_dir, img['file_name']))
+                self.image_ids.append(img['id'])
+
+                if ground_truth_available:
+                    # Get all annotations for this image.
+                    annotations = image_ids_to_annotations[img['id']]
+                    boxes = []
+                    for annotation in annotations:
+                        cat_id = annotation['category_id']
+                        # Check if this class is supposed to be included in the dataset.
+                        if (not self.include_classes == 'all') and (not cat_id in self.include_classes): continue
+                        # Transform the original class ID to fit in the sequence of consecutive IDs.
+                        class_id = self.cats_to_classes[cat_id]
+                        xmin = annotation['bbox'][0]
+                        ymin = annotation['bbox'][1]
+                        width = annotation['bbox'][2]
+                        height = annotation['bbox'][3]
+                        # Compute `xmax` and `ymax`.
+                        xmax = xmin + width
+                        ymax = ymin + height
+                        item_dict = {'image_name': img['file_name'],
+                                     'image_id': img['id'],
+                                     'class_id': class_id,
+                                     'xmin': xmin,
+                                     'ymin': ymin,
+                                     'xmax': xmax,
+                                     'ymax': ymax}
+                        box = []
+                        for item in self.labels_output_format:
+                            box.append(item_dict[item])
+                        boxes.append(box)
+                    self.labels.append(boxes)
+
+        self.dataset_size = len(self.filenames)
+        self.dataset_indices = np.arange(self.dataset_size, dtype=np.int32)
+        if self.load_images_into_memory:
+            self.images = []
+            if verbose: it = tqdm(self.filenames, desc='Loading images into memory', file=sys.stdout)
+            else: it = self.filenames
+            for filename in it:
+                with Image.open(filename) as image:
+                    self.images.append(np.array(image, dtype=np.uint8))
+
+        if ret:
+            return self.images, self.filenames, self.labels, self.image_ids
+
+    def create_hdf5_dataset(self,
+                            file_path='dataset.h5',
+                            resize=False,
+                            variable_image_size=True,
+                            verbose=True):
+        '''
+        Converts the currently loaded dataset into a HDF5 file. This HDF5 file contains all
+        images as uncompressed arrays in a contiguous block of memory, which allows for them
+        to be loaded faster. Such an uncompressed dataset, however, may take up considerably
+        more space on your hard drive than the sum of the source images in a compressed format
+        such as JPG or PNG.
+
+        It is recommended that you always convert the dataset into an HDF5 dataset if you
+        have enugh hard drive space since loading from an HDF5 dataset accelerates the data
+        generation noticeably.
+
+        Note that you must load a dataset (e.g. via one of the parser methods) before creating
+        an HDF5 dataset from it.
+
+        The created HDF5 dataset will remain open upon its creation so that it can be used right
+        away.
+
+        Arguments:
+            file_path (str, optional): The full file path under which to store the HDF5 dataset.
+                You can load this output file via the `DataGenerator` constructor in the future.
+            resize (tuple, optional): `False` or a 2-tuple `(height, width)` that represents the
+                target size for the images. All images in the dataset will be resized to this
+                target size before they will be written to the HDF5 file. If `False`, no resizing
+                will be performed.
+            variable_image_size (bool, optional): The only purpose of this argument is that its
+                value will be stored in the HDF5 dataset in order to be able to quickly find out
+                whether the images in the dataset all have the same size or not.
+            verbose (bool, optional): Whether or not prit out the progress of the dataset creation.
+
+        Returns:
+            None.
+        '''
+
+        self.hdf5_dataset_path = file_path
+
+        dataset_size = len(self.filenames)
+
+        # Create the HDF5 file.
+        hdf5_dataset = h5py.File(file_path, 'w')
+
+        # Create a few attributes that tell us what this dataset contains.
+        # The dataset will obviously always contain images, but maybe it will
+        # also contain labels, image IDs, etc.
+        hdf5_dataset.attrs.create(name='has_labels', data=False, shape=None, dtype=np.bool_)
+        hdf5_dataset.attrs.create(name='has_image_ids', data=False, shape=None, dtype=np.bool_)
+        hdf5_dataset.attrs.create(name='has_eval_neutral', data=False, shape=None, dtype=np.bool_)
+        # It's useful to be able to quickly check whether the images in a dataset all
+        # have the same size or not, so add a boolean attribute for that.
+        if variable_image_size and not resize:
+            hdf5_dataset.attrs.create(name='variable_image_size', data=True, shape=None, dtype=np.bool_)
+        else:
+            hdf5_dataset.attrs.create(name='variable_image_size', data=False, shape=None, dtype=np.bool_)
+
+        # Create the dataset in which the images will be stored as flattened arrays.
+        # This allows us, among other things, to store images of variable size.
+        hdf5_images = hdf5_dataset.create_dataset(name='images',
+                                                  shape=(dataset_size,),
+                                                  maxshape=(None),
+                                                  dtype=h5py.special_dtype(vlen=np.uint8))
+
+        # Create the dataset that will hold the image heights, widths and channels that
+        # we need in order to reconstruct the images from the flattened arrays later.
+        hdf5_image_shapes = hdf5_dataset.create_dataset(name='image_shapes',
+                                                        shape=(dataset_size, 3),
+                                                        maxshape=(None, 3),
+                                                        dtype=np.int32)
+
+        if not (self.labels is None):
+
+            # Create the dataset in which the labels will be stored as flattened arrays.
+            hdf5_labels = hdf5_dataset.create_dataset(name='labels',
+                                                      shape=(dataset_size,),
+                                                      maxshape=(None),
+                                                      dtype=h5py.special_dtype(vlen=np.int32))
+
+            # Create the dataset that will hold the dimensions of the labels arrays for
+            # each image so that we can restore the labels from the flattened arrays later.
+            hdf5_label_shapes = hdf5_dataset.create_dataset(name='label_shapes',
+                                                            shape=(dataset_size, 2),
+                                                            maxshape=(None, 2),
+                                                            dtype=np.int32)
+
+            hdf5_dataset.attrs.modify(name='has_labels', value=True)
+
+        if not (self.image_ids is None):
+
+            hdf5_image_ids = hdf5_dataset.create_dataset(name='image_ids',
+                                                         shape=(dataset_size,),
+                                                         maxshape=(None),
+                                                         dtype=h5py.special_dtype(vlen=str))
+
+            hdf5_dataset.attrs.modify(name='has_image_ids', value=True)
+
+        if not (self.eval_neutral is None):
+
+            # Create the dataset in which the labels will be stored as flattened arrays.
+            hdf5_eval_neutral = hdf5_dataset.create_dataset(name='eval_neutral',
+                                                            shape=(dataset_size,),
+                                                            maxshape=(None),
+                                                            dtype=h5py.special_dtype(vlen=np.bool_))
+
+            hdf5_dataset.attrs.modify(name='has_eval_neutral', value=True)
+
+        if verbose:
+            tr = trange(dataset_size, desc='Creating HDF5 dataset', file=sys.stdout)
+        else:
+            tr = range(dataset_size)
+
+        # Iterate over all images in the dataset.
+        for i in tr:
+
+            # Store the image.
+            with Image.open(self.filenames[i]) as image:
+
+                image = np.asarray(image, dtype=np.uint8)
+
+                # Make sure all images end up having three channels.
+                if image.ndim == 2:
+                    image = np.stack([image] * 3, axis=-1)
+                elif image.ndim == 3:
+                    if image.shape[2] == 1:
+                        image = np.concatenate([image] * 3, axis=-1)
+                    elif image.shape[2] == 4:
+                        image = image[:,:,:3]
+
+                if resize:
+                    image = cv2.resize(image, dsize=(resize[1], resize[0]))
+
+                # Flatten the image array and write it to the images dataset.
+                hdf5_images[i] = image.reshape(-1)
+                # Write the image's shape to the image shapes dataset.
+                hdf5_image_shapes[i] = image.shape
+
+            # Store the ground truth if we have any.
+            if not (self.labels is None):
+
+                labels = np.asarray(self.labels[i])
+                # Flatten the labels array and write it to the labels dataset.
+                hdf5_labels[i] = labels.reshape(-1)
+                # Write the labels' shape to the label shapes dataset.
+                hdf5_label_shapes[i] = labels.shape
+
+            # Store the image ID if we have one.
+            if not (self.image_ids is None):
+
+                hdf5_image_ids[i] = self.image_ids[i]
+
+            # Store the evaluation-neutrality annotations if we have any.
+            if not (self.eval_neutral is None):
+
+                hdf5_eval_neutral[i] = self.eval_neutral[i]
+
+        hdf5_dataset.close()
+        self.hdf5_dataset = h5py.File(file_path, 'r')
+        self.hdf5_dataset_path = file_path
+        self.dataset_size = len(self.hdf5_dataset['images'])
+        self.dataset_indices = np.arange(self.dataset_size, dtype=np.int32) # Instead of shuffling the HDF5 dataset, we will shuffle this index list.
+
+    def generate(self,
+                 batch_size=32,
+                 shuffle=True,
+                 transformations=[],
+                 label_encoder=None,
+                 returns={'processed_images', 'encoded_labels'},
+                 keep_images_without_gt=False,
+                 degenerate_box_handling='remove'):
+        '''
+        Generates batches of samples and (optionally) corresponding labels indefinitely.
+
+        Can shuffle the samples consistently after each complete pass.
+
+        Optionally takes a list of arbitrary image transformations to apply to the
+        samples ad hoc.
+
+        Arguments:
+            batch_size (int, optional): The size of the batches to be generated.
+            shuffle (bool, optional): Whether or not to shuffle the dataset before each pass.
+                This option should always be `True` during training, but it can be useful to turn shuffling off
+                for debugging or if you're using the generator for prediction.
+            transformations (list, optional): A list of transformations that will be applied to the images and labels
+                in the given order. Each transformation is a callable that takes as input an image (as a Numpy array)
+                and optionally labels (also as a Numpy array) and returns an image and optionally labels in the same
+                format.
+            label_encoder (callable, optional): Only relevant if labels are given. A callable that takes as input the
+                labels of a batch (as a list of Numpy arrays) and returns some structure that represents those labels.
+                The general use case for this is to convert labels from their input format to a format that a given object
+                detection model needs as its training targets.
+            returns (set, optional): A set of strings that determines what outputs the generator yields. The generator's output
+                is always a tuple that contains the outputs specified in this set and only those. If an output is not available,
+                it will be `None`. The output tuple can contain the following outputs according to the specified keyword strings:
+                * 'processed_images': An array containing the processed images. Will always be in the outputs, so it doesn't
+                    matter whether or not you include this keyword in the set.
+                * 'encoded_labels': The encoded labels tensor. Will always be in the outputs if a label encoder is given,
+                    so it doesn't matter whether or not you include this keyword in the set if you pass a label encoder.
+                * 'matched_anchors': Only available if `labels_encoder` is an `SSDInputEncoder` object. The same as 'encoded_labels',
+                    but containing anchor box coordinates for all matched anchor boxes instead of ground truth coordinates.
+                    This can be useful to visualize what anchor boxes are being matched to each ground truth box. Only available
+                    in training mode.
+                * 'processed_labels': The processed, but not yet encoded labels. This is a list that contains for each
+                    batch image a Numpy array with all ground truth boxes for that image. Only available if ground truth is available.
+                * 'filenames': A list containing the file names (full paths) of the images in the batch.
+                * 'image_ids': A list containing the integer IDs of the images in the batch. Only available if there
+                    are image IDs available.
+                * 'evaluation-neutral': A nested list of lists of booleans. Each list contains `True` or `False` for every ground truth
+                    bounding box of the respective image depending on whether that bounding box is supposed to be evaluation-neutral (`True`)
+                    or not (`False`). May return `None` if there exists no such concept for a given dataset. An example for
+                    evaluation-neutrality are the ground truth boxes annotated as "difficult" in the Pascal VOC datasets, which are
+                    usually treated to be neutral in a model evaluation.
+                * 'inverse_transform': A nested list that contains a list of "inverter" functions for each item in the batch.
+                    These inverter functions take (predicted) labels for an image as input and apply the inverse of the transformations
+                    that were applied to the original image to them. This makes it possible to let the model make predictions on a
+                    transformed image and then convert these predictions back to the original image. This is mostly relevant for
+                    evaluation: If you want to evaluate your model on a dataset with varying image sizes, then you are forced to
+                    transform the images somehow (e.g. by resizing or cropping) to make them all the same size. Your model will then
+                    predict boxes for those transformed images, but for the evaluation you will need predictions with respect to the
+                    original images, not with respect to the transformed images. This means you will have to transform the predicted
+                    box coordinates back to the original image sizes. Note that for each image, the inverter functions for that
+                    image need to be applied in the order in which they are given in the respective list for that image.
+                * 'original_images': A list containing the original images in the batch before any processing.
+                * 'original_labels': A list containing the original ground truth boxes for the images in this batch before any
+                    processing. Only available if ground truth is available.
+                The order of the outputs in the tuple is the order of the list above. If `returns` contains a keyword for an
+                output that is unavailable, that output omitted in the yielded tuples and a warning will be raised.
+            keep_images_without_gt (bool, optional): If `False`, images for which there aren't any ground truth boxes before
+                any transformations have been applied will be removed from the batch. If `True`, such images will be kept
+                in the batch.
+            degenerate_box_handling (str, optional): How to handle degenerate boxes, which are boxes that have `xmax <= xmin` and/or
+                `ymax <= ymin`. Degenerate boxes can sometimes be in the dataset, or non-degenerate boxes can become degenerate
+                after they were processed by transformations. Note that the generator checks for degenerate boxes after all
+                transformations have been applied (if any), but before the labels were passed to the `label_encoder` (if one was given).
+                Can be one of 'warn' or 'remove'. If 'warn', the generator will merely print a warning to let you know that there
+                are degenerate boxes in a batch. If 'remove', the generator will remove degenerate boxes from the batch silently.
+
+        Yields:
+            The next batch as a tuple of items as defined by the `returns` argument.
+        '''
+
+        if self.dataset_size == 0:
+            raise DatasetError("Cannot generate batches because you did not load a dataset.")
+
+        #############################################################################################
+        # Warn if any of the set returns aren't possible.
+        #############################################################################################
+
+        if self.labels is None:
+            if any([ret in returns for ret in ['original_labels', 'processed_labels', 'encoded_labels', 'matched_anchors', 'evaluation-neutral']]):
+                warnings.warn("Since no labels were given, none of 'original_labels', 'processed_labels', 'evaluation-neutral', 'encoded_labels', and 'matched_anchors' " +
+                              "are possible returns, but you set `returns = {}`. The impossible returns will be `None`.".format(returns))
+        elif label_encoder is None:
+            if any([ret in returns for ret in ['encoded_labels', 'matched_anchors']]):
+                warnings.warn("Since no label encoder was given, 'encoded_labels' and 'matched_anchors' aren't possible returns, " +
+                              "but you set `returns = {}`. The impossible returns will be `None`.".format(returns))
+        elif not isinstance(label_encoder, SSDInputEncoder):
+            if 'matched_anchors' in returns:
+                warnings.warn("`label_encoder` is not an `SSDInputEncoder` object, therefore 'matched_anchors' is not a possible return, " +
+                              "but you set `returns = {}`. The impossible returns will be `None`.".format(returns))
+
+        #############################################################################################
+        # Do a few preparatory things like maybe shuffling the dataset initially.
+        #############################################################################################
+
+        if shuffle:
+            objects_to_shuffle = [self.dataset_indices]
+            if not (self.filenames is None):
+                objects_to_shuffle.append(self.filenames)
+            if not (self.labels is None):
+                objects_to_shuffle.append(self.labels)
+            if not (self.image_ids is None):
+                objects_to_shuffle.append(self.image_ids)
+            if not (self.eval_neutral is None):
+                objects_to_shuffle.append(self.eval_neutral)
+            shuffled_objects = sklearn.utils.shuffle(*objects_to_shuffle)
+            for i in range(len(objects_to_shuffle)):
+                objects_to_shuffle[i][:] = shuffled_objects[i]
+
+        if degenerate_box_handling == 'remove':
+            box_filter = BoxFilter(check_overlap=False,
+                                   check_min_area=False,
+                                   check_degenerate=True,
+                                   labels_format=self.labels_format)
+
+        # Override the labels formats of all the transformations to make sure they are set correctly.
+        if not (self.labels is None):
+            for transform in transformations:
+                transform.labels_format = self.labels_format
+
+        #############################################################################################
+        # Generate mini batches.
+        #############################################################################################
+
+        current = 0
+
+        while True:
+
+            batch_X, batch_y = [], []
+
+            if current >= self.dataset_size:
+                current = 0
+
+            #########################################################################################
+            # Maybe shuffle the dataset if a full pass over the dataset has finished.
+            #########################################################################################
+
+                if shuffle:
+                    objects_to_shuffle = [self.dataset_indices]
+                    if not (self.filenames is None):
+                        objects_to_shuffle.append(self.filenames)
+                    if not (self.labels is None):
+                        objects_to_shuffle.append(self.labels)
+                    if not (self.image_ids is None):
+                        objects_to_shuffle.append(self.image_ids)
+                    if not (self.eval_neutral is None):
+                        objects_to_shuffle.append(self.eval_neutral)
+                    shuffled_objects = sklearn.utils.shuffle(*objects_to_shuffle)
+                    for i in range(len(objects_to_shuffle)):
+                        objects_to_shuffle[i][:] = shuffled_objects[i]
+
+            #########################################################################################
+            # Get the images, (maybe) image IDs, (maybe) labels, etc. for this batch.
+            #########################################################################################
+
+            # We prioritize our options in the following order:
+            # 1) If we have the images already loaded in memory, get them from there.
+            # 2) Else, if we have an HDF5 dataset, get the images from there.
+            # 3) Else, if we have neither of the above, we'll have to load the individual image
+            #    files from disk.
+            batch_indices = self.dataset_indices[current:current+batch_size]
+            if not (self.images is None):
+                for i in batch_indices:
+                    batch_X.append(self.images[i])
+                if not (self.filenames is None):
+                    batch_filenames = self.filenames[current:current+batch_size]
+                else:
+                    batch_filenames = None
+            elif not (self.hdf5_dataset is None):
+                for i in batch_indices:
+                    batch_X.append(self.hdf5_dataset['images'][i].reshape(self.hdf5_dataset['image_shapes'][i]))
+                if not (self.filenames is None):
+                    batch_filenames = self.filenames[current:current+batch_size]
+                else:
+                    batch_filenames = None
+            else:
+                batch_filenames = self.filenames[current:current+batch_size]
+                for filename in batch_filenames:
+                    with Image.open(filename) as image:
+                        batch_X.append(np.array(image, dtype=np.uint8))
+
+            # Get the labels for this batch (if there are any).
+            if not (self.labels is None):
+                batch_y = deepcopy(self.labels[current:current+batch_size])
+            else:
+                batch_y = None
+
+            if not (self.eval_neutral is None):
+                batch_eval_neutral = self.eval_neutral[current:current+batch_size]
+            else:
+                batch_eval_neutral = None
+
+            # Get the image IDs for this batch (if there are any).
+            if not (self.image_ids is None):
+                batch_image_ids = self.image_ids[current:current+batch_size]
+            else:
+                batch_image_ids = None
+
+            if 'original_images' in returns:
+                batch_original_images = deepcopy(batch_X) # The original, unaltered images
+            if 'original_labels' in returns:
+                batch_original_labels = deepcopy(batch_y) # The original, unaltered labels
+
+            current += batch_size
+
+            #########################################################################################
+            # Maybe perform image transformations.
+            #########################################################################################
+
+            batch_items_to_remove = [] # In case we need to remove any images from the batch, store their indices in this list.
+            batch_inverse_transforms = []
+
+            for i in range(len(batch_X)):
+
+                if not (self.labels is None):
+                    # Convert the labels for this image to an array (in case they aren't already).
+                    batch_y[i] = np.array(batch_y[i])
+                    # If this image has no ground truth boxes, maybe we don't want to keep it in the batch.
+                    if (batch_y[i].size == 0) and not keep_images_without_gt:
+                        batch_items_to_remove.append(i)
+                        batch_inverse_transforms.append([])
+                        continue
+
+                # Apply any image transformations we may have received.
+                if transformations:
+
+                    inverse_transforms = []
+
+                    for transform in transformations:
+
+                        if not (self.labels is None):
+
+                            if ('inverse_transform' in returns) and ('return_inverter' in inspect.signature(transform).parameters):
+                                batch_X[i], batch_y[i], inverse_transform = transform(batch_X[i], batch_y[i], return_inverter=True)
+                                inverse_transforms.append(inverse_transform)
+                            else:
+                                batch_X[i], batch_y[i] = transform(batch_X[i], batch_y[i])
+
+                            if batch_X[i] is None: # In case the transform failed to produce an output image, which is possible for some random transforms.
+                                batch_items_to_remove.append(i)
+                                batch_inverse_transforms.append([])
+                                continue
+
+                        else:
+
+                            if ('inverse_transform' in returns) and ('return_inverter' in inspect.signature(transform).parameters):
+                                batch_X[i], inverse_transform = transform(batch_X[i], return_inverter=True)
+                                inverse_transforms.append(inverse_transform)
+                            else:
+                                batch_X[i] = transform(batch_X[i])
+
+                    batch_inverse_transforms.append(inverse_transforms[::-1])
+
+                #########################################################################################
+                # Check for degenerate boxes in this batch item.
+                #########################################################################################
+
+                if not (self.labels is None):
+
+                    xmin = self.labels_format['xmin']
+                    ymin = self.labels_format['ymin']
+                    xmax = self.labels_format['xmax']
+                    ymax = self.labels_format['ymax']
+
+                    if np.any(batch_y[i][:,xmax] - batch_y[i][:,xmin] <= 0) or np.any(batch_y[i][:,ymax] - batch_y[i][:,ymin] <= 0):
+                        if degenerate_box_handling == 'warn':
+                            warnings.warn("Detected degenerate ground truth bounding boxes for batch item {} with bounding boxes {}, ".format(i, batch_y[i]) +
+                                          "i.e. bounding boxes where xmax <= xmin and/or ymax <= ymin. " +
+                                          "This could mean that your dataset contains degenerate ground truth boxes, or that any image transformations you may apply might " +
+                                          "result in degenerate ground truth boxes, or that you are parsing the ground truth in the wrong coordinate format." +
+                                          "Degenerate ground truth bounding boxes may lead to NaN errors during the training.")
+                        elif degenerate_box_handling == 'remove':
+                            batch_y[i] = box_filter(batch_y[i])
+                            if (batch_y[i].size == 0) and not keep_images_without_gt:
+                                batch_items_to_remove.append(i)
+
+            #########################################################################################
+            # Remove any items we might not want to keep from the batch.
+            #########################################################################################
+
+            if batch_items_to_remove:
+                for j in sorted(batch_items_to_remove, reverse=True):
+                    # This isn't efficient, but it hopefully shouldn't need to be done often anyway.
+                    batch_X.pop(j)
+                    batch_filenames.pop(j)
+                    if batch_inverse_transforms: batch_inverse_transforms.pop(j)
+                    if not (self.labels is None): batch_y.pop(j)
+                    if not (self.image_ids is None): batch_image_ids.pop(j)
+                    if not (self.eval_neutral is None): batch_eval_neutral.pop(j)
+                    if 'original_images' in returns: batch_original_images.pop(j)
+                    if 'original_labels' in returns and not (self.labels is None): batch_original_labels.pop(j)
+
+            #########################################################################################
+
+            # CAUTION: Converting `batch_X` into an array will result in an empty batch if the images have varying sizes
+            #          or varying numbers of channels. At this point, all images must have the same size and the same
+            #          number of channels.
+            batch_X = np.array(batch_X)
+            if (batch_X.size == 0):
+                raise DegenerateBatchError("You produced an empty batch. This might be because the images in the batch vary " +
+                                           "in their size and/or number of channels. Note that after all transformations " +
+                                           "(if any were given) have been applied to all images in the batch, all images " +
+                                           "must be homogenous in size along all axes.")
+
+            #########################################################################################
+            # If we have a label encoder, encode our labels.
+            #########################################################################################
+
+            if not (label_encoder is None or self.labels is None):
+
+                if ('matched_anchors' in returns) and isinstance(label_encoder, SSDInputEncoder):
+                    batch_y_encoded, batch_matched_anchors = label_encoder(batch_y, diagnostics=True)
+                else:
+                    batch_y_encoded = label_encoder(batch_y, diagnostics=False)
+                    batch_matched_anchors = None
+
+            else:
+                batch_y_encoded = None
+                batch_matched_anchors = None
+
+            #########################################################################################
+            # Compose the output.
+            #########################################################################################
+
+            ret = []
+            if 'processed_images' in returns: ret.append(batch_X)
+            if 'encoded_labels' in returns: ret.append(batch_y_encoded)
+            if 'matched_anchors' in returns: ret.append(batch_matched_anchors)
+            if 'processed_labels' in returns: ret.append(batch_y)
+            if 'filenames' in returns: ret.append(batch_filenames)
+            if 'image_ids' in returns: ret.append(batch_image_ids)
+            if 'evaluation-neutral' in returns: ret.append(batch_eval_neutral)
+            if 'inverse_transform' in returns: ret.append(batch_inverse_transforms)
+            if 'original_images' in returns: ret.append(batch_original_images)
+            if 'original_labels' in returns: ret.append(batch_original_labels)
+
+            yield ret
+
+    def save_dataset(self,
+                     filenames_path='filenames.pkl',
+                     labels_path=None,
+                     image_ids_path=None,
+                     eval_neutral_path=None):
+        '''
+        Writes the current `filenames`, `labels`, and `image_ids` lists to the specified files.
+        This is particularly useful for large datasets with annotations that are
+        parsed from XML files, which can take quite long. If you'll be using the
+        same dataset repeatedly, you don't want to have to parse the XML label
+        files every time.
+
+        Arguments:
+            filenames_path (str): The path under which to save the filenames pickle.
+            labels_path (str): The path under which to save the labels pickle.
+            image_ids_path (str, optional): The path under which to save the image IDs pickle.
+            eval_neutral_path (str, optional): The path under which to save the pickle for
+                the evaluation-neutrality annotations.
+        '''
+        with open(filenames_path, 'wb') as f:
+            pickle.dump(self.filenames, f)
+        if not labels_path is None:
+            with open(labels_path, 'wb') as f:
+                pickle.dump(self.labels, f)
+        if not image_ids_path is None:
+            with open(image_ids_path, 'wb') as f:
+                pickle.dump(self.image_ids, f)
+        if not eval_neutral_path is None:
+            with open(eval_neutral_path, 'wb') as f:
+                pickle.dump(self.eval_neutral, f)
+
+    def get_dataset(self):
+        '''
+        Returns:
+            4-tuple containing lists and/or `None` for the filenames, labels, image IDs,
+            and evaluation-neutrality annotations.
+        '''
+        return self.filenames, self.labels, self.image_ids, self.eval_neutral
+
+    def get_dataset_size(self):
+        '''
+        Returns:
+            The number of images in the dataset.
+        '''
+        return self.dataset_size
diff --git a/keras_ssd/data_generator/object_detection_2d_geometric_ops.py b/keras_ssd/data_generator/object_detection_2d_geometric_ops.py
new file mode 100644
index 0000000..1b36815
--- /dev/null
+++ b/keras_ssd/data_generator/object_detection_2d_geometric_ops.py
@@ -0,0 +1,779 @@
+'''
+Various geometric image transformations for 2D object detection, both deterministic
+and probabilistic.
+
+Copyright (C) 2018 Pierluigi Ferrari
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+'''
+
+from __future__ import division
+import numpy as np
+import cv2
+import random
+
+from data_generator.object_detection_2d_image_boxes_validation_utils import BoxFilter, ImageValidator
+
+class Resize:
+    '''
+    Resizes images to a specified height and width in pixels.
+    '''
+
+    def __init__(self,
+                 height,
+                 width,
+                 interpolation_mode=cv2.INTER_LINEAR,
+                 box_filter=None,
+                 labels_format={'class_id': 0, 'xmin': 1, 'ymin': 2, 'xmax': 3, 'ymax': 4}):
+        '''
+        Arguments:
+            height (int): The desired height of the output images in pixels.
+            width (int): The desired width of the output images in pixels.
+            interpolation_mode (int, optional): An integer that denotes a valid
+                OpenCV interpolation mode. For example, integers 0 through 5 are
+                valid interpolation modes.
+            box_filter (BoxFilter, optional): Only relevant if ground truth bounding boxes are given.
+                A `BoxFilter` object to filter out bounding boxes that don't meet the given criteria
+                after the transformation. Refer to the `BoxFilter` documentation for details. If `None`,
+                the validity of the bounding boxes is not checked.
+            labels_format (dict, optional): A dictionary that defines which index in the last axis of the labels
+                of an image contains which bounding box coordinate. The dictionary maps at least the keywords
+                'xmin', 'ymin', 'xmax', and 'ymax' to their respective indices within last axis of the labels array.
+        '''
+        if not (isinstance(box_filter, BoxFilter) or box_filter is None):
+            raise ValueError("`box_filter` must be either `None` or a `BoxFilter` object.")
+        self.out_height = height
+        self.out_width = width
+        self.interpolation_mode = interpolation_mode
+        self.box_filter = box_filter
+        self.labels_format = labels_format
+
+    def __call__(self, image, labels=None, return_inverter=False):
+
+        img_height, img_width = image.shape[:2]
+
+        xmin = self.labels_format['xmin']
+        ymin = self.labels_format['ymin']
+        xmax = self.labels_format['xmax']
+        ymax = self.labels_format['ymax']
+
+        image = cv2.resize(image,
+                           dsize=(self.out_width, self.out_height),
+                           interpolation=self.interpolation_mode)
+
+        if return_inverter:
+            def inverter(labels):
+                labels = np.copy(labels)
+                labels[:, [ymin+1, ymax+1]] = np.round(labels[:, [ymin+1, ymax+1]] * (img_height / self.out_height), decimals=0)
+                labels[:, [xmin+1, xmax+1]] = np.round(labels[:, [xmin+1, xmax+1]] * (img_width / self.out_width), decimals=0)
+                return labels
+
+        if labels is None:
+            if return_inverter:
+                return image, inverter
+            else:
+                return image
+        else:
+            labels = np.copy(labels)
+            labels[:, [ymin, ymax]] = np.round(labels[:, [ymin, ymax]] * (self.out_height / img_height), decimals=0)
+            labels[:, [xmin, xmax]] = np.round(labels[:, [xmin, xmax]] * (self.out_width / img_width), decimals=0)
+
+            if not (self.box_filter is None):
+                self.box_filter.labels_format = self.labels_format
+                labels = self.box_filter(labels=labels,
+                                         image_height=self.out_height,
+                                         image_width=self.out_width)
+
+            if return_inverter:
+                return image, labels, inverter
+            else:
+                return image, labels
+
+class ResizeRandomInterp:
+    '''
+    Resizes images to a specified height and width in pixels using a radnomly
+    selected interpolation mode.
+    '''
+
+    def __init__(self,
+                 height,
+                 width,
+                 interpolation_modes=[cv2.INTER_NEAREST,
+                                      cv2.INTER_LINEAR,
+                                      cv2.INTER_CUBIC,
+                                      cv2.INTER_AREA,
+                                      cv2.INTER_LANCZOS4],
+                 box_filter=None,
+                 labels_format={'class_id': 0, 'xmin': 1, 'ymin': 2, 'xmax': 3, 'ymax': 4}):
+        '''
+        Arguments:
+            height (int): The desired height of the output image in pixels.
+            width (int): The desired width of the output image in pixels.
+            interpolation_modes (list/tuple, optional): A list/tuple of integers
+                that represent valid OpenCV interpolation modes. For example,
+                integers 0 through 5 are valid interpolation modes.
+            box_filter (BoxFilter, optional): Only relevant if ground truth bounding boxes are given.
+                A `BoxFilter` object to filter out bounding boxes that don't meet the given criteria
+                after the transformation. Refer to the `BoxFilter` documentation for details. If `None`,
+                the validity of the bounding boxes is not checked.
+            labels_format (dict, optional): A dictionary that defines which index in the last axis of the labels
+                of an image contains which bounding box coordinate. The dictionary maps at least the keywords
+                'xmin', 'ymin', 'xmax', and 'ymax' to their respective indices within last axis of the labels array.
+        '''
+        if not (isinstance(interpolation_modes, (list, tuple))):
+            raise ValueError("`interpolation_mode` must be a list or tuple.")
+        self.height = height
+        self.width = width
+        self.interpolation_modes = interpolation_modes
+        self.box_filter = box_filter
+        self.labels_format = labels_format
+        self.resize = Resize(height=self.height,
+                             width=self.width,
+                             box_filter=self.box_filter,
+                             labels_format=self.labels_format)
+
+    def __call__(self, image, labels=None, return_inverter=False):
+        self.resize.interpolation_mode = np.random.choice(self.interpolation_modes)
+        self.resize.labels_format = self.labels_format
+        return self.resize(image, labels, return_inverter)
+
+class Flip:
+    '''
+    Flips images horizontally or vertically.
+    '''
+    def __init__(self,
+                 dim='horizontal',
+                 labels_format={'class_id': 0, 'xmin': 1, 'ymin': 2, 'xmax': 3, 'ymax': 4}):
+        '''
+        Arguments:
+            dim (str, optional): Can be either of 'horizontal' and 'vertical'.
+                If 'horizontal', images will be flipped horizontally, i.e. along
+                the vertical axis. If 'horizontal', images will be flipped vertically,
+                i.e. along the horizontal axis.
+            labels_format (dict, optional): A dictionary that defines which index in the last axis of the labels
+                of an image contains which bounding box coordinate. The dictionary maps at least the keywords
+                'xmin', 'ymin', 'xmax', and 'ymax' to their respective indices within last axis of the labels array.
+        '''
+        if not (dim in {'horizontal', 'vertical'}): raise ValueError("`dim` can be one of 'horizontal' and 'vertical'.")
+        self.dim = dim
+        self.labels_format = labels_format
+
+    def __call__(self, image, labels=None, return_inverter=False):
+
+        img_height, img_width = image.shape[:2]
+
+        xmin = self.labels_format['xmin']
+        ymin = self.labels_format['ymin']
+        xmax = self.labels_format['xmax']
+        ymax = self.labels_format['ymax']
+
+        if self.dim == 'horizontal':
+            image = image[:,::-1]
+            if labels is None:
+                return image
+            else:
+                labels = np.copy(labels)
+                labels[:, [xmin, xmax]] = img_width - labels[:, [xmax, xmin]]
+                return image, labels
+        else:
+            image = image[::-1]
+            if labels is None:
+                return image
+            else:
+                labels = np.copy(labels)
+                labels[:, [ymin, ymax]] = img_height - labels[:, [ymax, ymin]]
+                return image, labels
+
+class RandomFlip:
+    '''
+    Randomly flips images horizontally or vertically. The randomness only refers
+    to whether or not the image will be flipped.
+    '''
+    def __init__(self,
+                 dim='horizontal',
+                 prob=0.5,
+                 labels_format={'class_id': 0, 'xmin': 1, 'ymin': 2, 'xmax': 3, 'ymax': 4}):
+        '''
+        Arguments:
+            dim (str, optional): Can be either of 'horizontal' and 'vertical'.
+                If 'horizontal', images will be flipped horizontally, i.e. along
+                the vertical axis. If 'horizontal', images will be flipped vertically,
+                i.e. along the horizontal axis.
+            prob (float, optional): `(1 - prob)` determines the probability with which the original,
+                unaltered image is returned.
+            labels_format (dict, optional): A dictionary that defines which index in the last axis of the labels
+                of an image contains which bounding box coordinate. The dictionary maps at least the keywords
+                'xmin', 'ymin', 'xmax', and 'ymax' to their respective indices within last axis of the labels array.
+        '''
+        self.dim = dim
+        self.prob = prob
+        self.labels_format = labels_format
+        self.flip = Flip(dim=self.dim, labels_format=self.labels_format)
+
+    def __call__(self, image, labels=None):
+        p = np.random.uniform(0,1)
+        if p >= (1.0-self.prob):
+            self.flip.labels_format = self.labels_format
+            return self.flip(image, labels)
+        elif labels is None:
+            return image
+        else:
+            return image, labels
+
+class Translate:
+    '''
+    Translates images horizontally and/or vertically.
+    '''
+
+    def __init__(self,
+                 dy,
+                 dx,
+                 clip_boxes=True,
+                 box_filter=None,
+                 background=(0,0,0),
+                 labels_format={'class_id': 0, 'xmin': 1, 'ymin': 2, 'xmax': 3, 'ymax': 4}):
+        '''
+        Arguments:
+            dy (float): The fraction of the image height by which to translate images along the
+                vertical axis. Positive values translate images downwards, negative values
+                translate images upwards.
+            dx (float): The fraction of the image width by which to translate images along the
+                horizontal axis. Positive values translate images to the right, negative values
+                translate images to the left.
+            clip_boxes (bool, optional): Only relevant if ground truth bounding boxes are given.
+                If `True`, any ground truth bounding boxes will be clipped to lie entirely within the
+                image after the translation.
+            box_filter (BoxFilter, optional): Only relevant if ground truth bounding boxes are given.
+                A `BoxFilter` object to filter out bounding boxes that don't meet the given criteria
+                after the transformation. Refer to the `BoxFilter` documentation for details. If `None`,
+                the validity of the bounding boxes is not checked.
+            background (list/tuple, optional): A 3-tuple specifying the RGB color value of the
+                background pixels of the translated images.
+            labels_format (dict, optional): A dictionary that defines which index in the last axis of the labels
+                of an image contains which bounding box coordinate. The dictionary maps at least the keywords
+                'xmin', 'ymin', 'xmax', and 'ymax' to their respective indices within last axis of the labels array.
+        '''
+
+        if not (isinstance(box_filter, BoxFilter) or box_filter is None):
+            raise ValueError("`box_filter` must be either `None` or a `BoxFilter` object.")
+        self.dy_rel = dy
+        self.dx_rel = dx
+        self.clip_boxes = clip_boxes
+        self.box_filter = box_filter
+        self.background = background
+        self.labels_format = labels_format
+
+    def __call__(self, image, labels=None):
+
+        img_height, img_width = image.shape[:2]
+
+        # Compute the translation matrix.
+        dy_abs = int(round(img_height * self.dy_rel))
+        dx_abs = int(round(img_width * self.dx_rel))
+        M = np.float32([[1, 0, dx_abs],
+                        [0, 1, dy_abs]])
+
+        # Translate the image.
+        image = cv2.warpAffine(image,
+                               M=M,
+                               dsize=(img_width, img_height),
+                               borderMode=cv2.BORDER_CONSTANT,
+                               borderValue=self.background)
+
+        if labels is None:
+            return image
+        else:
+            xmin = self.labels_format['xmin']
+            ymin = self.labels_format['ymin']
+            xmax = self.labels_format['xmax']
+            ymax = self.labels_format['ymax']
+
+            labels = np.copy(labels)
+            # Translate the box coordinates to the translated image's coordinate system.
+            labels[:,[xmin,xmax]] += dx_abs
+            labels[:,[ymin,ymax]] += dy_abs
+
+            # Compute all valid boxes for this patch.
+            if not (self.box_filter is None):
+                self.box_filter.labels_format = self.labels_format
+                labels = self.box_filter(labels=labels,
+                                         image_height=img_height,
+                                         image_width=img_width)
+
+            if self.clip_boxes:
+                labels[:,[ymin,ymax]] = np.clip(labels[:,[ymin,ymax]], a_min=0, a_max=img_height-1)
+                labels[:,[xmin,xmax]] = np.clip(labels[:,[xmin,xmax]], a_min=0, a_max=img_width-1)
+
+            return image, labels
+
+class RandomTranslate:
+    '''
+    Randomly translates images horizontally and/or vertically.
+    '''
+
+    def __init__(self,
+                 dy_minmax=(0.03,0.3),
+                 dx_minmax=(0.03,0.3),
+                 prob=0.5,
+                 clip_boxes=True,
+                 box_filter=None,
+                 image_validator=None,
+                 n_trials_max=3,
+                 background=(0,0,0),
+                 labels_format={'class_id': 0, 'xmin': 1, 'ymin': 2, 'xmax': 3, 'ymax': 4}):
+        '''
+        Arguments:
+            dy_minmax (list/tuple, optional): A 2-tuple `(min, max)` of non-negative floats that
+                determines the minimum and maximum relative translation of images along the vertical
+                axis both upward and downward. That is, images will be randomly translated by at least
+                `min` and at most `max` either upward or downward. For example, if `dy_minmax == (0.05,0.3)`,
+                an image of size `(100,100)` will be translated by at least 5 and at most 30 pixels
+                either upward or downward. The translation direction is chosen randomly.
+            dx_minmax (list/tuple, optional): A 2-tuple `(min, max)` of non-negative floats that
+                determines the minimum and maximum relative translation of images along the horizontal
+                axis both to the left and right. That is, images will be randomly translated by at least
+                `min` and at most `max` either left or right. For example, if `dx_minmax == (0.05,0.3)`,
+                an image of size `(100,100)` will be translated by at least 5 and at most 30 pixels
+                either left or right. The translation direction is chosen randomly.
+            prob (float, optional): `(1 - prob)` determines the probability with which the original,
+                unaltered image is returned.
+            clip_boxes (bool, optional): Only relevant if ground truth bounding boxes are given.
+                If `True`, any ground truth bounding boxes will be clipped to lie entirely within the
+                image after the translation.
+            box_filter (BoxFilter, optional): Only relevant if ground truth bounding boxes are given.
+                A `BoxFilter` object to filter out bounding boxes that don't meet the given criteria
+                after the transformation. Refer to the `BoxFilter` documentation for details. If `None`,
+                the validity of the bounding boxes is not checked.
+            image_validator (ImageValidator, optional): Only relevant if ground truth bounding boxes are given.
+                An `ImageValidator` object to determine whether a translated image is valid. If `None`,
+                any outcome is valid.
+            n_trials_max (int, optional): Only relevant if ground truth bounding boxes are given.
+                Determines the maxmial number of trials to produce a valid image. If no valid image could
+                be produced in `n_trials_max` trials, returns the unaltered input image.
+            background (list/tuple, optional): A 3-tuple specifying the RGB color value of the
+                background pixels of the translated images.
+            labels_format (dict, optional): A dictionary that defines which index in the last axis of the labels
+                of an image contains which bounding box coordinate. The dictionary maps at least the keywords
+                'xmin', 'ymin', 'xmax', and 'ymax' to their respective indices within last axis of the labels array.
+        '''
+        if dy_minmax[0] > dy_minmax[1]:
+            raise ValueError("It must be `dy_minmax[0] <= dy_minmax[1]`.")
+        if dx_minmax[0] > dx_minmax[1]:
+            raise ValueError("It must be `dx_minmax[0] <= dx_minmax[1]`.")
+        if dy_minmax[0] < 0 or dx_minmax[0] < 0:
+            raise ValueError("It must be `dy_minmax[0] >= 0` and `dx_minmax[0] >= 0`.")
+        if not (isinstance(image_validator, ImageValidator) or image_validator is None):
+            raise ValueError("`image_validator` must be either `None` or an `ImageValidator` object.")
+        self.dy_minmax = dy_minmax
+        self.dx_minmax = dx_minmax
+        self.prob = prob
+        self.clip_boxes = clip_boxes
+        self.box_filter = box_filter
+        self.image_validator = image_validator
+        self.n_trials_max = n_trials_max
+        self.background = background
+        self.labels_format = labels_format
+        self.translate = Translate(dy=0,
+                                   dx=0,
+                                   clip_boxes=self.clip_boxes,
+                                   box_filter=self.box_filter,
+                                   background=self.background,
+                                   labels_format=self.labels_format)
+
+    def __call__(self, image, labels=None):
+
+        p = np.random.uniform(0,1)
+        if p >= (1.0-self.prob):
+
+            img_height, img_width = image.shape[:2]
+
+            xmin = self.labels_format['xmin']
+            ymin = self.labels_format['ymin']
+            xmax = self.labels_format['xmax']
+            ymax = self.labels_format['ymax']
+
+            # Override the preset labels format.
+            if not self.image_validator is None:
+                self.image_validator.labels_format = self.labels_format
+            self.translate.labels_format = self.labels_format
+
+            for _ in range(max(1, self.n_trials_max)):
+
+                # Pick the relative amount by which to translate.
+                dy_abs = np.random.uniform(self.dy_minmax[0], self.dy_minmax[1])
+                dx_abs = np.random.uniform(self.dx_minmax[0], self.dx_minmax[1])
+                # Pick the direction in which to translate.
+                dy = np.random.choice([-dy_abs, dy_abs])
+                dx = np.random.choice([-dx_abs, dx_abs])
+                self.translate.dy_rel = dy
+                self.translate.dx_rel = dx
+
+                if (labels is None) or (self.image_validator is None):
+                    # We either don't have any boxes or if we do, we will accept any outcome as valid.
+                    return self.translate(image, labels)
+                else:
+                    # Translate the box coordinates to the translated image's coordinate system.
+                    new_labels = np.copy(labels)
+                    new_labels[:, [ymin, ymax]] += int(round(img_height * dy))
+                    new_labels[:, [xmin, xmax]] += int(round(img_width * dx))
+
+                    # Check if the patch is valid.
+                    if self.image_validator(labels=new_labels,
+                                            image_height=img_height,
+                                            image_width=img_width):
+                        return self.translate(image, labels)
+
+            # If all attempts failed, return the unaltered input image.
+            if labels is None:
+                return image
+
+            else:
+                return image, labels
+
+        elif labels is None:
+            return image
+
+        else:
+            return image, labels
+
+class Scale:
+    '''
+    Scales images, i.e. zooms in or out.
+    '''
+
+    def __init__(self,
+                 factor,
+                 clip_boxes=True,
+                 box_filter=None,
+                 background=(0,0,0),
+                 labels_format={'class_id': 0, 'xmin': 1, 'ymin': 2, 'xmax': 3, 'ymax': 4}):
+        '''
+        Arguments:
+            factor (float): The fraction of the image size by which to scale images. Must be positive.
+            clip_boxes (bool, optional): Only relevant if ground truth bounding boxes are given.
+                If `True`, any ground truth bounding boxes will be clipped to lie entirely within the
+                image after the translation.
+            box_filter (BoxFilter, optional): Only relevant if ground truth bounding boxes are given.
+                A `BoxFilter` object to filter out bounding boxes that don't meet the given criteria
+                after the transformation. Refer to the `BoxFilter` documentation for details. If `None`,
+                the validity of the bounding boxes is not checked.
+            background (list/tuple, optional): A 3-tuple specifying the RGB color value of the potential
+                background pixels of the scaled images.
+            labels_format (dict, optional): A dictionary that defines which index in the last axis of the labels
+                of an image contains which bounding box coordinate. The dictionary maps at least the keywords
+                'xmin', 'ymin', 'xmax', and 'ymax' to their respective indices within last axis of the labels array.
+        '''
+
+        if factor <= 0:
+            raise ValueError("It must be `factor > 0`.")
+        if not (isinstance(box_filter, BoxFilter) or box_filter is None):
+            raise ValueError("`box_filter` must be either `None` or a `BoxFilter` object.")
+        self.factor = factor
+        self.clip_boxes = clip_boxes
+        self.box_filter = box_filter
+        self.background = background
+        self.labels_format = labels_format
+
+    def __call__(self, image, labels=None):
+
+        img_height, img_width = image.shape[:2]
+
+        # Compute the rotation matrix.
+        M = cv2.getRotationMatrix2D(center=(img_width / 2, img_height / 2),
+                                    angle=0,
+                                    scale=self.factor)
+
+        # Scale the image.
+        image = cv2.warpAffine(image,
+                               M=M,
+                               dsize=(img_width, img_height),
+                               borderMode=cv2.BORDER_CONSTANT,
+                               borderValue=self.background)
+
+        if labels is None:
+            return image
+        else:
+            xmin = self.labels_format['xmin']
+            ymin = self.labels_format['ymin']
+            xmax = self.labels_format['xmax']
+            ymax = self.labels_format['ymax']
+
+            labels = np.copy(labels)
+            # Scale the bounding boxes accordingly.
+            # Transform two opposite corner points of the rectangular boxes using the rotation matrix `M`.
+            toplefts = np.array([labels[:,xmin], labels[:,ymin], np.ones(labels.shape[0])])
+            bottomrights = np.array([labels[:,xmax], labels[:,ymax], np.ones(labels.shape[0])])
+            new_toplefts = (np.dot(M, toplefts)).T
+            new_bottomrights = (np.dot(M, bottomrights)).T
+            labels[:,[xmin,ymin]] = np.round(new_toplefts, decimals=0).astype(np.int)
+            labels[:,[xmax,ymax]] = np.round(new_bottomrights, decimals=0).astype(np.int)
+
+            # Compute all valid boxes for this patch.
+            if not (self.box_filter is None):
+                self.box_filter.labels_format = self.labels_format
+                labels = self.box_filter(labels=labels,
+                                         image_height=img_height,
+                                         image_width=img_width)
+
+            if self.clip_boxes:
+                labels[:,[ymin,ymax]] = np.clip(labels[:,[ymin,ymax]], a_min=0, a_max=img_height-1)
+                labels[:,[xmin,xmax]] = np.clip(labels[:,[xmin,xmax]], a_min=0, a_max=img_width-1)
+
+            return image, labels
+
+class RandomScale:
+    '''
+    Randomly scales images.
+    '''
+
+    def __init__(self,
+                 min_factor=0.5,
+                 max_factor=1.5,
+                 prob=0.5,
+                 clip_boxes=True,
+                 box_filter=None,
+                 image_validator=None,
+                 n_trials_max=3,
+                 background=(0,0,0),
+                 labels_format={'class_id': 0, 'xmin': 1, 'ymin': 2, 'xmax': 3, 'ymax': 4}):
+        '''
+        Arguments:
+            min_factor (float, optional): The minimum fraction of the image size by which to scale images.
+                Must be positive.
+            max_factor (float, optional): The maximum fraction of the image size by which to scale images.
+                Must be positive.
+            prob (float, optional): `(1 - prob)` determines the probability with which the original,
+                unaltered image is returned.
+            clip_boxes (bool, optional): Only relevant if ground truth bounding boxes are given.
+                If `True`, any ground truth bounding boxes will be clipped to lie entirely within the
+                image after the translation.
+            box_filter (BoxFilter, optional): Only relevant if ground truth bounding boxes are given.
+                A `BoxFilter` object to filter out bounding boxes that don't meet the given criteria
+                after the transformation. Refer to the `BoxFilter` documentation for details. If `None`,
+                the validity of the bounding boxes is not checked.
+            image_validator (ImageValidator, optional): Only relevant if ground truth bounding boxes are given.
+                An `ImageValidator` object to determine whether a scaled image is valid. If `None`,
+                any outcome is valid.
+            n_trials_max (int, optional): Only relevant if ground truth bounding boxes are given.
+                Determines the maxmial number of trials to produce a valid image. If no valid image could
+                be produced in `n_trials_max` trials, returns the unaltered input image.
+            background (list/tuple, optional): A 3-tuple specifying the RGB color value of the potential
+                background pixels of the scaled images.
+            labels_format (dict, optional): A dictionary that defines which index in the last axis of the labels
+                of an image contains which bounding box coordinate. The dictionary maps at least the keywords
+                'xmin', 'ymin', 'xmax', and 'ymax' to their respective indices within last axis of the labels array.
+        '''
+
+        if not (0 < min_factor <= max_factor):
+            raise ValueError("It must be `0 < min_factor <= max_factor`.")
+        if not (isinstance(image_validator, ImageValidator) or image_validator is None):
+            raise ValueError("`image_validator` must be either `None` or an `ImageValidator` object.")
+        self.min_factor = min_factor
+        self.max_factor = max_factor
+        self.prob = prob
+        self.clip_boxes = clip_boxes
+        self.box_filter = box_filter
+        self.image_validator = image_validator
+        self.n_trials_max = n_trials_max
+        self.background = background
+        self.labels_format = labels_format
+        self.scale = Scale(factor=1.0,
+                           clip_boxes=self.clip_boxes,
+                           box_filter=self.box_filter,
+                           background=self.background,
+                           labels_format=self.labels_format)
+
+    def __call__(self, image, labels=None):
+
+        p = np.random.uniform(0,1)
+        if p >= (1.0-self.prob):
+
+            img_height, img_width = image.shape[:2]
+
+            xmin = self.labels_format['xmin']
+            ymin = self.labels_format['ymin']
+            xmax = self.labels_format['xmax']
+            ymax = self.labels_format['ymax']
+
+            # Override the preset labels format.
+            if not self.image_validator is None:
+                self.image_validator.labels_format = self.labels_format
+            self.scale.labels_format = self.labels_format
+
+            for _ in range(max(1, self.n_trials_max)):
+
+                # Pick a scaling factor.
+                factor = np.random.uniform(self.min_factor, self.max_factor)
+                self.scale.factor = factor
+
+                if (labels is None) or (self.image_validator is None):
+                    # We either don't have any boxes or if we do, we will accept any outcome as valid.
+                    return self.scale(image, labels)
+                else:
+                    # Scale the bounding boxes accordingly.
+                    # Transform two opposite corner points of the rectangular boxes using the rotation matrix `M`.
+                    toplefts = np.array([labels[:,xmin], labels[:,ymin], np.ones(labels.shape[0])])
+                    bottomrights = np.array([labels[:,xmax], labels[:,ymax], np.ones(labels.shape[0])])
+
+                    # Compute the rotation matrix.
+                    M = cv2.getRotationMatrix2D(center=(img_width / 2, img_height / 2),
+                                                angle=0,
+                                                scale=factor)
+
+                    new_toplefts = (np.dot(M, toplefts)).T
+                    new_bottomrights = (np.dot(M, bottomrights)).T
+
+                    new_labels = np.copy(labels)
+                    new_labels[:,[xmin,ymin]] = np.around(new_toplefts, decimals=0).astype(np.int)
+                    new_labels[:,[xmax,ymax]] = np.around(new_bottomrights, decimals=0).astype(np.int)
+
+                    # Check if the patch is valid.
+                    if self.image_validator(labels=new_labels,
+                                            image_height=img_height,
+                                            image_width=img_width):
+                        return self.scale(image, labels)
+
+            # If all attempts failed, return the unaltered input image.
+            if labels is None:
+                return image
+
+            else:
+                return image, labels
+
+        elif labels is None:
+            return image
+
+        else:
+            return image, labels
+
+class Rotate:
+    '''
+    Rotates images counter-clockwise by 90, 180, or 270 degrees.
+    '''
+
+    def __init__(self,
+                 angle,
+                 labels_format={'class_id': 0, 'xmin': 1, 'ymin': 2, 'xmax': 3, 'ymax': 4}):
+        '''
+        Arguments:
+            angle (int): The angle in degrees by which to rotate the images counter-clockwise.
+                Only 90, 180, and 270 are valid values.
+            labels_format (dict, optional): A dictionary that defines which index in the last axis of the labels
+                of an image contains which bounding box coordinate. The dictionary maps at least the keywords
+                'xmin', 'ymin', 'xmax', and 'ymax' to their respective indices within last axis of the labels array.
+        '''
+
+        if not angle in {90, 180, 270}:
+            raise ValueError("`angle` must be in the set {90, 180, 270}.")
+        self.angle = angle
+        self.labels_format = labels_format
+
+    def __call__(self, image, labels=None):
+
+        img_height, img_width = image.shape[:2]
+
+        # Compute the rotation matrix.
+        M = cv2.getRotationMatrix2D(center=(img_width / 2, img_height / 2),
+                                    angle=self.angle,
+                                    scale=1)
+
+        # Get the sine and cosine from the rotation matrix.
+        cos_angle = np.abs(M[0, 0])
+        sin_angle = np.abs(M[0, 1])
+
+        # Compute the new bounding dimensions of the image.
+        img_width_new = int(img_height * sin_angle + img_width * cos_angle)
+        img_height_new = int(img_height * cos_angle + img_width * sin_angle)
+
+        # Adjust the rotation matrix to take into account the translation.
+        M[1, 2] += (img_height_new - img_height) / 2
+        M[0, 2] += (img_width_new - img_width) / 2
+
+        # Rotate the image.
+        image = cv2.warpAffine(image,
+                               M=M,
+                               dsize=(img_width_new, img_height_new))
+
+        if labels is None:
+            return image
+        else:
+            xmin = self.labels_format['xmin']
+            ymin = self.labels_format['ymin']
+            xmax = self.labels_format['xmax']
+            ymax = self.labels_format['ymax']
+
+            labels = np.copy(labels)
+            # Rotate the bounding boxes accordingly.
+            # Transform two opposite corner points of the rectangular boxes using the rotation matrix `M`.
+            toplefts = np.array([labels[:,xmin], labels[:,ymin], np.ones(labels.shape[0])])
+            bottomrights = np.array([labels[:,xmax], labels[:,ymax], np.ones(labels.shape[0])])
+            new_toplefts = (np.dot(M, toplefts)).T
+            new_bottomrights = (np.dot(M, bottomrights)).T
+            labels[:,[xmin,ymin]] = np.round(new_toplefts, decimals=0).astype(np.int)
+            labels[:,[xmax,ymax]] = np.round(new_bottomrights, decimals=0).astype(np.int)
+
+            if self.angle == 90:
+                # ymin and ymax were switched by the rotation.
+                labels[:,[ymax,ymin]] = labels[:,[ymin,ymax]]
+            elif self.angle == 180:
+                # ymin and ymax were switched by the rotation,
+                # and also xmin and xmax were switched.
+                labels[:,[ymax,ymin]] = labels[:,[ymin,ymax]]
+                labels[:,[xmax,xmin]] = labels[:,[xmin,xmax]]
+            elif self.angle == 270:
+                # xmin and xmax were switched by the rotation.
+                labels[:,[xmax,xmin]] = labels[:,[xmin,xmax]]
+
+            return image, labels
+
+class RandomRotate:
+    '''
+    Randomly rotates images counter-clockwise.
+    '''
+
+    def __init__(self,
+                 angles=[90, 180, 270],
+                 prob=0.5,
+                 labels_format={'class_id': 0, 'xmin': 1, 'ymin': 2, 'xmax': 3, 'ymax': 4}):
+        '''
+        Arguments:
+            angle (list): The list of angles in degrees from which one is randomly selected to rotate
+                the images counter-clockwise. Only 90, 180, and 270 are valid values.
+            prob (float, optional): `(1 - prob)` determines the probability with which the original,
+                unaltered image is returned.
+            labels_format (dict, optional): A dictionary that defines which index in the last axis of the labels
+                of an image contains which bounding box coordinate. The dictionary maps at least the keywords
+                'xmin', 'ymin', 'xmax', and 'ymax' to their respective indices within last axis of the labels array.
+        '''
+        for angle in angles:
+            if not angle in {90, 180, 270}:
+                raise ValueError("`angles` can only contain the values 90, 180, and 270.")
+        self.angles = angles
+        self.prob = prob
+        self.labels_format = labels_format
+        self.rotate = Rotate(angle=90, labels_format=self.labels_format)
+
+    def __call__(self, image, labels=None):
+
+        p = np.random.uniform(0,1)
+        if p >= (1.0-self.prob):
+            # Pick a rotation angle.
+            self.rotate.angle = random.choice(self.angles)
+            self.rotate.labels_format = self.labels_format
+            return self.rotate(image, labels)
+
+        elif labels is None:
+            return image
+
+        else:
+            return image, labels
diff --git a/keras_ssd/data_generator/object_detection_2d_image_boxes_validation_utils.py b/keras_ssd/data_generator/object_detection_2d_image_boxes_validation_utils.py
new file mode 100644
index 0000000..8338fd7
--- /dev/null
+++ b/keras_ssd/data_generator/object_detection_2d_image_boxes_validation_utils.py
@@ -0,0 +1,322 @@
+'''
+Utilities for 2D object detection related to answering the following questions:
+1. Given an image size and bounding boxes, which bounding boxes meet certain
+   requirements with respect to the image size?
+2. Given an image size and bounding boxes, is an image of that size valid with
+   respect to the bounding boxes according to certain requirements?
+
+Copyright (C) 2018 Pierluigi Ferrari
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+'''
+
+from __future__ import division
+import numpy as np
+
+from bounding_box_utils.bounding_box_utils import iou
+
+class BoundGenerator:
+    '''
+    Generates pairs of floating point values that represent lower and upper bounds
+    from a given sample space.
+    '''
+    def __init__(self,
+                 sample_space=((0.1, None),
+                               (0.3, None),
+                               (0.5, None),
+                               (0.7, None),
+                               (0.9, None),
+                               (None, None)),
+                 weights=None):
+        '''
+        Arguments:
+            sample_space (list or tuple): A list, tuple, or array-like object of shape
+                `(n, 2)` that contains `n` samples to choose from, where each sample
+                is a 2-tuple of scalars and/or `None` values.
+            weights (list or tuple, optional): A list or tuple representing the distribution
+                over the sample space. If `None`, a uniform distribution will be assumed.
+        '''
+
+        if (not (weights is None)) and len(weights) != len(sample_space):
+            raise ValueError("`weights` must either be `None` for uniform distribution or have the same length as `sample_space`.")
+
+        self.sample_space = []
+        for bound_pair in sample_space:
+            if len(bound_pair) != 2:
+                raise ValueError("All elements of the sample space must be 2-tuples.")
+            bound_pair = list(bound_pair)
+            if bound_pair[0] is None: bound_pair[0] = 0.0
+            if bound_pair[1] is None: bound_pair[1] = 1.0
+            if bound_pair[0] > bound_pair[1]:
+                raise ValueError("For all sample space elements, the lower bound cannot be greater than the upper bound.")
+            self.sample_space.append(bound_pair)
+
+        self.sample_space_size = len(self.sample_space)
+
+        if weights is None:
+            self.weights = [1.0/self.sample_space_size] * self.sample_space_size
+        else:
+            self.weights = weights
+
+    def __call__(self):
+        '''
+        Returns:
+            An item of the sample space, i.e. a 2-tuple of scalars.
+        '''
+        i = np.random.choice(self.sample_space_size, p=self.weights)
+        return self.sample_space[i]
+
+class BoxFilter:
+    '''
+    Returns all bounding boxes that are valid with respect to a the defined criteria.
+    '''
+
+    def __init__(self,
+                 check_overlap=True,
+                 check_min_area=True,
+                 check_degenerate=True,
+                 overlap_criterion='center_point',
+                 overlap_bounds=(0.3, 1.0),
+                 min_area=16,
+                 labels_format={'class_id': 0, 'xmin': 1, 'ymin': 2, 'xmax': 3, 'ymax': 4},
+                 border_pixels='half'):
+        '''
+        Arguments:
+            check_overlap (bool, optional): Whether or not to enforce the overlap requirements defined by
+                `overlap_criterion` and `overlap_bounds`. Sometimes you might want to use the box filter only
+                to enforce a certain minimum area for all boxes (see next argument), in such cases you can
+                turn the overlap requirements off.
+            check_min_area (bool, optional): Whether or not to enforce the minimum area requirement defined
+                by `min_area`. If `True`, any boxes that have an area (in pixels) that is smaller than `min_area`
+                will be removed from the labels of an image. Bounding boxes below a certain area aren't useful
+                training examples. An object that takes up only, say, 5 pixels in an image is probably not
+                recognizable anymore, neither for a human, nor for an object detection model. It makes sense
+                to remove such boxes.
+            check_degenerate (bool, optional): Whether or not to check for and remove degenerate bounding boxes.
+                Degenerate bounding boxes are boxes that have `xmax <= xmin` and/or `ymax <= ymin`. In particular,
+                boxes with a width and/or height of zero are degenerate. It is obviously important to filter out
+                such boxes, so you should only set this option to `False` if you are certain that degenerate
+                boxes are not possible in your data and processing chain.
+            overlap_criterion (str, optional): Can be either of 'center_point', 'iou', or 'area'. Determines
+                which boxes are considered valid with respect to a given image. If set to 'center_point',
+                a given bounding box is considered valid if its center point lies within the image.
+                If set to 'area', a given bounding box is considered valid if the quotient of its intersection
+                area with the image and its own area is within the given `overlap_bounds`. If set to 'iou', a given
+                bounding box is considered valid if its IoU with the image is within the given `overlap_bounds`.
+            overlap_bounds (list or BoundGenerator, optional): Only relevant if `overlap_criterion` is 'area' or 'iou'.
+                Determines the lower and upper bounds for `overlap_criterion`. Can be either a 2-tuple of scalars
+                representing a lower bound and an upper bound, or a `BoundGenerator` object, which provides
+                the possibility to generate bounds randomly.
+            min_area (int, optional): Only relevant if `check_min_area` is `True`. Defines the minimum area in
+                pixels that a bounding box must have in order to be valid. Boxes with an area smaller than this
+                will be removed.
+            labels_format (dict, optional): A dictionary that defines which index in the last axis of the labels
+                of an image contains which bounding box coordinate. The dictionary maps at least the keywords
+                'xmin', 'ymin', 'xmax', and 'ymax' to their respective indices within last axis of the labels array.
+            border_pixels (str, optional): How to treat the border pixels of the bounding boxes.
+                Can be 'include', 'exclude', or 'half'. If 'include', the border pixels belong
+                to the boxes. If 'exclude', the border pixels do not belong to the boxes.
+                If 'half', then one of each of the two horizontal and vertical borders belong
+                to the boxex, but not the other.
+        '''
+        if not isinstance(overlap_bounds, (list, tuple, BoundGenerator)):
+            raise ValueError("`overlap_bounds` must be either a 2-tuple of scalars or a `BoundGenerator` object.")
+        if isinstance(overlap_bounds, (list, tuple)) and (overlap_bounds[0] > overlap_bounds[1]):
+            raise ValueError("The lower bound must not be greater than the upper bound.")
+        if not (overlap_criterion in {'iou', 'area', 'center_point'}):
+            raise ValueError("`overlap_criterion` must be one of 'iou', 'area', or 'center_point'.")
+        self.overlap_criterion = overlap_criterion
+        self.overlap_bounds = overlap_bounds
+        self.min_area = min_area
+        self.check_overlap = check_overlap
+        self.check_min_area = check_min_area
+        self.check_degenerate = check_degenerate
+        self.labels_format = labels_format
+        self.border_pixels = border_pixels
+
+    def __call__(self,
+                 labels,
+                 image_height=None,
+                 image_width=None):
+        '''
+        Arguments:
+            labels (array): The labels to be filtered. This is an array with shape `(m,n)`, where
+                `m` is the number of bounding boxes and `n` is the number of elements that defines
+                each bounding box (box coordinates, class ID, etc.). The box coordinates are expected
+                to be in the image's coordinate system.
+            image_height (int): Only relevant if `check_overlap == True`. The height of the image
+                (in pixels) to compare the box coordinates to.
+            image_width (int): `check_overlap == True`. The width of the image (in pixels) to compare
+                the box coordinates to.
+
+        Returns:
+            An array containing the labels of all boxes that are valid.
+        '''
+
+        labels = np.copy(labels)
+
+        xmin = self.labels_format['xmin']
+        ymin = self.labels_format['ymin']
+        xmax = self.labels_format['xmax']
+        ymax = self.labels_format['ymax']
+
+        # Record the boxes that pass all checks here.
+        requirements_met = np.ones(shape=labels.shape[0], dtype=np.bool)
+
+        if self.check_degenerate:
+
+            non_degenerate = (labels[:,xmax] > labels[:,xmin]) * (labels[:,ymax] > labels[:,ymin])
+            requirements_met *= non_degenerate
+
+        if self.check_min_area:
+
+            min_area_met = (labels[:,xmax] - labels[:,xmin]) * (labels[:,ymax] - labels[:,ymin]) >= self.min_area
+            requirements_met *= min_area_met
+
+        if self.check_overlap:
+
+            # Get the lower and upper bounds.
+            if isinstance(self.overlap_bounds, BoundGenerator):
+                lower, upper = self.overlap_bounds()
+            else:
+                lower, upper = self.overlap_bounds
+
+            # Compute which boxes are valid.
+
+            if self.overlap_criterion == 'iou':
+                # Compute the patch coordinates.
+                image_coords = np.array([0, 0, image_width, image_height])
+                # Compute the IoU between the patch and all of the ground truth boxes.
+                image_boxes_iou = iou(image_coords, labels[:, [xmin, ymin, xmax, ymax]], coords='corners', mode='element-wise', border_pixels=self.border_pixels)
+                requirements_met *= (image_boxes_iou > lower) * (image_boxes_iou <= upper)
+
+            elif self.overlap_criterion == 'area':
+                if self.border_pixels == 'half':
+                    d = 0
+                elif self.border_pixels == 'include':
+                    d = 1 # If border pixels are supposed to belong to the bounding boxes, we have to add one pixel to any difference `xmax - xmin` or `ymax - ymin`.
+                elif self.border_pixels == 'exclude':
+                    d = -1 # If border pixels are not supposed to belong to the bounding boxes, we have to subtract one pixel from any difference `xmax - xmin` or `ymax - ymin`.
+                # Compute the areas of the boxes.
+                box_areas = (labels[:,xmax] - labels[:,xmin] + d) * (labels[:,ymax] - labels[:,ymin] + d)
+                # Compute the intersection area between the patch and all of the ground truth boxes.
+                clipped_boxes = np.copy(labels)
+                clipped_boxes[:,[ymin,ymax]] = np.clip(labels[:,[ymin,ymax]], a_min=0, a_max=image_height-1)
+                clipped_boxes[:,[xmin,xmax]] = np.clip(labels[:,[xmin,xmax]], a_min=0, a_max=image_width-1)
+                intersection_areas = (clipped_boxes[:,xmax] - clipped_boxes[:,xmin] + d) * (clipped_boxes[:,ymax] - clipped_boxes[:,ymin] + d) # +1 because the border pixels belong to the box areas.
+                # Check which boxes meet the overlap requirements.
+                if lower == 0.0:
+                    mask_lower = intersection_areas > lower * box_areas # If `self.lower == 0`, we want to make sure that boxes with area 0 don't count, hence the ">" sign instead of the ">=" sign.
+                else:
+                    mask_lower = intersection_areas >= lower * box_areas # Especially for the case `self.lower == 1` we want the ">=" sign, otherwise no boxes would count at all.
+                mask_upper = intersection_areas <= upper * box_areas
+                requirements_met *= mask_lower * mask_upper
+
+            elif self.overlap_criterion == 'center_point':
+                # Compute the center points of the boxes.
+                cy = (labels[:,ymin] + labels[:,ymax]) / 2
+                cx = (labels[:,xmin] + labels[:,xmax]) / 2
+                # Check which of the boxes have center points within the cropped patch remove those that don't.
+                requirements_met *= (cy >= 0.0) * (cy <= image_height-1) * (cx >= 0.0) * (cx <= image_width-1)
+
+        return labels[requirements_met]
+
+class ImageValidator:
+    '''
+    Returns `True` if a given minimum number of bounding boxes meets given overlap
+    requirements with an image of a given height and width.
+    '''
+
+    def __init__(self,
+                 overlap_criterion='center_point',
+                 bounds=(0.3, 1.0),
+                 n_boxes_min=1,
+                 labels_format={'class_id': 0, 'xmin': 1, 'ymin': 2, 'xmax': 3, 'ymax': 4},
+                 border_pixels='half'):
+        '''
+        Arguments:
+            overlap_criterion (str, optional): Can be either of 'center_point', 'iou', or 'area'. Determines
+                which boxes are considered valid with respect to a given image. If set to 'center_point',
+                a given bounding box is considered valid if its center point lies within the image.
+                If set to 'area', a given bounding box is considered valid if the quotient of its intersection
+                area with the image and its own area is within `lower` and `upper`. If set to 'iou', a given
+                bounding box is considered valid if its IoU with the image is within `lower` and `upper`.
+            bounds (list or BoundGenerator, optional): Only relevant if `overlap_criterion` is 'area' or 'iou'.
+                Determines the lower and upper bounds for `overlap_criterion`. Can be either a 2-tuple of scalars
+                representing a lower bound and an upper bound, or a `BoundGenerator` object, which provides
+                the possibility to generate bounds randomly.
+            n_boxes_min (int or str, optional): Either a non-negative integer or the string 'all'.
+                Determines the minimum number of boxes that must meet the `overlap_criterion` with respect to
+                an image of the given height and width in order for the image to be a valid image.
+                If set to 'all', an image is considered valid if all given boxes meet the `overlap_criterion`.
+            labels_format (dict, optional): A dictionary that defines which index in the last axis of the labels
+                of an image contains which bounding box coordinate. The dictionary maps at least the keywords
+                'xmin', 'ymin', 'xmax', and 'ymax' to their respective indices within last axis of the labels array.
+            border_pixels (str, optional): How to treat the border pixels of the bounding boxes.
+                Can be 'include', 'exclude', or 'half'. If 'include', the border pixels belong
+                to the boxes. If 'exclude', the border pixels do not belong to the boxes.
+                If 'half', then one of each of the two horizontal and vertical borders belong
+                to the boxex, but not the other.
+        '''
+        if not ((isinstance(n_boxes_min, int) and n_boxes_min > 0) or n_boxes_min == 'all'):
+            raise ValueError("`n_boxes_min` must be a positive integer or 'all'.")
+        self.overlap_criterion = overlap_criterion
+        self.bounds = bounds
+        self.n_boxes_min = n_boxes_min
+        self.labels_format = labels_format
+        self.border_pixels = border_pixels
+        self.box_filter = BoxFilter(check_overlap=True,
+                                    check_min_area=False,
+                                    check_degenerate=False,
+                                    overlap_criterion=self.overlap_criterion,
+                                    overlap_bounds=self.bounds,
+                                    labels_format=self.labels_format,
+                                    border_pixels=self.border_pixels)
+
+    def __call__(self,
+                 labels,
+                 image_height,
+                 image_width):
+        '''
+        Arguments:
+            labels (array): The labels to be tested. The box coordinates are expected
+                to be in the image's coordinate system.
+            image_height (int): The height of the image to compare the box coordinates to.
+            image_width (int): The width of the image to compare the box coordinates to.
+
+        Returns:
+            A boolean indicating whether an imgae of the given height and width is
+            valid with respect to the given bounding boxes.
+        '''
+
+        self.box_filter.overlap_bounds = self.bounds
+        self.box_filter.labels_format = self.labels_format
+
+        # Get all boxes that meet the overlap requirements.
+        valid_labels = self.box_filter(labels=labels,
+                                       image_height=image_height,
+                                       image_width=image_width)
+
+        # Check whether enough boxes meet the requirements.
+        if isinstance(self.n_boxes_min, int):
+            # The image is valid if at least `self.n_boxes_min` ground truth boxes meet the requirements.
+            if len(valid_labels) >= self.n_boxes_min:
+                return True
+            else:
+                return False
+        elif self.n_boxes_min == 'all':
+            # The image is valid if all ground truth boxes meet the requirements.
+            if len(valid_labels) == len(labels):
+                return True
+            else:
+                return False
diff --git a/keras_ssd/data_generator/object_detection_2d_misc_utils.py b/keras_ssd/data_generator/object_detection_2d_misc_utils.py
new file mode 100644
index 0000000..1a4397f
--- /dev/null
+++ b/keras_ssd/data_generator/object_detection_2d_misc_utils.py
@@ -0,0 +1,73 @@
+'''
+Miscellaneous data generator utilities.
+
+Copyright (C) 2018 Pierluigi Ferrari
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+'''
+
+from __future__ import division
+import numpy as np
+
+def apply_inverse_transforms(y_pred_decoded, inverse_transforms):
+    '''
+    Takes a list or Numpy array of decoded predictions and applies a given list of
+    transforms to them. The list of inverse transforms would usually contain the
+    inverter functions that some of the image transformations that come with this
+    data generator return. This function would normally be used to transform predictions
+    that were made on a transformed image back to the original image.
+
+    Arguments:
+        y_pred_decoded (list or array): Either a list of length `batch_size` that
+            contains Numpy arrays that contain the predictions for each batch item
+            or a Numpy array. If this is a list of Numpy arrays, the arrays would
+            usually have the shape `(num_predictions, 6)`, where `num_predictions`
+            is different for each batch item. If this is a Numpy array, it would
+            usually have the shape `(batch_size, num_predictions, 6)`. The last axis
+            would usually contain the class ID, confidence score, and four bounding
+            box coordinates for each prediction.
+        inverse_predictions (list): A nested list of length `batch_size` that contains
+            for each batch item a list of functions that take one argument (one element
+            of `y_pred_decoded` if it is a list or one slice along the first axis of
+            `y_pred_decoded` if it is an array) and return an output of the same shape
+            and data type.
+
+    Returns:
+        The transformed predictions, which have the same structure as `y_pred_decoded`.
+    '''
+
+    if isinstance(y_pred_decoded, list):
+
+        y_pred_decoded_inv = []
+
+        for i in range(len(y_pred_decoded)):
+            y_pred_decoded_inv.append(np.copy(y_pred_decoded[i]))
+            if y_pred_decoded_inv[i].size > 0: # If there are any predictions for this batch item.
+                for inverter in inverse_transforms[i]:
+                    if not (inverter is None):
+                        y_pred_decoded_inv[i] = inverter(y_pred_decoded_inv[i])
+
+    elif isinstance(y_pred_decoded, np.ndarray):
+
+        y_pred_decoded_inv = np.copy(y_pred_decoded)
+
+        for i in range(len(y_pred_decoded)):
+            if y_pred_decoded_inv[i].size > 0: # If there are any predictions for this batch item.
+                for inverter in inverse_transforms[i]:
+                    if not (inverter is None):
+                        y_pred_decoded_inv[i] = inverter(y_pred_decoded_inv[i])
+
+    else:
+        raise ValueError("`y_pred_decoded` must be either a list or a Numpy array.")
+
+    return y_pred_decoded_inv
diff --git a/keras_ssd/data_generator/object_detection_2d_patch_sampling_ops.py b/keras_ssd/data_generator/object_detection_2d_patch_sampling_ops.py
new file mode 100644
index 0000000..bec7002
--- /dev/null
+++ b/keras_ssd/data_generator/object_detection_2d_patch_sampling_ops.py
@@ -0,0 +1,881 @@
+'''
+Various patch sampling operations for data augmentation in 2D object detection.
+
+Copyright (C) 2018 Pierluigi Ferrari
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+'''
+
+from __future__ import division
+import numpy as np
+
+from data_generator.object_detection_2d_image_boxes_validation_utils import BoundGenerator, BoxFilter, ImageValidator
+
+class PatchCoordinateGenerator:
+    '''
+    Generates random patch coordinates that meet specified requirements.
+    '''
+
+    def __init__(self,
+                 img_height=None,
+                 img_width=None,
+                 must_match='h_w',
+                 min_scale=0.3,
+                 max_scale=1.0,
+                 scale_uniformly=False,
+                 min_aspect_ratio = 0.5,
+                 max_aspect_ratio = 2.0,
+                 patch_ymin=None,
+                 patch_xmin=None,
+                 patch_height=None,
+                 patch_width=None,
+                 patch_aspect_ratio=None):
+        '''
+        Arguments:
+            img_height (int): The height of the image for which the patch coordinates
+                shall be generated. Doesn't have to be known upon construction.
+            img_width (int): The width of the image for which the patch coordinates
+                shall be generated. Doesn't have to be known upon construction.
+            must_match (str, optional): Can be either of 'h_w', 'h_ar', and 'w_ar'.
+                Specifies which two of the three quantities height, width, and aspect
+                ratio determine the shape of the generated patch. The respective third
+                quantity will be computed from the other two. For example,
+                if `must_match == 'h_w'`, then the patch's height and width will be
+                set to lie within [min_scale, max_scale] of the image size or to
+                `patch_height` and/or `patch_width`, if given. The patch's aspect ratio
+                is the dependent variable in this case, it will be computed from the
+                height and width. Any given values for `patch_aspect_ratio`,
+                `min_aspect_ratio`, or `max_aspect_ratio` will be ignored.
+            min_scale (float, optional): The minimum size of a dimension of the patch
+                as a fraction of the respective dimension of the image. Can be greater
+                than 1. For example, if the image width is 200 and `min_scale == 0.5`,
+                then the width of the generated patch will be at least 100. If `min_scale == 1.5`,
+                the width of the generated patch will be at least 300.
+            max_scale (float, optional): The maximum size of a dimension of the patch
+                as a fraction of the respective dimension of the image. Can be greater
+                than 1. For example, if the image width is 200 and `max_scale == 1.0`,
+                then the width of the generated patch will be at most 200. If `max_scale == 1.5`,
+                the width of the generated patch will be at most 300. Must be greater than
+                `min_scale`.
+            scale_uniformly (bool, optional): If `True` and if `must_match == 'h_w'`,
+                the patch height and width will be scaled uniformly, otherwise they will
+                be scaled independently.
+            min_aspect_ratio (float, optional): Determines the minimum aspect ratio
+                for the generated patches.
+            max_aspect_ratio (float, optional): Determines the maximum aspect ratio
+                for the generated patches.
+            patch_ymin (int, optional): `None` or the vertical coordinate of the top left
+                corner of the generated patches. If this is not `None`, the position of the
+                patches along the vertical axis is fixed. If this is `None`, then the
+                vertical position of generated patches will be chosen randomly such that
+                the overlap of a patch and the image along the vertical dimension is
+                always maximal.
+            patch_xmin (int, optional): `None` or the horizontal coordinate of the top left
+                corner of the generated patches. If this is not `None`, the position of the
+                patches along the horizontal axis is fixed. If this is `None`, then the
+                horizontal position of generated patches will be chosen randomly such that
+                the overlap of a patch and the image along the horizontal dimension is
+                always maximal.
+            patch_height (int, optional): `None` or the fixed height of the generated patches.
+            patch_width (int, optional): `None` or the fixed width of the generated patches.
+            patch_aspect_ratio (float, optional): `None` or the fixed aspect ratio of the
+                generated patches.
+        '''
+
+        if not (must_match in {'h_w', 'h_ar', 'w_ar'}):
+            raise ValueError("`must_match` must be either of 'h_w', 'h_ar' and 'w_ar'.")
+        if min_scale >= max_scale:
+            raise ValueError("It must be `min_scale < max_scale`.")
+        if min_aspect_ratio >= max_aspect_ratio:
+            raise ValueError("It must be `min_aspect_ratio < max_aspect_ratio`.")
+        if scale_uniformly and not ((patch_height is None) and (patch_width is None)):
+            raise ValueError("If `scale_uniformly == True`, `patch_height` and `patch_width` must both be `None`.")
+        self.img_height = img_height
+        self.img_width = img_width
+        self.must_match = must_match
+        self.min_scale = min_scale
+        self.max_scale = max_scale
+        self.scale_uniformly = scale_uniformly
+        self.min_aspect_ratio = min_aspect_ratio
+        self.max_aspect_ratio = max_aspect_ratio
+        self.patch_ymin = patch_ymin
+        self.patch_xmin = patch_xmin
+        self.patch_height = patch_height
+        self.patch_width = patch_width
+        self.patch_aspect_ratio = patch_aspect_ratio
+
+    def __call__(self):
+        '''
+        Returns:
+            A 4-tuple `(ymin, xmin, height, width)` that represents the coordinates
+            of the generated patch.
+        '''
+
+        # Get the patch height and width.
+
+        if self.must_match == 'h_w': # Aspect is the dependent variable.
+            if not self.scale_uniformly:
+                # Get the height.
+                if self.patch_height is None:
+                    patch_height = int(np.random.uniform(self.min_scale, self.max_scale) * self.img_height)
+                else:
+                    patch_height = self.patch_height
+                # Get the width.
+                if self.patch_width is None:
+                    patch_width = int(np.random.uniform(self.min_scale, self.max_scale) * self.img_width)
+                else:
+                    patch_width = self.patch_width
+            else:
+                scaling_factor = np.random.uniform(self.min_scale, self.max_scale)
+                patch_height = int(scaling_factor * self.img_height)
+                patch_width = int(scaling_factor * self.img_width)
+
+        elif self.must_match == 'h_ar': # Width is the dependent variable.
+            # Get the height.
+            if self.patch_height is None:
+                patch_height = int(np.random.uniform(self.min_scale, self.max_scale) * self.img_height)
+            else:
+                patch_height = self.patch_height
+            # Get the aspect ratio.
+            if self.patch_aspect_ratio is None:
+                patch_aspect_ratio = np.random.uniform(self.min_aspect_ratio, self.max_aspect_ratio)
+            else:
+                patch_aspect_ratio = self.patch_aspect_ratio
+            # Get the width.
+            patch_width = int(patch_height * patch_aspect_ratio)
+
+        elif self.must_match == 'w_ar': # Height is the dependent variable.
+            # Get the width.
+            if self.patch_width is None:
+                patch_width = int(np.random.uniform(self.min_scale, self.max_scale) * self.img_width)
+            else:
+                patch_width = self.patch_width
+            # Get the aspect ratio.
+            if self.patch_aspect_ratio is None:
+                patch_aspect_ratio = np.random.uniform(self.min_aspect_ratio, self.max_aspect_ratio)
+            else:
+                patch_aspect_ratio = self.patch_aspect_ratio
+            # Get the height.
+            patch_height = int(patch_width / patch_aspect_ratio)
+
+        # Get the top left corner coordinates of the patch.
+
+        if self.patch_ymin is None:
+            # Compute how much room we have along the vertical axis to place the patch.
+            # A negative number here means that we want to sample a patch that is larger than the original image
+            # in the vertical dimension, in which case the patch will be placed such that it fully contains the
+            # image in the vertical dimension.
+            y_range = self.img_height - patch_height
+            # Select a random top left corner for the sample position from the possible positions.
+            if y_range >= 0: patch_ymin = np.random.randint(0, y_range + 1) # There are y_range + 1 possible positions for the crop in the vertical dimension.
+            else: patch_ymin = np.random.randint(y_range, 1) # The possible positions for the image on the background canvas in the vertical dimension.
+        else:
+            patch_ymin = self.patch_ymin
+
+        if self.patch_xmin is None:
+            # Compute how much room we have along the horizontal axis to place the patch.
+            # A negative number here means that we want to sample a patch that is larger than the original image
+            # in the horizontal dimension, in which case the patch will be placed such that it fully contains the
+            # image in the horizontal dimension.
+            x_range = self.img_width - patch_width
+            # Select a random top left corner for the sample position from the possible positions.
+            if x_range >= 0: patch_xmin = np.random.randint(0, x_range + 1) # There are x_range + 1 possible positions for the crop in the horizontal dimension.
+            else: patch_xmin = np.random.randint(x_range, 1) # The possible positions for the image on the background canvas in the horizontal dimension.
+        else:
+            patch_xmin = self.patch_xmin
+
+        return (patch_ymin, patch_xmin, patch_height, patch_width)
+
+class CropPad:
+    '''
+    Crops and/or pads an image deterministically.
+
+    Depending on the given output patch size and the position (top left corner) relative
+    to the input image, the image will be cropped and/or padded along one or both spatial
+    dimensions.
+
+    For example, if the output patch lies entirely within the input image, this will result
+    in a regular crop. If the input image lies entirely within the output patch, this will
+    result in the image being padded in every direction. All other cases are mixed cases
+    where the image might be cropped in some directions and padded in others.
+
+    The output patch can be arbitrary in both size and position as long as it overlaps
+    with the input image.
+    '''
+
+    def __init__(self,
+                 patch_ymin,
+                 patch_xmin,
+                 patch_height,
+                 patch_width,
+                 clip_boxes=True,
+                 box_filter=None,
+                 background=(0,0,0),
+                 labels_format={'class_id': 0, 'xmin': 1, 'ymin': 2, 'xmax': 3, 'ymax': 4}):
+        '''
+        Arguments:
+            patch_ymin (int, optional): The vertical coordinate of the top left corner of the output
+                patch relative to the image coordinate system. Can be negative (i.e. lie outside the image)
+                as long as the resulting patch still overlaps with the image.
+            patch_ymin (int, optional): The horizontal coordinate of the top left corner of the output
+                patch relative to the image coordinate system. Can be negative (i.e. lie outside the image)
+                as long as the resulting patch still overlaps with the image.
+            patch_height (int): The height of the patch to be sampled from the image. Can be greater
+                than the height of the input image.
+            patch_width (int): The width of the patch to be sampled from the image. Can be greater
+                than the width of the input image.
+            clip_boxes (bool, optional): Only relevant if ground truth bounding boxes are given.
+                If `True`, any ground truth bounding boxes will be clipped to lie entirely within the
+                sampled patch.
+            box_filter (BoxFilter, optional): Only relevant if ground truth bounding boxes are given.
+                A `BoxFilter` object to filter out bounding boxes that don't meet the given criteria
+                after the transformation. Refer to the `BoxFilter` documentation for details. If `None`,
+                the validity of the bounding boxes is not checked.
+            background (list/tuple, optional): A 3-tuple specifying the RGB color value of the potential
+                background pixels of the scaled images. In the case of single-channel images,
+                the first element of `background` will be used as the background pixel value.
+            labels_format (dict, optional): A dictionary that defines which index in the last axis of the labels
+                of an image contains which bounding box coordinate. The dictionary maps at least the keywords
+                'xmin', 'ymin', 'xmax', and 'ymax' to their respective indices within last axis of the labels array.
+        '''
+        #if (patch_height <= 0) or (patch_width <= 0):
+        #    raise ValueError("Patch height and width must both be positive.")
+        #if (patch_ymin + patch_height < 0) or (patch_xmin + patch_width < 0):
+        #    raise ValueError("A patch with the given coordinates cannot overlap with an input image.")
+        if not (isinstance(box_filter, BoxFilter) or box_filter is None):
+            raise ValueError("`box_filter` must be either `None` or a `BoxFilter` object.")
+        self.patch_height = patch_height
+        self.patch_width = patch_width
+        self.patch_ymin = patch_ymin
+        self.patch_xmin = patch_xmin
+        self.clip_boxes = clip_boxes
+        self.box_filter = box_filter
+        self.background = background
+        self.labels_format = labels_format
+
+    def __call__(self, image, labels=None, return_inverter=False):
+
+        img_height, img_width = image.shape[:2]
+
+        if (self.patch_ymin > img_height) or (self.patch_xmin > img_width):
+            raise ValueError("The given patch doesn't overlap with the input image.")
+
+        labels = np.copy(labels)
+
+        xmin = self.labels_format['xmin']
+        ymin = self.labels_format['ymin']
+        xmax = self.labels_format['xmax']
+        ymax = self.labels_format['ymax']
+
+        # Top left corner of the patch relative to the image coordinate system:
+        patch_ymin = self.patch_ymin
+        patch_xmin = self.patch_xmin
+
+        # Create a canvas of the size of the patch we want to end up with.
+        if image.ndim == 3:
+            canvas = np.zeros(shape=(self.patch_height, self.patch_width, 3), dtype=np.uint8)
+            canvas[:, :] = self.background
+        elif image.ndim == 2:
+            canvas = np.zeros(shape=(self.patch_height, self.patch_width), dtype=np.uint8)
+            canvas[:, :] = self.background[0]
+
+        # Perform the crop.
+        if patch_ymin < 0 and patch_xmin < 0: # Pad the image at the top and on the left.
+            image_crop_height = min(img_height, self.patch_height + patch_ymin)  # The number of pixels of the image that will end up on the canvas in the vertical direction.
+            image_crop_width = min(img_width, self.patch_width + patch_xmin) # The number of pixels of the image that will end up on the canvas in the horizontal direction.
+            canvas[-patch_ymin:-patch_ymin + image_crop_height, -patch_xmin:-patch_xmin + image_crop_width] = image[:image_crop_height, :image_crop_width]
+
+        elif patch_ymin < 0 and patch_xmin >= 0: # Pad the image at the top and crop it on the left.
+            image_crop_height = min(img_height, self.patch_height + patch_ymin)  # The number of pixels of the image that will end up on the canvas in the vertical direction.
+            image_crop_width = min(self.patch_width, img_width - patch_xmin) # The number of pixels of the image that will end up on the canvas in the horizontal direction.
+            canvas[-patch_ymin:-patch_ymin + image_crop_height, :image_crop_width] = image[:image_crop_height, patch_xmin:patch_xmin + image_crop_width]
+
+        elif patch_ymin >= 0 and patch_xmin < 0: # Crop the image at the top and pad it on the left.
+            image_crop_height = min(self.patch_height, img_height - patch_ymin) # The number of pixels of the image that will end up on the canvas in the vertical direction.
+            image_crop_width = min(img_width, self.patch_width + patch_xmin) # The number of pixels of the image that will end up on the canvas in the horizontal direction.
+            canvas[:image_crop_height, -patch_xmin:-patch_xmin + image_crop_width] = image[patch_ymin:patch_ymin + image_crop_height, :image_crop_width]
+
+        elif patch_ymin >= 0 and patch_xmin >= 0: # Crop the image at the top and on the left.
+            image_crop_height = min(self.patch_height, img_height - patch_ymin) # The number of pixels of the image that will end up on the canvas in the vertical direction.
+            image_crop_width = min(self.patch_width, img_width - patch_xmin) # The number of pixels of the image that will end up on the canvas in the horizontal direction.
+            canvas[:image_crop_height, :image_crop_width] = image[patch_ymin:patch_ymin + image_crop_height, patch_xmin:patch_xmin + image_crop_width]
+
+        image = canvas
+
+        if return_inverter:
+            def inverter(labels):
+                labels = np.copy(labels)
+                labels[:, [ymin+1, ymax+1]] += patch_ymin
+                labels[:, [xmin+1, xmax+1]] += patch_xmin
+                return labels
+
+        if not (labels is None):
+
+            # Translate the box coordinates to the patch's coordinate system.
+            labels[:, [ymin, ymax]] -= patch_ymin
+            labels[:, [xmin, xmax]] -= patch_xmin
+
+            # Compute all valid boxes for this patch.
+            if not (self.box_filter is None):
+                self.box_filter.labels_format = self.labels_format
+                labels = self.box_filter(labels=labels,
+                                         image_height=self.patch_height,
+                                         image_width=self.patch_width)
+
+            if self.clip_boxes:
+                labels[:,[ymin,ymax]] = np.clip(labels[:,[ymin,ymax]], a_min=0, a_max=self.patch_height-1)
+                labels[:,[xmin,xmax]] = np.clip(labels[:,[xmin,xmax]], a_min=0, a_max=self.patch_width-1)
+
+            if return_inverter:
+                return image, labels, inverter
+            else:
+                return image, labels
+
+        else:
+            if return_inverter:
+                return image, inverter
+            else:
+                return image
+
+class Crop:
+    '''
+    Crops off the specified numbers of pixels from the borders of images.
+
+    This is just a convenience interface for `CropPad`.
+    '''
+
+    def __init__(self,
+                 crop_top,
+                 crop_bottom,
+                 crop_left,
+                 crop_right,
+                 clip_boxes=True,
+                 box_filter=None,
+                 labels_format={'class_id': 0, 'xmin': 1, 'ymin': 2, 'xmax': 3, 'ymax': 4}):
+        self.crop_top = crop_top
+        self.crop_bottom = crop_bottom
+        self.crop_left = crop_left
+        self.crop_right = crop_right
+        self.clip_boxes = clip_boxes
+        self.box_filter = box_filter
+        self.labels_format = labels_format
+        self.crop = CropPad(patch_ymin=self.crop_top,
+                            patch_xmin=self.crop_left,
+                            patch_height=None,
+                            patch_width=None,
+                            clip_boxes=self.clip_boxes,
+                            box_filter=self.box_filter,
+                            labels_format=self.labels_format)
+
+    def __call__(self, image, labels=None, return_inverter=False):
+
+        img_height, img_width = image.shape[:2]
+
+        self.crop.patch_height = img_height - self.crop_top - self.crop_bottom
+        self.crop.patch_width = img_width - self.crop_left - self.crop_right
+        self.crop.labels_format = self.labels_format
+
+        return self.crop(image, labels, return_inverter)
+
+class Pad:
+    '''
+    Pads images by the specified numbers of pixels on each side.
+
+    This is just a convenience interface for `CropPad`.
+    '''
+
+    def __init__(self,
+                 pad_top,
+                 pad_bottom,
+                 pad_left,
+                 pad_right,
+                 background=(0,0,0),
+                 labels_format={'class_id': 0, 'xmin': 1, 'ymin': 2, 'xmax': 3, 'ymax': 4}):
+        self.pad_top = pad_top
+        self.pad_bottom = pad_bottom
+        self.pad_left = pad_left
+        self.pad_right = pad_right
+        self.background = background
+        self.labels_format = labels_format
+        self.pad = CropPad(patch_ymin=-self.pad_top,
+                           patch_xmin=-self.pad_left,
+                           patch_height=None,
+                           patch_width=None,
+                           clip_boxes=False,
+                           box_filter=None,
+                           background=self.background,
+                           labels_format=self.labels_format)
+
+    def __call__(self, image, labels=None, return_inverter=False):
+
+        img_height, img_width = image.shape[:2]
+
+        self.pad.patch_height = img_height + self.pad_top + self.pad_bottom
+        self.pad.patch_width = img_width + self.pad_left + self.pad_right
+        self.pad.labels_format = self.labels_format
+
+        return self.pad(image, labels, return_inverter)
+
+class RandomPatch:
+    '''
+    Randomly samples a patch from an image. The randomness refers to whatever
+    randomness may be introduced by the patch coordinate generator, the box filter,
+    and the patch validator.
+
+    Input images may be cropped and/or padded along either or both of the two
+    spatial dimensions as necessary in order to obtain the required patch.
+
+    As opposed to `RandomPatchInf`, it is possible for this transform to fail to produce
+    an output image at all, in which case it will return `None`. This is useful, because
+    if this transform is used to generate patches of a fixed size or aspect ratio, then
+    the caller needs to be able to rely on the output image satisfying the set size or
+    aspect ratio. It might therefore not be an option to return the unaltered input image
+    as other random transforms do when they fail to produce a valid transformed image.
+    '''
+
+    def __init__(self,
+                 patch_coord_generator,
+                 box_filter=None,
+                 image_validator=None,
+                 n_trials_max=3,
+                 clip_boxes=True,
+                 prob=1.0,
+                 background=(0,0,0),
+                 can_fail=False,
+                 labels_format={'class_id': 0, 'xmin': 1, 'ymin': 2, 'xmax': 3, 'ymax': 4}):
+        '''
+        Arguments:
+            patch_coord_generator (PatchCoordinateGenerator): A `PatchCoordinateGenerator` object
+                to generate the positions and sizes of the patches to be sampled from the input images.
+            box_filter (BoxFilter, optional): Only relevant if ground truth bounding boxes are given.
+                A `BoxFilter` object to filter out bounding boxes that don't meet the given criteria
+                after the transformation. Refer to the `BoxFilter` documentation for details. If `None`,
+                the validity of the bounding boxes is not checked.
+            image_validator (ImageValidator, optional): Only relevant if ground truth bounding boxes are given.
+                An `ImageValidator` object to determine whether a sampled patch is valid. If `None`,
+                any outcome is valid.
+            n_trials_max (int, optional): Only relevant if ground truth bounding boxes are given.
+                Determines the maxmial number of trials to sample a valid patch. If no valid patch could
+                be sampled in `n_trials_max` trials, returns one `None` in place of each regular output.
+            clip_boxes (bool, optional): Only relevant if ground truth bounding boxes are given.
+                If `True`, any ground truth bounding boxes will be clipped to lie entirely within the
+                sampled patch.
+            prob (float, optional): `(1 - prob)` determines the probability with which the original,
+                unaltered image is returned.
+            background (list/tuple, optional): A 3-tuple specifying the RGB color value of the potential
+                background pixels of the scaled images. In the case of single-channel images,
+                the first element of `background` will be used as the background pixel value.
+            can_fail (bool, optional): If `True`, will return `None` if no valid patch could be found after
+                `n_trials_max` trials. If `False`, will return the unaltered input image in such a case.
+            labels_format (dict, optional): A dictionary that defines which index in the last axis of the labels
+                of an image contains which bounding box coordinate. The dictionary maps at least the keywords
+                'xmin', 'ymin', 'xmax', and 'ymax' to their respective indices within last axis of the labels array.
+        '''
+        if not isinstance(patch_coord_generator, PatchCoordinateGenerator):
+            raise ValueError("`patch_coord_generator` must be an instance of `PatchCoordinateGenerator`.")
+        if not (isinstance(image_validator, ImageValidator) or image_validator is None):
+            raise ValueError("`image_validator` must be either `None` or an `ImageValidator` object.")
+        self.patch_coord_generator = patch_coord_generator
+        self.box_filter = box_filter
+        self.image_validator = image_validator
+        self.n_trials_max = n_trials_max
+        self.clip_boxes = clip_boxes
+        self.prob = prob
+        self.background = background
+        self.can_fail = can_fail
+        self.labels_format = labels_format
+        self.sample_patch = CropPad(patch_ymin=None,
+                                    patch_xmin=None,
+                                    patch_height=None,
+                                    patch_width=None,
+                                    clip_boxes=self.clip_boxes,
+                                    box_filter=self.box_filter,
+                                    background=self.background,
+                                    labels_format=self.labels_format)
+
+    def __call__(self, image, labels=None, return_inverter=False):
+
+        p = np.random.uniform(0,1)
+        if p >= (1.0-self.prob):
+
+            img_height, img_width = image.shape[:2]
+            self.patch_coord_generator.img_height = img_height
+            self.patch_coord_generator.img_width = img_width
+
+            xmin = self.labels_format['xmin']
+            ymin = self.labels_format['ymin']
+            xmax = self.labels_format['xmax']
+            ymax = self.labels_format['ymax']
+
+            # Override the preset labels format.
+            if not self.image_validator is None:
+                self.image_validator.labels_format = self.labels_format
+            self.sample_patch.labels_format = self.labels_format
+
+            for _ in range(max(1, self.n_trials_max)):
+
+                # Generate patch coordinates.
+                patch_ymin, patch_xmin, patch_height, patch_width = self.patch_coord_generator()
+
+                self.sample_patch.patch_ymin = patch_ymin
+                self.sample_patch.patch_xmin = patch_xmin
+                self.sample_patch.patch_height = patch_height
+                self.sample_patch.patch_width = patch_width
+
+                if (labels is None) or (self.image_validator is None):
+                    # We either don't have any boxes or if we do, we will accept any outcome as valid.
+                    return self.sample_patch(image, labels, return_inverter)
+                else:
+                    # Translate the box coordinates to the patch's coordinate system.
+                    new_labels = np.copy(labels)
+                    new_labels[:, [ymin, ymax]] -= patch_ymin
+                    new_labels[:, [xmin, xmax]] -= patch_xmin
+                    # Check if the patch is valid.
+                    if self.image_validator(labels=new_labels,
+                                            image_height=patch_height,
+                                            image_width=patch_width):
+                        return self.sample_patch(image, labels, return_inverter)
+
+            # If we weren't able to sample a valid patch...
+            if self.can_fail:
+                # ...return `None`.
+                if labels is None:
+                    if return_inverter:
+                        return None, None
+                    else:
+                        return None
+                else:
+                    if return_inverter:
+                        return None, None, None
+                    else:
+                        return None, None
+            else:
+                # ...return the unaltered input image.
+                if labels is None:
+                    if return_inverter:
+                        return image, None
+                    else:
+                        return image
+                else:
+                    if return_inverter:
+                        return image, labels, None
+                    else:
+                        return image, labels
+
+        else:
+            if return_inverter:
+                def inverter(labels):
+                    return labels
+
+            if labels is None:
+                if return_inverter:
+                    return image, inverter
+                else:
+                    return image
+            else:
+                if return_inverter:
+                    return image, labels, inverter
+                else:
+                    return image, labels
+
+class RandomPatchInf:
+    '''
+    Randomly samples a patch from an image. The randomness refers to whatever
+    randomness may be introduced by the patch coordinate generator, the box filter,
+    and the patch validator.
+
+    Input images may be cropped and/or padded along either or both of the two
+    spatial dimensions as necessary in order to obtain the required patch.
+
+    This operation is very similar to `RandomPatch`, except that:
+    1. This operation runs indefinitely until either a valid patch is found or
+       the input image is returned unaltered, i.e. it cannot fail.
+    2. If a bound generator is given, a new pair of bounds will be generated
+       every `n_trials_max` iterations.
+    '''
+
+    def __init__(self,
+                 patch_coord_generator,
+                 box_filter=None,
+                 image_validator=None,
+                 bound_generator=None,
+                 n_trials_max=50,
+                 clip_boxes=True,
+                 prob=0.857,
+                 background=(0,0,0),
+                 labels_format={'class_id': 0, 'xmin': 1, 'ymin': 2, 'xmax': 3, 'ymax': 4}):
+        '''
+        Arguments:
+            patch_coord_generator (PatchCoordinateGenerator): A `PatchCoordinateGenerator` object
+                to generate the positions and sizes of the patches to be sampled from the input images.
+            box_filter (BoxFilter, optional): Only relevant if ground truth bounding boxes are given.
+                A `BoxFilter` object to filter out bounding boxes that don't meet the given criteria
+                after the transformation. Refer to the `BoxFilter` documentation for details. If `None`,
+                the validity of the bounding boxes is not checked.
+            image_validator (ImageValidator, optional): Only relevant if ground truth bounding boxes are given.
+                An `ImageValidator` object to determine whether a sampled patch is valid. If `None`,
+                any outcome is valid.
+            bound_generator (BoundGenerator, optional): A `BoundGenerator` object to generate upper and
+                lower bound values for the patch validator. Every `n_trials_max` trials, a new pair of
+                upper and lower bounds will be generated until a valid patch is found or the original image
+                is returned. This bound generator overrides the bound generator of the patch validator.
+            n_trials_max (int, optional): Only relevant if ground truth bounding boxes are given.
+                The sampler will run indefinitely until either a valid patch is found or the original image
+                is returned, but this determines the maxmial number of trials to sample a valid patch for each
+                selected pair of lower and upper bounds before a new pair is picked.
+            clip_boxes (bool, optional): Only relevant if ground truth bounding boxes are given.
+                If `True`, any ground truth bounding boxes will be clipped to lie entirely within the
+                sampled patch.
+            prob (float, optional): `(1 - prob)` determines the probability with which the original,
+                unaltered image is returned.
+            background (list/tuple, optional): A 3-tuple specifying the RGB color value of the potential
+                background pixels of the scaled images. In the case of single-channel images,
+                the first element of `background` will be used as the background pixel value.
+            labels_format (dict, optional): A dictionary that defines which index in the last axis of the labels
+                of an image contains which bounding box coordinate. The dictionary maps at least the keywords
+                'xmin', 'ymin', 'xmax', and 'ymax' to their respective indices within last axis of the labels array.
+        '''
+
+        if not isinstance(patch_coord_generator, PatchCoordinateGenerator):
+            raise ValueError("`patch_coord_generator` must be an instance of `PatchCoordinateGenerator`.")
+        if not (isinstance(image_validator, ImageValidator) or image_validator is None):
+            raise ValueError("`image_validator` must be either `None` or an `ImageValidator` object.")
+        if not (isinstance(bound_generator, BoundGenerator) or bound_generator is None):
+            raise ValueError("`bound_generator` must be either `None` or a `BoundGenerator` object.")
+        self.patch_coord_generator = patch_coord_generator
+        self.box_filter = box_filter
+        self.image_validator = image_validator
+        self.bound_generator = bound_generator
+        self.n_trials_max = n_trials_max
+        self.clip_boxes = clip_boxes
+        self.prob = prob
+        self.background = background
+        self.labels_format = labels_format
+        self.sample_patch = CropPad(patch_ymin=None,
+                                    patch_xmin=None,
+                                    patch_height=None,
+                                    patch_width=None,
+                                    clip_boxes=self.clip_boxes,
+                                    box_filter=self.box_filter,
+                                    background=self.background,
+                                    labels_format=self.labels_format)
+
+    def __call__(self, image, labels=None, return_inverter=False):
+
+        img_height, img_width = image.shape[:2]
+        self.patch_coord_generator.img_height = img_height
+        self.patch_coord_generator.img_width = img_width
+
+        xmin = self.labels_format['xmin']
+        ymin = self.labels_format['ymin']
+        xmax = self.labels_format['xmax']
+        ymax = self.labels_format['ymax']
+
+        # Override the preset labels format.
+        if not self.image_validator is None:
+            self.image_validator.labels_format = self.labels_format
+        self.sample_patch.labels_format = self.labels_format
+
+        while True: # Keep going until we either find a valid patch or return the original image.
+
+            p = np.random.uniform(0,1)
+            if p >= (1.0-self.prob):
+
+                # In case we have a bound generator, pick a lower and upper bound for the patch validator.
+                if not ((self.image_validator is None) or (self.bound_generator is None)):
+                    self.image_validator.bounds = self.bound_generator()
+
+                # Use at most `self.n_trials_max` attempts to find a crop
+                # that meets our requirements.
+                for _ in range(max(1, self.n_trials_max)):
+
+                    # Generate patch coordinates.
+                    patch_ymin, patch_xmin, patch_height, patch_width = self.patch_coord_generator()
+
+                    self.sample_patch.patch_ymin = patch_ymin
+                    self.sample_patch.patch_xmin = patch_xmin
+                    self.sample_patch.patch_height = patch_height
+                    self.sample_patch.patch_width = patch_width
+
+                    # Check if the resulting patch meets the aspect ratio requirements.
+                    aspect_ratio = patch_width / patch_height
+                    if not (self.patch_coord_generator.min_aspect_ratio <= aspect_ratio <= self.patch_coord_generator.max_aspect_ratio):
+                        continue
+
+                    if (labels is None) or (self.image_validator is None):
+                        # We either don't have any boxes or if we do, we will accept any outcome as valid.
+                        return self.sample_patch(image, labels, return_inverter)
+                    else:
+                        # Translate the box coordinates to the patch's coordinate system.
+                        new_labels = np.copy(labels)
+                        new_labels[:, [ymin, ymax]] -= patch_ymin
+                        new_labels[:, [xmin, xmax]] -= patch_xmin
+                        # Check if the patch contains the minimum number of boxes we require.
+                        if self.image_validator(labels=new_labels,
+                                                image_height=patch_height,
+                                                image_width=patch_width):
+                            return self.sample_patch(image, labels, return_inverter)
+            else:
+                if return_inverter:
+                    def inverter(labels):
+                        return labels
+
+                if labels is None:
+                    if return_inverter:
+                        return image, inverter
+                    else:
+                        return image
+                else:
+                    if return_inverter:
+                        return image, labels, inverter
+                    else:
+                        return image, labels
+
+class RandomMaxCropFixedAR:
+    '''
+    Crops the largest possible patch of a given fixed aspect ratio
+    from an image.
+
+    Since the aspect ratio of the sampled patches is constant, they
+    can subsequently be resized to the same size without distortion.
+    '''
+
+    def __init__(self,
+                 patch_aspect_ratio,
+                 box_filter=None,
+                 image_validator=None,
+                 n_trials_max=3,
+                 clip_boxes=True,
+                 labels_format={'class_id': 0, 'xmin': 1, 'ymin': 2, 'xmax': 3, 'ymax': 4}):
+        '''
+        Arguments:
+            patch_aspect_ratio (float): The fixed aspect ratio that all sampled patches will have.
+            box_filter (BoxFilter, optional): Only relevant if ground truth bounding boxes are given.
+                A `BoxFilter` object to filter out bounding boxes that don't meet the given criteria
+                after the transformation. Refer to the `BoxFilter` documentation for details. If `None`,
+                the validity of the bounding boxes is not checked.
+            image_validator (ImageValidator, optional): Only relevant if ground truth bounding boxes are given.
+                An `ImageValidator` object to determine whether a sampled patch is valid. If `None`,
+                any outcome is valid.
+            n_trials_max (int, optional): Only relevant if ground truth bounding boxes are given.
+                Determines the maxmial number of trials to sample a valid patch. If no valid patch could
+                be sampled in `n_trials_max` trials, returns `None`.
+            clip_boxes (bool, optional): Only relevant if ground truth bounding boxes are given.
+                If `True`, any ground truth bounding boxes will be clipped to lie entirely within the
+                sampled patch.
+            labels_format (dict, optional): A dictionary that defines which index in the last axis of the labels
+                of an image contains which bounding box coordinate. The dictionary maps at least the keywords
+                'xmin', 'ymin', 'xmax', and 'ymax' to their respective indices within last axis of the labels array.
+        '''
+
+        self.patch_aspect_ratio = patch_aspect_ratio
+        self.box_filter = box_filter
+        self.image_validator = image_validator
+        self.n_trials_max = n_trials_max
+        self.clip_boxes = clip_boxes
+        self.labels_format = labels_format
+        self.random_patch = RandomPatch(patch_coord_generator=PatchCoordinateGenerator(), # Just a dummy object
+                                        box_filter=self.box_filter,
+                                        image_validator=self.image_validator,
+                                        n_trials_max=self.n_trials_max,
+                                        clip_boxes=self.clip_boxes,
+                                        prob=1.0,
+                                        can_fail=False,
+                                        labels_format=self.labels_format)
+
+    def __call__(self, image, labels=None, return_inverter=False):
+
+        img_height, img_width = image.shape[:2]
+
+        # The ratio of the input image aspect ratio and patch aspect ratio determines the maximal possible crop.
+        image_aspect_ratio = img_width / img_height
+
+        if image_aspect_ratio < self.patch_aspect_ratio:
+            patch_width = img_width
+            patch_height = int(round(patch_width / self.patch_aspect_ratio))
+        else:
+            patch_height = img_height
+            patch_width = int(round(patch_height * self.patch_aspect_ratio))
+
+        # Now that we know the desired height and width for the patch,
+        # instantiate an appropriate patch coordinate generator.
+        patch_coord_generator = PatchCoordinateGenerator(img_height=img_height,
+                                                         img_width=img_width,
+                                                         must_match='h_w',
+                                                         patch_height=patch_height,
+                                                         patch_width=patch_width)
+
+        # The rest of the work is done by `RandomPatch`.
+        self.random_patch.patch_coord_generator = patch_coord_generator
+        self.random_patch.labels_format = self.labels_format
+        return self.random_patch(image, labels, return_inverter)
+
+class RandomPadFixedAR:
+    '''
+    Adds the minimal possible padding to an image that results in a patch
+    of the given fixed aspect ratio that contains the entire image.
+
+    Since the aspect ratio of the resulting images is constant, they
+    can subsequently be resized to the same size without distortion.
+    '''
+
+    def __init__(self,
+                 patch_aspect_ratio,
+                 background=(0,0,0),
+                 labels_format={'class_id': 0, 'xmin': 1, 'ymin': 2, 'xmax': 3, 'ymax': 4}):
+        '''
+        Arguments:
+            patch_aspect_ratio (float): The fixed aspect ratio that all sampled patches will have.
+            background (list/tuple, optional): A 3-tuple specifying the RGB color value of the potential
+                background pixels of the scaled images. In the case of single-channel images,
+                the first element of `background` will be used as the background pixel value.
+            labels_format (dict, optional): A dictionary that defines which index in the last axis of the labels
+                of an image contains which bounding box coordinate. The dictionary maps at least the keywords
+                'xmin', 'ymin', 'xmax', and 'ymax' to their respective indices within last axis of the labels array.
+        '''
+
+        self.patch_aspect_ratio = patch_aspect_ratio
+        self.background = background
+        self.labels_format = labels_format
+        self.random_patch = RandomPatch(patch_coord_generator=PatchCoordinateGenerator(), # Just a dummy object
+                                        box_filter=None,
+                                        image_validator=None,
+                                        n_trials_max=1,
+                                        clip_boxes=False,
+                                        background=self.background,
+                                        prob=1.0,
+                                        labels_format=self.labels_format)
+
+    def __call__(self, image, labels=None, return_inverter=False):
+
+        img_height, img_width = image.shape[:2]
+
+        if img_width < img_height:
+            patch_height = img_height
+            patch_width = int(round(patch_height * self.patch_aspect_ratio))
+        else:
+            patch_width = img_width
+            patch_height = int(round(patch_width / self.patch_aspect_ratio))
+
+        # Now that we know the desired height and width for the patch,
+        # instantiate an appropriate patch coordinate generator.
+        patch_coord_generator = PatchCoordinateGenerator(img_height=img_height,
+                                                         img_width=img_width,
+                                                         must_match='h_w',
+                                                         patch_height=patch_height,
+                                                         patch_width=patch_width)
+
+        # The rest of the work is done by `RandomPatch`.
+        self.random_patch.patch_coord_generator = patch_coord_generator
+        self.random_patch.labels_format = self.labels_format
+        return self.random_patch(image, labels, return_inverter)
diff --git a/keras_ssd/data_generator/object_detection_2d_photometric_ops.py b/keras_ssd/data_generator/object_detection_2d_photometric_ops.py
new file mode 100644
index 0000000..375b7aa
--- /dev/null
+++ b/keras_ssd/data_generator/object_detection_2d_photometric_ops.py
@@ -0,0 +1,485 @@
+'''
+Various photometric image transformations, both deterministic and probabilistic.
+
+Copyright (C) 2018 Pierluigi Ferrari
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+'''
+
+from __future__ import division
+import numpy as np
+import cv2
+
+class ConvertColor:
+    '''
+    Converts images between RGB, HSV and grayscale color spaces. This is just a wrapper
+    around `cv2.cvtColor()`.
+    '''
+    def __init__(self, current='RGB', to='HSV', keep_3ch=True):
+        '''
+        Arguments:
+            current (str, optional): The current color space of the images. Can be
+                one of 'RGB' and 'HSV'.
+            to (str, optional): The target color space of the images. Can be one of
+                'RGB', 'HSV', and 'GRAY'.
+            keep_3ch (bool, optional): Only relevant if `to == GRAY`.
+                If `True`, the resulting grayscale images will have three channels.
+        '''
+        if not ((current in {'RGB', 'HSV'}) and (to in {'RGB', 'HSV', 'GRAY'})):
+            raise NotImplementedError
+        self.current = current
+        self.to = to
+        self.keep_3ch = keep_3ch
+
+    def __call__(self, image, labels=None):
+        if self.current == 'RGB' and self.to == 'HSV':
+            image = cv2.cvtColor(image, cv2.COLOR_RGB2HSV)
+        elif self.current == 'RGB' and self.to == 'GRAY':
+            image = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)
+            if self.keep_3ch:
+                image = np.stack([image] * 3, axis=-1)
+        elif self.current == 'HSV' and self.to == 'RGB':
+            image = cv2.cvtColor(image, cv2.COLOR_HSV2RGB)
+        elif self.current == 'HSV' and self.to == 'GRAY':
+            image = cv2.cvtColor(image, cv2.COLOR_HSV2GRAY)
+            if self.keep_3ch:
+                image = np.stack([image] * 3, axis=-1)
+        if labels is None:
+            return image
+        else:
+            return image, labels
+
+class ConvertDataType:
+    '''
+    Converts images represented as Numpy arrays between `uint8` and `float32`.
+    Serves as a helper for certain photometric distortions. This is just a wrapper
+    around `np.ndarray.astype()`.
+    '''
+    def __init__(self, to='uint8'):
+        '''
+        Arguments:
+            to (string, optional): To which datatype to convert the input images.
+                Can be either of 'uint8' and 'float32'.
+        '''
+        if not (to == 'uint8' or to == 'float32'):
+            raise ValueError("`to` can be either of 'uint8' or 'float32'.")
+        self.to = to
+
+    def __call__(self, image, labels=None):
+        if self.to == 'uint8':
+            image = np.round(image, decimals=0).astype(np.uint8)
+        else:
+            image = image.astype(np.float32)
+        if labels is None:
+            return image
+        else:
+            return image, labels
+
+class ConvertTo3Channels:
+    '''
+    Converts 1-channel and 4-channel images to 3-channel images. Does nothing to images that
+    already have 3 channels. In the case of 4-channel images, the fourth channel will be
+    discarded.
+    '''
+    def __init__(self):
+        pass
+
+    def __call__(self, image, labels=None):
+        if image.ndim == 2:
+            image = np.stack([image] * 3, axis=-1)
+        elif image.ndim == 3:
+            if image.shape[2] == 1:
+                image = np.concatenate([image] * 3, axis=-1)
+            elif image.shape[2] == 4:
+                image = image[:,:,:3]
+        if labels is None:
+            return image
+        else:
+            return image, labels
+
+class Hue:
+    '''
+    Changes the hue of HSV images.
+
+    Important:
+        - Expects HSV input.
+        - Expects input array to be of `dtype` `float`.
+    '''
+    def __init__(self, delta):
+        '''
+        Arguments:
+            delta (int): An integer in the closed interval `[-180, 180]` that determines the hue change, where
+                a change by integer `delta` means a change by `2 * delta` degrees. Read up on the HSV color format
+                if you need more information.
+        '''
+        if not (-180 <= delta <= 180): raise ValueError("`delta` must be in the closed interval `[-180, 180]`.")
+        self.delta = delta
+
+    def __call__(self, image, labels=None):
+        image[:, :, 0] = (image[:, :, 0] + self.delta) % 180.0
+        if labels is None:
+            return image
+        else:
+            return image, labels
+
+class RandomHue:
+    '''
+    Randomly changes the hue of HSV images.
+
+    Important:
+        - Expects HSV input.
+        - Expects input array to be of `dtype` `float`.
+    '''
+    def __init__(self, max_delta=18, prob=0.5):
+        '''
+        Arguments:
+            max_delta (int): An integer in the closed interval `[0, 180]` that determines the maximal absolute
+                hue change.
+            prob (float, optional): `(1 - prob)` determines the probability with which the original,
+                unaltered image is returned.
+        '''
+        if not (0 <= max_delta <= 180): raise ValueError("`max_delta` must be in the closed interval `[0, 180]`.")
+        self.max_delta = max_delta
+        self.prob = prob
+        self.change_hue = Hue(delta=0)
+
+    def __call__(self, image, labels=None):
+        p = np.random.uniform(0,1)
+        if p >= (1.0-self.prob):
+            self.change_hue.delta = np.random.uniform(-self.max_delta, self.max_delta)
+            return self.change_hue(image, labels)
+        elif labels is None:
+            return image
+        else:
+            return image, labels
+
+class Saturation:
+    '''
+    Changes the saturation of HSV images.
+
+    Important:
+        - Expects HSV input.
+        - Expects input array to be of `dtype` `float`.
+    '''
+    def __init__(self, factor):
+        '''
+        Arguments:
+            factor (float): A float greater than zero that determines saturation change, where
+                values less than one result in less saturation and values greater than one result
+                in more saturation.
+        '''
+        if factor <= 0.0: raise ValueError("It must be `factor > 0`.")
+        self.factor = factor
+
+    def __call__(self, image, labels=None):
+        image[:,:,1] = np.clip(image[:,:,1] * self.factor, 0, 255)
+        if labels is None:
+            return image
+        else:
+            return image, labels
+
+class RandomSaturation:
+    '''
+    Randomly changes the saturation of HSV images.
+
+    Important:
+        - Expects HSV input.
+        - Expects input array to be of `dtype` `float`.
+    '''
+    def __init__(self, lower=0.3, upper=2.0, prob=0.5):
+        '''
+        Arguments:
+            lower (float, optional): A float greater than zero, the lower bound for the random
+                saturation change.
+            upper (float, optional): A float greater than zero, the upper bound for the random
+                saturation change. Must be greater than `lower`.
+            prob (float, optional): `(1 - prob)` determines the probability with which the original,
+                unaltered image is returned.
+        '''
+        if lower >= upper: raise ValueError("`upper` must be greater than `lower`.")
+        self.lower = lower
+        self.upper = upper
+        self.prob = prob
+        self.change_saturation = Saturation(factor=1.0)
+
+    def __call__(self, image, labels=None):
+        p = np.random.uniform(0,1)
+        if p >= (1.0-self.prob):
+            self.change_saturation.factor = np.random.uniform(self.lower, self.upper)
+            return self.change_saturation(image, labels)
+        elif labels is None:
+            return image
+        else:
+            return image, labels
+
+class Brightness:
+    '''
+    Changes the brightness of RGB images.
+
+    Important:
+        - Expects RGB input.
+        - Expects input array to be of `dtype` `float`.
+    '''
+    def __init__(self, delta):
+        '''
+        Arguments:
+            delta (int): An integer, the amount to add to or subtract from the intensity
+                of every pixel.
+        '''
+        self.delta = delta
+
+    def __call__(self, image, labels=None):
+        image = np.clip(image + self.delta, 0, 255)
+        if labels is None:
+            return image
+        else:
+            return image, labels
+
+class RandomBrightness:
+    '''
+    Randomly changes the brightness of RGB images.
+
+    Important:
+        - Expects RGB input.
+        - Expects input array to be of `dtype` `float`.
+    '''
+    def __init__(self, lower=-84, upper=84, prob=0.5):
+        '''
+        Arguments:
+            lower (int, optional): An integer, the lower bound for the random brightness change.
+            upper (int, optional): An integer, the upper bound for the random brightness change.
+                Must be greater than `lower`.
+            prob (float, optional): `(1 - prob)` determines the probability with which the original,
+                unaltered image is returned.
+        '''
+        if lower >= upper: raise ValueError("`upper` must be greater than `lower`.")
+        self.lower = float(lower)
+        self.upper = float(upper)
+        self.prob = prob
+        self.change_brightness = Brightness(delta=0)
+
+    def __call__(self, image, labels=None):
+        p = np.random.uniform(0,1)
+        if p >= (1.0-self.prob):
+            self.change_brightness.delta = np.random.uniform(self.lower, self.upper)
+            return self.change_brightness(image, labels)
+        elif labels is None:
+            return image
+        else:
+            return image, labels
+
+class Contrast:
+    '''
+    Changes the contrast of RGB images.
+
+    Important:
+        - Expects RGB input.
+        - Expects input array to be of `dtype` `float`.
+    '''
+    def __init__(self, factor):
+        '''
+        Arguments:
+            factor (float): A float greater than zero that determines contrast change, where
+                values less than one result in less contrast and values greater than one result
+                in more contrast.
+        '''
+        if factor <= 0.0: raise ValueError("It must be `factor > 0`.")
+        self.factor = factor
+
+    def __call__(self, image, labels=None):
+        image = np.clip(127.5 + self.factor * (image - 127.5), 0, 255)
+        if labels is None:
+            return image
+        else:
+            return image, labels
+
+class RandomContrast:
+    '''
+    Randomly changes the contrast of RGB images.
+
+    Important:
+        - Expects RGB input.
+        - Expects input array to be of `dtype` `float`.
+    '''
+    def __init__(self, lower=0.5, upper=1.5, prob=0.5):
+        '''
+        Arguments:
+            lower (float, optional): A float greater than zero, the lower bound for the random
+                contrast change.
+            upper (float, optional): A float greater than zero, the upper bound for the random
+                contrast change. Must be greater than `lower`.
+            prob (float, optional): `(1 - prob)` determines the probability with which the original,
+                unaltered image is returned.
+        '''
+        if lower >= upper: raise ValueError("`upper` must be greater than `lower`.")
+        self.lower = lower
+        self.upper = upper
+        self.prob = prob
+        self.change_contrast = Contrast(factor=1.0)
+
+    def __call__(self, image, labels=None):
+        p = np.random.uniform(0,1)
+        if p >= (1.0-self.prob):
+            self.change_contrast.factor = np.random.uniform(self.lower, self.upper)
+            return self.change_contrast(image, labels)
+        elif labels is None:
+            return image
+        else:
+            return image, labels
+
+class Gamma:
+    '''
+    Changes the gamma value of RGB images.
+
+    Important: Expects RGB input.
+    '''
+    def __init__(self, gamma):
+        '''
+        Arguments:
+            gamma (float): A float greater than zero that determines gamma change.
+        '''
+        if gamma <= 0.0: raise ValueError("It must be `gamma > 0`.")
+        self.gamma = gamma
+        self.gamma_inv = 1.0 / gamma
+        # Build a lookup table mapping the pixel values [0, 255] to
+        # their adjusted gamma values.
+        self.table = np.array([((i / 255.0) ** self.gamma_inv) * 255 for i in np.arange(0, 256)]).astype("uint8")
+
+    def __call__(self, image, labels=None):
+        image = cv2.LUT(image, table)
+        if labels is None:
+            return image
+        else:
+            return image, labels
+
+class RandomGamma:
+    '''
+    Randomly changes the gamma value of RGB images.
+
+    Important: Expects RGB input.
+    '''
+    def __init__(self, lower=0.25, upper=2.0, prob=0.5):
+        '''
+        Arguments:
+            lower (float, optional): A float greater than zero, the lower bound for the random
+                gamma change.
+            upper (float, optional): A float greater than zero, the upper bound for the random
+                gamma change. Must be greater than `lower`.
+            prob (float, optional): `(1 - prob)` determines the probability with which the original,
+                unaltered image is returned.
+        '''
+        if lower >= upper: raise ValueError("`upper` must be greater than `lower`.")
+        self.lower = lower
+        self.upper = upper
+        self.prob = prob
+
+    def __call__(self, image, labels=None):
+        p = np.random.uniform(0,1)
+        if p >= (1.0-self.prob):
+            gamma = np.random.uniform(self.lower, self.upper)
+            change_gamma = Gamma(gamma=gamma)
+            return change_gamma(image, labels)
+        elif labels is None:
+            return image
+        else:
+            return image, labels
+
+class HistogramEqualization:
+    '''
+    Performs histogram equalization on HSV images.
+
+    Importat: Expects HSV input.
+    '''
+    def __init__(self):
+        pass
+
+    def __call__(self, image, labels=None):
+        image[:,:,2] = cv2.equalizeHist(image[:,:,2])
+        if labels is None:
+            return image
+        else:
+            return image, labels
+
+class RandomHistogramEqualization:
+    '''
+    Randomly performs histogram equalization on HSV images. The randomness only refers
+    to whether or not the equalization is performed.
+
+    Importat: Expects HSV input.
+    '''
+    def __init__(self, prob=0.5):
+        '''
+        Arguments:
+            prob (float, optional): `(1 - prob)` determines the probability with which the original,
+                unaltered image is returned.
+        '''
+        self.prob = prob
+        self.equalize = HistogramEqualization()
+
+    def __call__(self, image, labels=None):
+        p = np.random.uniform(0,1)
+        if p >= (1.0-self.prob):
+            return self.equalize(image, labels)
+        elif labels is None:
+            return image
+        else:
+            return image, labels
+
+class ChannelSwap:
+    '''
+    Swaps the channels of images.
+    '''
+    def __init__(self, order):
+        '''
+        Arguments:
+            order (tuple): A tuple of integers that defines the desired channel order
+                of the input images after the channel swap.
+        '''
+        self.order = order
+
+    def __call__(self, image, labels=None):
+        image = image[:,:,self.order]
+        if labels is None:
+            return image
+        else:
+            return image, labels
+
+class RandomChannelSwap:
+    '''
+    Randomly swaps the channels of RGB images.
+
+    Important: Expects RGB input.
+    '''
+    def __init__(self, prob=0.5):
+        '''
+        Arguments:
+            prob (float, optional): `(1 - prob)` determines the probability with which the original,
+                unaltered image is returned.
+        '''
+        self.prob = prob
+        # All possible permutations of the three image channels except the original order.
+        self.permutations = ((0, 2, 1),
+                             (1, 0, 2), (1, 2, 0),
+                             (2, 0, 1), (2, 1, 0))
+        self.swap_channels = ChannelSwap(order=(0, 1, 2))
+
+    def __call__(self, image, labels=None):
+        p = np.random.uniform(0,1)
+        if p >= (1.0-self.prob):
+            i = np.random.randint(5) # There are 6 possible permutations.
+            self.swap_channels.order = self.permutations[i]
+            return self.swap_channels(image, labels)
+        elif labels is None:
+            return image
+        else:
+            return image, labels
diff --git a/keras_ssd/eval_utils/__init__.py b/keras_ssd/eval_utils/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/keras_ssd/eval_utils/average_precision_evaluator.py b/keras_ssd/eval_utils/average_precision_evaluator.py
new file mode 100644
index 0000000..e1c52f9
--- /dev/null
+++ b/keras_ssd/eval_utils/average_precision_evaluator.py
@@ -0,0 +1,906 @@
+'''
+An evaluator to compute the Pascal VOC-style mean average precision (both the pre-2010
+and post-2010 algorithm versions) of a given Keras SSD model on a given dataset.
+
+Copyright (C) 2018 Pierluigi Ferrari
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+'''
+
+from __future__ import division
+import numpy as np
+from math import ceil
+from tqdm import trange
+import sys
+import warnings
+
+from data_generator.object_detection_2d_data_generator import DataGenerator
+from data_generator.object_detection_2d_geometric_ops import Resize
+from data_generator.object_detection_2d_patch_sampling_ops import RandomPadFixedAR
+from data_generator.object_detection_2d_photometric_ops import ConvertTo3Channels
+from ssd_encoder_decoder.ssd_output_decoder import decode_detections
+from data_generator.object_detection_2d_misc_utils import apply_inverse_transforms
+
+from bounding_box_utils.bounding_box_utils import iou
+
+class Evaluator:
+    '''
+    Computes the mean average precision of the given Keras SSD model on the given dataset.
+
+    Can compute the Pascal-VOC-style average precision in both the pre-2010 (k-point sampling)
+    and post-2010 (integration) algorithm versions.
+
+    Optionally also returns the average precisions, precisions, and recalls.
+
+    The algorithm is identical to the official Pascal VOC pre-2010 detection evaluation algorithm
+    in its default settings, but can be cusomized in a number of ways.
+    '''
+
+    def __init__(self,
+                 model,
+                 n_classes,
+                 data_generator,
+                 model_mode='inference',
+                 pred_format={'class_id': 0, 'conf': 1, 'xmin': 2, 'ymin': 3, 'xmax': 4, 'ymax': 5},
+                 gt_format={'class_id': 0, 'xmin': 1, 'ymin': 2, 'xmax': 3, 'ymax': 4}):
+        '''
+        Arguments:
+            model (Keras model): A Keras SSD model object.
+            n_classes (int): The number of positive classes, e.g. 20 for Pascal VOC, 80 for MS COCO.
+            data_generator (DataGenerator): A `DataGenerator` object with the evaluation dataset.
+            model_mode (str, optional): The mode in which the model was created, i.e. 'training', 'inference' or 'inference_fast'.
+                This is needed in order to know whether the model output is already decoded or still needs to be decoded. Refer to
+                the model documentation for the meaning of the individual modes.
+            pred_format (dict, optional): A dictionary that defines which index in the last axis of the model's decoded predictions
+                contains which bounding box coordinate. The dictionary must map the keywords 'class_id', 'conf' (for the confidence),
+                'xmin', 'ymin', 'xmax', and 'ymax' to their respective indices within last axis.
+            gt_format (list, optional): A dictionary that defines which index of a ground truth bounding box contains which of the five
+                items class ID, xmin, ymin, xmax, ymax. The expected strings are 'xmin', 'ymin', 'xmax', 'ymax', 'class_id'.
+        '''
+
+        if not isinstance(data_generator, DataGenerator):
+            warnings.warn("`data_generator` is not a `DataGenerator` object, which will cause undefined behavior.")
+
+        self.model = model
+        self.data_generator = data_generator
+        self.n_classes = n_classes
+        self.model_mode = model_mode
+        self.pred_format = pred_format
+        self.gt_format = gt_format
+
+        # The following lists all contain per-class data, i.e. all list have the length `n_classes + 1`,
+        # where one element is for the background class, i.e. that element is just a dummy entry.
+        self.prediction_results = None
+        self.num_gt_per_class = None
+        self.true_positives = None
+        self.false_positives = None
+        self.cumulative_true_positives = None
+        self.cumulative_false_positives = None
+        self.cumulative_precisions = None # "Cumulative" means that the i-th element in each list represents the precision for the first i highest condidence predictions for that class.
+        self.cumulative_recalls = None # "Cumulative" means that the i-th element in each list represents the recall for the first i highest condidence predictions for that class.
+        self.average_precisions = None
+        self.mean_average_precision = None
+
+    def __call__(self,
+                 img_height,
+                 img_width,
+                 batch_size,
+                 data_generator_mode='resize',
+                 round_confidences=False,
+                 matching_iou_threshold=0.5,
+                 border_pixels='include',
+                 sorting_algorithm='quicksort',
+                 average_precision_mode='sample',
+                 num_recall_points=11,
+                 ignore_neutral_boxes=True,
+                 return_precisions=False,
+                 return_recalls=False,
+                 return_average_precisions=False,
+                 verbose=True,
+                 decoding_confidence_thresh=0.01,
+                 decoding_iou_threshold=0.45,
+                 decoding_top_k=200,
+                 decoding_pred_coords='centroids',
+                 decoding_normalize_coords=True):
+        '''
+        Computes the mean average precision of the given Keras SSD model on the given dataset.
+
+        Optionally also returns the averages precisions, precisions, and recalls.
+
+        All the individual steps of the overall evaluation algorithm can also be called separately
+        (check out the other methods of this class), but this runs the overall algorithm all at once.
+
+        Arguments:
+            img_height (int): The input image height for the model.
+            img_width (int): The input image width for the model.
+            batch_size (int): The batch size for the evaluation.
+            data_generator_mode (str, optional): Either of 'resize' and 'pad'. If 'resize', the input images will
+                be resized (i.e. warped) to `(img_height, img_width)`. This mode does not preserve the aspect ratios of the images.
+                If 'pad', the input images will be first padded so that they have the aspect ratio defined by `img_height`
+                and `img_width` and then resized to `(img_height, img_width)`. This mode preserves the aspect ratios of the images.
+            round_confidences (int, optional): `False` or an integer that is the number of decimals that the prediction
+                confidences will be rounded to. If `False`, the confidences will not be rounded.
+            matching_iou_threshold (float, optional): A prediction will be considered a true positive if it has a Jaccard overlap
+                of at least `matching_iou_threshold` with any ground truth bounding box of the same class.
+            border_pixels (str, optional): How to treat the border pixels of the bounding boxes.
+                Can be 'include', 'exclude', or 'half'. If 'include', the border pixels belong
+                to the boxes. If 'exclude', the border pixels do not belong to the boxes.
+                If 'half', then one of each of the two horizontal and vertical borders belong
+                to the boxex, but not the other.
+            sorting_algorithm (str, optional): Which sorting algorithm the matching algorithm should use. This argument accepts
+                any valid sorting algorithm for Numpy's `argsort()` function. You will usually want to choose between 'quicksort'
+                (fastest and most memory efficient, but not stable) and 'mergesort' (slight slower and less memory efficient, but stable).
+                The official Matlab evaluation algorithm uses a stable sorting algorithm, so this algorithm is only guaranteed
+                to behave identically if you choose 'mergesort' as the sorting algorithm, but it will almost always behave identically
+                even if you choose 'quicksort' (but no guarantees).
+            average_precision_mode (str, optional): Can be either 'sample' or 'integrate'. In the case of 'sample', the average precision
+                will be computed according to the Pascal VOC formula that was used up until VOC 2009, where the precision will be sampled
+                for `num_recall_points` recall values. In the case of 'integrate', the average precision will be computed according to the
+                Pascal VOC formula that was used from VOC 2010 onward, where the average precision will be computed by numerically integrating
+                over the whole preciscion-recall curve instead of sampling individual points from it. 'integrate' mode is basically just
+                the limit case of 'sample' mode as the number of sample points increases.
+            num_recall_points (int, optional): The number of points to sample from the precision-recall-curve to compute the average
+                precisions. In other words, this is the number of equidistant recall values for which the resulting precision will be
+                computed. 11 points is the value used in the official Pascal VOC 2007 detection evaluation algorithm.
+            ignore_neutral_boxes (bool, optional): In case the data generator provides annotations indicating whether a ground truth
+                bounding box is supposed to either count or be neutral for the evaluation, this argument decides what to do with these
+                annotations. If `False`, even boxes that are annotated as neutral will be counted into the evaluation. If `True`,
+                neutral boxes will be ignored for the evaluation. An example for evaluation-neutrality are the ground truth boxes
+                annotated as "difficult" in the Pascal VOC datasets, which are usually treated as neutral for the evaluation.
+            return_precisions (bool, optional): If `True`, returns a nested list containing the cumulative precisions for each class.
+            return_recalls (bool, optional): If `True`, returns a nested list containing the cumulative recalls for each class.
+            return_average_precisions (bool, optional): If `True`, returns a list containing the average precision for each class.
+            verbose (bool, optional): If `True`, will print out the progress during runtime.
+            decoding_confidence_thresh (float, optional): Only relevant if the model is in 'training' mode.
+                A float in [0,1), the minimum classification confidence in a specific positive class in order to be considered
+                for the non-maximum suppression stage for the respective class. A lower value will result in a larger part of the
+                selection process being done by the non-maximum suppression stage, while a larger value will result in a larger
+                part of the selection process happening in the confidence thresholding stage.
+            decoding_iou_threshold (float, optional): Only relevant if the model is in 'training' mode. A float in [0,1].
+                All boxes with a Jaccard similarity of greater than `iou_threshold` with a locally maximal box will be removed
+                from the set of predictions for a given class, where 'maximal' refers to the box score.
+            decoding_top_k (int, optional): Only relevant if the model is in 'training' mode. The number of highest scoring
+                predictions to be kept for each batch item after the non-maximum suppression stage.
+            decoding_input_coords (str, optional): Only relevant if the model is in 'training' mode. The box coordinate format
+                that the model outputs. Can be either 'centroids' for the format `(cx, cy, w, h)` (box center coordinates, width, and height),
+                'minmax' for the format `(xmin, xmax, ymin, ymax)`, or 'corners' for the format `(xmin, ymin, xmax, ymax)`.
+            decoding_normalize_coords (bool, optional): Only relevant if the model is in 'training' mode. Set to `True` if the model
+                outputs relative coordinates. Do not set this to `True` if the model already outputs absolute coordinates,
+                as that would result in incorrect coordinates.
+
+        Returns:
+            A float, the mean average precision, plus any optional returns specified in the arguments.
+        '''
+
+        #############################################################################################
+        # Predict on the entire dataset.
+        #############################################################################################
+
+        self.predict_on_dataset(img_height=img_height,
+                                img_width=img_width,
+                                batch_size=batch_size,
+                                data_generator_mode=data_generator_mode,
+                                decoding_confidence_thresh=decoding_confidence_thresh,
+                                decoding_iou_threshold=decoding_iou_threshold,
+                                decoding_top_k=decoding_top_k,
+                                decoding_pred_coords=decoding_pred_coords,
+                                decoding_normalize_coords=decoding_normalize_coords,
+                                decoding_border_pixels=border_pixels,
+                                round_confidences=round_confidences,
+                                verbose=verbose,
+                                ret=False)
+
+        #############################################################################################
+        # Get the total number of ground truth boxes for each class.
+        #############################################################################################
+
+        self.get_num_gt_per_class(ignore_neutral_boxes=ignore_neutral_boxes,
+                                  verbose=False,
+                                  ret=False)
+
+        #############################################################################################
+        # Match predictions to ground truth boxes for all classes.
+        #############################################################################################
+
+        self.match_predictions(ignore_neutral_boxes=ignore_neutral_boxes,
+                               matching_iou_threshold=matching_iou_threshold,
+                               border_pixels=border_pixels,
+                               sorting_algorithm=sorting_algorithm,
+                               verbose=verbose,
+                               ret=False)
+
+        #############################################################################################
+        # Compute the cumulative precision and recall for all classes.
+        #############################################################################################
+
+        self.compute_precision_recall(verbose=verbose, ret=False)
+
+        #############################################################################################
+        # Compute the average precision for this class.
+        #############################################################################################
+
+        self.compute_average_precisions(mode=average_precision_mode,
+                                        num_recall_points=num_recall_points,
+                                        verbose=verbose,
+                                        ret=False)
+
+        #############################################################################################
+        # Compute the mean average precision.
+        #############################################################################################
+
+        mean_average_precision = self.compute_mean_average_precision(ret=True)
+
+        #############################################################################################
+
+        # Compile the returns.
+        if return_precisions or return_recalls or return_average_precisions:
+            ret = [mean_average_precision]
+            if return_average_precisions:
+                ret.append(self.average_precisions)
+            if return_precisions:
+                ret.append(self.cumulative_precisions)
+            if return_recalls:
+                ret.append(self.cumulative_recalls)
+            return ret
+        else:
+            return mean_average_precision
+
+    def predict_on_dataset(self,
+                           img_height,
+                           img_width,
+                           batch_size,
+                           data_generator_mode='resize',
+                           decoding_confidence_thresh=0.01,
+                           decoding_iou_threshold=0.45,
+                           decoding_top_k=200,
+                           decoding_pred_coords='centroids',
+                           decoding_normalize_coords=True,
+                           decoding_border_pixels='include',
+                           round_confidences=False,
+                           verbose=True,
+                           ret=False):
+        '''
+        Runs predictions for the given model over the entire dataset given by `data_generator`.
+
+        Arguments:
+            img_height (int): The input image height for the model.
+            img_width (int): The input image width for the model.
+            batch_size (int): The batch size for the evaluation.
+            data_generator_mode (str, optional): Either of 'resize' and 'pad'. If 'resize', the input images will
+                be resized (i.e. warped) to `(img_height, img_width)`. This mode does not preserve the aspect ratios of the images.
+                If 'pad', the input images will be first padded so that they have the aspect ratio defined by `img_height`
+                and `img_width` and then resized to `(img_height, img_width)`. This mode preserves the aspect ratios of the images.
+            decoding_confidence_thresh (float, optional): Only relevant if the model is in 'training' mode.
+                A float in [0,1), the minimum classification confidence in a specific positive class in order to be considered
+                for the non-maximum suppression stage for the respective class. A lower value will result in a larger part of the
+                selection process being done by the non-maximum suppression stage, while a larger value will result in a larger
+                part of the selection process happening in the confidence thresholding stage.
+            decoding_iou_threshold (float, optional): Only relevant if the model is in 'training' mode. A float in [0,1].
+                All boxes with a Jaccard similarity of greater than `iou_threshold` with a locally maximal box will be removed
+                from the set of predictions for a given class, where 'maximal' refers to the box score.
+            decoding_top_k (int, optional): Only relevant if the model is in 'training' mode. The number of highest scoring
+                predictions to be kept for each batch item after the non-maximum suppression stage.
+            decoding_input_coords (str, optional): Only relevant if the model is in 'training' mode. The box coordinate format
+                that the model outputs. Can be either 'centroids' for the format `(cx, cy, w, h)` (box center coordinates, width, and height),
+                'minmax' for the format `(xmin, xmax, ymin, ymax)`, or 'corners' for the format `(xmin, ymin, xmax, ymax)`.
+            decoding_normalize_coords (bool, optional): Only relevant if the model is in 'training' mode. Set to `True` if the model
+                outputs relative coordinates. Do not set this to `True` if the model already outputs absolute coordinates,
+                as that would result in incorrect coordinates.
+            round_confidences (int, optional): `False` or an integer that is the number of decimals that the prediction
+                confidences will be rounded to. If `False`, the confidences will not be rounded.
+            verbose (bool, optional): If `True`, will print out the progress during runtime.
+            ret (bool, optional): If `True`, returns the predictions.
+
+        Returns:
+            None by default. Optionally, a nested list containing the predictions for each class.
+        '''
+
+        class_id_pred = self.pred_format['class_id']
+        conf_pred     = self.pred_format['conf']
+        xmin_pred     = self.pred_format['xmin']
+        ymin_pred     = self.pred_format['ymin']
+        xmax_pred     = self.pred_format['xmax']
+        ymax_pred     = self.pred_format['ymax']
+
+        #############################################################################################
+        # Configure the data generator for the evaluation.
+        #############################################################################################
+
+        convert_to_3_channels = ConvertTo3Channels()
+        resize = Resize(height=img_height,width=img_width, labels_format=self.gt_format)
+        if data_generator_mode == 'resize':
+            transformations = [convert_to_3_channels,
+                               resize]
+        elif data_generator_mode == 'pad':
+            random_pad = RandomPadFixedAR(patch_aspect_ratio=img_width/img_height, labels_format=self.gt_format)
+            transformations = [convert_to_3_channels,
+                               random_pad,
+                               resize]
+        else:
+            raise ValueError("`data_generator_mode` can be either of 'resize' or 'pad', but received '{}'.".format(data_generator_mode))
+
+        # Set the generator parameters.
+        generator = self.data_generator.generate(batch_size=batch_size,
+                                                 shuffle=False,
+                                                 transformations=transformations,
+                                                 label_encoder=None,
+                                                 returns={'processed_images',
+                                                          'image_ids',
+                                                          'evaluation-neutral',
+                                                          'inverse_transform',
+                                                          'original_labels'},
+                                                 keep_images_without_gt=True,
+                                                 degenerate_box_handling='remove')
+
+        # If we don't have any real image IDs, generate pseudo-image IDs.
+        # This is just to make the evaluator compatible both with datasets that do and don't
+        # have image IDs.
+        if self.data_generator.image_ids is None:
+            self.data_generator.image_ids = list(range(self.data_generator.get_dataset_size()))
+
+        #############################################################################################
+        # Predict over all batches of the dataset and store the predictions.
+        #############################################################################################
+
+        # We have to generate a separate results list for each class.
+        results = [list() for _ in range(self.n_classes + 1)]
+
+        # Create a dictionary that maps image IDs to ground truth annotations.
+        # We'll need it below.
+        image_ids_to_labels = {}
+
+        # Compute the number of batches to iterate over the entire dataset.
+        n_images = self.data_generator.get_dataset_size()
+        n_batches = int(ceil(n_images / batch_size))
+        if verbose:
+            print("Number of images in the evaluation dataset: {}".format(n_images))
+            print()
+            tr = trange(n_batches, file=sys.stdout)
+            tr.set_description('Producing predictions batch-wise')
+        else:
+            tr = range(n_batches)
+
+        # Loop over all batches.
+        for j in tr:
+            # Generate batch.
+            batch_X, batch_image_ids, batch_eval_neutral, batch_inverse_transforms, batch_orig_labels = next(generator)
+            # Predict.
+            y_pred = self.model.predict(batch_X)
+            # If the model was created in 'training' mode, the raw predictions need to
+            # be decoded and filtered, otherwise that's already taken care of.
+            if self.model_mode == 'training':
+                # Decode.
+                y_pred = decode_detections(y_pred,
+                                           confidence_thresh=decoding_confidence_thresh,
+                                           iou_threshold=decoding_iou_threshold,
+                                           top_k=decoding_top_k,
+                                           input_coords=decoding_pred_coords,
+                                           normalize_coords=decoding_normalize_coords,
+                                           img_height=img_height,
+                                           img_width=img_width,
+                                           border_pixels=decoding_border_pixels)
+            else:
+                # Filter out the all-zeros dummy elements of `y_pred`.
+                y_pred_filtered = []
+                for i in range(len(y_pred)):
+                    y_pred_filtered.append(y_pred[i][y_pred[i,:,0] != 0])
+                y_pred = y_pred_filtered
+            # Convert the predicted box coordinates for the original images.
+            y_pred = apply_inverse_transforms(y_pred, batch_inverse_transforms)
+
+            # Iterate over all batch items.
+            for k, batch_item in enumerate(y_pred):
+
+                image_id = batch_image_ids[k]
+
+                for box in batch_item:
+                    class_id = int(box[class_id_pred])
+                    # Round the box coordinates to reduce the required memory.
+                    if round_confidences:
+                        confidence = round(box[conf_pred], round_confidences)
+                    else:
+                        confidence = box[conf_pred]
+                    xmin = round(box[xmin_pred], 1)
+                    ymin = round(box[ymin_pred], 1)
+                    xmax = round(box[xmax_pred], 1)
+                    ymax = round(box[ymax_pred], 1)
+                    prediction = (image_id, confidence, xmin, ymin, xmax, ymax)
+                    # Append the predicted box to the results list for its class.
+                    results[class_id].append(prediction)
+
+        self.prediction_results = results
+
+        if ret:
+            return results
+
+    def write_predictions_to_txt(self,
+                                 classes=None,
+                                 out_file_prefix='comp3_det_test_',
+                                 verbose=True):
+        '''
+        Writes the predictions for all classes to separate text files according to the Pascal VOC results format.
+
+        Arguments:
+            classes (list, optional): `None` or a list of strings containing the class names of all classes in the dataset,
+                including some arbitrary name for the background class. This list will be used to name the output text files.
+                The ordering of the names in the list represents the ordering of the classes as they are predicted by the model,
+                i.e. the element with index 3 in this list should correspond to the class with class ID 3 in the model's predictions.
+                If `None`, the output text files will be named by their class IDs.
+            out_file_prefix (str, optional): A prefix for the output text file names. The suffix to each output text file name will
+                be the respective class name followed by the `.txt` file extension. This string is also how you specify the directory
+                in which the results are to be saved.
+            verbose (bool, optional): If `True`, will print out the progress during runtime.
+
+        Returns:
+            None.
+        '''
+
+        if self.prediction_results is None:
+            raise ValueError("There are no prediction results. You must run `predict_on_dataset()` before calling this method.")
+
+        # We generate a separate results file for each class.
+        for class_id in range(1, self.n_classes + 1):
+
+            if verbose:
+                print("Writing results file for class {}/{}.".format(class_id, self.n_classes))
+
+            if classes is None:
+                class_suffix = '{:04d}'.format(class_id)
+            else:
+                class_suffix = classes[class_id]
+
+            results_file = open('{}{}.txt'.format(out_file_prefix, class_suffix), 'w')
+
+            for prediction in self.prediction_results[class_id]:
+
+                prediction_list = list(prediction)
+                prediction_list[0] = '{:06d}'.format(int(prediction_list[0]))
+                prediction_list[1] = round(prediction_list[1], 4)
+                prediction_txt = ' '.join(map(str, prediction_list)) + '\n'
+                results_file.write(prediction_txt)
+
+            results_file.close()
+
+        if verbose:
+            print("All results files saved.")
+
+    def get_num_gt_per_class(self,
+                             ignore_neutral_boxes=True,
+                             verbose=True,
+                             ret=False):
+        '''
+        Counts the number of ground truth boxes for each class across the dataset.
+
+        Arguments:
+            ignore_neutral_boxes (bool, optional): In case the data generator provides annotations indicating whether a ground truth
+                bounding box is supposed to either count or be neutral for the evaluation, this argument decides what to do with these
+                annotations. If `True`, only non-neutral ground truth boxes will be counted, otherwise all ground truth boxes will
+                be counted.
+            verbose (bool, optional): If `True`, will print out the progress during runtime.
+            ret (bool, optional): If `True`, returns the list of counts.
+
+        Returns:
+            None by default. Optionally, a list containing a count of the number of ground truth boxes for each class across the
+            entire dataset.
+        '''
+
+        if self.data_generator.labels is None:
+            raise ValueError("Computing the number of ground truth boxes per class not possible, no ground truth given.")
+
+        num_gt_per_class = np.zeros(shape=(self.n_classes+1), dtype=np.int)
+
+        class_id_index = self.gt_format['class_id']
+
+        ground_truth = self.data_generator.labels
+
+        if verbose:
+            print('Computing the number of positive ground truth boxes per class.')
+            tr = trange(len(ground_truth), file=sys.stdout)
+        else:
+            tr = range(len(ground_truth))
+
+        # Iterate over the ground truth for all images in the dataset.
+        for i in tr:
+
+            boxes = np.asarray(ground_truth[i])
+
+            # Iterate over all ground truth boxes for the current image.
+            for j in range(boxes.shape[0]):
+
+                if ignore_neutral_boxes and not (self.data_generator.eval_neutral is None):
+                    if not self.data_generator.eval_neutral[i][j]:
+                        # If this box is not supposed to be evaluation-neutral,
+                        # increment the counter for the respective class ID.
+                        class_id = boxes[j, class_id_index]
+                        num_gt_per_class[class_id] += 1
+                else:
+                    # If there is no such thing as evaluation-neutral boxes for
+                    # our dataset, always increment the counter for the respective
+                    # class ID.
+                    class_id = boxes[j, class_id_index]
+                    num_gt_per_class[class_id] += 1
+
+        self.num_gt_per_class = num_gt_per_class
+
+        if ret:
+            return num_gt_per_class
+
+    def match_predictions(self,
+                          ignore_neutral_boxes=True,
+                          matching_iou_threshold=0.5,
+                          border_pixels='include',
+                          sorting_algorithm='quicksort',
+                          verbose=True,
+                          ret=False):
+        '''
+        Matches predictions to ground truth boxes.
+
+        Note that `predict_on_dataset()` must be called before calling this method.
+
+        Arguments:
+            ignore_neutral_boxes (bool, optional): In case the data generator provides annotations indicating whether a ground truth
+                bounding box is supposed to either count or be neutral for the evaluation, this argument decides what to do with these
+                annotations. If `False`, even boxes that are annotated as neutral will be counted into the evaluation. If `True`,
+                neutral boxes will be ignored for the evaluation. An example for evaluation-neutrality are the ground truth boxes
+                annotated as "difficult" in the Pascal VOC datasets, which are usually treated as neutral for the evaluation.
+            matching_iou_threshold (float, optional): A prediction will be considered a true positive if it has a Jaccard overlap
+                of at least `matching_iou_threshold` with any ground truth bounding box of the same class.
+            border_pixels (str, optional): How to treat the border pixels of the bounding boxes.
+                Can be 'include', 'exclude', or 'half'. If 'include', the border pixels belong
+                to the boxes. If 'exclude', the border pixels do not belong to the boxes.
+                If 'half', then one of each of the two horizontal and vertical borders belong
+                to the boxex, but not the other.
+            sorting_algorithm (str, optional): Which sorting algorithm the matching algorithm should use. This argument accepts
+                any valid sorting algorithm for Numpy's `argsort()` function. You will usually want to choose between 'quicksort'
+                (fastest and most memory efficient, but not stable) and 'mergesort' (slight slower and less memory efficient, but stable).
+                The official Matlab evaluation algorithm uses a stable sorting algorithm, so this algorithm is only guaranteed
+                to behave identically if you choose 'mergesort' as the sorting algorithm, but it will almost always behave identically
+                even if you choose 'quicksort' (but no guarantees).
+            verbose (bool, optional): If `True`, will print out the progress during runtime.
+            ret (bool, optional): If `True`, returns the true and false positives.
+
+        Returns:
+            None by default. Optionally, four nested lists containing the true positives, false positives, cumulative true positives,
+            and cumulative false positives for each class.
+        '''
+
+        if self.data_generator.labels is None:
+            raise ValueError("Matching predictions to ground truth boxes not possible, no ground truth given.")
+
+        if self.prediction_results is None:
+            raise ValueError("There are no prediction results. You must run `predict_on_dataset()` before calling this method.")
+
+        class_id_gt = self.gt_format['class_id']
+        xmin_gt = self.gt_format['xmin']
+        ymin_gt = self.gt_format['ymin']
+        xmax_gt = self.gt_format['xmax']
+        ymax_gt = self.gt_format['ymax']
+
+        # Convert the ground truth to a more efficient format for what we need
+        # to do, which is access ground truth by image ID repeatedly.
+        ground_truth = {}
+        eval_neutral_available = not (self.data_generator.eval_neutral is None) # Whether or not we have annotations to decide whether ground truth boxes should be neutral or not.
+        for i in range(len(self.data_generator.image_ids)):
+            image_id = str(self.data_generator.image_ids[i])
+            labels = self.data_generator.labels[i]
+            if ignore_neutral_boxes and eval_neutral_available:
+                ground_truth[image_id] = (np.asarray(labels), np.asarray(self.data_generator.eval_neutral[i]))
+            else:
+                ground_truth[image_id] = np.asarray(labels)
+
+        true_positives = [[]] # The false positives for each class, sorted by descending confidence.
+        false_positives = [[]] # The true positives for each class, sorted by descending confidence.
+        cumulative_true_positives = [[]]
+        cumulative_false_positives = [[]]
+
+        # Iterate over all classes.
+        for class_id in range(1, self.n_classes + 1):
+
+            predictions = self.prediction_results[class_id]
+
+            # Store the matching results in these lists:
+            true_pos = np.zeros(len(predictions), dtype=np.int) # 1 for every prediction that is a true positive, 0 otherwise
+            false_pos = np.zeros(len(predictions), dtype=np.int) # 1 for every prediction that is a false positive, 0 otherwise
+
+            # In case there are no predictions at all for this class, we're done here.
+            if len(predictions) == 0:
+                print("No predictions for class {}/{}".format(class_id, self.n_classes))
+                true_positives.append(true_pos)
+                false_positives.append(false_pos)
+                continue
+
+            # Convert the predictions list for this class into a structured array so that we can sort it by confidence.
+
+            # Get the number of characters needed to store the image ID strings in the structured array.
+            num_chars_per_image_id = len(str(predictions[0][0])) + 6 # Keep a few characters buffer in case some image IDs are longer than others.
+            # Create the data type for the structured array.
+            preds_data_type = np.dtype([('image_id', 'U{}'.format(num_chars_per_image_id)),
+                                        ('confidence', 'f4'),
+                                        ('xmin', 'f4'),
+                                        ('ymin', 'f4'),
+                                        ('xmax', 'f4'),
+                                        ('ymax', 'f4')])
+            # Create the structured array
+            predictions = np.array(predictions, dtype=preds_data_type)
+
+            # Sort the detections by decreasing confidence.
+            descending_indices = np.argsort(-predictions['confidence'], kind=sorting_algorithm)
+            predictions_sorted = predictions[descending_indices]
+
+            if verbose:
+                tr = trange(len(predictions), file=sys.stdout)
+                tr.set_description("Matching predictions to ground truth, class {}/{}.".format(class_id, self.n_classes))
+            else:
+                tr = range(len(predictions.shape))
+
+            # Keep track of which ground truth boxes were already matched to a detection.
+            gt_matched = {}
+
+            # Iterate over all predictions.
+            for i in tr:
+
+                prediction = predictions_sorted[i]
+                image_id = prediction['image_id']
+                pred_box = np.asarray(list(prediction[['xmin', 'ymin', 'xmax', 'ymax']])) # Convert the structured array element to a regular array.
+
+                # Get the relevant ground truth boxes for this prediction,
+                # i.e. all ground truth boxes that match the prediction's
+                # image ID and class ID.
+
+                # The ground truth could either be a tuple with `(ground_truth_boxes, eval_neutral_boxes)`
+                # or only `ground_truth_boxes`.
+                if ignore_neutral_boxes and eval_neutral_available:
+                    gt, eval_neutral = ground_truth[image_id]
+                else:
+                    gt = ground_truth[image_id]
+                gt = np.asarray(gt)
+                class_mask = gt[:,class_id_gt] == class_id
+                gt = gt[class_mask]
+                if ignore_neutral_boxes and eval_neutral_available:
+                    eval_neutral = eval_neutral[class_mask]
+
+                if gt.size == 0:
+                    # If the image doesn't contain any objects of this class,
+                    # the prediction becomes a false positive.
+                    false_pos[i] = 1
+                    continue
+
+                # Compute the IoU of this prediction with all ground truth boxes of the same class.
+                overlaps = iou(boxes1=gt[:,[xmin_gt, ymin_gt, xmax_gt, ymax_gt]],
+                               boxes2=pred_box,
+                               coords='corners',
+                               mode='element-wise',
+                               border_pixels=border_pixels)
+
+                # For each detection, match the ground truth box with the highest overlap.
+                # It's possible that the same ground truth box will be matched to multiple
+                # detections.
+                gt_match_index = np.argmax(overlaps)
+                gt_match_overlap = overlaps[gt_match_index]
+
+                if gt_match_overlap < matching_iou_threshold:
+                    # False positive, IoU threshold violated:
+                    # Those predictions whose matched overlap is below the threshold become
+                    # false positives.
+                    false_pos[i] = 1
+                else:
+                    if not (ignore_neutral_boxes and eval_neutral_available) or (eval_neutral[gt_match_index] == False):
+                        # If this is not a ground truth that is supposed to be evaluation-neutral
+                        # (i.e. should be skipped for the evaluation) or if we don't even have the
+                        # concept of neutral boxes.
+                        if not (image_id in gt_matched):
+                            # True positive:
+                            # If the matched ground truth box for this prediction hasn't been matched to a
+                            # different prediction already, we have a true positive.
+                            true_pos[i] = 1
+                            gt_matched[image_id] = np.zeros(shape=(gt.shape[0]), dtype=np.bool)
+                            gt_matched[image_id][gt_match_index] = True
+                        elif not gt_matched[image_id][gt_match_index]:
+                            # True positive:
+                            # If the matched ground truth box for this prediction hasn't been matched to a
+                            # different prediction already, we have a true positive.
+                            true_pos[i] = 1
+                            gt_matched[image_id][gt_match_index] = True
+                        else:
+                            # False positive, duplicate detection:
+                            # If the matched ground truth box for this prediction has already been matched
+                            # to a different prediction previously, it is a duplicate detection for an
+                            # already detected object, which counts as a false positive.
+                            false_pos[i] = 1
+
+            true_positives.append(true_pos)
+            false_positives.append(false_pos)
+
+            cumulative_true_pos = np.cumsum(true_pos) # Cumulative sums of the true positives
+            cumulative_false_pos = np.cumsum(false_pos) # Cumulative sums of the false positives
+
+            cumulative_true_positives.append(cumulative_true_pos)
+            cumulative_false_positives.append(cumulative_false_pos)
+
+        self.true_positives = true_positives
+        self.false_positives = false_positives
+        self.cumulative_true_positives = cumulative_true_positives
+        self.cumulative_false_positives = cumulative_false_positives
+
+        if ret:
+            return true_positives, false_positives, cumulative_true_positives, cumulative_false_positives
+
+    def compute_precision_recall(self, verbose=True, ret=False):
+        '''
+        Computes the precisions and recalls for all classes.
+
+        Note that `match_predictions()` must be called before calling this method.
+
+        Arguments:
+            verbose (bool, optional): If `True`, will print out the progress during runtime.
+            ret (bool, optional): If `True`, returns the precisions and recalls.
+
+        Returns:
+            None by default. Optionally, two nested lists containing the cumulative precisions and recalls for each class.
+        '''
+
+        if (self.cumulative_true_positives is None) or (self.cumulative_false_positives is None):
+            raise ValueError("True and false positives not available. You must run `match_predictions()` before you call this method.")
+
+        if (self.num_gt_per_class is None):
+            raise ValueError("Number of ground truth boxes per class not available. You must run `get_num_gt_per_class()` before you call this method.")
+
+        cumulative_precisions = [[]]
+        cumulative_recalls = [[]]
+
+        # Iterate over all classes.
+        for class_id in range(1, self.n_classes + 1):
+
+            if verbose:
+                print("Computing precisions and recalls, class {}/{}".format(class_id, self.n_classes))
+
+            tp = self.cumulative_true_positives[class_id]
+            fp = self.cumulative_false_positives[class_id]
+
+
+            cumulative_precision = np.where(tp + fp > 0, tp / (tp + fp), 0) # 1D array with shape `(num_predictions,)`
+            cumulative_recall = tp / self.num_gt_per_class[class_id] # 1D array with shape `(num_predictions,)`
+
+            cumulative_precisions.append(cumulative_precision)
+            cumulative_recalls.append(cumulative_recall)
+
+        self.cumulative_precisions = cumulative_precisions
+        self.cumulative_recalls = cumulative_recalls
+
+        if ret:
+            return cumulative_precisions, cumulative_recalls
+
+    def compute_average_precisions(self, mode='sample', num_recall_points=11, verbose=True, ret=False):
+        '''
+        Computes the average precision for each class.
+
+        Can compute the Pascal-VOC-style average precision in both the pre-2010 (k-point sampling)
+        and post-2010 (integration) algorithm versions.
+
+        Note that `compute_precision_recall()` must be called before calling this method.
+
+        Arguments:
+            mode (str, optional): Can be either 'sample' or 'integrate'. In the case of 'sample', the average precision will be computed
+                according to the Pascal VOC formula that was used up until VOC 2009, where the precision will be sampled for `num_recall_points`
+                recall values. In the case of 'integrate', the average precision will be computed according to the Pascal VOC formula that
+                was used from VOC 2010 onward, where the average precision will be computed by numerically integrating over the whole
+                preciscion-recall curve instead of sampling individual points from it. 'integrate' mode is basically just the limit case
+                of 'sample' mode as the number of sample points increases. For details, see the references below.
+            num_recall_points (int, optional): Only relevant if mode is 'sample'. The number of points to sample from the precision-recall-curve
+                to compute the average precisions. In other words, this is the number of equidistant recall values for which the resulting
+                precision will be computed. 11 points is the value used in the official Pascal VOC pre-2010 detection evaluation algorithm.
+            verbose (bool, optional): If `True`, will print out the progress during runtime.
+            ret (bool, optional): If `True`, returns the average precisions.
+
+        Returns:
+            None by default. Optionally, a list containing average precision for each class.
+
+        References:
+            http://host.robots.ox.ac.uk/pascal/VOC/voc2012/htmldoc/devkit_doc.html#sec:ap
+        '''
+
+        if (self.cumulative_precisions is None) or (self.cumulative_recalls is None):
+            raise ValueError("Precisions and recalls not available. You must run `compute_precision_recall()` before you call this method.")
+
+        if not (mode in {'sample', 'integrate'}):
+            raise ValueError("`mode` can be either 'sample' or 'integrate', but received '{}'".format(mode))
+
+        average_precisions = [0.0]
+
+        # Iterate over all classes.
+        for class_id in range(1, self.n_classes + 1):
+
+            if verbose:
+                print("Computing average precision, class {}/{}".format(class_id, self.n_classes))
+
+            cumulative_precision = self.cumulative_precisions[class_id]
+            cumulative_recall = self.cumulative_recalls[class_id]
+            average_precision = 0.0
+
+            if mode == 'sample':
+
+                for t in np.linspace(start=0, stop=1, num=num_recall_points, endpoint=True):
+
+                    cum_prec_recall_greater_t = cumulative_precision[cumulative_recall >= t]
+
+                    if cum_prec_recall_greater_t.size == 0:
+                        precision = 0.0
+                    else:
+                        precision = np.amax(cum_prec_recall_greater_t)
+
+                    average_precision += precision
+
+                average_precision /= num_recall_points
+
+            elif mode == 'integrate':
+
+                # We will compute the precision at all unique recall values.
+                unique_recalls, unique_recall_indices, unique_recall_counts = np.unique(cumulative_recall, return_index=True, return_counts=True)
+
+                # Store the maximal precision for each recall value and the absolute difference
+                # between any two unique recal values in the lists below. The products of these
+                # two nummbers constitute the rectangular areas whose sum will be our numerical
+                # integral.
+                maximal_precisions = np.zeros_like(unique_recalls)
+                recall_deltas = np.zeros_like(unique_recalls)
+
+                # Iterate over all unique recall values in reverse order. This saves a lot of computation:
+                # For each unique recall value `r`, we want to get the maximal precision value obtained
+                # for any recall value `r* >= r`. Once we know the maximal precision for the last `k` recall
+                # values after a given iteration, then in the next iteration, in order compute the maximal
+                # precisions for the last `l > k` recall values, we only need to compute the maximal precision
+                # for `l - k` recall values and then take the maximum between that and the previously computed
+                # maximum instead of computing the maximum over all `l` values.
+                # We skip the very last recall value, since the precision after between the last recall value
+                # recall 1.0 is defined to be zero.
+                for i in range(len(unique_recalls)-2, -1, -1):
+                    begin = unique_recall_indices[i]
+                    end   = unique_recall_indices[i + 1]
+                    # When computing the maximal precisions, use the maximum of the previous iteration to
+                    # avoid unnecessary repeated computation over the same precision values.
+                    # The maximal precisions are the heights of the rectangle areas of our integral under
+                    # the precision-recall curve.
+                    maximal_precisions[i] = np.maximum(np.amax(cumulative_precision[begin:end]), maximal_precisions[i + 1])
+                    # The differences between two adjacent recall values are the widths of our rectangle areas.
+                    recall_deltas[i] = unique_recalls[i + 1] - unique_recalls[i]
+
+                average_precision = np.sum(maximal_precisions * recall_deltas)
+
+            average_precisions.append(average_precision)
+
+        self.average_precisions = average_precisions
+
+        if ret:
+            return average_precisions
+
+    def compute_mean_average_precision(self, ret=True):
+        '''
+        Computes the mean average precision over all classes.
+
+        Note that `compute_average_precisions()` must be called before calling this method.
+
+        Arguments:
+            ret (bool, optional): If `True`, returns the mean average precision.
+
+        Returns:
+            A float, the mean average precision, by default. Optionally, None.
+        '''
+
+        if self.average_precisions is None:
+            raise ValueError("Average precisions not available. You must run `compute_average_precisions()` before you call this method.")
+
+        mean_average_precision = np.average(self.average_precisions[1:]) # The first element is for the background class, so skip it.
+        self.mean_average_precision = mean_average_precision
+
+        if ret:
+            return mean_average_precision
diff --git a/keras_ssd/eval_utils/coco_utils.py b/keras_ssd/eval_utils/coco_utils.py
new file mode 100644
index 0000000..b0e88f8
--- /dev/null
+++ b/keras_ssd/eval_utils/coco_utils.py
@@ -0,0 +1,200 @@
+'''
+A few utilities that are useful when working with the MS COCO datasets.
+
+Copyright (C) 2018 Pierluigi Ferrari
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+'''
+
+import json
+from tqdm import trange
+from math import ceil
+import sys
+
+from data_generator.object_detection_2d_geometric_ops import Resize
+from data_generator.object_detection_2d_patch_sampling_ops import RandomPadFixedAR
+from data_generator.object_detection_2d_photometric_ops import ConvertTo3Channels
+from ssd_encoder_decoder.ssd_output_decoder import decode_detections
+from data_generator.object_detection_2d_misc_utils import apply_inverse_transforms
+
+def get_coco_category_maps(annotations_file):
+    '''
+    Builds dictionaries that map between MS COCO category IDs, transformed category IDs, and category names.
+    The original MS COCO category IDs are not consecutive unfortunately: The 80 category IDs are spread
+    across the integers 1 through 90 with some integers skipped. Since we usually use a one-hot
+    class representation in neural networks, we need to map these non-consecutive original COCO category
+    IDs (let's call them 'cats') to consecutive category IDs (let's call them 'classes').
+
+    Arguments:
+        annotations_file (str): The filepath to any MS COCO annotations JSON file.
+
+    Returns:
+        1) cats_to_classes: A dictionary that maps between the original (keys) and the transformed category IDs (values).
+        2) classes_to_cats: A dictionary that maps between the transformed (keys) and the original category IDs (values).
+        3) cats_to_names: A dictionary that maps between original category IDs (keys) and the respective category names (values).
+        4) classes_to_names: A list of the category names (values) with their indices representing the transformed IDs.
+    '''
+    with open(annotations_file, 'r') as f:
+        annotations = json.load(f)
+    cats_to_classes = {}
+    classes_to_cats = {}
+    cats_to_names = {}
+    classes_to_names = []
+    classes_to_names.append('background') # Need to add the background class first so that the indexing is right.
+    for i, cat in enumerate(annotations['categories']):
+        cats_to_classes[cat['id']] = i + 1
+        classes_to_cats[i + 1] = cat['id']
+        cats_to_names[cat['id']] = cat['name']
+        classes_to_names.append(cat['name'])
+
+    return cats_to_classes, classes_to_cats, cats_to_names, classes_to_names
+
+def predict_all_to_json(out_file,
+                        model,
+                        img_height,
+                        img_width,
+                        classes_to_cats,
+                        data_generator,
+                        batch_size,
+                        data_generator_mode='resize',
+                        model_mode='training',
+                        confidence_thresh=0.01,
+                        iou_threshold=0.45,
+                        top_k=200,
+                        pred_coords='centroids',
+                        normalize_coords=True):
+    '''
+    Runs detection predictions over the whole dataset given a model and saves them in a JSON file
+    in the MS COCO detection results format.
+
+    Arguments:
+        out_file (str): The file name (full path) under which to save the results JSON file.
+        model (Keras model): A Keras SSD model object.
+        img_height (int): The input image height for the model.
+        img_width (int): The input image width for the model.
+        classes_to_cats (dict): A dictionary that maps the consecutive class IDs predicted by the model
+            to the non-consecutive original MS COCO category IDs.
+        data_generator (DataGenerator): A `DataGenerator` object with the evaluation dataset.
+        batch_size (int): The batch size for the evaluation.
+        data_generator_mode (str, optional): Either of 'resize' or 'pad'. If 'resize', the input images will
+            be resized (i.e. warped) to `(img_height, img_width)`. This mode does not preserve the aspect ratios of the images.
+            If 'pad', the input images will be first padded so that they have the aspect ratio defined by `img_height`
+            and `img_width` and then resized to `(img_height, img_width)`. This mode preserves the aspect ratios of the images.
+        model_mode (str, optional): The mode in which the model was created, i.e. 'training', 'inference' or 'inference_fast'.
+            This is needed in order to know whether the model output is already decoded or still needs to be decoded. Refer to
+            the model documentation for the meaning of the individual modes.
+        confidence_thresh (float, optional): A float in [0,1), the minimum classification confidence in a specific
+            positive class in order to be considered for the non-maximum suppression stage for the respective class.
+            A lower value will result in a larger part of the selection process being done by the non-maximum suppression
+            stage, while a larger value will result in a larger part of the selection process happening in the confidence
+            thresholding stage.
+        iou_threshold (float, optional): A float in [0,1]. All boxes with a Jaccard similarity of greater than `iou_threshold`
+            with a locally maximal box will be removed from the set of predictions for a given class, where 'maximal' refers
+            to the box score.
+        top_k (int, optional): The number of highest scoring predictions to be kept for each batch item after the
+            non-maximum suppression stage. Defaults to 200, following the paper.
+        input_coords (str, optional): The box coordinate format that the model outputs. Can be either 'centroids'
+            for the format `(cx, cy, w, h)` (box center coordinates, width, and height), 'minmax' for the format
+            `(xmin, xmax, ymin, ymax)`, or 'corners' for the format `(xmin, ymin, xmax, ymax)`.
+        normalize_coords (bool, optional): Set to `True` if the model outputs relative coordinates (i.e. coordinates in [0,1])
+            and you wish to transform these relative coordinates back to absolute coordinates. If the model outputs
+            relative coordinates, but you do not want to convert them back to absolute coordinates, set this to `False`.
+            Do not set this to `True` if the model already outputs absolute coordinates, as that would result in incorrect
+            coordinates. Requires `img_height` and `img_width` if set to `True`.
+
+    Returns:
+        None.
+    '''
+
+    convert_to_3_channels = ConvertTo3Channels()
+    resize = Resize(height=img_height,width=img_width)
+    if data_generator_mode == 'resize':
+        transformations = [convert_to_3_channels,
+                           resize]
+    elif data_generator_mode == 'pad':
+        random_pad = RandomPadFixedAR(patch_aspect_ratio=img_width/img_height, clip_boxes=False)
+        transformations = [convert_to_3_channels,
+                           random_pad,
+                           resize]
+    else:
+        raise ValueError("Unexpected argument value: `data_generator_mode` can be either of 'resize' or 'pad', but received '{}'.".format(data_generator_mode))
+
+    # Set the generator parameters.
+    generator = data_generator.generate(batch_size=batch_size,
+                                        shuffle=False,
+                                        transformations=transformations,
+                                        label_encoder=None,
+                                        returns={'processed_images',
+                                                 'image_ids',
+                                                 'inverse_transform'},
+                                        keep_images_without_gt=True)
+    # Put the results in this list.
+    results = []
+    # Compute the number of batches to iterate over the entire dataset.
+    n_images = data_generator.get_dataset_size()
+    print("Number of images in the evaluation dataset: {}".format(n_images))
+    n_batches = int(ceil(n_images / batch_size))
+    # Loop over all batches.
+    tr = trange(n_batches, file=sys.stdout)
+    tr.set_description('Producing results file')
+    for i in tr:
+        # Generate batch.
+        batch_X, batch_image_ids, batch_inverse_transforms = next(generator)
+        # Predict.
+        y_pred = model.predict(batch_X)
+        # If the model was created in 'training' mode, the raw predictions need to
+        # be decoded and filtered, otherwise that's already taken care of.
+        if model_mode == 'training':
+            # Decode.
+            y_pred = decode_detections(y_pred,
+                                       confidence_thresh=confidence_thresh,
+                                       iou_threshold=iou_threshold,
+                                       top_k=top_k,
+                                       input_coords=pred_coords,
+                                       normalize_coords=normalize_coords,
+                                       img_height=img_height,
+                                       img_width=img_width)
+        else:
+            # Filter out the all-zeros dummy elements of `y_pred`.
+            y_pred_filtered = []
+            for i in range(len(y_pred)):
+                y_pred_filtered.append(y_pred[i][y_pred[i,:,0] != 0])
+            y_pred = y_pred_filtered
+        # Convert the predicted box coordinates for the original images.
+        y_pred = apply_inverse_transforms(y_pred, batch_inverse_transforms)
+
+        # Convert each predicted box into the results format.
+        for k, batch_item in enumerate(y_pred):
+            for box in batch_item:
+                class_id = box[0]
+                # Transform the consecutive class IDs back to the original COCO category IDs.
+                cat_id = classes_to_cats[class_id]
+                # Round the box coordinates to reduce the JSON file size.
+                xmin = float(round(box[2], 1))
+                ymin = float(round(box[3], 1))
+                xmax = float(round(box[4], 1))
+                ymax = float(round(box[5], 1))
+                width = xmax - xmin
+                height = ymax - ymin
+                bbox = [xmin, ymin, width, height]
+                result = {}
+                result['image_id'] = batch_image_ids[k]
+                result['category_id'] = cat_id
+                result['score'] = float(round(box[1], 3))
+                result['bbox'] = bbox
+                results.append(result)
+
+    with open(out_file, 'w') as f:
+        json.dump(results, f)
+
+    print("Prediction results saved in '{}'".format(out_file))
diff --git a/keras_ssd/keras_layers/__init__.py b/keras_ssd/keras_layers/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/keras_ssd/keras_layers/keras_layer_AnchorBoxes.py b/keras_ssd/keras_layers/keras_layer_AnchorBoxes.py
new file mode 100644
index 0000000..83a7ab5
--- /dev/null
+++ b/keras_ssd/keras_layers/keras_layer_AnchorBoxes.py
@@ -0,0 +1,278 @@
+'''
+A custom Keras layer to generate anchor boxes.
+
+Copyright (C) 2018 Pierluigi Ferrari
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+'''
+
+from __future__ import division
+import numpy as np
+import keras.backend as K
+from keras.engine.topology import InputSpec
+from keras.engine.topology import Layer
+
+from bounding_box_utils.bounding_box_utils import convert_coordinates
+
+class AnchorBoxes(Layer):
+    '''
+    A Keras layer to create an output tensor containing anchor box coordinates
+    and variances based on the input tensor and the passed arguments.
+
+    A set of 2D anchor boxes of different aspect ratios is created for each spatial unit of
+    the input tensor. The number of anchor boxes created per unit depends on the arguments
+    `aspect_ratios` and `two_boxes_for_ar1`, in the default case it is 4. The boxes
+    are parameterized by the coordinate tuple `(xmin, xmax, ymin, ymax)`.
+
+    The logic implemented by this layer is identical to the logic in the module
+    `ssd_box_encode_decode_utils.py`.
+
+    The purpose of having this layer in the network is to make the model self-sufficient
+    at inference time. Since the model is predicting offsets to the anchor boxes
+    (rather than predicting absolute box coordinates directly), one needs to know the anchor
+    box coordinates in order to construct the final prediction boxes from the predicted offsets.
+    If the model's output tensor did not contain the anchor box coordinates, the necessary
+    information to convert the predicted offsets back to absolute coordinates would be missing
+    in the model output. The reason why it is necessary to predict offsets to the anchor boxes
+    rather than to predict absolute box coordinates directly is explained in `README.md`.
+
+    Input shape:
+        4D tensor of shape `(batch, channels, height, width)` if `dim_ordering = 'th'`
+        or `(batch, height, width, channels)` if `dim_ordering = 'tf'`.
+
+    Output shape:
+        5D tensor of shape `(batch, height, width, n_boxes, 8)`. The last axis contains
+        the four anchor box coordinates and the four variance values for each box.
+    '''
+
+    def __init__(self,
+                 img_height,
+                 img_width,
+                 this_scale,
+                 next_scale,
+                 aspect_ratios=[0.5, 1.0, 2.0],
+                 two_boxes_for_ar1=True,
+                 this_steps=None,
+                 this_offsets=None,
+                 clip_boxes=False,
+                 variances=[0.1, 0.1, 0.2, 0.2],
+                 coords='centroids',
+                 normalize_coords=False,
+                 **kwargs):
+        '''
+        All arguments need to be set to the same values as in the box encoding process, otherwise the behavior is undefined.
+        Some of these arguments are explained in more detail in the documentation of the `SSDBoxEncoder` class.
+
+        Arguments:
+            img_height (int): The height of the input images.
+            img_width (int): The width of the input images.
+            this_scale (float): A float in [0, 1], the scaling factor for the size of the generated anchor boxes
+                as a fraction of the shorter side of the input image.
+            next_scale (float): A float in [0, 1], the next larger scaling factor. Only relevant if
+                `self.two_boxes_for_ar1 == True`.
+            aspect_ratios (list, optional): The list of aspect ratios for which default boxes are to be
+                generated for this layer.
+            two_boxes_for_ar1 (bool, optional): Only relevant if `aspect_ratios` contains 1.
+                If `True`, two default boxes will be generated for aspect ratio 1. The first will be generated
+                using the scaling factor for the respective layer, the second one will be generated using
+                geometric mean of said scaling factor and next bigger scaling factor.
+            clip_boxes (bool, optional): If `True`, clips the anchor box coordinates to stay within image boundaries.
+            variances (list, optional): A list of 4 floats >0. The anchor box offset for each coordinate will be divided by
+                its respective variance value.
+            coords (str, optional): The box coordinate format to be used internally in the model (i.e. this is not the input format
+                of the ground truth labels). Can be either 'centroids' for the format `(cx, cy, w, h)` (box center coordinates, width, and height),
+                'corners' for the format `(xmin, ymin, xmax,  ymax)`, or 'minmax' for the format `(xmin, xmax, ymin, ymax)`.
+            normalize_coords (bool, optional): Set to `True` if the model uses relative instead of absolute coordinates,
+                i.e. if the model predicts box coordinates within [0,1] instead of absolute coordinates.
+        '''
+        if K.backend() != 'tensorflow':
+            raise TypeError("This layer only supports TensorFlow at the moment, but you are using the {} backend.".format(K.backend()))
+
+        if (this_scale < 0) or (next_scale < 0) or (this_scale > 1):
+            raise ValueError("`this_scale` must be in [0, 1] and `next_scale` must be >0, but `this_scale` == {}, `next_scale` == {}".format(this_scale, next_scale))
+
+        if len(variances) != 4:
+            raise ValueError("4 variance values must be pased, but {} values were received.".format(len(variances)))
+        variances = np.array(variances)
+        if np.any(variances <= 0):
+            raise ValueError("All variances must be >0, but the variances given are {}".format(variances))
+
+        self.img_height = img_height
+        self.img_width = img_width
+        self.this_scale = this_scale
+        self.next_scale = next_scale
+        self.aspect_ratios = aspect_ratios
+        self.two_boxes_for_ar1 = two_boxes_for_ar1
+        self.this_steps = this_steps
+        self.this_offsets = this_offsets
+        self.clip_boxes = clip_boxes
+        self.variances = variances
+        self.coords = coords
+        self.normalize_coords = normalize_coords
+        # Compute the number of boxes per cell
+        if (1 in aspect_ratios) and two_boxes_for_ar1:
+            self.n_boxes = len(aspect_ratios) + 1
+        else:
+            self.n_boxes = len(aspect_ratios)
+        super(AnchorBoxes, self).__init__(**kwargs)
+
+    def build(self, input_shape):
+        self.input_spec = [InputSpec(shape=input_shape)]
+        super(AnchorBoxes, self).build(input_shape)
+
+    def call(self, x, mask=None):
+        '''
+        Return an anchor box tensor based on the shape of the input tensor.
+
+        The logic implemented here is identical to the logic in the module `ssd_box_encode_decode_utils.py`.
+
+        Note that this tensor does not participate in any graph computations at runtime. It is being created
+        as a constant once during graph creation and is just being output along with the rest of the model output
+        during runtime. Because of this, all logic is implemented as Numpy array operations and it is sufficient
+        to convert the resulting Numpy array into a Keras tensor at the very end before outputting it.
+
+        Arguments:
+            x (tensor): 4D tensor of shape `(batch, channels, height, width)` if `dim_ordering = 'th'`
+                or `(batch, height, width, channels)` if `dim_ordering = 'tf'`. The input for this
+                layer must be the output of the localization predictor layer.
+        '''
+
+        # Compute box width and height for each aspect ratio
+        # The shorter side of the image will be used to compute `w` and `h` using `scale` and `aspect_ratios`.
+        size = min(self.img_height, self.img_width)
+        # Compute the box widths and and heights for all aspect ratios
+        wh_list = []
+        for ar in self.aspect_ratios:
+            if (ar == 1):
+                # Compute the regular anchor box for aspect ratio 1.
+                box_height = box_width = self.this_scale * size
+                wh_list.append((box_width, box_height))
+                if self.two_boxes_for_ar1:
+                    # Compute one slightly larger version using the geometric mean of this scale value and the next.
+                    box_height = box_width = np.sqrt(self.this_scale * self.next_scale) * size
+                    wh_list.append((box_width, box_height))
+            else:
+                box_height = self.this_scale * size / np.sqrt(ar)
+                box_width = self.this_scale * size * np.sqrt(ar)
+                wh_list.append((box_width, box_height))
+        wh_list = np.array(wh_list)
+
+        # We need the shape of the input tensor
+        if K.image_dim_ordering() == 'tf':
+            batch_size, feature_map_height, feature_map_width, feature_map_channels = x._keras_shape
+        else: # Not yet relevant since TensorFlow is the only supported backend right now, but it can't harm to have this in here for the future
+            batch_size, feature_map_channels, feature_map_height, feature_map_width = x._keras_shape
+
+        # Compute the grid of box center points. They are identical for all aspect ratios.
+
+        # Compute the step sizes, i.e. how far apart the anchor box center points will be vertically and horizontally.
+        if (self.this_steps is None):
+            step_height = self.img_height / feature_map_height
+            step_width = self.img_width / feature_map_width
+        else:
+            if isinstance(self.this_steps, (list, tuple)) and (len(self.this_steps) == 2):
+                step_height = self.this_steps[0]
+                step_width = self.this_steps[1]
+            elif isinstance(self.this_steps, (int, float)):
+                step_height = self.this_steps
+                step_width = self.this_steps
+        # Compute the offsets, i.e. at what pixel values the first anchor box center point will be from the top and from the left of the image.
+        if (self.this_offsets is None):
+            offset_height = 0.5
+            offset_width = 0.5
+        else:
+            if isinstance(self.this_offsets, (list, tuple)) and (len(self.this_offsets) == 2):
+                offset_height = self.this_offsets[0]
+                offset_width = self.this_offsets[1]
+            elif isinstance(self.this_offsets, (int, float)):
+                offset_height = self.this_offsets
+                offset_width = self.this_offsets
+        # Now that we have the offsets and step sizes, compute the grid of anchor box center points.
+        cy = np.linspace(offset_height * step_height, (offset_height + feature_map_height - 1) * step_height, feature_map_height)
+        cx = np.linspace(offset_width * step_width, (offset_width + feature_map_width - 1) * step_width, feature_map_width)
+        cx_grid, cy_grid = np.meshgrid(cx, cy)
+        cx_grid = np.expand_dims(cx_grid, -1) # This is necessary for np.tile() to do what we want further down
+        cy_grid = np.expand_dims(cy_grid, -1) # This is necessary for np.tile() to do what we want further down
+
+        # Create a 4D tensor template of shape `(feature_map_height, feature_map_width, n_boxes, 4)`
+        # where the last dimension will contain `(cx, cy, w, h)`
+        boxes_tensor = np.zeros((feature_map_height, feature_map_width, self.n_boxes, 4))
+
+        boxes_tensor[:, :, :, 0] = np.tile(cx_grid, (1, 1, self.n_boxes)) # Set cx
+        boxes_tensor[:, :, :, 1] = np.tile(cy_grid, (1, 1, self.n_boxes)) # Set cy
+        boxes_tensor[:, :, :, 2] = wh_list[:, 0] # Set w
+        boxes_tensor[:, :, :, 3] = wh_list[:, 1] # Set h
+
+        # Convert `(cx, cy, w, h)` to `(xmin, xmax, ymin, ymax)`
+        boxes_tensor = convert_coordinates(boxes_tensor, start_index=0, conversion='centroids2corners')
+
+        # If `clip_boxes` is enabled, clip the coordinates to lie within the image boundaries
+        if self.clip_boxes:
+            x_coords = boxes_tensor[:,:,:,[0, 2]]
+            x_coords[x_coords >= self.img_width] = self.img_width - 1
+            x_coords[x_coords < 0] = 0
+            boxes_tensor[:,:,:,[0, 2]] = x_coords
+            y_coords = boxes_tensor[:,:,:,[1, 3]]
+            y_coords[y_coords >= self.img_height] = self.img_height - 1
+            y_coords[y_coords < 0] = 0
+            boxes_tensor[:,:,:,[1, 3]] = y_coords
+
+        # If `normalize_coords` is enabled, normalize the coordinates to be within [0,1]
+        if self.normalize_coords:
+            boxes_tensor[:, :, :, [0, 2]] /= self.img_width
+            boxes_tensor[:, :, :, [1, 3]] /= self.img_height
+
+        # TODO: Implement box limiting directly for `(cx, cy, w, h)` so that we don't have to unnecessarily convert back and forth.
+        if self.coords == 'centroids':
+            # Convert `(xmin, ymin, xmax, ymax)` back to `(cx, cy, w, h)`.
+            boxes_tensor = convert_coordinates(boxes_tensor, start_index=0, conversion='corners2centroids', border_pixels='half')
+        elif self.coords == 'minmax':
+            # Convert `(xmin, ymin, xmax, ymax)` to `(xmin, xmax, ymin, ymax).
+            boxes_tensor = convert_coordinates(boxes_tensor, start_index=0, conversion='corners2minmax', border_pixels='half')
+
+        # Create a tensor to contain the variances and append it to `boxes_tensor`. This tensor has the same shape
+        # as `boxes_tensor` and simply contains the same 4 variance values for every position in the last axis.
+        variances_tensor = np.zeros_like(boxes_tensor) # Has shape `(feature_map_height, feature_map_width, n_boxes, 4)`
+        variances_tensor += self.variances # Long live broadcasting
+        # Now `boxes_tensor` becomes a tensor of shape `(feature_map_height, feature_map_width, n_boxes, 8)`
+        boxes_tensor = np.concatenate((boxes_tensor, variances_tensor), axis=-1)
+
+        # Now prepend one dimension to `boxes_tensor` to account for the batch size and tile it along
+        # The result will be a 5D tensor of shape `(batch_size, feature_map_height, feature_map_width, n_boxes, 8)`
+        boxes_tensor = np.expand_dims(boxes_tensor, axis=0)
+        boxes_tensor = K.tile(K.constant(boxes_tensor, dtype='float32'), (K.shape(x)[0], 1, 1, 1, 1))
+
+        return boxes_tensor
+
+    def compute_output_shape(self, input_shape):
+        if K.image_dim_ordering() == 'tf':
+            batch_size, feature_map_height, feature_map_width, feature_map_channels = input_shape
+        else: # Not yet relevant since TensorFlow is the only supported backend right now, but it can't harm to have this in here for the future
+            batch_size, feature_map_channels, feature_map_height, feature_map_width = input_shape
+        return (batch_size, feature_map_height, feature_map_width, self.n_boxes, 8)
+
+    def get_config(self):
+        config = {
+            'img_height': self.img_height,
+            'img_width': self.img_width,
+            'this_scale': self.this_scale,
+            'next_scale': self.next_scale,
+            'aspect_ratios': list(self.aspect_ratios),
+            'two_boxes_for_ar1': self.two_boxes_for_ar1,
+            'clip_boxes': self.clip_boxes,
+            'variances': list(self.variances),
+            'coords': self.coords,
+            'normalize_coords': self.normalize_coords
+        }
+        base_config = super(AnchorBoxes, self).get_config()
+        return dict(list(base_config.items()) + list(config.items()))
diff --git a/keras_ssd/keras_layers/keras_layer_DecodeDetections.py b/keras_ssd/keras_layers/keras_layer_DecodeDetections.py
new file mode 100644
index 0000000..3fc4d57
--- /dev/null
+++ b/keras_ssd/keras_layers/keras_layer_DecodeDetections.py
@@ -0,0 +1,283 @@
+'''
+A custom Keras layer to decode the raw SSD prediction output. Corresponds to the
+`DetectionOutput` layer type in the original Caffe implementation of SSD.
+
+Copyright (C) 2018 Pierluigi Ferrari
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+'''
+
+from __future__ import division
+import numpy as np
+import tensorflow as tf
+import keras.backend as K
+from keras.engine.topology import InputSpec
+from keras.engine.topology import Layer
+
+class DecodeDetections(Layer):
+    '''
+    A Keras layer to decode the raw SSD prediction output.
+
+    Input shape:
+        3D tensor of shape `(batch_size, n_boxes, n_classes + 12)`.
+
+    Output shape:
+        3D tensor of shape `(batch_size, top_k, 6)`.
+    '''
+
+    def __init__(self,
+                 confidence_thresh=0.01,
+                 iou_threshold=0.45,
+                 top_k=200,
+                 nms_max_output_size=400,
+                 coords='centroids',
+                 normalize_coords=True,
+                 img_height=None,
+                 img_width=None,
+                 **kwargs):
+        '''
+        All default argument values follow the Caffe implementation.
+
+        Arguments:
+            confidence_thresh (float, optional): A float in [0,1), the minimum classification confidence in a specific
+                positive class in order to be considered for the non-maximum suppression stage for the respective class.
+                A lower value will result in a larger part of the selection process being done by the non-maximum suppression
+                stage, while a larger value will result in a larger part of the selection process happening in the confidence
+                thresholding stage.
+            iou_threshold (float, optional): A float in [0,1]. All boxes with a Jaccard similarity of greater than `iou_threshold`
+                with a locally maximal box will be removed from the set of predictions for a given class, where 'maximal' refers
+                to the box score.
+            top_k (int, optional): The number of highest scoring predictions to be kept for each batch item after the
+                non-maximum suppression stage.
+            nms_max_output_size (int, optional): The maximum number of predictions that will be left after performing non-maximum
+                suppression.
+            coords (str, optional): The box coordinate format that the model outputs. Must be 'centroids'
+                i.e. the format `(cx, cy, w, h)` (box center coordinates, width, and height). Other coordinate formats are
+                currently not supported.
+            normalize_coords (bool, optional): Set to `True` if the model outputs relative coordinates (i.e. coordinates in [0,1])
+                and you wish to transform these relative coordinates back to absolute coordinates. If the model outputs
+                relative coordinates, but you do not want to convert them back to absolute coordinates, set this to `False`.
+                Do not set this to `True` if the model already outputs absolute coordinates, as that would result in incorrect
+                coordinates. Requires `img_height` and `img_width` if set to `True`.
+            img_height (int, optional): The height of the input images. Only needed if `normalize_coords` is `True`.
+            img_width (int, optional): The width of the input images. Only needed if `normalize_coords` is `True`.
+        '''
+        if K.backend() != 'tensorflow':
+            raise TypeError("This layer only supports TensorFlow at the moment, but you are using the {} backend.".format(K.backend()))
+
+        if normalize_coords and ((img_height is None) or (img_width is None)):
+            raise ValueError("If relative box coordinates are supposed to be converted to absolute coordinates, the decoder needs the image size in order to decode the predictions, but `img_height == {}` and `img_width == {}`".format(img_height, img_width))
+
+        if coords != 'centroids':
+            raise ValueError("The DetectionOutput layer currently only supports the 'centroids' coordinate format.")
+
+        # We need these members for the config.
+        self.confidence_thresh = confidence_thresh
+        self.iou_threshold = iou_threshold
+        self.top_k = top_k
+        self.normalize_coords = normalize_coords
+        self.img_height = img_height
+        self.img_width = img_width
+        self.coords = coords
+        self.nms_max_output_size = nms_max_output_size
+
+        # We need these members for TensorFlow.
+        self.tf_confidence_thresh = tf.constant(self.confidence_thresh, name='confidence_thresh')
+        self.tf_iou_threshold = tf.constant(self.iou_threshold, name='iou_threshold')
+        self.tf_top_k = tf.constant(self.top_k, name='top_k')
+        self.tf_normalize_coords = tf.constant(self.normalize_coords, name='normalize_coords')
+        self.tf_img_height = tf.constant(self.img_height, dtype=tf.float32, name='img_height')
+        self.tf_img_width = tf.constant(self.img_width, dtype=tf.float32, name='img_width')
+        self.tf_nms_max_output_size = tf.constant(self.nms_max_output_size, name='nms_max_output_size')
+
+        super(DecodeDetections, self).__init__(**kwargs)
+
+    def build(self, input_shape):
+        self.input_spec = [InputSpec(shape=input_shape)]
+        super(DecodeDetections, self).build(input_shape)
+
+    def call(self, y_pred, mask=None):
+        '''
+        Returns:
+            3D tensor of shape `(batch_size, top_k, 6)`. The second axis is zero-padded
+            to always yield `top_k` predictions per batch item. The last axis contains
+            the coordinates for each predicted box in the format
+            `[class_id, confidence, xmin, ymin, xmax, ymax]`.
+        '''
+
+        #####################################################################################
+        # 1. Convert the box coordinates from predicted anchor box offsets to predicted
+        #    absolute coordinates
+        #####################################################################################
+
+        # Convert anchor box offsets to image offsets.
+        cx = y_pred[...,-12] * y_pred[...,-4] * y_pred[...,-6] + y_pred[...,-8] # cx = cx_pred * cx_variance * w_anchor + cx_anchor
+        cy = y_pred[...,-11] * y_pred[...,-3] * y_pred[...,-5] + y_pred[...,-7] # cy = cy_pred * cy_variance * h_anchor + cy_anchor
+        w = tf.exp(y_pred[...,-10] * y_pred[...,-2]) * y_pred[...,-6] # w = exp(w_pred * variance_w) * w_anchor
+        h = tf.exp(y_pred[...,-9] * y_pred[...,-1]) * y_pred[...,-5] # h = exp(h_pred * variance_h) * h_anchor
+
+        # Convert 'centroids' to 'corners'.
+        xmin = cx - 0.5 * w
+        ymin = cy - 0.5 * h
+        xmax = cx + 0.5 * w
+        ymax = cy + 0.5 * h
+
+        # If the model predicts box coordinates relative to the image dimensions and they are supposed
+        # to be converted back to absolute coordinates, do that.
+        def normalized_coords():
+            xmin1 = tf.expand_dims(xmin * self.tf_img_width, axis=-1)
+            ymin1 = tf.expand_dims(ymin * self.tf_img_height, axis=-1)
+            xmax1 = tf.expand_dims(xmax * self.tf_img_width, axis=-1)
+            ymax1 = tf.expand_dims(ymax * self.tf_img_height, axis=-1)
+            return xmin1, ymin1, xmax1, ymax1
+        def non_normalized_coords():
+            return tf.expand_dims(xmin, axis=-1), tf.expand_dims(ymin, axis=-1), tf.expand_dims(xmax, axis=-1), tf.expand_dims(ymax, axis=-1)
+
+        xmin, ymin, xmax, ymax = tf.cond(self.tf_normalize_coords, normalized_coords, non_normalized_coords)
+
+        # Concatenate the one-hot class confidences and the converted box coordinates to form the decoded predictions tensor.
+        y_pred = tf.concat(values=[y_pred[...,:-12], xmin, ymin, xmax, ymax], axis=-1)
+
+        #####################################################################################
+        # 2. Perform confidence thresholding, per-class non-maximum suppression, and
+        #    top-k filtering.
+        #####################################################################################
+
+        batch_size = tf.shape(y_pred)[0] # Output dtype: tf.int32
+        n_boxes = tf.shape(y_pred)[1]
+        n_classes = y_pred.shape[2] - 4
+        class_indices = tf.range(1, n_classes)
+
+        # Create a function that filters the predictions for the given batch item. Specifically, it performs:
+        # - confidence thresholding
+        # - non-maximum suppression (NMS)
+        # - top-k filtering
+        def filter_predictions(batch_item):
+
+            # Create a function that filters the predictions for one single class.
+            def filter_single_class(index):
+
+                # From a tensor of shape (n_boxes, n_classes + 4 coordinates) extract
+                # a tensor of shape (n_boxes, 1 + 4 coordinates) that contains the
+                # confidnece values for just one class, determined by `index`.
+                confidences = tf.expand_dims(batch_item[..., index], axis=-1)
+                class_id = tf.fill(dims=tf.shape(confidences), value=tf.to_float(index))
+                box_coordinates = batch_item[...,-4:]
+
+                single_class = tf.concat([class_id, confidences, box_coordinates], axis=-1)
+
+                # Apply confidence thresholding with respect to the class defined by `index`.
+                threshold_met = single_class[:,1] > self.tf_confidence_thresh
+                single_class = tf.boolean_mask(tensor=single_class,
+                                               mask=threshold_met)
+
+                # If any boxes made the threshold, perform NMS.
+                def perform_nms():
+                    scores = single_class[...,1]
+
+                    # `tf.image.non_max_suppression()` needs the box coordinates in the format `(ymin, xmin, ymax, xmax)`.
+                    xmin = tf.expand_dims(single_class[...,-4], axis=-1)
+                    ymin = tf.expand_dims(single_class[...,-3], axis=-1)
+                    xmax = tf.expand_dims(single_class[...,-2], axis=-1)
+                    ymax = tf.expand_dims(single_class[...,-1], axis=-1)
+                    boxes = tf.concat(values=[ymin, xmin, ymax, xmax], axis=-1)
+
+                    maxima_indices = tf.image.non_max_suppression(boxes=boxes,
+                                                                  scores=scores,
+                                                                  max_output_size=self.tf_nms_max_output_size,
+                                                                  iou_threshold=self.iou_threshold,
+                                                                  name='non_maximum_suppresion')
+                    maxima = tf.gather(params=single_class,
+                                       indices=maxima_indices,
+                                       axis=0)
+                    return maxima
+
+                def no_confident_predictions():
+                    return tf.constant(value=0.0, shape=(1,6))
+
+                single_class_nms = tf.cond(tf.equal(tf.size(single_class), 0), no_confident_predictions, perform_nms)
+
+                # Make sure `single_class` is exactly `self.nms_max_output_size` elements long.
+                padded_single_class = tf.pad(tensor=single_class_nms,
+                                             paddings=[[0, self.tf_nms_max_output_size - tf.shape(single_class_nms)[0]], [0, 0]],
+                                             mode='CONSTANT',
+                                             constant_values=0.0)
+
+                return padded_single_class
+
+            # Iterate `filter_single_class()` over all class indices.
+            filtered_single_classes = tf.map_fn(fn=lambda i: filter_single_class(i),
+                                                elems=tf.range(1,n_classes),
+                                                dtype=tf.float32,
+                                                parallel_iterations=128,
+                                                back_prop=False,
+                                                swap_memory=False,
+                                                infer_shape=True,
+                                                name='loop_over_classes')
+
+            # Concatenate the filtered results for all individual classes to one tensor.
+            filtered_predictions = tf.reshape(tensor=filtered_single_classes, shape=(-1,6))
+
+            # Perform top-k filtering for this batch item or pad it in case there are
+            # fewer than `self.top_k` boxes left at this point. Either way, produce a
+            # tensor of length `self.top_k`. By the time we return the final results tensor
+            # for the whole batch, all batch items must have the same number of predicted
+            # boxes so that the tensor dimensions are homogenous. If fewer than `self.top_k`
+            # predictions are left after the filtering process above, we pad the missing
+            # predictions with zeros as dummy entries.
+            def top_k():
+                return tf.gather(params=filtered_predictions,
+                                 indices=tf.nn.top_k(filtered_predictions[:, 1], k=self.tf_top_k, sorted=True).indices,
+                                 axis=0)
+            def pad_and_top_k():
+                padded_predictions = tf.pad(tensor=filtered_predictions,
+                                            paddings=[[0, self.tf_top_k - tf.shape(filtered_predictions)[0]], [0, 0]],
+                                            mode='CONSTANT',
+                                            constant_values=0.0)
+                return tf.gather(params=padded_predictions,
+                                 indices=tf.nn.top_k(padded_predictions[:, 1], k=self.tf_top_k, sorted=True).indices,
+                                 axis=0)
+
+            top_k_boxes = tf.cond(tf.greater_equal(tf.shape(filtered_predictions)[0], self.tf_top_k), top_k, pad_and_top_k)
+
+            return top_k_boxes
+
+        # Iterate `filter_predictions()` over all batch items.
+        output_tensor = tf.map_fn(fn=lambda x: filter_predictions(x),
+                                  elems=y_pred,
+                                  dtype=None,
+                                  parallel_iterations=128,
+                                  back_prop=False,
+                                  swap_memory=False,
+                                  infer_shape=True,
+                                  name='loop_over_batch')
+
+        return output_tensor
+
+    def compute_output_shape(self, input_shape):
+        batch_size, n_boxes, last_axis = input_shape
+        return (batch_size, self.tf_top_k, 6) # Last axis: (class_ID, confidence, 4 box coordinates)
+
+    def get_config(self):
+        config = {
+            'confidence_thresh': self.confidence_thresh,
+            'iou_threshold': self.iou_threshold,
+            'top_k': self.top_k,
+            'nms_max_output_size': self.nms_max_output_size,
+            'coords': self.coords,
+            'normalize_coords': self.normalize_coords,
+            'img_height': self.img_height,
+            'img_width': self.img_width,
+        }
+        base_config = super(DecodeDetections, self).get_config()
+        return dict(list(base_config.items()) + list(config.items()))
diff --git a/keras_ssd/keras_layers/keras_layer_DecodeDetectionsFast.py b/keras_ssd/keras_layers/keras_layer_DecodeDetectionsFast.py
new file mode 100644
index 0000000..f8ab221
--- /dev/null
+++ b/keras_ssd/keras_layers/keras_layer_DecodeDetectionsFast.py
@@ -0,0 +1,266 @@
+'''
+A custom Keras layer to decode the raw SSD prediction output. This is a modified
+and more efficient version of the `DetectionOutput` layer type in the original Caffe
+implementation of SSD. For a faithful replication of the original layer, please
+refer to the `DecodeDetections` layer.
+
+Copyright (C) 2018 Pierluigi Ferrari
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+'''
+
+from __future__ import division
+import numpy as np
+import tensorflow as tf
+import keras.backend as K
+from keras.engine.topology import InputSpec
+from keras.engine.topology import Layer
+
+class DecodeDetectionsFast(Layer):
+    '''
+    A Keras layer to decode the raw SSD prediction output.
+
+    Input shape:
+        3D tensor of shape `(batch_size, n_boxes, n_classes + 12)`.
+
+    Output shape:
+        3D tensor of shape `(batch_size, top_k, 6)`.
+    '''
+
+    def __init__(self,
+                 confidence_thresh=0.01,
+                 iou_threshold=0.45,
+                 top_k=200,
+                 nms_max_output_size=400,
+                 coords='centroids',
+                 normalize_coords=True,
+                 img_height=None,
+                 img_width=None,
+                 **kwargs):
+        '''
+        All default argument values follow the Caffe implementation.
+
+        Arguments:
+            confidence_thresh (float, optional): A float in [0,1), the minimum classification confidence in a specific
+                positive class in order to be considered for the non-maximum suppression stage for the respective class.
+                A lower value will result in a larger part of the selection process being done by the non-maximum suppression
+                stage, while a larger value will result in a larger part of the selection process happening in the confidence
+                thresholding stage.
+            iou_threshold (float, optional): A float in [0,1]. All boxes with a Jaccard similarity of greater than `iou_threshold`
+                with a locally maximal box will be removed from the set of predictions for a given class, where 'maximal' refers
+                to the box score.
+            top_k (int, optional): The number of highest scoring predictions to be kept for each batch item after the
+                non-maximum suppression stage.
+            nms_max_output_size (int, optional): The maximum number of predictions that will be left after performing non-maximum
+                suppression.
+            coords (str, optional): The box coordinate format that the model outputs. Must be 'centroids'
+                i.e. the format `(cx, cy, w, h)` (box center coordinates, width, and height). Other coordinate formats are
+                currently not supported.
+            normalize_coords (bool, optional): Set to `True` if the model outputs relative coordinates (i.e. coordinates in [0,1])
+                and you wish to transform these relative coordinates back to absolute coordinates. If the model outputs
+                relative coordinates, but you do not want to convert them back to absolute coordinates, set this to `False`.
+                Do not set this to `True` if the model already outputs absolute coordinates, as that would result in incorrect
+                coordinates. Requires `img_height` and `img_width` if set to `True`.
+            img_height (int, optional): The height of the input images. Only needed if `normalize_coords` is `True`.
+            img_width (int, optional): The width of the input images. Only needed if `normalize_coords` is `True`.
+        '''
+        if K.backend() != 'tensorflow':
+            raise TypeError("This layer only supports TensorFlow at the moment, but you are using the {} backend.".format(K.backend()))
+
+        if normalize_coords and ((img_height is None) or (img_width is None)):
+            raise ValueError("If relative box coordinates are supposed to be converted to absolute coordinates, the decoder needs the image size in order to decode the predictions, but `img_height == {}` and `img_width == {}`".format(img_height, img_width))
+
+        if coords != 'centroids':
+            raise ValueError("The DetectionOutput layer currently only supports the 'centroids' coordinate format.")
+
+        # We need these members for the config.
+        self.confidence_thresh = confidence_thresh
+        self.iou_threshold = iou_threshold
+        self.top_k = top_k
+        self.normalize_coords = normalize_coords
+        self.img_height = img_height
+        self.img_width = img_width
+        self.coords = coords
+        self.nms_max_output_size = nms_max_output_size
+
+        # We need these members for TensorFlow.
+        self.tf_confidence_thresh = tf.constant(self.confidence_thresh, name='confidence_thresh')
+        self.tf_iou_threshold = tf.constant(self.iou_threshold, name='iou_threshold')
+        self.tf_top_k = tf.constant(self.top_k, name='top_k')
+        self.tf_normalize_coords = tf.constant(self.normalize_coords, name='normalize_coords')
+        self.tf_img_height = tf.constant(self.img_height, dtype=tf.float32, name='img_height')
+        self.tf_img_width = tf.constant(self.img_width, dtype=tf.float32, name='img_width')
+        self.tf_nms_max_output_size = tf.constant(self.nms_max_output_size, name='nms_max_output_size')
+
+        super(DecodeDetectionsFast, self).__init__(**kwargs)
+
+    def build(self, input_shape):
+        self.input_spec = [InputSpec(shape=input_shape)]
+        super(DecodeDetectionsFast, self).build(input_shape)
+
+    def call(self, y_pred, mask=None):
+        '''
+        Returns:
+            3D tensor of shape `(batch_size, top_k, 6)`. The second axis is zero-padded
+            to always yield `top_k` predictions per batch item. The last axis contains
+            the coordinates for each predicted box in the format
+            `[class_id, confidence, xmin, ymin, xmax, ymax]`.
+        '''
+
+        #####################################################################################
+        # 1. Convert the box coordinates from predicted anchor box offsets to predicted
+        #    absolute coordinates
+        #####################################################################################
+
+        # Extract the predicted class IDs as the indices of the highest confidence values.
+        class_ids = tf.expand_dims(tf.to_float(tf.argmax(y_pred[...,:-12], axis=-1)), axis=-1)
+        # Extract the confidences of the maximal classes.
+        confidences = tf.reduce_max(y_pred[...,:-12], axis=-1, keep_dims=True)
+
+        # Convert anchor box offsets to image offsets.
+        cx = y_pred[...,-12] * y_pred[...,-4] * y_pred[...,-6] + y_pred[...,-8] # cx = cx_pred * cx_variance * w_anchor + cx_anchor
+        cy = y_pred[...,-11] * y_pred[...,-3] * y_pred[...,-5] + y_pred[...,-7] # cy = cy_pred * cy_variance * h_anchor + cy_anchor
+        w = tf.exp(y_pred[...,-10] * y_pred[...,-2]) * y_pred[...,-6] # w = exp(w_pred * variance_w) * w_anchor
+        h = tf.exp(y_pred[...,-9] * y_pred[...,-1]) * y_pred[...,-5] # h = exp(h_pred * variance_h) * h_anchor
+
+        # Convert 'centroids' to 'corners'.
+        xmin = cx - 0.5 * w
+        ymin = cy - 0.5 * h
+        xmax = cx + 0.5 * w
+        ymax = cy + 0.5 * h
+
+        # If the model predicts box coordinates relative to the image dimensions and they are supposed
+        # to be converted back to absolute coordinates, do that.
+        def normalized_coords():
+            xmin1 = tf.expand_dims(xmin * self.tf_img_width, axis=-1)
+            ymin1 = tf.expand_dims(ymin * self.tf_img_height, axis=-1)
+            xmax1 = tf.expand_dims(xmax * self.tf_img_width, axis=-1)
+            ymax1 = tf.expand_dims(ymax * self.tf_img_height, axis=-1)
+            return xmin1, ymin1, xmax1, ymax1
+        def non_normalized_coords():
+            return tf.expand_dims(xmin, axis=-1), tf.expand_dims(ymin, axis=-1), tf.expand_dims(xmax, axis=-1), tf.expand_dims(ymax, axis=-1)
+
+        xmin, ymin, xmax, ymax = tf.cond(self.tf_normalize_coords, normalized_coords, non_normalized_coords)
+
+        # Concatenate the one-hot class confidences and the converted box coordinates to form the decoded predictions tensor.
+        y_pred = tf.concat(values=[class_ids, confidences, xmin, ymin, xmax, ymax], axis=-1)
+
+        #####################################################################################
+        # 2. Perform confidence thresholding, non-maximum suppression, and top-k filtering.
+        #####################################################################################
+
+        batch_size = tf.shape(y_pred)[0] # Output dtype: tf.int32
+        n_boxes = tf.shape(y_pred)[1]
+        n_classes = y_pred.shape[2] - 4
+        class_indices = tf.range(1, n_classes)
+
+        # Create a function that filters the predictions for the given batch item. Specifically, it performs:
+        # - confidence thresholding
+        # - non-maximum suppression (NMS)
+        # - top-k filtering
+        def filter_predictions(batch_item):
+
+            # Keep only the non-background boxes.
+            positive_boxes = tf.not_equal(batch_item[...,0], 0.0)
+            predictions = tf.boolean_mask(tensor=batch_item,
+                                          mask=positive_boxes)
+
+            def perform_confidence_thresholding():
+                # Apply confidence thresholding.
+                threshold_met = predictions[:,1] > self.tf_confidence_thresh
+                return tf.boolean_mask(tensor=predictions,
+                                       mask=threshold_met)
+            def no_positive_boxes():
+                return tf.constant(value=0.0, shape=(1,6))
+
+            # If there are any positive predictions, perform confidence thresholding.
+            predictions_conf_thresh = tf.cond(tf.equal(tf.size(predictions), 0), no_positive_boxes, perform_confidence_thresholding)
+
+            def perform_nms():
+                scores = predictions_conf_thresh[...,1]
+
+                # `tf.image.non_max_suppression()` needs the box coordinates in the format `(ymin, xmin, ymax, xmax)`.
+                xmin = tf.expand_dims(predictions_conf_thresh[...,-4], axis=-1)
+                ymin = tf.expand_dims(predictions_conf_thresh[...,-3], axis=-1)
+                xmax = tf.expand_dims(predictions_conf_thresh[...,-2], axis=-1)
+                ymax = tf.expand_dims(predictions_conf_thresh[...,-1], axis=-1)
+                boxes = tf.concat(values=[ymin, xmin, ymax, xmax], axis=-1)
+
+                maxima_indices = tf.image.non_max_suppression(boxes=boxes,
+                                                              scores=scores,
+                                                              max_output_size=self.tf_nms_max_output_size,
+                                                              iou_threshold=self.iou_threshold,
+                                                              name='non_maximum_suppresion')
+                maxima = tf.gather(params=predictions_conf_thresh,
+                                   indices=maxima_indices,
+                                   axis=0)
+                return maxima
+            def no_confident_predictions():
+                return tf.constant(value=0.0, shape=(1,6))
+
+            # If any boxes made the threshold, perform NMS.
+            predictions_nms = tf.cond(tf.equal(tf.size(predictions_conf_thresh), 0), no_confident_predictions, perform_nms)
+
+            # Perform top-k filtering for this batch item or pad it in case there are
+            # fewer than `self.top_k` boxes left at this point. Either way, produce a
+            # tensor of length `self.top_k`. By the time we return the final results tensor
+            # for the whole batch, all batch items must have the same number of predicted
+            # boxes so that the tensor dimensions are homogenous. If fewer than `self.top_k`
+            # predictions are left after the filtering process above, we pad the missing
+            # predictions with zeros as dummy entries.
+            def top_k():
+                return tf.gather(params=predictions_nms,
+                                 indices=tf.nn.top_k(predictions_nms[:, 1], k=self.tf_top_k, sorted=True).indices,
+                                 axis=0)
+            def pad_and_top_k():
+                padded_predictions = tf.pad(tensor=predictions_nms,
+                                            paddings=[[0, self.tf_top_k - tf.shape(predictions_nms)[0]], [0, 0]],
+                                            mode='CONSTANT',
+                                            constant_values=0.0)
+                return tf.gather(params=padded_predictions,
+                                 indices=tf.nn.top_k(padded_predictions[:, 1], k=self.tf_top_k, sorted=True).indices,
+                                 axis=0)
+
+            top_k_boxes = tf.cond(tf.greater_equal(tf.shape(predictions_nms)[0], self.tf_top_k), top_k, pad_and_top_k)
+
+            return top_k_boxes
+
+        # Iterate `filter_predictions()` over all batch items.
+        output_tensor = tf.map_fn(fn=lambda x: filter_predictions(x),
+                                  elems=y_pred,
+                                  dtype=None,
+                                  parallel_iterations=128,
+                                  back_prop=False,
+                                  swap_memory=False,
+                                  infer_shape=True,
+                                  name='loop_over_batch')
+
+        return output_tensor
+
+    def compute_output_shape(self, input_shape):
+        batch_size, n_boxes, last_axis = input_shape
+        return (batch_size, self.tf_top_k, 6) # Last axis: (class_ID, confidence, 4 box coordinates)
+
+    def get_config(self):
+        config = {
+            'confidence_thresh': self.confidence_thresh,
+            'iou_threshold': self.iou_threshold,
+            'top_k': self.top_k,
+            'nms_max_output_size': self.nms_max_output_size,
+            'coords': self.coords,
+            'normalize_coords': self.normalize_coords,
+            'img_height': self.img_height,
+            'img_width': self.img_width,
+        }
+        base_config = super(DecodeDetectionsFast, self).get_config()
+        return dict(list(base_config.items()) + list(config.items()))
diff --git a/keras_ssd/keras_layers/keras_layer_L2Normalization.py b/keras_ssd/keras_layers/keras_layer_L2Normalization.py
new file mode 100644
index 0000000..e2c71bf
--- /dev/null
+++ b/keras_ssd/keras_layers/keras_layer_L2Normalization.py
@@ -0,0 +1,70 @@
+'''
+A custom Keras layer to perform L2-normalization.
+
+Copyright (C) 2018 Pierluigi Ferrari
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+'''
+
+from __future__ import division
+import numpy as np
+import keras.backend as K
+from keras.engine.topology import InputSpec
+from keras.engine.topology import Layer
+
+class L2Normalization(Layer):
+    '''
+    Performs L2 normalization on the input tensor with a learnable scaling parameter
+    as described in the paper "Parsenet: Looking Wider to See Better" (see references)
+    and as used in the original SSD model.
+
+    Arguments:
+        gamma_init (int): The initial scaling parameter. Defaults to 20 following the
+            SSD paper.
+
+    Input shape:
+        4D tensor of shape `(batch, channels, height, width)` if `dim_ordering = 'th'`
+        or `(batch, height, width, channels)` if `dim_ordering = 'tf'`.
+
+    Returns:
+        The scaled tensor. Same shape as the input tensor.
+
+    References:
+        http://cs.unc.edu/~wliu/papers/parsenet.pdf
+    '''
+
+    def __init__(self, gamma_init=20, **kwargs):
+        if K.image_dim_ordering() == 'tf':
+            self.axis = 3
+        else:
+            self.axis = 1
+        self.gamma_init = gamma_init
+        super(L2Normalization, self).__init__(**kwargs)
+
+    def build(self, input_shape):
+        self.input_spec = [InputSpec(shape=input_shape)]
+        gamma = self.gamma_init * np.ones((input_shape[self.axis],))
+        self.gamma = K.variable(gamma, name='{}_gamma'.format(self.name))
+        self.trainable_weights = [self.gamma]
+        super(L2Normalization, self).build(input_shape)
+
+    def call(self, x, mask=None):
+        output = K.l2_normalize(x, self.axis)
+        return output * self.gamma
+
+    def get_config(self):
+        config = {
+            'gamma_init': self.gamma_init
+        }
+        base_config = super(L2Normalization, self).get_config()
+        return dict(list(base_config.items()) + list(config.items()))
diff --git a/keras_ssd/keras_loss_function/__init__.py b/keras_ssd/keras_loss_function/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/keras_ssd/keras_loss_function/keras_ssd_loss.py b/keras_ssd/keras_loss_function/keras_ssd_loss.py
new file mode 100644
index 0000000..83567f5
--- /dev/null
+++ b/keras_ssd/keras_loss_function/keras_ssd_loss.py
@@ -0,0 +1,211 @@
+'''
+The Keras-compatible loss function for the SSD model. Currently supports TensorFlow only.
+
+Copyright (C) 2018 Pierluigi Ferrari
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+'''
+
+from __future__ import division
+import tensorflow as tf
+
+class SSDLoss:
+    '''
+    The SSD loss, see https://arxiv.org/abs/1512.02325.
+    '''
+
+    def __init__(self,
+                 neg_pos_ratio=3,
+                 n_neg_min=0,
+                 alpha=1.0):
+        '''
+        Arguments:
+            neg_pos_ratio (int, optional): The maximum ratio of negative (i.e. background)
+                to positive ground truth boxes to include in the loss computation.
+                There are no actual background ground truth boxes of course, but `y_true`
+                contains anchor boxes labeled with the background class. Since
+                the number of background boxes in `y_true` will usually exceed
+                the number of positive boxes by far, it is necessary to balance
+                their influence on the loss. Defaults to 3 following the paper.
+            n_neg_min (int, optional): The minimum number of negative ground truth boxes to
+                enter the loss computation *per batch*. This argument can be used to make
+                sure that the model learns from a minimum number of negatives in batches
+                in which there are very few, or even none at all, positive ground truth
+                boxes. It defaults to 0 and if used, it should be set to a value that
+                stands in reasonable proportion to the batch size used for training.
+            alpha (float, optional): A factor to weight the localization loss in the
+                computation of the total loss. Defaults to 1.0 following the paper.
+        '''
+        self.neg_pos_ratio = neg_pos_ratio
+        self.n_neg_min = n_neg_min
+        self.alpha = alpha
+
+    def smooth_L1_loss(self, y_true, y_pred):
+        '''
+        Compute smooth L1 loss, see references.
+
+        Arguments:
+            y_true (nD tensor): A TensorFlow tensor of any shape containing the ground truth data.
+                In this context, the expected tensor has shape `(batch_size, #boxes, 4)` and
+                contains the ground truth bounding box coordinates, where the last dimension
+                contains `(xmin, xmax, ymin, ymax)`.
+            y_pred (nD tensor): A TensorFlow tensor of identical structure to `y_true` containing
+                the predicted data, in this context the predicted bounding box coordinates.
+
+        Returns:
+            The smooth L1 loss, a nD-1 Tensorflow tensor. In this context a 2D tensor
+            of shape (batch, n_boxes_total).
+
+        References:
+            https://arxiv.org/abs/1504.08083
+        '''
+        absolute_loss = tf.abs(y_true - y_pred)
+        square_loss = 0.5 * (y_true - y_pred)**2
+        l1_loss = tf.where(tf.less(absolute_loss, 1.0), square_loss, absolute_loss - 0.5)
+        return tf.reduce_sum(l1_loss, axis=-1)
+
+    def log_loss(self, y_true, y_pred):
+        '''
+        Compute the softmax log loss.
+
+        Arguments:
+            y_true (nD tensor): A TensorFlow tensor of any shape containing the ground truth data.
+                In this context, the expected tensor has shape (batch_size, #boxes, #classes)
+                and contains the ground truth bounding box categories.
+            y_pred (nD tensor): A TensorFlow tensor of identical structure to `y_true` containing
+                the predicted data, in this context the predicted bounding box categories.
+
+        Returns:
+            The softmax log loss, a nD-1 Tensorflow tensor. In this context a 2D tensor
+            of shape (batch, n_boxes_total).
+        '''
+        # Make sure that `y_pred` doesn't contain any zeros (which would break the log function)
+        y_pred = tf.maximum(y_pred, 1e-15)
+        # Compute the log loss
+        log_loss = -tf.reduce_sum(y_true * tf.log(y_pred), axis=-1)
+        return log_loss
+
+    def compute_loss(self, y_true, y_pred):
+        '''
+        Compute the loss of the SSD model prediction against the ground truth.
+
+        Arguments:
+            y_true (array): A Numpy array of shape `(batch_size, #boxes, #classes + 12)`,
+                where `#boxes` is the total number of boxes that the model predicts
+                per image. Be careful to make sure that the index of each given
+                box in `y_true` is the same as the index for the corresponding
+                box in `y_pred`. The last axis must have length `#classes + 12` and contain
+                `[classes one-hot encoded, 4 ground truth box coordinate offsets, 8 arbitrary entries]`
+                in this order, including the background class. The last eight entries of the
+                last axis are not used by this function and therefore their contents are
+                irrelevant, they only exist so that `y_true` has the same shape as `y_pred`,
+                where the last four entries of the last axis contain the anchor box
+                coordinates, which are needed during inference. Important: Boxes that
+                you want the cost function to ignore need to have a one-hot
+                class vector of all zeros.
+            y_pred (Keras tensor): The model prediction. The shape is identical
+                to that of `y_true`, i.e. `(batch_size, #boxes, #classes + 12)`.
+                The last axis must contain entries in the format
+                `[classes one-hot encoded, 4 predicted box coordinate offsets, 8 arbitrary entries]`.
+
+        Returns:
+            A scalar, the total multitask loss for classification and localization.
+        '''
+        self.neg_pos_ratio = tf.constant(self.neg_pos_ratio)
+        self.n_neg_min = tf.constant(self.n_neg_min)
+        self.alpha = tf.constant(self.alpha)
+
+        batch_size = tf.shape(y_pred)[0] # Output dtype: tf.int32
+        n_boxes = tf.shape(y_pred)[1] # Output dtype: tf.int32, note that `n_boxes` in this context denotes the total number of boxes per image, not the number of boxes per cell.
+
+        # 1: Compute the losses for class and box predictions for every box.
+
+        classification_loss = tf.to_float(self.log_loss(y_true[:,:,:-12], y_pred[:,:,:-12])) # Output shape: (batch_size, n_boxes)
+        localization_loss = tf.to_float(self.smooth_L1_loss(y_true[:,:,-12:-8], y_pred[:,:,-12:-8])) # Output shape: (batch_size, n_boxes)
+
+        # 2: Compute the classification losses for the positive and negative targets.
+
+        # Create masks for the positive and negative ground truth classes.
+        negatives = y_true[:,:,0] # Tensor of shape (batch_size, n_boxes)
+        positives = tf.to_float(tf.reduce_max(y_true[:,:,1:-12], axis=-1)) # Tensor of shape (batch_size, n_boxes)
+
+        # Count the number of positive boxes (classes 1 to n) in y_true across the whole batch.
+        n_positive = tf.reduce_sum(positives)
+
+        # Now mask all negative boxes and sum up the losses for the positive boxes PER batch item
+        # (Keras loss functions must output one scalar loss value PER batch item, rather than just
+        # one scalar for the entire batch, that's why we're not summing across all axes).
+        pos_class_loss = tf.reduce_sum(classification_loss * positives, axis=-1) # Tensor of shape (batch_size,)
+
+        # Compute the classification loss for the negative default boxes (if there are any).
+
+        # First, compute the classification loss for all negative boxes.
+        neg_class_loss_all = classification_loss * negatives # Tensor of shape (batch_size, n_boxes)
+        n_neg_losses = tf.count_nonzero(neg_class_loss_all, dtype=tf.int32) # The number of non-zero loss entries in `neg_class_loss_all`
+        # What's the point of `n_neg_losses`? For the next step, which will be to compute which negative boxes enter the classification
+        # loss, we don't just want to know how many negative ground truth boxes there are, but for how many of those there actually is
+        # a positive (i.e. non-zero) loss. This is necessary because `tf.nn.top-k()` in the function below will pick the top k boxes with
+        # the highest losses no matter what, even if it receives a vector where all losses are zero. In the unlikely event that all negative
+        # classification losses ARE actually zero though, this behavior might lead to `tf.nn.top-k()` returning the indices of positive
+        # boxes, leading to an incorrect negative classification loss computation, and hence an incorrect overall loss computation.
+        # We therefore need to make sure that `n_negative_keep`, which assumes the role of the `k` argument in `tf.nn.top-k()`,
+        # is at most the number of negative boxes for which there is a positive classification loss.
+
+        # Compute the number of negative examples we want to account for in the loss.
+        # We'll keep at most `self.neg_pos_ratio` times the number of positives in `y_true`, but at least `self.n_neg_min` (unless `n_neg_loses` is smaller).
+        n_negative_keep = tf.minimum(tf.maximum(self.neg_pos_ratio * tf.to_int32(n_positive), self.n_neg_min), n_neg_losses)
+
+        # In the unlikely case when either (1) there are no negative ground truth boxes at all
+        # or (2) the classification loss for all negative boxes is zero, return zero as the `neg_class_loss`.
+        def f1():
+            return tf.zeros([batch_size])
+        # Otherwise compute the negative loss.
+        def f2():
+            # Now we'll identify the top-k (where k == `n_negative_keep`) boxes with the highest confidence loss that
+            # belong to the background class in the ground truth data. Note that this doesn't necessarily mean that the model
+            # predicted the wrong class for those boxes, it just means that the loss for those boxes is the highest.
+
+            # To do this, we reshape `neg_class_loss_all` to 1D...
+            neg_class_loss_all_1D = tf.reshape(neg_class_loss_all, [-1]) # Tensor of shape (batch_size * n_boxes,)
+            # ...and then we get the indices for the `n_negative_keep` boxes with the highest loss out of those...
+            values, indices = tf.nn.top_k(neg_class_loss_all_1D,
+                                          k=n_negative_keep,
+                                          sorted=False) # We don't need them sorted.
+            # ...and with these indices we'll create a mask...
+            negatives_keep = tf.scatter_nd(indices=tf.expand_dims(indices, axis=1),
+                                           updates=tf.ones_like(indices, dtype=tf.int32),
+                                           shape=tf.shape(neg_class_loss_all_1D)) # Tensor of shape (batch_size * n_boxes,)
+            negatives_keep = tf.to_float(tf.reshape(negatives_keep, [batch_size, n_boxes])) # Tensor of shape (batch_size, n_boxes)
+            # ...and use it to keep only those boxes and mask all other classification losses
+            neg_class_loss = tf.reduce_sum(classification_loss * negatives_keep, axis=-1) # Tensor of shape (batch_size,)
+            return neg_class_loss
+
+        neg_class_loss = tf.cond(tf.equal(n_neg_losses, tf.constant(0)), f1, f2)
+
+        class_loss = pos_class_loss + neg_class_loss # Tensor of shape (batch_size,)
+
+        # 3: Compute the localization loss for the positive targets.
+        #    We don't compute a localization loss for negative predicted boxes (obviously: there are no ground truth boxes they would correspond to).
+
+        loc_loss = tf.reduce_sum(localization_loss * positives, axis=-1) # Tensor of shape (batch_size,)
+
+        # 4: Compute the total loss.
+
+        total_loss = (class_loss + self.alpha * loc_loss) / tf.maximum(1.0, n_positive) # In case `n_positive == 0`
+        # Keras has the annoying habit of dividing the loss by the batch size, which sucks in our case
+        # because the relevant criterion to average our loss over is the number of positive boxes in the batch
+        # (by which we're dividing in the line above), not the batch size. So in order to revert Keras' averaging
+        # over the batch size, we'll have to multiply by it.
+        total_loss = total_loss * tf.to_float(batch_size)
+
+        return total_loss
diff --git a/keras_ssd/misc_utils/__init__.py b/keras_ssd/misc_utils/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/keras_ssd/misc_utils/tensor_sampling_utils.py b/keras_ssd/misc_utils/tensor_sampling_utils.py
new file mode 100644
index 0000000..a27ce1d
--- /dev/null
+++ b/keras_ssd/misc_utils/tensor_sampling_utils.py
@@ -0,0 +1,177 @@
+'''
+Utilities that are useful to sub- or up-sample weights tensors.
+
+Copyright (C) 2018 Pierluigi Ferrari
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+'''
+
+import numpy as np
+
+def sample_tensors(weights_list, sampling_instructions, axes=None, init=None, mean=0.0, stddev=0.005):
+    '''
+    Can sub-sample and/or up-sample individual dimensions of the tensors in the given list
+    of input tensors.
+
+    It is possible to sub-sample some dimensions and up-sample other dimensions at the same time.
+
+    The tensors in the list will be sampled consistently, i.e. for any given dimension that
+    corresponds among all tensors in the list, the same elements will be picked for every tensor
+    along that dimension.
+
+    For dimensions that are being sub-sampled, you can either provide a list of the indices
+    that should be picked, or you can provide the number of elements to be sub-sampled, in which
+    case the elements will be chosen at random.
+
+    For dimensions that are being up-sampled, "filler" elements will be insterted at random
+    positions along the respective dimension. These filler elements will be initialized either
+    with zero or from a normal distribution with selectable mean and standard deviation.
+
+    Arguments:
+        weights_list (list): A list of Numpy arrays. Each array represents one of the tensors
+            to be sampled. The tensor with the greatest number of dimensions must be the first
+            element in the list. For example, in the case of the weights of a 2D convolutional
+            layer, the kernel must be the first element in the list and the bias the second,
+            not the other way around. For all tensors in the list after the first tensor, the
+            lengths of each of their axes must identical to the length of some axis of the
+            first tensor.
+        sampling_instructions (list): A list that contains the sampling instructions for each
+            dimension of the first tensor. If the first tensor has `n` dimensions, then this
+            must be a list of length `n`. That means, sampling instructions for every dimension
+            of the first tensor must still be given even if not all dimensions should be changed.
+            The elements of this list can be either lists of integers or integers. If the sampling
+            instruction for a given dimension is a list of integers, then these integers represent
+            the indices of the elements of that dimension that will be sub-sampled. If the sampling
+            instruction for a given dimension is an integer, then that number of elements will be
+            sampled along said dimension. If the integer is greater than the number of elements
+            of the input tensors in that dimension, that dimension will be up-sampled. If the integer
+            is smaller than the number of elements of the input tensors in that dimension, that
+            dimension will be sub-sampled. If the integer is equal to the number of elements
+            of the input tensors in that dimension, that dimension will remain the same.
+        axes (list, optional): Only relevant if `weights_list` contains more than one tensor.
+            This list contains a list for each additional tensor in `weights_list` beyond the first.
+            Each of these lists contains integers that determine to which axes of the first tensor
+            the axes of the respective tensor correspond. For example, let the first tensor be a
+            4D tensor and the second tensor in the list be a 2D tensor. If the first element of
+            `axis` is the list `[2,3]`, then that means that the two axes of the second tensor
+            correspond to the last two axes of the first tensor, in the same order. The point of
+            this list is for the program to know, if a given dimension of the first tensor is to
+            be sub- or up-sampled, which dimensions of the other tensors in the list must be
+            sub- or up-sampled accordingly.
+        init (list, optional): Only relevant for up-sampling. Must be `None` or a list of strings
+            that determines for each tensor in `weights_list` how the newly inserted values should
+            be initialized. The possible values are 'gaussian' for initialization from a normal
+            distribution with the selected mean and standard deviation (see the following two arguments),
+            or 'zeros' for zero-initialization. If `None`, all initializations default to
+            'gaussian'.
+        mean (float, optional): Only relevant for up-sampling. The mean of the values that will
+            be inserted into the tensors at random in the case of up-sampling.
+        stddev (float, optional): Only relevant for up-sampling. The standard deviation of the
+            values that will be inserted into the tensors at random in the case of up-sampling.
+
+    Returns:
+        A list containing the sampled tensors in the same order in which they were given.
+    '''
+
+    first_tensor = weights_list[0]
+
+    if (not isinstance(sampling_instructions, (list, tuple))) or (len(sampling_instructions) != first_tensor.ndim):
+        raise ValueError("The sampling instructions must be a list whose length is the number of dimensions of the first tensor in `weights_list`.")
+
+    if (not init is None) and len(init) != len(weights_list):
+        raise ValueError("`init` must either be `None` or a list of strings that has the same length as `weights_list`.")
+
+    up_sample = [] # Store the dimensions along which we need to up-sample.
+    out_shape = [] # Store the shape of the output tensor here.
+    # Store two stages of the new (sub-sampled and/or up-sampled) weights tensors in the following two lists.
+    subsampled_weights_list = [] # Tensors after sub-sampling, but before up-sampling (if any).
+    upsampled_weights_list = [] # Sub-sampled tensors after up-sampling (if any), i.e. final output tensors.
+
+    # Create the slicing arrays from the sampling instructions.
+    sampling_slices = []
+    for i, sampling_inst in enumerate(sampling_instructions):
+        if isinstance(sampling_inst, (list, tuple)):
+            amax = np.amax(np.array(sampling_inst))
+            if amax >= first_tensor.shape[i]:
+                raise ValueError("The sample instructions for dimension {} contain index {}, which is greater than the length of that dimension.".format(i, amax))
+            sampling_slices.append(np.array(sampling_inst))
+            out_shape.append(len(sampling_inst))
+        elif isinstance(sampling_inst, int):
+            out_shape.append(sampling_inst)
+            if sampling_inst == first_tensor.shape[i]:
+                # Nothing to sample here, we're keeping the original number of elements along this axis.
+                sampling_slice = np.arange(sampling_inst)
+                sampling_slices.append(sampling_slice)
+            elif sampling_inst < first_tensor.shape[i]:
+                # We want to SUB-sample this dimension. Randomly pick `sample_inst` many elements from it.
+                sampling_slice1 = np.array([0]) # We will always sample class 0, the background class.
+                # Sample the rest of the classes.
+                sampling_slice2 = np.sort(np.random.choice(np.arange(1, first_tensor.shape[i]), sampling_inst - 1, replace=False))
+                sampling_slice = np.concatenate([sampling_slice1, sampling_slice2])
+                sampling_slices.append(sampling_slice)
+            else:
+                # We want to UP-sample. Pick all elements from this dimension.
+                sampling_slice = np.arange(first_tensor.shape[i])
+                sampling_slices.append(sampling_slice)
+                up_sample.append(i)
+        else:
+            raise ValueError("Each element of the sampling instructions must be either an integer or a list/tuple of integers, but received `{}`".format(type(sampling_inst)))
+
+    # Process the first tensor.
+    subsampled_first_tensor = np.copy(first_tensor[np.ix_(*sampling_slices)])
+    subsampled_weights_list.append(subsampled_first_tensor)
+
+    # Process the other tensors.
+    if len(weights_list) > 1:
+        for j in range(1, len(weights_list)):
+            this_sampling_slices = [sampling_slices[i] for i in axes[j-1]] # Get the sampling slices for this tensor.
+            subsampled_weights_list.append(np.copy(weights_list[j][np.ix_(*this_sampling_slices)]))
+
+    if up_sample:
+        # Take care of the dimensions that are to be up-sampled.
+
+        out_shape = np.array(out_shape)
+
+        # Process the first tensor.
+        if init is None or init[0] == 'gaussian':
+            upsampled_first_tensor = np.random.normal(loc=mean, scale=stddev, size=out_shape)
+        elif init[0] == 'zeros':
+            upsampled_first_tensor = np.zeros(out_shape)
+        else:
+            raise ValueError("Valid initializations are 'gaussian' and 'zeros', but received '{}'.".format(init[0]))
+        # Pick the indices of the elements in `upsampled_first_tensor` that should be occupied by `subsampled_first_tensor`.
+        up_sample_slices = [np.arange(k) for k in subsampled_first_tensor.shape]
+        for i in up_sample:
+            # Randomly select across which indices of this dimension to scatter the elements of `new_weights_tensor` in this dimension.
+            up_sample_slice1 = np.array([0])
+            up_sample_slice2 = np.sort(np.random.choice(np.arange(1, upsampled_first_tensor.shape[i]), subsampled_first_tensor.shape[i] - 1, replace=False))
+            up_sample_slices[i] = np.concatenate([up_sample_slice1, up_sample_slice2])
+        upsampled_first_tensor[np.ix_(*up_sample_slices)] = subsampled_first_tensor
+        upsampled_weights_list.append(upsampled_first_tensor)
+
+        # Process the other tensors
+        if len(weights_list) > 1:
+            for j in range(1, len(weights_list)):
+                if init is None or init[j] == 'gaussian':
+                    upsampled_tensor = np.random.normal(loc=mean, scale=stddev, size=out_shape[axes[j-1]])
+                elif init[j] == 'zeros':
+                    upsampled_tensor = np.zeros(out_shape[axes[j-1]])
+                else:
+                    raise ValueError("Valid initializations are 'gaussian' and 'zeros', but received '{}'.".format(init[j]))
+                this_up_sample_slices = [up_sample_slices[i] for i in axes[j-1]] # Get the up-sampling slices for this tensor.
+                upsampled_tensor[np.ix_(*this_up_sample_slices)] = subsampled_weights_list[j]
+                upsampled_weights_list.append(upsampled_tensor)
+
+        return upsampled_weights_list
+    else:
+        return subsampled_weights_list
diff --git a/keras_ssd/models/__init__.py b/keras_ssd/models/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/keras_ssd/models/keras_ssd300.py b/keras_ssd/models/keras_ssd300.py
new file mode 100644
index 0000000..6aed701
--- /dev/null
+++ b/keras_ssd/models/keras_ssd300.py
@@ -0,0 +1,457 @@
+'''
+A Keras port of the original Caffe SSD300 network.
+
+Copyright (C) 2018 Pierluigi Ferrari
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+'''
+
+from __future__ import division
+import numpy as np
+from keras.models import Model
+from keras.layers import Input, Lambda, Activation, Conv2D, MaxPooling2D, ZeroPadding2D, Reshape, Concatenate
+from keras.regularizers import l2
+import keras.backend as K
+
+from keras_layers.keras_layer_AnchorBoxes import AnchorBoxes
+from keras_layers.keras_layer_L2Normalization import L2Normalization
+from keras_layers.keras_layer_DecodeDetections import DecodeDetections
+from keras_layers.keras_layer_DecodeDetectionsFast import DecodeDetectionsFast
+
+def ssd_300(image_size,
+            n_classes,
+            mode='training',
+            l2_regularization=0.0005,
+            min_scale=None,
+            max_scale=None,
+            scales=None,
+            aspect_ratios_global=None,
+            aspect_ratios_per_layer=[[1.0, 2.0, 0.5],
+                                     [1.0, 2.0, 0.5, 3.0, 1.0/3.0],
+                                     [1.0, 2.0, 0.5, 3.0, 1.0/3.0],
+                                     [1.0, 2.0, 0.5, 3.0, 1.0/3.0],
+                                     [1.0, 2.0, 0.5],
+                                     [1.0, 2.0, 0.5]],
+            two_boxes_for_ar1=True,
+            steps=[8, 16, 32, 64, 100, 300],
+            offsets=None,
+            clip_boxes=False,
+            variances=[0.1, 0.1, 0.2, 0.2],
+            coords='centroids',
+            normalize_coords=True,
+            subtract_mean=[123, 117, 104],
+            divide_by_stddev=None,
+            swap_channels=[2, 1, 0],
+            confidence_thresh=0.01,
+            iou_threshold=0.45,
+            top_k=200,
+            nms_max_output_size=400,
+            return_predictor_sizes=False):
+    '''
+    Build a Keras model with SSD300 architecture, see references.
+
+    The base network is a reduced atrous VGG-16, extended by the SSD architecture,
+    as described in the paper.
+
+    Most of the arguments that this function takes are only needed for the anchor
+    box layers. In case you're training the network, the parameters passed here must
+    be the same as the ones used to set up `SSDBoxEncoder`. In case you're loading
+    trained weights, the parameters passed here must be the same as the ones used
+    to produce the trained weights.
+
+    Some of these arguments are explained in more detail in the documentation of the
+    `SSDBoxEncoder` class.
+
+    Note: Requires Keras v2.0 or later. Currently works only with the
+    TensorFlow backend (v1.0 or later).
+
+    Arguments:
+        image_size (tuple): The input image size in the format `(height, width, channels)`.
+        n_classes (int): The number of positive classes, e.g. 20 for Pascal VOC, 80 for MS COCO.
+        mode (str, optional): One of 'training', 'inference' and 'inference_fast'. In 'training' mode,
+            the model outputs the raw prediction tensor, while in 'inference' and 'inference_fast' modes,
+            the raw predictions are decoded into absolute coordinates and filtered via confidence thresholding,
+            non-maximum suppression, and top-k filtering. The difference between latter two modes is that
+            'inference' follows the exact procedure of the original Caffe implementation, while
+            'inference_fast' uses a faster prediction decoding procedure.
+        l2_regularization (float, optional): The L2-regularization rate. Applies to all convolutional layers.
+            Set to zero to deactivate L2-regularization.
+        min_scale (float, optional): The smallest scaling factor for the size of the anchor boxes as a fraction
+            of the shorter side of the input images.
+        max_scale (float, optional): The largest scaling factor for the size of the anchor boxes as a fraction
+            of the shorter side of the input images. All scaling factors between the smallest and the
+            largest will be linearly interpolated. Note that the second to last of the linearly interpolated
+            scaling factors will actually be the scaling factor for the last predictor layer, while the last
+            scaling factor is used for the second box for aspect ratio 1 in the last predictor layer
+            if `two_boxes_for_ar1` is `True`.
+        scales (list, optional): A list of floats containing scaling factors per convolutional predictor layer.
+            This list must be one element longer than the number of predictor layers. The first `k` elements are the
+            scaling factors for the `k` predictor layers, while the last element is used for the second box
+            for aspect ratio 1 in the last predictor layer if `two_boxes_for_ar1` is `True`. This additional
+            last scaling factor must be passed either way, even if it is not being used. If a list is passed,
+            this argument overrides `min_scale` and `max_scale`. All scaling factors must be greater than zero.
+        aspect_ratios_global (list, optional): The list of aspect ratios for which anchor boxes are to be
+            generated. This list is valid for all prediction layers.
+        aspect_ratios_per_layer (list, optional): A list containing one aspect ratio list for each prediction layer.
+            This allows you to set the aspect ratios for each predictor layer individually, which is the case for the
+            original SSD300 implementation. If a list is passed, it overrides `aspect_ratios_global`.
+        two_boxes_for_ar1 (bool, optional): Only relevant for aspect ratio lists that contain 1. Will be ignored otherwise.
+            If `True`, two anchor boxes will be generated for aspect ratio 1. The first will be generated
+            using the scaling factor for the respective layer, the second one will be generated using
+            geometric mean of said scaling factor and next bigger scaling factor.
+        steps (list, optional): `None` or a list with as many elements as there are predictor layers. The elements can be
+            either ints/floats or tuples of two ints/floats. These numbers represent for each predictor layer how many
+            pixels apart the anchor box center points should be vertically and horizontally along the spatial grid over
+            the image. If the list contains ints/floats, then that value will be used for both spatial dimensions.
+            If the list contains tuples of two ints/floats, then they represent `(step_height, step_width)`.
+            If no steps are provided, then they will be computed such that the anchor box center points will form an
+            equidistant grid within the image dimensions.
+        offsets (list, optional): `None` or a list with as many elements as there are predictor layers. The elements can be
+            either floats or tuples of two floats. These numbers represent for each predictor layer how many
+            pixels from the top and left boarders of the image the top-most and left-most anchor box center points should be
+            as a fraction of `steps`. The last bit is important: The offsets are not absolute pixel values, but fractions
+            of the step size specified in the `steps` argument. If the list contains floats, then that value will
+            be used for both spatial dimensions. If the list contains tuples of two floats, then they represent
+            `(vertical_offset, horizontal_offset)`. If no offsets are provided, then they will default to 0.5 of the step size.
+        clip_boxes (bool, optional): If `True`, clips the anchor box coordinates to stay within image boundaries.
+        variances (list, optional): A list of 4 floats >0. The anchor box offset for each coordinate will be divided by
+            its respective variance value.
+        coords (str, optional): The box coordinate format to be used internally by the model (i.e. this is not the input format
+            of the ground truth labels). Can be either 'centroids' for the format `(cx, cy, w, h)` (box center coordinates, width,
+            and height), 'minmax' for the format `(xmin, xmax, ymin, ymax)`, or 'corners' for the format `(xmin, ymin, xmax, ymax)`.
+        normalize_coords (bool, optional): Set to `True` if the model is supposed to use relative instead of absolute coordinates,
+            i.e. if the model predicts box coordinates within [0,1] instead of absolute coordinates.
+        subtract_mean (array-like, optional): `None` or an array-like object of integers or floating point values
+            of any shape that is broadcast-compatible with the image shape. The elements of this array will be
+            subtracted from the image pixel intensity values. For example, pass a list of three integers
+            to perform per-channel mean normalization for color images.
+        divide_by_stddev (array-like, optional): `None` or an array-like object of non-zero integers or
+            floating point values of any shape that is broadcast-compatible with the image shape. The image pixel
+            intensity values will be divided by the elements of this array. For example, pass a list
+            of three integers to perform per-channel standard deviation normalization for color images.
+        swap_channels (list, optional): Either `False` or a list of integers representing the desired order in which the input
+            image channels should be swapped.
+        confidence_thresh (float, optional): A float in [0,1), the minimum classification confidence in a specific
+            positive class in order to be considered for the non-maximum suppression stage for the respective class.
+            A lower value will result in a larger part of the selection process being done by the non-maximum suppression
+            stage, while a larger value will result in a larger part of the selection process happening in the confidence
+            thresholding stage.
+        iou_threshold (float, optional): A float in [0,1]. All boxes that have a Jaccard similarity of greater than `iou_threshold`
+            with a locally maximal box will be removed from the set of predictions for a given class, where 'maximal' refers
+            to the box's confidence score.
+        top_k (int, optional): The number of highest scoring predictions to be kept for each batch item after the
+            non-maximum suppression stage.
+        nms_max_output_size (int, optional): The maximal number of predictions that will be left over after the NMS stage.
+        return_predictor_sizes (bool, optional): If `True`, this function not only returns the model, but also
+            a list containing the spatial dimensions of the predictor layers. This isn't strictly necessary since
+            you can always get their sizes easily via the Keras API, but it's convenient and less error-prone
+            to get them this way. They are only relevant for training anyway (SSDBoxEncoder needs to know the
+            spatial dimensions of the predictor layers), for inference you don't need them.
+
+    Returns:
+        model: The Keras SSD300 model.
+        predictor_sizes (optional): A Numpy array containing the `(height, width)` portion
+            of the output tensor shape for each convolutional predictor layer. During
+            training, the generator function needs this in order to transform
+            the ground truth labels into tensors of identical structure as the
+            output tensors of the model, which is in turn needed for the cost
+            function.
+
+    References:
+        https://arxiv.org/abs/1512.02325v5
+    '''
+
+    n_predictor_layers = 6 # The number of predictor conv layers in the network is 6 for the original SSD300.
+    n_classes += 1 # Account for the background class.
+    l2_reg = l2_regularization # Make the internal name shorter.
+    img_height, img_width, img_channels = image_size[0], image_size[1], image_size[2]
+
+    ############################################################################
+    # Get a few exceptions out of the way.
+    ############################################################################
+
+    if aspect_ratios_global is None and aspect_ratios_per_layer is None:
+        raise ValueError("`aspect_ratios_global` and `aspect_ratios_per_layer` cannot both be None. At least one needs to be specified.")
+    if aspect_ratios_per_layer:
+        if len(aspect_ratios_per_layer) != n_predictor_layers:
+            raise ValueError("It must be either aspect_ratios_per_layer is None or len(aspect_ratios_per_layer) == {}, but len(aspect_ratios_per_layer) == {}.".format(n_predictor_layers, len(aspect_ratios_per_layer)))
+
+    if (min_scale is None or max_scale is None) and scales is None:
+        raise ValueError("Either `min_scale` and `max_scale` or `scales` need to be specified.")
+    if scales:
+        if len(scales) != n_predictor_layers+1:
+            raise ValueError("It must be either scales is None or len(scales) == {}, but len(scales) == {}.".format(n_predictor_layers+1, len(scales)))
+    else: # If no explicit list of scaling factors was passed, compute the list of scaling factors from `min_scale` and `max_scale`
+        scales = np.linspace(min_scale, max_scale, n_predictor_layers+1)
+
+    if len(variances) != 4:
+        raise ValueError("4 variance values must be pased, but {} values were received.".format(len(variances)))
+    variances = np.array(variances)
+    if np.any(variances <= 0):
+        raise ValueError("All variances must be >0, but the variances given are {}".format(variances))
+
+    if (not (steps is None)) and (len(steps) != n_predictor_layers):
+        raise ValueError("You must provide at least one step value per predictor layer.")
+
+    if (not (offsets is None)) and (len(offsets) != n_predictor_layers):
+        raise ValueError("You must provide at least one offset value per predictor layer.")
+
+    ############################################################################
+    # Compute the anchor box parameters.
+    ############################################################################
+
+    # Set the aspect ratios for each predictor layer. These are only needed for the anchor box layers.
+    if aspect_ratios_per_layer:
+        aspect_ratios = aspect_ratios_per_layer
+    else:
+        aspect_ratios = [aspect_ratios_global] * n_predictor_layers
+
+    # Compute the number of boxes to be predicted per cell for each predictor layer.
+    # We need this so that we know how many channels the predictor layers need to have.
+    if aspect_ratios_per_layer:
+        n_boxes = []
+        for ar in aspect_ratios_per_layer:
+            if (1 in ar) & two_boxes_for_ar1:
+                n_boxes.append(len(ar) + 1) # +1 for the second box for aspect ratio 1
+            else:
+                n_boxes.append(len(ar))
+    else: # If only a global aspect ratio list was passed, then the number of boxes is the same for each predictor layer
+        if (1 in aspect_ratios_global) & two_boxes_for_ar1:
+            n_boxes = len(aspect_ratios_global) + 1
+        else:
+            n_boxes = len(aspect_ratios_global)
+        n_boxes = [n_boxes] * n_predictor_layers
+
+    if steps is None:
+        steps = [None] * n_predictor_layers
+    if offsets is None:
+        offsets = [None] * n_predictor_layers
+
+    ############################################################################
+    # Define functions for the Lambda layers below.
+    ############################################################################
+
+    def identity_layer(tensor):
+        return tensor
+
+    def input_mean_normalization(tensor):
+        return tensor - np.array(subtract_mean)
+
+    def input_stddev_normalization(tensor):
+        return tensor / np.array(divide_by_stddev)
+
+    def input_channel_swap(tensor):
+        if len(swap_channels) == 3:
+            return K.stack([tensor[...,swap_channels[0]], tensor[...,swap_channels[1]], tensor[...,swap_channels[2]]], axis=-1)
+        elif len(swap_channels) == 4:
+            return K.stack([tensor[...,swap_channels[0]], tensor[...,swap_channels[1]], tensor[...,swap_channels[2]], tensor[...,swap_channels[3]]], axis=-1)
+
+    ############################################################################
+    # Build the network.
+    ############################################################################
+
+    x = Input(shape=(img_height, img_width, img_channels))
+
+    # The following identity layer is only needed so that the subsequent lambda layers can be optional.
+    x1 = Lambda(identity_layer, output_shape=(img_height, img_width, img_channels), name='identity_layer')(x)
+    if not (subtract_mean is None):
+        x1 = Lambda(input_mean_normalization, output_shape=(img_height, img_width, img_channels), name='input_mean_normalization')(x1)
+    if not (divide_by_stddev is None):
+        x1 = Lambda(input_stddev_normalization, output_shape=(img_height, img_width, img_channels), name='input_stddev_normalization')(x1)
+    if swap_channels:
+        x1 = Lambda(input_channel_swap, output_shape=(img_height, img_width, img_channels), name='input_channel_swap')(x1)
+
+    conv1_1 = Conv2D(64, (3, 3), activation='relu', padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv1_1')(x1)
+    conv1_2 = Conv2D(64, (3, 3), activation='relu', padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv1_2')(conv1_1)
+    pool1 = MaxPooling2D(pool_size=(2, 2), strides=(2, 2), padding='same', name='pool1')(conv1_2)
+
+    conv2_1 = Conv2D(128, (3, 3), activation='relu', padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv2_1')(pool1)
+    conv2_2 = Conv2D(128, (3, 3), activation='relu', padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv2_2')(conv2_1)
+    pool2 = MaxPooling2D(pool_size=(2, 2), strides=(2, 2), padding='same', name='pool2')(conv2_2)
+
+    conv3_1 = Conv2D(256, (3, 3), activation='relu', padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv3_1')(pool2)
+    conv3_2 = Conv2D(256, (3, 3), activation='relu', padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv3_2')(conv3_1)
+    conv3_3 = Conv2D(256, (3, 3), activation='relu', padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv3_3')(conv3_2)
+    pool3 = MaxPooling2D(pool_size=(2, 2), strides=(2, 2), padding='same', name='pool3')(conv3_3)
+
+    conv4_1 = Conv2D(512, (3, 3), activation='relu', padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv4_1')(pool3)
+    conv4_2 = Conv2D(512, (3, 3), activation='relu', padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv4_2')(conv4_1)
+    conv4_3 = Conv2D(512, (3, 3), activation='relu', padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv4_3')(conv4_2)
+    pool4 = MaxPooling2D(pool_size=(2, 2), strides=(2, 2), padding='same', name='pool4')(conv4_3)
+
+    conv5_1 = Conv2D(512, (3, 3), activation='relu', padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv5_1')(pool4)
+    conv5_2 = Conv2D(512, (3, 3), activation='relu', padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv5_2')(conv5_1)
+    conv5_3 = Conv2D(512, (3, 3), activation='relu', padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv5_3')(conv5_2)
+    pool5 = MaxPooling2D(pool_size=(3, 3), strides=(1, 1), padding='same', name='pool5')(conv5_3)
+
+    fc6 = Conv2D(1024, (3, 3), dilation_rate=(6, 6), activation='relu', padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='fc6')(pool5)
+
+    fc7 = Conv2D(1024, (1, 1), activation='relu', padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='fc7')(fc6)
+
+    conv6_1 = Conv2D(256, (1, 1), activation='relu', padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv6_1')(fc7)
+    conv6_1 = ZeroPadding2D(padding=((1, 1), (1, 1)), name='conv6_padding')(conv6_1)
+    conv6_2 = Conv2D(512, (3, 3), strides=(2, 2), activation='relu', padding='valid', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv6_2')(conv6_1)
+
+    conv7_1 = Conv2D(128, (1, 1), activation='relu', padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv7_1')(conv6_2)
+    conv7_1 = ZeroPadding2D(padding=((1, 1), (1, 1)), name='conv7_padding')(conv7_1)
+    conv7_2 = Conv2D(256, (3, 3), strides=(2, 2), activation='relu', padding='valid', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv7_2')(conv7_1)
+
+    conv8_1 = Conv2D(128, (1, 1), activation='relu', padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv8_1')(conv7_2)
+    conv8_2 = Conv2D(256, (3, 3), strides=(1, 1), activation='relu', padding='valid', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv8_2')(conv8_1)
+
+    conv9_1 = Conv2D(128, (1, 1), activation='relu', padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv9_1')(conv8_2)
+    conv9_2 = Conv2D(256, (3, 3), strides=(1, 1), activation='relu', padding='valid', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv9_2')(conv9_1)
+
+    # Feed conv4_3 into the L2 normalization layer
+    conv4_3_norm = L2Normalization(gamma_init=20, name='conv4_3_norm')(conv4_3)
+
+    ### Build the convolutional predictor layers on top of the base network
+
+    # We precidt `n_classes` confidence values for each box, hence the confidence predictors have depth `n_boxes * n_classes`
+    # Output shape of the confidence layers: `(batch, height, width, n_boxes * n_classes)`
+    conv4_3_norm_mbox_conf = Conv2D(n_boxes[0] * n_classes, (3, 3), padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv4_3_norm_mbox_conf')(conv4_3_norm)
+    fc7_mbox_conf = Conv2D(n_boxes[1] * n_classes, (3, 3), padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='fc7_mbox_conf')(fc7)
+    conv6_2_mbox_conf = Conv2D(n_boxes[2] * n_classes, (3, 3), padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv6_2_mbox_conf')(conv6_2)
+    conv7_2_mbox_conf = Conv2D(n_boxes[3] * n_classes, (3, 3), padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv7_2_mbox_conf')(conv7_2)
+    conv8_2_mbox_conf = Conv2D(n_boxes[4] * n_classes, (3, 3), padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv8_2_mbox_conf')(conv8_2)
+    conv9_2_mbox_conf = Conv2D(n_boxes[5] * n_classes, (3, 3), padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv9_2_mbox_conf')(conv9_2)
+    # We predict 4 box coordinates for each box, hence the localization predictors have depth `n_boxes * 4`
+    # Output shape of the localization layers: `(batch, height, width, n_boxes * 4)`
+    conv4_3_norm_mbox_loc = Conv2D(n_boxes[0] * 4, (3, 3), padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv4_3_norm_mbox_loc')(conv4_3_norm)
+    fc7_mbox_loc = Conv2D(n_boxes[1] * 4, (3, 3), padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='fc7_mbox_loc')(fc7)
+    conv6_2_mbox_loc = Conv2D(n_boxes[2] * 4, (3, 3), padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv6_2_mbox_loc')(conv6_2)
+    conv7_2_mbox_loc = Conv2D(n_boxes[3] * 4, (3, 3), padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv7_2_mbox_loc')(conv7_2)
+    conv8_2_mbox_loc = Conv2D(n_boxes[4] * 4, (3, 3), padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv8_2_mbox_loc')(conv8_2)
+    conv9_2_mbox_loc = Conv2D(n_boxes[5] * 4, (3, 3), padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv9_2_mbox_loc')(conv9_2)
+
+    ### Generate the anchor boxes (called "priors" in the original Caffe/C++ implementation, so I'll keep their layer names)
+
+    # Output shape of anchors: `(batch, height, width, n_boxes, 8)`
+    conv4_3_norm_mbox_priorbox = AnchorBoxes(img_height, img_width, this_scale=scales[0], next_scale=scales[1], aspect_ratios=aspect_ratios[0],
+                                             two_boxes_for_ar1=two_boxes_for_ar1, this_steps=steps[0], this_offsets=offsets[0], clip_boxes=clip_boxes,
+                                             variances=variances, coords=coords, normalize_coords=normalize_coords, name='conv4_3_norm_mbox_priorbox')(conv4_3_norm_mbox_loc)
+    fc7_mbox_priorbox = AnchorBoxes(img_height, img_width, this_scale=scales[1], next_scale=scales[2], aspect_ratios=aspect_ratios[1],
+                                    two_boxes_for_ar1=two_boxes_for_ar1, this_steps=steps[1], this_offsets=offsets[1], clip_boxes=clip_boxes,
+                                    variances=variances, coords=coords, normalize_coords=normalize_coords, name='fc7_mbox_priorbox')(fc7_mbox_loc)
+    conv6_2_mbox_priorbox = AnchorBoxes(img_height, img_width, this_scale=scales[2], next_scale=scales[3], aspect_ratios=aspect_ratios[2],
+                                        two_boxes_for_ar1=two_boxes_for_ar1, this_steps=steps[2], this_offsets=offsets[2], clip_boxes=clip_boxes,
+                                        variances=variances, coords=coords, normalize_coords=normalize_coords, name='conv6_2_mbox_priorbox')(conv6_2_mbox_loc)
+    conv7_2_mbox_priorbox = AnchorBoxes(img_height, img_width, this_scale=scales[3], next_scale=scales[4], aspect_ratios=aspect_ratios[3],
+                                        two_boxes_for_ar1=two_boxes_for_ar1, this_steps=steps[3], this_offsets=offsets[3], clip_boxes=clip_boxes,
+                                        variances=variances, coords=coords, normalize_coords=normalize_coords, name='conv7_2_mbox_priorbox')(conv7_2_mbox_loc)
+    conv8_2_mbox_priorbox = AnchorBoxes(img_height, img_width, this_scale=scales[4], next_scale=scales[5], aspect_ratios=aspect_ratios[4],
+                                        two_boxes_for_ar1=two_boxes_for_ar1, this_steps=steps[4], this_offsets=offsets[4], clip_boxes=clip_boxes,
+                                        variances=variances, coords=coords, normalize_coords=normalize_coords, name='conv8_2_mbox_priorbox')(conv8_2_mbox_loc)
+    conv9_2_mbox_priorbox = AnchorBoxes(img_height, img_width, this_scale=scales[5], next_scale=scales[6], aspect_ratios=aspect_ratios[5],
+                                        two_boxes_for_ar1=two_boxes_for_ar1, this_steps=steps[5], this_offsets=offsets[5], clip_boxes=clip_boxes,
+                                        variances=variances, coords=coords, normalize_coords=normalize_coords, name='conv9_2_mbox_priorbox')(conv9_2_mbox_loc)
+
+    ### Reshape
+
+    # Reshape the class predictions, yielding 3D tensors of shape `(batch, height * width * n_boxes, n_classes)`
+    # We want the classes isolated in the last axis to perform softmax on them
+    conv4_3_norm_mbox_conf_reshape = Reshape((-1, n_classes), name='conv4_3_norm_mbox_conf_reshape')(conv4_3_norm_mbox_conf)
+    fc7_mbox_conf_reshape = Reshape((-1, n_classes), name='fc7_mbox_conf_reshape')(fc7_mbox_conf)
+    conv6_2_mbox_conf_reshape = Reshape((-1, n_classes), name='conv6_2_mbox_conf_reshape')(conv6_2_mbox_conf)
+    conv7_2_mbox_conf_reshape = Reshape((-1, n_classes), name='conv7_2_mbox_conf_reshape')(conv7_2_mbox_conf)
+    conv8_2_mbox_conf_reshape = Reshape((-1, n_classes), name='conv8_2_mbox_conf_reshape')(conv8_2_mbox_conf)
+    conv9_2_mbox_conf_reshape = Reshape((-1, n_classes), name='conv9_2_mbox_conf_reshape')(conv9_2_mbox_conf)
+    # Reshape the box predictions, yielding 3D tensors of shape `(batch, height * width * n_boxes, 4)`
+    # We want the four box coordinates isolated in the last axis to compute the smooth L1 loss
+    conv4_3_norm_mbox_loc_reshape = Reshape((-1, 4), name='conv4_3_norm_mbox_loc_reshape')(conv4_3_norm_mbox_loc)
+    fc7_mbox_loc_reshape = Reshape((-1, 4), name='fc7_mbox_loc_reshape')(fc7_mbox_loc)
+    conv6_2_mbox_loc_reshape = Reshape((-1, 4), name='conv6_2_mbox_loc_reshape')(conv6_2_mbox_loc)
+    conv7_2_mbox_loc_reshape = Reshape((-1, 4), name='conv7_2_mbox_loc_reshape')(conv7_2_mbox_loc)
+    conv8_2_mbox_loc_reshape = Reshape((-1, 4), name='conv8_2_mbox_loc_reshape')(conv8_2_mbox_loc)
+    conv9_2_mbox_loc_reshape = Reshape((-1, 4), name='conv9_2_mbox_loc_reshape')(conv9_2_mbox_loc)
+    # Reshape the anchor box tensors, yielding 3D tensors of shape `(batch, height * width * n_boxes, 8)`
+    conv4_3_norm_mbox_priorbox_reshape = Reshape((-1, 8), name='conv4_3_norm_mbox_priorbox_reshape')(conv4_3_norm_mbox_priorbox)
+    fc7_mbox_priorbox_reshape = Reshape((-1, 8), name='fc7_mbox_priorbox_reshape')(fc7_mbox_priorbox)
+    conv6_2_mbox_priorbox_reshape = Reshape((-1, 8), name='conv6_2_mbox_priorbox_reshape')(conv6_2_mbox_priorbox)
+    conv7_2_mbox_priorbox_reshape = Reshape((-1, 8), name='conv7_2_mbox_priorbox_reshape')(conv7_2_mbox_priorbox)
+    conv8_2_mbox_priorbox_reshape = Reshape((-1, 8), name='conv8_2_mbox_priorbox_reshape')(conv8_2_mbox_priorbox)
+    conv9_2_mbox_priorbox_reshape = Reshape((-1, 8), name='conv9_2_mbox_priorbox_reshape')(conv9_2_mbox_priorbox)
+
+    ### Concatenate the predictions from the different layers
+
+    # Axis 0 (batch) and axis 2 (n_classes or 4, respectively) are identical for all layer predictions,
+    # so we want to concatenate along axis 1, the number of boxes per layer
+    # Output shape of `mbox_conf`: (batch, n_boxes_total, n_classes)
+    mbox_conf = Concatenate(axis=1, name='mbox_conf')([conv4_3_norm_mbox_conf_reshape,
+                                                       fc7_mbox_conf_reshape,
+                                                       conv6_2_mbox_conf_reshape,
+                                                       conv7_2_mbox_conf_reshape,
+                                                       conv8_2_mbox_conf_reshape,
+                                                       conv9_2_mbox_conf_reshape])
+
+    # Output shape of `mbox_loc`: (batch, n_boxes_total, 4)
+    mbox_loc = Concatenate(axis=1, name='mbox_loc')([conv4_3_norm_mbox_loc_reshape,
+                                                     fc7_mbox_loc_reshape,
+                                                     conv6_2_mbox_loc_reshape,
+                                                     conv7_2_mbox_loc_reshape,
+                                                     conv8_2_mbox_loc_reshape,
+                                                     conv9_2_mbox_loc_reshape])
+
+    # Output shape of `mbox_priorbox`: (batch, n_boxes_total, 8)
+    mbox_priorbox = Concatenate(axis=1, name='mbox_priorbox')([conv4_3_norm_mbox_priorbox_reshape,
+                                                               fc7_mbox_priorbox_reshape,
+                                                               conv6_2_mbox_priorbox_reshape,
+                                                               conv7_2_mbox_priorbox_reshape,
+                                                               conv8_2_mbox_priorbox_reshape,
+                                                               conv9_2_mbox_priorbox_reshape])
+
+    # The box coordinate predictions will go into the loss function just the way they are,
+    # but for the class predictions, we'll apply a softmax activation layer first
+    mbox_conf_softmax = Activation('softmax', name='mbox_conf_softmax')(mbox_conf)
+
+    # Concatenate the class and box predictions and the anchors to one large predictions vector
+    # Output shape of `predictions`: (batch, n_boxes_total, n_classes + 4 + 8)
+    predictions = Concatenate(axis=2, name='predictions')([mbox_conf_softmax, mbox_loc, mbox_priorbox])
+
+    if mode == 'training':
+        model = Model(inputs=x, outputs=predictions)
+    elif mode == 'inference':
+        decoded_predictions = DecodeDetections(confidence_thresh=confidence_thresh,
+                                               iou_threshold=iou_threshold,
+                                               top_k=top_k,
+                                               nms_max_output_size=nms_max_output_size,
+                                               coords=coords,
+                                               normalize_coords=normalize_coords,
+                                               img_height=img_height,
+                                               img_width=img_width,
+                                               name='decoded_predictions')(predictions)
+        model = Model(inputs=x, outputs=decoded_predictions)
+    elif mode == 'inference_fast':
+        decoded_predictions = DecodeDetectionsFast(confidence_thresh=confidence_thresh,
+                                                   iou_threshold=iou_threshold,
+                                                   top_k=top_k,
+                                                   nms_max_output_size=nms_max_output_size,
+                                                   coords=coords,
+                                                   normalize_coords=normalize_coords,
+                                                   img_height=img_height,
+                                                   img_width=img_width,
+                                                   name='decoded_predictions')(predictions)
+        model = Model(inputs=x, outputs=decoded_predictions)
+    else:
+        raise ValueError("`mode` must be one of 'training', 'inference' or 'inference_fast', but received '{}'.".format(mode))
+
+    if return_predictor_sizes:
+        predictor_sizes = np.array([conv4_3_norm_mbox_conf._keras_shape[1:3],
+                                     fc7_mbox_conf._keras_shape[1:3],
+                                     conv6_2_mbox_conf._keras_shape[1:3],
+                                     conv7_2_mbox_conf._keras_shape[1:3],
+                                     conv8_2_mbox_conf._keras_shape[1:3],
+                                     conv9_2_mbox_conf._keras_shape[1:3]])
+        return model, predictor_sizes
+    else:
+        return model
diff --git a/keras_ssd/models/keras_ssd512.py b/keras_ssd/models/keras_ssd512.py
new file mode 100644
index 0000000..3f69ac6
--- /dev/null
+++ b/keras_ssd/models/keras_ssd512.py
@@ -0,0 +1,477 @@
+'''
+A Keras port of the original Caffe SSD512 network.
+
+Copyright (C) 2018 Pierluigi Ferrari
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+'''
+
+from __future__ import division
+import numpy as np
+from keras.models import Model
+from keras.layers import Input, Lambda, Activation, Conv2D, MaxPooling2D, ZeroPadding2D, Reshape, Concatenate
+from keras.regularizers import l2
+import keras.backend as K
+
+from keras_layers.keras_layer_AnchorBoxes import AnchorBoxes
+from keras_layers.keras_layer_L2Normalization import L2Normalization
+from keras_layers.keras_layer_DecodeDetections import DecodeDetections
+from keras_layers.keras_layer_DecodeDetectionsFast import DecodeDetectionsFast
+
+def ssd_512(image_size,
+            n_classes,
+            mode='training',
+            l2_regularization=0.0005,
+            min_scale=None,
+            max_scale=None,
+            scales=None,
+            aspect_ratios_global=None,
+            aspect_ratios_per_layer=[[1.0, 2.0, 0.5],
+                                     [1.0, 2.0, 0.5, 3.0, 1.0/3.0],
+                                     [1.0, 2.0, 0.5, 3.0, 1.0/3.0],
+                                     [1.0, 2.0, 0.5, 3.0, 1.0/3.0],
+                                     [1.0, 2.0, 0.5, 3.0, 1.0/3.0],
+                                     [1.0, 2.0, 0.5],
+                                     [1.0, 2.0, 0.5]],
+            two_boxes_for_ar1=True,
+            steps=[8, 16, 32, 64, 128, 256, 512],
+            offsets=None,
+            clip_boxes=False,
+            variances=[0.1, 0.1, 0.2, 0.2],
+            coords='centroids',
+            normalize_coords=True,
+            subtract_mean=[123, 117, 104],
+            divide_by_stddev=None,
+            swap_channels=[2, 1, 0],
+            confidence_thresh=0.01,
+            iou_threshold=0.45,
+            top_k=200,
+            nms_max_output_size=400,
+            return_predictor_sizes=False):
+    '''
+    Build a Keras model with SSD512 architecture, see references.
+
+    The base network is a reduced atrous VGG-16, extended by the SSD architecture,
+    as described in the paper.
+
+    Most of the arguments that this function takes are only needed for the anchor
+    box layers. In case you're training the network, the parameters passed here must
+    be the same as the ones used to set up `SSDBoxEncoder`. In case you're loading
+    trained weights, the parameters passed here must be the same as the ones used
+    to produce the trained weights.
+
+    Some of these arguments are explained in more detail in the documentation of the
+    `SSDBoxEncoder` class.
+
+    Note: Requires Keras v2.0 or later. Currently works only with the
+    TensorFlow backend (v1.0 or later).
+
+    Arguments:
+        image_size (tuple): The input image size in the format `(height, width, channels)`.
+        n_classes (int): The number of positive classes, e.g. 20 for Pascal VOC, 80 for MS COCO.
+        mode (str, optional): One of 'training', 'inference' and 'inference_fast'. In 'training' mode,
+            the model outputs the raw prediction tensor, while in 'inference' and 'inference_fast' modes,
+            the raw predictions are decoded into absolute coordinates and filtered via confidence thresholding,
+            non-maximum suppression, and top-k filtering. The difference between latter two modes is that
+            'inference' follows the exact procedure of the original Caffe implementation, while
+            'inference_fast' uses a faster prediction decoding procedure.
+        l2_regularization (float, optional): The L2-regularization rate. Applies to all convolutional layers.
+            Set to zero to deactivate L2-regularization.
+        min_scale (float, optional): The smallest scaling factor for the size of the anchor boxes as a fraction
+            of the shorter side of the input images.
+        max_scale (float, optional): The largest scaling factor for the size of the anchor boxes as a fraction
+            of the shorter side of the input images. All scaling factors between the smallest and the
+            largest will be linearly interpolated. Note that the second to last of the linearly interpolated
+            scaling factors will actually be the scaling factor for the last predictor layer, while the last
+            scaling factor is used for the second box for aspect ratio 1 in the last predictor layer
+            if `two_boxes_for_ar1` is `True`.
+        scales (list, optional): A list of floats containing scaling factors per convolutional predictor layer.
+            This list must be one element longer than the number of predictor layers. The first `k` elements are the
+            scaling factors for the `k` predictor layers, while the last element is used for the second box
+            for aspect ratio 1 in the last predictor layer if `two_boxes_for_ar1` is `True`. This additional
+            last scaling factor must be passed either way, even if it is not being used.
+            If a list is passed, this argument overrides `min_scale` and `max_scale`. All scaling factors
+            must be greater than zero.
+        aspect_ratios_global (list, optional): The list of aspect ratios for which anchor boxes are to be
+            generated. This list is valid for all prediction layers.
+        aspect_ratios_per_layer (list, optional): A list containing one aspect ratio list for each prediction layer.
+            This allows you to set the aspect ratios for each predictor layer individually, which is the case for the
+            original SSD512 implementation. If a list is passed, it overrides `aspect_ratios_global`.
+        two_boxes_for_ar1 (bool, optional): Only relevant for aspect ratio lists that contain 1. Will be ignored otherwise.
+            If `True`, two anchor boxes will be generated for aspect ratio 1. The first will be generated
+            using the scaling factor for the respective layer, the second one will be generated using
+            geometric mean of said scaling factor and next bigger scaling factor.
+        steps (list, optional): `None` or a list with as many elements as there are predictor layers. The elements can be
+            either ints/floats or tuples of two ints/floats. These numbers represent for each predictor layer how many
+            pixels apart the anchor box center points should be vertically and horizontally along the spatial grid over
+            the image. If the list contains ints/floats, then that value will be used for both spatial dimensions.
+            If the list contains tuples of two ints/floats, then they represent `(step_height, step_width)`.
+            If no steps are provided, then they will be computed such that the anchor box center points will form an
+            equidistant grid within the image dimensions.
+        offsets (list, optional): `None` or a list with as many elements as there are predictor layers. The elements can be
+            either floats or tuples of two floats. These numbers represent for each predictor layer how many
+            pixels from the top and left boarders of the image the top-most and left-most anchor box center points should be
+            as a fraction of `steps`. The last bit is important: The offsets are not absolute pixel values, but fractions
+            of the step size specified in the `steps` argument. If the list contains floats, then that value will
+            be used for both spatial dimensions. If the list contains tuples of two floats, then they represent
+            `(vertical_offset, horizontal_offset)`. If no offsets are provided, then they will default to 0.5 of the step size.
+        clip_boxes (bool, optional): If `True`, clips the anchor box coordinates to stay within image boundaries.
+        variances (list, optional): A list of 4 floats >0. The anchor box offset for each coordinate will be divided by
+            its respective variance value.
+        coords (str, optional): The box coordinate format to be used internally by the model (i.e. this is not the input format
+            of the ground truth labels). Can be either 'centroids' for the format `(cx, cy, w, h)` (box center coordinates, width,
+            and height), 'minmax' for the format `(xmin, xmax, ymin, ymax)`, or 'corners' for the format `(xmin, ymin, xmax, ymax)`.
+        normalize_coords (bool, optional): Set to `True` if the model is supposed to use relative instead of absolute coordinates,
+            i.e. if the model predicts box coordinates within [0,1] instead of absolute coordinates.
+        subtract_mean (array-like, optional): `None` or an array-like object of integers or floating point values
+            of any shape that is broadcast-compatible with the image shape. The elements of this array will be
+            subtracted from the image pixel intensity values. For example, pass a list of three integers
+            to perform per-channel mean normalization for color images.
+        divide_by_stddev (array-like, optional): `None` or an array-like object of non-zero integers or
+            floating point values of any shape that is broadcast-compatible with the image shape. The image pixel
+            intensity values will be divided by the elements of this array. For example, pass a list
+            of three integers to perform per-channel standard deviation normalization for color images.
+        swap_channels (list, optional): Either `False` or a list of integers representing the desired order in which the input
+            image channels should be swapped.
+        confidence_thresh (float, optional): A float in [0,1), the minimum classification confidence in a specific
+            positive class in order to be considered for the non-maximum suppression stage for the respective class.
+            A lower value will result in a larger part of the selection process being done by the non-maximum suppression
+            stage, while a larger value will result in a larger part of the selection process happening in the confidence
+            thresholding stage.
+        iou_threshold (float, optional): A float in [0,1]. All boxes that have a Jaccard similarity of greater than `iou_threshold`
+            with a locally maximal box will be removed from the set of predictions for a given class, where 'maximal' refers
+            to the box's confidence score.
+        top_k (int, optional): The number of highest scoring predictions to be kept for each batch item after the
+            non-maximum suppression stage.
+        nms_max_output_size (int, optional): The maximal number of predictions that will be left over after the NMS stage.
+        return_predictor_sizes (bool, optional): If `True`, this function not only returns the model, but also
+            a list containing the spatial dimensions of the predictor layers. This isn't strictly necessary since
+            you can always get their sizes easily via the Keras API, but it's convenient and less error-prone
+            to get them this way. They are only relevant for training anyway (SSDBoxEncoder needs to know the
+            spatial dimensions of the predictor layers), for inference you don't need them.
+
+    Returns:
+        model: The Keras SSD512 model.
+        predictor_sizes (optional): A Numpy array containing the `(height, width)` portion
+            of the output tensor shape for each convolutional predictor layer. During
+            training, the generator function needs this in order to transform
+            the ground truth labels into tensors of identical structure as the
+            output tensors of the model, which is in turn needed for the cost
+            function.
+
+    References:
+        https://arxiv.org/abs/1512.02325v5
+    '''
+
+    n_predictor_layers = 7 # The number of predictor conv layers in the network is 7 for the original SSD512
+    n_classes += 1 # Account for the background class.
+    l2_reg = l2_regularization # Make the internal name shorter.
+    img_height, img_width, img_channels = image_size[0], image_size[1], image_size[2]
+
+    ############################################################################
+    # Get a few exceptions out of the way.
+    ############################################################################
+
+    if aspect_ratios_global is None and aspect_ratios_per_layer is None:
+        raise ValueError("`aspect_ratios_global` and `aspect_ratios_per_layer` cannot both be None. At least one needs to be specified.")
+    if aspect_ratios_per_layer:
+        if len(aspect_ratios_per_layer) != n_predictor_layers:
+            raise ValueError("It must be either aspect_ratios_per_layer is None or len(aspect_ratios_per_layer) == {}, but len(aspect_ratios_per_layer) == {}.".format(n_predictor_layers, len(aspect_ratios_per_layer)))
+
+    if (min_scale is None or max_scale is None) and scales is None:
+        raise ValueError("Either `min_scale` and `max_scale` or `scales` need to be specified.")
+    if scales:
+        if len(scales) != n_predictor_layers+1:
+            raise ValueError("It must be either scales is None or len(scales) == {}, but len(scales) == {}.".format(n_predictor_layers+1, len(scales)))
+    else: # If no explicit list of scaling factors was passed, compute the list of scaling factors from `min_scale` and `max_scale`
+        scales = np.linspace(min_scale, max_scale, n_predictor_layers+1)
+
+    if len(variances) != 4:
+        raise ValueError("4 variance values must be pased, but {} values were received.".format(len(variances)))
+    variances = np.array(variances)
+    if np.any(variances <= 0):
+        raise ValueError("All variances must be >0, but the variances given are {}".format(variances))
+
+    if (not (steps is None)) and (len(steps) != n_predictor_layers):
+        raise ValueError("You must provide at least one step value per predictor layer.")
+
+    if (not (offsets is None)) and (len(offsets) != n_predictor_layers):
+        raise ValueError("You must provide at least one offset value per predictor layer.")
+
+    ############################################################################
+    # Compute the anchor box parameters.
+    ############################################################################
+
+    # Set the aspect ratios for each predictor layer. These are only needed for the anchor box layers.
+    if aspect_ratios_per_layer:
+        aspect_ratios = aspect_ratios_per_layer
+    else:
+        aspect_ratios = [aspect_ratios_global] * n_predictor_layers
+
+    # Compute the number of boxes to be predicted per cell for each predictor layer.
+    # We need this so that we know how many channels the predictor layers need to have.
+    if aspect_ratios_per_layer:
+        n_boxes = []
+        for ar in aspect_ratios_per_layer:
+            if (1 in ar) & two_boxes_for_ar1:
+                n_boxes.append(len(ar) + 1) # +1 for the second box for aspect ratio 1
+            else:
+                n_boxes.append(len(ar))
+    else: # If only a global aspect ratio list was passed, then the number of boxes is the same for each predictor layer
+        if (1 in aspect_ratios_global) & two_boxes_for_ar1:
+            n_boxes = len(aspect_ratios_global) + 1
+        else:
+            n_boxes = len(aspect_ratios_global)
+        n_boxes = [n_boxes] * n_predictor_layers
+
+    if steps is None:
+        steps = [None] * n_predictor_layers
+    if offsets is None:
+        offsets = [None] * n_predictor_layers
+
+    ############################################################################
+    # Define functions for the Lambda layers below.
+    ############################################################################
+
+    def identity_layer(tensor):
+        return tensor
+
+    def input_mean_normalization(tensor):
+        return tensor - np.array(subtract_mean)
+
+    def input_stddev_normalization(tensor):
+        return tensor / np.array(divide_by_stddev)
+
+    def input_channel_swap(tensor):
+        if len(swap_channels) == 3:
+            return K.stack([tensor[...,swap_channels[0]], tensor[...,swap_channels[1]], tensor[...,swap_channels[2]]], axis=-1)
+        elif len(swap_channels) == 4:
+            return K.stack([tensor[...,swap_channels[0]], tensor[...,swap_channels[1]], tensor[...,swap_channels[2]], tensor[...,swap_channels[3]]], axis=-1)
+
+    ############################################################################
+    # Build the network.
+    ############################################################################
+
+    x = Input(shape=(img_height, img_width, img_channels))
+
+    # The following identity layer is only needed so that the subsequent lambda layers can be optional.
+    x1 = Lambda(identity_layer, output_shape=(img_height, img_width, img_channels), name='identity_layer')(x)
+    if not (subtract_mean is None):
+        x1 = Lambda(input_mean_normalization, output_shape=(img_height, img_width, img_channels), name='input_mean_normalization')(x1)
+    if not (divide_by_stddev is None):
+        x1 = Lambda(input_stddev_normalization, output_shape=(img_height, img_width, img_channels), name='input_stddev_normalization')(x1)
+    if swap_channels:
+        x1 = Lambda(input_channel_swap, output_shape=(img_height, img_width, img_channels), name='input_channel_swap')(x1)
+
+    conv1_1 = Conv2D(64, (3, 3), activation='relu', padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv1_1')(x1)
+    conv1_2 = Conv2D(64, (3, 3), activation='relu', padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv1_2')(conv1_1)
+    pool1 = MaxPooling2D(pool_size=(2, 2), strides=(2, 2), padding='same', name='pool1')(conv1_2)
+
+    conv2_1 = Conv2D(128, (3, 3), activation='relu', padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv2_1')(pool1)
+    conv2_2 = Conv2D(128, (3, 3), activation='relu', padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv2_2')(conv2_1)
+    pool2 = MaxPooling2D(pool_size=(2, 2), strides=(2, 2), padding='same', name='pool2')(conv2_2)
+
+    conv3_1 = Conv2D(256, (3, 3), activation='relu', padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv3_1')(pool2)
+    conv3_2 = Conv2D(256, (3, 3), activation='relu', padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv3_2')(conv3_1)
+    conv3_3 = Conv2D(256, (3, 3), activation='relu', padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv3_3')(conv3_2)
+    pool3 = MaxPooling2D(pool_size=(2, 2), strides=(2, 2), padding='same', name='pool3')(conv3_3)
+
+    conv4_1 = Conv2D(512, (3, 3), activation='relu', padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv4_1')(pool3)
+    conv4_2 = Conv2D(512, (3, 3), activation='relu', padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv4_2')(conv4_1)
+    conv4_3 = Conv2D(512, (3, 3), activation='relu', padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv4_3')(conv4_2)
+    pool4 = MaxPooling2D(pool_size=(2, 2), strides=(2, 2), padding='same', name='pool4')(conv4_3)
+
+    conv5_1 = Conv2D(512, (3, 3), activation='relu', padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv5_1')(pool4)
+    conv5_2 = Conv2D(512, (3, 3), activation='relu', padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv5_2')(conv5_1)
+    conv5_3 = Conv2D(512, (3, 3), activation='relu', padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv5_3')(conv5_2)
+    pool5 = MaxPooling2D(pool_size=(3, 3), strides=(1, 1), padding='same', name='pool5')(conv5_3)
+
+    fc6 = Conv2D(1024, (3, 3), dilation_rate=(6, 6), activation='relu', padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='fc6')(pool5)
+
+    fc7 = Conv2D(1024, (1, 1), activation='relu', padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='fc7')(fc6)
+
+    conv6_1 = Conv2D(256, (1, 1), activation='relu', padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv6_1')(fc7)
+    conv6_1 = ZeroPadding2D(padding=((1, 1), (1, 1)), name='conv6_padding')(conv6_1)
+    conv6_2 = Conv2D(512, (3, 3), strides=(2, 2), activation='relu', padding='valid', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv6_2')(conv6_1)
+
+    conv7_1 = Conv2D(128, (1, 1), activation='relu', padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv7_1')(conv6_2)
+    conv7_1 = ZeroPadding2D(padding=((1, 1), (1, 1)), name='conv7_padding')(conv7_1)
+    conv7_2 = Conv2D(256, (3, 3), strides=(2, 2), activation='relu', padding='valid', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv7_2')(conv7_1)
+
+    conv8_1 = Conv2D(128, (1, 1), activation='relu', padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv8_1')(conv7_2)
+    conv8_1 = ZeroPadding2D(padding=((1, 1), (1, 1)), name='conv8_padding')(conv8_1)
+    conv8_2 = Conv2D(256, (3, 3), strides=(2, 2), activation='relu', padding='valid', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv8_2')(conv8_1)
+
+    conv9_1 = Conv2D(128, (1, 1), activation='relu', padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv9_1')(conv8_2)
+    conv9_1 = ZeroPadding2D(padding=((1, 1), (1, 1)), name='conv9_padding')(conv9_1)
+    conv9_2 = Conv2D(256, (3, 3), strides=(2, 2), activation='relu', padding='valid', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv9_2')(conv9_1)
+
+    conv10_1 = Conv2D(128, (1, 1), activation='relu', padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv10_1')(conv9_2)
+    conv10_1 = ZeroPadding2D(padding=((1, 1), (1, 1)), name='conv10_padding')(conv10_1)
+    conv10_2 = Conv2D(256, (4, 4), strides=(1, 1), activation='relu', padding='valid', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv10_2')(conv10_1)
+
+    # Feed conv4_3 into the L2 normalization layer
+    conv4_3_norm = L2Normalization(gamma_init=20, name='conv4_3_norm')(conv4_3)
+
+    ### Build the convolutional predictor layers on top of the base network
+
+    # We precidt `n_classes` confidence values for each box, hence the confidence predictors have depth `n_boxes * n_classes`
+    # Output shape of the confidence layers: `(batch, height, width, n_boxes * n_classes)`
+    conv4_3_norm_mbox_conf = Conv2D(n_boxes[0] * n_classes, (3, 3), padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv4_3_norm_mbox_conf')(conv4_3_norm)
+    fc7_mbox_conf = Conv2D(n_boxes[1] * n_classes, (3, 3), padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='fc7_mbox_conf')(fc7)
+    conv6_2_mbox_conf = Conv2D(n_boxes[2] * n_classes, (3, 3), padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv6_2_mbox_conf')(conv6_2)
+    conv7_2_mbox_conf = Conv2D(n_boxes[3] * n_classes, (3, 3), padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv7_2_mbox_conf')(conv7_2)
+    conv8_2_mbox_conf = Conv2D(n_boxes[4] * n_classes, (3, 3), padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv8_2_mbox_conf')(conv8_2)
+    conv9_2_mbox_conf = Conv2D(n_boxes[5] * n_classes, (3, 3), padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv9_2_mbox_conf')(conv9_2)
+    conv10_2_mbox_conf = Conv2D(n_boxes[6] * n_classes, (3, 3), padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv10_2_mbox_conf')(conv10_2)
+    # We predict 4 box coordinates for each box, hence the localization predictors have depth `n_boxes * 4`
+    # Output shape of the localization layers: `(batch, height, width, n_boxes * 4)`
+    conv4_3_norm_mbox_loc = Conv2D(n_boxes[0] * 4, (3, 3), padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv4_3_norm_mbox_loc')(conv4_3_norm)
+    fc7_mbox_loc = Conv2D(n_boxes[1] * 4, (3, 3), padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='fc7_mbox_loc')(fc7)
+    conv6_2_mbox_loc = Conv2D(n_boxes[2] * 4, (3, 3), padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv6_2_mbox_loc')(conv6_2)
+    conv7_2_mbox_loc = Conv2D(n_boxes[3] * 4, (3, 3), padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv7_2_mbox_loc')(conv7_2)
+    conv8_2_mbox_loc = Conv2D(n_boxes[4] * 4, (3, 3), padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv8_2_mbox_loc')(conv8_2)
+    conv9_2_mbox_loc = Conv2D(n_boxes[5] * 4, (3, 3), padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv9_2_mbox_loc')(conv9_2)
+    conv10_2_mbox_loc = Conv2D(n_boxes[6] * 4, (3, 3), padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv10_2_mbox_loc')(conv10_2)
+
+    ### Generate the anchor boxes (called "priors" in the original Caffe/C++ implementation, so I'll keep their layer names)
+
+    # Output shape of anchors: `(batch, height, width, n_boxes, 8)`
+    conv4_3_norm_mbox_priorbox = AnchorBoxes(img_height, img_width, this_scale=scales[0], next_scale=scales[1], aspect_ratios=aspect_ratios[0],
+                                             two_boxes_for_ar1=two_boxes_for_ar1, this_steps=steps[0], this_offsets=offsets[0], clip_boxes=clip_boxes,
+                                             variances=variances, coords=coords, normalize_coords=normalize_coords, name='conv4_3_norm_mbox_priorbox')(conv4_3_norm_mbox_loc)
+    fc7_mbox_priorbox = AnchorBoxes(img_height, img_width, this_scale=scales[1], next_scale=scales[2], aspect_ratios=aspect_ratios[1],
+                                    two_boxes_for_ar1=two_boxes_for_ar1, this_steps=steps[1], this_offsets=offsets[1], clip_boxes=clip_boxes,
+                                    variances=variances, coords=coords, normalize_coords=normalize_coords, name='fc7_mbox_priorbox')(fc7_mbox_loc)
+    conv6_2_mbox_priorbox = AnchorBoxes(img_height, img_width, this_scale=scales[2], next_scale=scales[3], aspect_ratios=aspect_ratios[2],
+                                        two_boxes_for_ar1=two_boxes_for_ar1, this_steps=steps[2], this_offsets=offsets[2], clip_boxes=clip_boxes,
+                                        variances=variances, coords=coords, normalize_coords=normalize_coords, name='conv6_2_mbox_priorbox')(conv6_2_mbox_loc)
+    conv7_2_mbox_priorbox = AnchorBoxes(img_height, img_width, this_scale=scales[3], next_scale=scales[4], aspect_ratios=aspect_ratios[3],
+                                        two_boxes_for_ar1=two_boxes_for_ar1, this_steps=steps[3], this_offsets=offsets[3], clip_boxes=clip_boxes,
+                                        variances=variances, coords=coords, normalize_coords=normalize_coords, name='conv7_2_mbox_priorbox')(conv7_2_mbox_loc)
+    conv8_2_mbox_priorbox = AnchorBoxes(img_height, img_width, this_scale=scales[4], next_scale=scales[5], aspect_ratios=aspect_ratios[4],
+                                        two_boxes_for_ar1=two_boxes_for_ar1, this_steps=steps[4], this_offsets=offsets[4], clip_boxes=clip_boxes,
+                                        variances=variances, coords=coords, normalize_coords=normalize_coords, name='conv8_2_mbox_priorbox')(conv8_2_mbox_loc)
+    conv9_2_mbox_priorbox = AnchorBoxes(img_height, img_width, this_scale=scales[5], next_scale=scales[6], aspect_ratios=aspect_ratios[5],
+                                        two_boxes_for_ar1=two_boxes_for_ar1, this_steps=steps[5], this_offsets=offsets[5], clip_boxes=clip_boxes,
+                                        variances=variances, coords=coords, normalize_coords=normalize_coords, name='conv9_2_mbox_priorbox')(conv9_2_mbox_loc)
+    conv10_2_mbox_priorbox = AnchorBoxes(img_height, img_width, this_scale=scales[6], next_scale=scales[7], aspect_ratios=aspect_ratios[6],
+                                        two_boxes_for_ar1=two_boxes_for_ar1, this_steps=steps[6], this_offsets=offsets[6], clip_boxes=clip_boxes,
+                                        variances=variances, coords=coords, normalize_coords=normalize_coords, name='conv10_2_mbox_priorbox')(conv10_2_mbox_loc)
+
+    ### Reshape
+
+    # Reshape the class predictions, yielding 3D tensors of shape `(batch, height * width * n_boxes, n_classes)`
+    # We want the classes isolated in the last axis to perform softmax on them
+    conv4_3_norm_mbox_conf_reshape = Reshape((-1, n_classes), name='conv4_3_norm_mbox_conf_reshape')(conv4_3_norm_mbox_conf)
+    fc7_mbox_conf_reshape = Reshape((-1, n_classes), name='fc7_mbox_conf_reshape')(fc7_mbox_conf)
+    conv6_2_mbox_conf_reshape = Reshape((-1, n_classes), name='conv6_2_mbox_conf_reshape')(conv6_2_mbox_conf)
+    conv7_2_mbox_conf_reshape = Reshape((-1, n_classes), name='conv7_2_mbox_conf_reshape')(conv7_2_mbox_conf)
+    conv8_2_mbox_conf_reshape = Reshape((-1, n_classes), name='conv8_2_mbox_conf_reshape')(conv8_2_mbox_conf)
+    conv9_2_mbox_conf_reshape = Reshape((-1, n_classes), name='conv9_2_mbox_conf_reshape')(conv9_2_mbox_conf)
+    conv10_2_mbox_conf_reshape = Reshape((-1, n_classes), name='conv10_2_mbox_conf_reshape')(conv10_2_mbox_conf)
+    # Reshape the box predictions, yielding 3D tensors of shape `(batch, height * width * n_boxes, 4)`
+    # We want the four box coordinates isolated in the last axis to compute the smooth L1 loss
+    conv4_3_norm_mbox_loc_reshape = Reshape((-1, 4), name='conv4_3_norm_mbox_loc_reshape')(conv4_3_norm_mbox_loc)
+    fc7_mbox_loc_reshape = Reshape((-1, 4), name='fc7_mbox_loc_reshape')(fc7_mbox_loc)
+    conv6_2_mbox_loc_reshape = Reshape((-1, 4), name='conv6_2_mbox_loc_reshape')(conv6_2_mbox_loc)
+    conv7_2_mbox_loc_reshape = Reshape((-1, 4), name='conv7_2_mbox_loc_reshape')(conv7_2_mbox_loc)
+    conv8_2_mbox_loc_reshape = Reshape((-1, 4), name='conv8_2_mbox_loc_reshape')(conv8_2_mbox_loc)
+    conv9_2_mbox_loc_reshape = Reshape((-1, 4), name='conv9_2_mbox_loc_reshape')(conv9_2_mbox_loc)
+    conv10_2_mbox_loc_reshape = Reshape((-1, 4), name='conv10_2_mbox_loc_reshape')(conv10_2_mbox_loc)
+    # Reshape the anchor box tensors, yielding 3D tensors of shape `(batch, height * width * n_boxes, 8)`
+    conv4_3_norm_mbox_priorbox_reshape = Reshape((-1, 8), name='conv4_3_norm_mbox_priorbox_reshape')(conv4_3_norm_mbox_priorbox)
+    fc7_mbox_priorbox_reshape = Reshape((-1, 8), name='fc7_mbox_priorbox_reshape')(fc7_mbox_priorbox)
+    conv6_2_mbox_priorbox_reshape = Reshape((-1, 8), name='conv6_2_mbox_priorbox_reshape')(conv6_2_mbox_priorbox)
+    conv7_2_mbox_priorbox_reshape = Reshape((-1, 8), name='conv7_2_mbox_priorbox_reshape')(conv7_2_mbox_priorbox)
+    conv8_2_mbox_priorbox_reshape = Reshape((-1, 8), name='conv8_2_mbox_priorbox_reshape')(conv8_2_mbox_priorbox)
+    conv9_2_mbox_priorbox_reshape = Reshape((-1, 8), name='conv9_2_mbox_priorbox_reshape')(conv9_2_mbox_priorbox)
+    conv10_2_mbox_priorbox_reshape = Reshape((-1, 8), name='conv10_2_mbox_priorbox_reshape')(conv10_2_mbox_priorbox)
+
+    ### Concatenate the predictions from the different layers
+
+    # Axis 0 (batch) and axis 2 (n_classes or 4, respectively) are identical for all layer predictions,
+    # so we want to concatenate along axis 1, the number of boxes per layer
+    # Output shape of `mbox_conf`: (batch, n_boxes_total, n_classes)
+    mbox_conf = Concatenate(axis=1, name='mbox_conf')([conv4_3_norm_mbox_conf_reshape,
+                                                       fc7_mbox_conf_reshape,
+                                                       conv6_2_mbox_conf_reshape,
+                                                       conv7_2_mbox_conf_reshape,
+                                                       conv8_2_mbox_conf_reshape,
+                                                       conv9_2_mbox_conf_reshape,
+                                                       conv10_2_mbox_conf_reshape])
+
+    # Output shape of `mbox_loc`: (batch, n_boxes_total, 4)
+    mbox_loc = Concatenate(axis=1, name='mbox_loc')([conv4_3_norm_mbox_loc_reshape,
+                                                     fc7_mbox_loc_reshape,
+                                                     conv6_2_mbox_loc_reshape,
+                                                     conv7_2_mbox_loc_reshape,
+                                                     conv8_2_mbox_loc_reshape,
+                                                     conv9_2_mbox_loc_reshape,
+                                                     conv10_2_mbox_loc_reshape])
+
+    # Output shape of `mbox_priorbox`: (batch, n_boxes_total, 8)
+    mbox_priorbox = Concatenate(axis=1, name='mbox_priorbox')([conv4_3_norm_mbox_priorbox_reshape,
+                                                               fc7_mbox_priorbox_reshape,
+                                                               conv6_2_mbox_priorbox_reshape,
+                                                               conv7_2_mbox_priorbox_reshape,
+                                                               conv8_2_mbox_priorbox_reshape,
+                                                               conv9_2_mbox_priorbox_reshape,
+                                                               conv10_2_mbox_priorbox_reshape])
+
+    # The box coordinate predictions will go into the loss function just the way they are,
+    # but for the class predictions, we'll apply a softmax activation layer first
+    mbox_conf_softmax = Activation('softmax', name='mbox_conf_softmax')(mbox_conf)
+
+    # Concatenate the class and box predictions and the anchors to one large predictions vector
+    # Output shape of `predictions`: (batch, n_boxes_total, n_classes + 4 + 8)
+    predictions = Concatenate(axis=2, name='predictions')([mbox_conf_softmax, mbox_loc, mbox_priorbox])
+
+    if mode == 'training':
+        model = Model(inputs=x, outputs=predictions)
+    elif mode == 'inference':
+        decoded_predictions = DecodeDetections(confidence_thresh=confidence_thresh,
+                                               iou_threshold=iou_threshold,
+                                               top_k=top_k,
+                                               nms_max_output_size=nms_max_output_size,
+                                               coords=coords,
+                                               normalize_coords=normalize_coords,
+                                               img_height=img_height,
+                                               img_width=img_width,
+                                               name='decoded_predictions')(predictions)
+        model = Model(inputs=x, outputs=decoded_predictions)
+    elif mode == 'inference_fast':
+        decoded_predictions = DecodeDetectionsFast(confidence_thresh=confidence_thresh,
+                                                   iou_threshold=iou_threshold,
+                                                   top_k=top_k,
+                                                   nms_max_output_size=nms_max_output_size,
+                                                   coords=coords,
+                                                   normalize_coords=normalize_coords,
+                                                   img_height=img_height,
+                                                   img_width=img_width,
+                                                   name='decoded_predictions')(predictions)
+        model = Model(inputs=x, outputs=decoded_predictions)
+    else:
+        raise ValueError("`mode` must be one of 'training', 'inference' or 'inference_fast', but received '{}'.".format(mode))
+
+    if return_predictor_sizes:
+        predictor_sizes = np.array([conv4_3_norm_mbox_conf._keras_shape[1:3],
+                                    fc7_mbox_conf._keras_shape[1:3],
+                                    conv6_2_mbox_conf._keras_shape[1:3],
+                                    conv7_2_mbox_conf._keras_shape[1:3],
+                                    conv8_2_mbox_conf._keras_shape[1:3],
+                                    conv9_2_mbox_conf._keras_shape[1:3],
+                                    conv10_2_mbox_conf._keras_shape[1:3]])
+        return model, predictor_sizes
+    else:
+        return model
diff --git a/keras_ssd/models/keras_ssd7.py b/keras_ssd/models/keras_ssd7.py
new file mode 100644
index 0000000..5409599
--- /dev/null
+++ b/keras_ssd/models/keras_ssd7.py
@@ -0,0 +1,430 @@
+'''
+A small 7-layer Keras model with SSD architecture. Also serves as a template to build arbitrary network architectures.
+
+Copyright (C) 2018 Pierluigi Ferrari
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+'''
+
+from __future__ import division
+import numpy as np
+from keras.models import Model
+from keras.layers import Input, Lambda, Conv2D, MaxPooling2D, BatchNormalization, ELU, Reshape, Concatenate, Activation
+from keras.regularizers import l2
+import keras.backend as K
+
+from keras_layers.keras_layer_AnchorBoxes import AnchorBoxes
+from keras_layers.keras_layer_DecodeDetections import DecodeDetections
+from keras_layers.keras_layer_DecodeDetectionsFast import DecodeDetectionsFast
+
+def build_model(image_size,
+                n_classes,
+                mode='training',
+                l2_regularization=0.0,
+                min_scale=0.1,
+                max_scale=0.9,
+                scales=None,
+                aspect_ratios_global=[0.5, 1.0, 2.0],
+                aspect_ratios_per_layer=None,
+                two_boxes_for_ar1=True,
+                steps=None,
+                offsets=None,
+                clip_boxes=False,
+                variances=[1.0, 1.0, 1.0, 1.0],
+                coords='centroids',
+                normalize_coords=False,
+                subtract_mean=None,
+                divide_by_stddev=None,
+                swap_channels=False,
+                confidence_thresh=0.01,
+                iou_threshold=0.45,
+                top_k=200,
+                nms_max_output_size=400,
+                return_predictor_sizes=False):
+    '''
+    Build a Keras model with SSD architecture, see references.
+
+    The model consists of convolutional feature layers and a number of convolutional
+    predictor layers that take their input from different feature layers.
+    The model is fully convolutional.
+
+    The implementation found here is a smaller version of the original architecture
+    used in the paper (where the base network consists of a modified VGG-16 extended
+    by a few convolutional feature layers), but of course it could easily be changed to
+    an arbitrarily large SSD architecture by following the general design pattern used here.
+    This implementation has 7 convolutional layers and 4 convolutional predictor
+    layers that take their input from layers 4, 5, 6, and 7, respectively.
+
+    Most of the arguments that this function takes are only needed for the anchor
+    box layers. In case you're training the network, the parameters passed here must
+    be the same as the ones used to set up `SSDBoxEncoder`. In case you're loading
+    trained weights, the parameters passed here must be the same as the ones used
+    to produce the trained weights.
+
+    Some of these arguments are explained in more detail in the documentation of the
+    `SSDBoxEncoder` class.
+
+    Note: Requires Keras v2.0 or later. Training currently works only with the
+    TensorFlow backend (v1.0 or later).
+
+    Arguments:
+        image_size (tuple): The input image size in the format `(height, width, channels)`.
+        n_classes (int): The number of positive classes, e.g. 20 for Pascal VOC, 80 for MS COCO.
+        mode (str, optional): One of 'training', 'inference' and 'inference_fast'. In 'training' mode,
+            the model outputs the raw prediction tensor, while in 'inference' and 'inference_fast' modes,
+            the raw predictions are decoded into absolute coordinates and filtered via confidence thresholding,
+            non-maximum suppression, and top-k filtering. The difference between latter two modes is that
+            'inference' follows the exact procedure of the original Caffe implementation, while
+            'inference_fast' uses a faster prediction decoding procedure.
+        l2_regularization (float, optional): The L2-regularization rate. Applies to all convolutional layers.
+        min_scale (float, optional): The smallest scaling factor for the size of the anchor boxes as a fraction
+            of the shorter side of the input images.
+        max_scale (float, optional): The largest scaling factor for the size of the anchor boxes as a fraction
+            of the shorter side of the input images. All scaling factors between the smallest and the
+            largest will be linearly interpolated. Note that the second to last of the linearly interpolated
+            scaling factors will actually be the scaling factor for the last predictor layer, while the last
+            scaling factor is used for the second box for aspect ratio 1 in the last predictor layer
+            if `two_boxes_for_ar1` is `True`.
+        scales (list, optional): A list of floats containing scaling factors per convolutional predictor layer.
+            This list must be one element longer than the number of predictor layers. The first `k` elements are the
+            scaling factors for the `k` predictor layers, while the last element is used for the second box
+            for aspect ratio 1 in the last predictor layer if `two_boxes_for_ar1` is `True`. This additional
+            last scaling factor must be passed either way, even if it is not being used. If a list is passed,
+            this argument overrides `min_scale` and `max_scale`. All scaling factors must be greater than zero.
+        aspect_ratios_global (list, optional): The list of aspect ratios for which anchor boxes are to be
+            generated. This list is valid for all predictor layers. The original implementation uses more aspect ratios
+            for some predictor layers and fewer for others. If you want to do that, too, then use the next argument instead.
+        aspect_ratios_per_layer (list, optional): A list containing one aspect ratio list for each predictor layer.
+            This allows you to set the aspect ratios for each predictor layer individually. If a list is passed,
+            it overrides `aspect_ratios_global`.
+        two_boxes_for_ar1 (bool, optional): Only relevant for aspect ratio lists that contain 1. Will be ignored otherwise.
+            If `True`, two anchor boxes will be generated for aspect ratio 1. The first will be generated
+            using the scaling factor for the respective layer, the second one will be generated using
+            geometric mean of said scaling factor and next bigger scaling factor.
+        steps (list, optional): `None` or a list with as many elements as there are predictor layers. The elements can be
+            either ints/floats or tuples of two ints/floats. These numbers represent for each predictor layer how many
+            pixels apart the anchor box center points should be vertically and horizontally along the spatial grid over
+            the image. If the list contains ints/floats, then that value will be used for both spatial dimensions.
+            If the list contains tuples of two ints/floats, then they represent `(step_height, step_width)`.
+            If no steps are provided, then they will be computed such that the anchor box center points will form an
+            equidistant grid within the image dimensions.
+        offsets (list, optional): `None` or a list with as many elements as there are predictor layers. The elements can be
+            either floats or tuples of two floats. These numbers represent for each predictor layer how many
+            pixels from the top and left boarders of the image the top-most and left-most anchor box center points should be
+            as a fraction of `steps`. The last bit is important: The offsets are not absolute pixel values, but fractions
+            of the step size specified in the `steps` argument. If the list contains floats, then that value will
+            be used for both spatial dimensions. If the list contains tuples of two floats, then they represent
+            `(vertical_offset, horizontal_offset)`. If no offsets are provided, then they will default to 0.5 of the step size,
+            which is also the recommended setting.
+        clip_boxes (bool, optional): If `True`, clips the anchor box coordinates to stay within image boundaries.
+        variances (list, optional): A list of 4 floats >0. The anchor box offset for each coordinate will be divided by
+            its respective variance value.
+        coords (str, optional): The box coordinate format to be used internally by the model (i.e. this is not the input format
+            of the ground truth labels). Can be either 'centroids' for the format `(cx, cy, w, h)` (box center coordinates, width,
+            and height), 'minmax' for the format `(xmin, xmax, ymin, ymax)`, or 'corners' for the format `(xmin, ymin, xmax, ymax)`.
+        normalize_coords (bool, optional): Set to `True` if the model is supposed to use relative instead of absolute coordinates,
+            i.e. if the model predicts box coordinates within [0,1] instead of absolute coordinates.
+        subtract_mean (array-like, optional): `None` or an array-like object of integers or floating point values
+            of any shape that is broadcast-compatible with the image shape. The elements of this array will be
+            subtracted from the image pixel intensity values. For example, pass a list of three integers
+            to perform per-channel mean normalization for color images.
+        divide_by_stddev (array-like, optional): `None` or an array-like object of non-zero integers or
+            floating point values of any shape that is broadcast-compatible with the image shape. The image pixel
+            intensity values will be divided by the elements of this array. For example, pass a list
+            of three integers to perform per-channel standard deviation normalization for color images.
+        swap_channels (list, optional): Either `False` or a list of integers representing the desired order in which the input
+            image channels should be swapped.
+        confidence_thresh (float, optional): A float in [0,1), the minimum classification confidence in a specific
+            positive class in order to be considered for the non-maximum suppression stage for the respective class.
+            A lower value will result in a larger part of the selection process being done by the non-maximum suppression
+            stage, while a larger value will result in a larger part of the selection process happening in the confidence
+            thresholding stage.
+        iou_threshold (float, optional): A float in [0,1]. All boxes that have a Jaccard similarity of greater than `iou_threshold`
+            with a locally maximal box will be removed from the set of predictions for a given class, where 'maximal' refers
+            to the box's confidence score.
+        top_k (int, optional): The number of highest scoring predictions to be kept for each batch item after the
+            non-maximum suppression stage.
+        nms_max_output_size (int, optional): The maximal number of predictions that will be left over after the NMS stage.
+        return_predictor_sizes (bool, optional): If `True`, this function not only returns the model, but also
+            a list containing the spatial dimensions of the predictor layers. This isn't strictly necessary since
+            you can always get their sizes easily via the Keras API, but it's convenient and less error-prone
+            to get them this way. They are only relevant for training anyway (SSDBoxEncoder needs to know the
+            spatial dimensions of the predictor layers), for inference you don't need them.
+
+    Returns:
+        model: The Keras SSD model.
+        predictor_sizes (optional): A Numpy array containing the `(height, width)` portion
+            of the output tensor shape for each convolutional predictor layer. During
+            training, the generator function needs this in order to transform
+            the ground truth labels into tensors of identical structure as the
+            output tensors of the model, which is in turn needed for the cost
+            function.
+
+    References:
+        https://arxiv.org/abs/1512.02325v5
+    '''
+
+    n_predictor_layers = 4 # The number of predictor conv layers in the network
+    n_classes += 1 # Account for the background class.
+    l2_reg = l2_regularization # Make the internal name shorter.
+    img_height, img_width, img_channels = image_size[0], image_size[1], image_size[2]
+
+    ############################################################################
+    # Get a few exceptions out of the way.
+    ############################################################################
+
+    if aspect_ratios_global is None and aspect_ratios_per_layer is None:
+        raise ValueError("`aspect_ratios_global` and `aspect_ratios_per_layer` cannot both be None. At least one needs to be specified.")
+    if aspect_ratios_per_layer:
+        if len(aspect_ratios_per_layer) != n_predictor_layers:
+            raise ValueError("It must be either aspect_ratios_per_layer is None or len(aspect_ratios_per_layer) == {}, but len(aspect_ratios_per_layer) == {}.".format(n_predictor_layers, len(aspect_ratios_per_layer)))
+
+    if (min_scale is None or max_scale is None) and scales is None:
+        raise ValueError("Either `min_scale` and `max_scale` or `scales` need to be specified.")
+    if scales:
+        if len(scales) != n_predictor_layers+1:
+            raise ValueError("It must be either scales is None or len(scales) == {}, but len(scales) == {}.".format(n_predictor_layers+1, len(scales)))
+    else: # If no explicit list of scaling factors was passed, compute the list of scaling factors from `min_scale` and `max_scale`
+        scales = np.linspace(min_scale, max_scale, n_predictor_layers+1)
+
+    if len(variances) != 4: # We need one variance value for each of the four box coordinates
+        raise ValueError("4 variance values must be pased, but {} values were received.".format(len(variances)))
+    variances = np.array(variances)
+    if np.any(variances <= 0):
+        raise ValueError("All variances must be >0, but the variances given are {}".format(variances))
+
+    if (not (steps is None)) and (len(steps) != n_predictor_layers):
+        raise ValueError("You must provide at least one step value per predictor layer.")
+
+    if (not (offsets is None)) and (len(offsets) != n_predictor_layers):
+        raise ValueError("You must provide at least one offset value per predictor layer.")
+
+    ############################################################################
+    # Compute the anchor box parameters.
+    ############################################################################
+
+    # Set the aspect ratios for each predictor layer. These are only needed for the anchor box layers.
+    if aspect_ratios_per_layer:
+        aspect_ratios = aspect_ratios_per_layer
+    else:
+        aspect_ratios = [aspect_ratios_global] * n_predictor_layers
+
+    # Compute the number of boxes to be predicted per cell for each predictor layer.
+    # We need this so that we know how many channels the predictor layers need to have.
+    if aspect_ratios_per_layer:
+        n_boxes = []
+        for ar in aspect_ratios_per_layer:
+            if (1 in ar) & two_boxes_for_ar1:
+                n_boxes.append(len(ar) + 1) # +1 for the second box for aspect ratio 1
+            else:
+                n_boxes.append(len(ar))
+    else: # If only a global aspect ratio list was passed, then the number of boxes is the same for each predictor layer
+        if (1 in aspect_ratios_global) & two_boxes_for_ar1:
+            n_boxes = len(aspect_ratios_global) + 1
+        else:
+            n_boxes = len(aspect_ratios_global)
+        n_boxes = [n_boxes] * n_predictor_layers
+
+    if steps is None:
+        steps = [None] * n_predictor_layers
+    if offsets is None:
+        offsets = [None] * n_predictor_layers
+
+    ############################################################################
+    # Define functions for the Lambda layers below.
+    ############################################################################
+
+    def identity_layer(tensor):
+        return tensor
+
+    def input_mean_normalization(tensor):
+        return tensor - np.array(subtract_mean)
+
+    def input_stddev_normalization(tensor):
+        return tensor / np.array(divide_by_stddev)
+
+    def input_channel_swap(tensor):
+        if len(swap_channels) == 3:
+            return K.stack([tensor[...,swap_channels[0]], tensor[...,swap_channels[1]], tensor[...,swap_channels[2]]], axis=-1)
+        elif len(swap_channels) == 4:
+            return K.stack([tensor[...,swap_channels[0]], tensor[...,swap_channels[1]], tensor[...,swap_channels[2]], tensor[...,swap_channels[3]]], axis=-1)
+
+    ############################################################################
+    # Build the network.
+    ############################################################################
+
+    x = Input(shape=(img_height, img_width, img_channels))
+
+    # The following identity layer is only needed so that the subsequent lambda layers can be optional.
+    x1 = Lambda(identity_layer, output_shape=(img_height, img_width, img_channels), name='identity_layer')(x)
+    if not (subtract_mean is None):
+        x1 = Lambda(input_mean_normalization, output_shape=(img_height, img_width, img_channels), name='input_mean_normalization')(x1)
+    if not (divide_by_stddev is None):
+        x1 = Lambda(input_stddev_normalization, output_shape=(img_height, img_width, img_channels), name='input_stddev_normalization')(x1)
+    if swap_channels:
+        x1 = Lambda(input_channel_swap, output_shape=(img_height, img_width, img_channels), name='input_channel_swap')(x1)
+
+    conv1 = Conv2D(32, (5, 5), strides=(1, 1), padding="same", kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv1')(x1)
+    conv1 = BatchNormalization(axis=3, momentum=0.99, name='bn1')(conv1) # Tensorflow uses filter format [filter_height, filter_width, in_channels, out_channels], hence axis = 3
+    conv1 = ELU(name='elu1')(conv1)
+    pool1 = MaxPooling2D(pool_size=(2, 2), name='pool1')(conv1)
+
+    conv2 = Conv2D(48, (3, 3), strides=(1, 1), padding="same", kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv2')(pool1)
+    conv2 = BatchNormalization(axis=3, momentum=0.99, name='bn2')(conv2)
+    conv2 = ELU(name='elu2')(conv2)
+    pool2 = MaxPooling2D(pool_size=(2, 2), name='pool2')(conv2)
+
+    conv3 = Conv2D(64, (3, 3), strides=(1, 1), padding="same", kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv3')(pool2)
+    conv3 = BatchNormalization(axis=3, momentum=0.99, name='bn3')(conv3)
+    conv3 = ELU(name='elu3')(conv3)
+    pool3 = MaxPooling2D(pool_size=(2, 2), name='pool3')(conv3)
+
+    conv4 = Conv2D(64, (3, 3), strides=(1, 1), padding="same", kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv4')(pool3)
+    conv4 = BatchNormalization(axis=3, momentum=0.99, name='bn4')(conv4)
+    conv4 = ELU(name='elu4')(conv4)
+    pool4 = MaxPooling2D(pool_size=(2, 2), name='pool4')(conv4)
+
+    conv5 = Conv2D(48, (3, 3), strides=(1, 1), padding="same", kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv5')(pool4)
+    conv5 = BatchNormalization(axis=3, momentum=0.99, name='bn5')(conv5)
+    conv5 = ELU(name='elu5')(conv5)
+    pool5 = MaxPooling2D(pool_size=(2, 2), name='pool5')(conv5)
+
+    conv6 = Conv2D(48, (3, 3), strides=(1, 1), padding="same", kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv6')(pool5)
+    conv6 = BatchNormalization(axis=3, momentum=0.99, name='bn6')(conv6)
+    conv6 = ELU(name='elu6')(conv6)
+    pool6 = MaxPooling2D(pool_size=(2, 2), name='pool6')(conv6)
+
+    conv7 = Conv2D(32, (3, 3), strides=(1, 1), padding="same", kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv7')(pool6)
+    conv7 = BatchNormalization(axis=3, momentum=0.99, name='bn7')(conv7)
+    conv7 = ELU(name='elu7')(conv7)
+
+    # The next part is to add the convolutional predictor layers on top of the base network
+    # that we defined above. Note that I use the term "base network" differently than the paper does.
+    # To me, the base network is everything that is not convolutional predictor layers or anchor
+    # box layers. In this case we'll have four predictor layers, but of course you could
+    # easily rewrite this into an arbitrarily deep base network and add an arbitrary number of
+    # predictor layers on top of the base network by simply following the pattern shown here.
+
+    # Build the convolutional predictor layers on top of conv layers 4, 5, 6, and 7.
+    # We build two predictor layers on top of each of these layers: One for class prediction (classification), one for box coordinate prediction (localization)
+    # We precidt `n_classes` confidence values for each box, hence the `classes` predictors have depth `n_boxes * n_classes`
+    # We predict 4 box coordinates for each box, hence the `boxes` predictors have depth `n_boxes * 4`
+    # Output shape of `classes`: `(batch, height, width, n_boxes * n_classes)`
+    classes4 = Conv2D(n_boxes[0] * n_classes, (3, 3), strides=(1, 1), padding="same", kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='classes4')(conv4)
+    classes5 = Conv2D(n_boxes[1] * n_classes, (3, 3), strides=(1, 1), padding="same", kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='classes5')(conv5)
+    classes6 = Conv2D(n_boxes[2] * n_classes, (3, 3), strides=(1, 1), padding="same", kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='classes6')(conv6)
+    classes7 = Conv2D(n_boxes[3] * n_classes, (3, 3), strides=(1, 1), padding="same", kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='classes7')(conv7)
+    # Output shape of `boxes`: `(batch, height, width, n_boxes * 4)`
+    boxes4 = Conv2D(n_boxes[0] * 4, (3, 3), strides=(1, 1), padding="same", kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='boxes4')(conv4)
+    boxes5 = Conv2D(n_boxes[1] * 4, (3, 3), strides=(1, 1), padding="same", kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='boxes5')(conv5)
+    boxes6 = Conv2D(n_boxes[2] * 4, (3, 3), strides=(1, 1), padding="same", kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='boxes6')(conv6)
+    boxes7 = Conv2D(n_boxes[3] * 4, (3, 3), strides=(1, 1), padding="same", kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='boxes7')(conv7)
+
+    # Generate the anchor boxes
+    # Output shape of `anchors`: `(batch, height, width, n_boxes, 8)`
+    anchors4 = AnchorBoxes(img_height, img_width, this_scale=scales[0], next_scale=scales[1], aspect_ratios=aspect_ratios[0],
+                           two_boxes_for_ar1=two_boxes_for_ar1, this_steps=steps[0], this_offsets=offsets[0],
+                           clip_boxes=clip_boxes, variances=variances, coords=coords, normalize_coords=normalize_coords, name='anchors4')(boxes4)
+    anchors5 = AnchorBoxes(img_height, img_width, this_scale=scales[1], next_scale=scales[2], aspect_ratios=aspect_ratios[1],
+                           two_boxes_for_ar1=two_boxes_for_ar1, this_steps=steps[1], this_offsets=offsets[1],
+                           clip_boxes=clip_boxes, variances=variances, coords=coords, normalize_coords=normalize_coords, name='anchors5')(boxes5)
+    anchors6 = AnchorBoxes(img_height, img_width, this_scale=scales[2], next_scale=scales[3], aspect_ratios=aspect_ratios[2],
+                           two_boxes_for_ar1=two_boxes_for_ar1, this_steps=steps[2], this_offsets=offsets[2],
+                           clip_boxes=clip_boxes, variances=variances, coords=coords, normalize_coords=normalize_coords, name='anchors6')(boxes6)
+    anchors7 = AnchorBoxes(img_height, img_width, this_scale=scales[3], next_scale=scales[4], aspect_ratios=aspect_ratios[3],
+                           two_boxes_for_ar1=two_boxes_for_ar1, this_steps=steps[3], this_offsets=offsets[3],
+                           clip_boxes=clip_boxes, variances=variances, coords=coords, normalize_coords=normalize_coords, name='anchors7')(boxes7)
+
+    # Reshape the class predictions, yielding 3D tensors of shape `(batch, height * width * n_boxes, n_classes)`
+    # We want the classes isolated in the last axis to perform softmax on them
+    classes4_reshaped = Reshape((-1, n_classes), name='classes4_reshape')(classes4)
+    classes5_reshaped = Reshape((-1, n_classes), name='classes5_reshape')(classes5)
+    classes6_reshaped = Reshape((-1, n_classes), name='classes6_reshape')(classes6)
+    classes7_reshaped = Reshape((-1, n_classes), name='classes7_reshape')(classes7)
+    # Reshape the box coordinate predictions, yielding 3D tensors of shape `(batch, height * width * n_boxes, 4)`
+    # We want the four box coordinates isolated in the last axis to compute the smooth L1 loss
+    boxes4_reshaped = Reshape((-1, 4), name='boxes4_reshape')(boxes4)
+    boxes5_reshaped = Reshape((-1, 4), name='boxes5_reshape')(boxes5)
+    boxes6_reshaped = Reshape((-1, 4), name='boxes6_reshape')(boxes6)
+    boxes7_reshaped = Reshape((-1, 4), name='boxes7_reshape')(boxes7)
+    # Reshape the anchor box tensors, yielding 3D tensors of shape `(batch, height * width * n_boxes, 8)`
+    anchors4_reshaped = Reshape((-1, 8), name='anchors4_reshape')(anchors4)
+    anchors5_reshaped = Reshape((-1, 8), name='anchors5_reshape')(anchors5)
+    anchors6_reshaped = Reshape((-1, 8), name='anchors6_reshape')(anchors6)
+    anchors7_reshaped = Reshape((-1, 8), name='anchors7_reshape')(anchors7)
+
+    # Concatenate the predictions from the different layers and the assosciated anchor box tensors
+    # Axis 0 (batch) and axis 2 (n_classes or 4, respectively) are identical for all layer predictions,
+    # so we want to concatenate along axis 1
+    # Output shape of `classes_concat`: (batch, n_boxes_total, n_classes)
+    classes_concat = Concatenate(axis=1, name='classes_concat')([classes4_reshaped,
+                                                                 classes5_reshaped,
+                                                                 classes6_reshaped,
+                                                                 classes7_reshaped])
+
+    # Output shape of `boxes_concat`: (batch, n_boxes_total, 4)
+    boxes_concat = Concatenate(axis=1, name='boxes_concat')([boxes4_reshaped,
+                                                             boxes5_reshaped,
+                                                             boxes6_reshaped,
+                                                             boxes7_reshaped])
+
+    # Output shape of `anchors_concat`: (batch, n_boxes_total, 8)
+    anchors_concat = Concatenate(axis=1, name='anchors_concat')([anchors4_reshaped,
+                                                                 anchors5_reshaped,
+                                                                 anchors6_reshaped,
+                                                                 anchors7_reshaped])
+
+    # The box coordinate predictions will go into the loss function just the way they are,
+    # but for the class predictions, we'll apply a softmax activation layer first
+    classes_softmax = Activation('softmax', name='classes_softmax')(classes_concat)
+
+    # Concatenate the class and box coordinate predictions and the anchors to one large predictions tensor
+    # Output shape of `predictions`: (batch, n_boxes_total, n_classes + 4 + 8)
+    predictions = Concatenate(axis=2, name='predictions')([classes_softmax, boxes_concat, anchors_concat])
+
+    if mode == 'training':
+        model = Model(inputs=x, outputs=predictions)
+    elif mode == 'inference':
+        decoded_predictions = DecodeDetections(confidence_thresh=confidence_thresh,
+                                               iou_threshold=iou_threshold,
+                                               top_k=top_k,
+                                               nms_max_output_size=nms_max_output_size,
+                                               coords=coords,
+                                               normalize_coords=normalize_coords,
+                                               img_height=img_height,
+                                               img_width=img_width,
+                                               name='decoded_predictions')(predictions)
+        model = Model(inputs=x, outputs=decoded_predictions)
+    elif mode == 'inference_fast':
+        decoded_predictions = DecodeDetectionsFast(confidence_thresh=confidence_thresh,
+                                                   iou_threshold=iou_threshold,
+                                                   top_k=top_k,
+                                                   nms_max_output_size=nms_max_output_size,
+                                                   coords=coords,
+                                                   normalize_coords=normalize_coords,
+                                                   img_height=img_height,
+                                                   img_width=img_width,
+                                                   name='decoded_predictions')(predictions)
+        model = Model(inputs=x, outputs=decoded_predictions)
+    else:
+        raise ValueError("`mode` must be one of 'training', 'inference' or 'inference_fast', but received '{}'.".format(mode))
+
+    if return_predictor_sizes:
+        # The spatial dimensions are the same for the `classes` and `boxes` predictor layers.
+        predictor_sizes = np.array([classes4._keras_shape[1:3],
+                                    classes5._keras_shape[1:3],
+                                    classes6._keras_shape[1:3],
+                                    classes7._keras_shape[1:3]])
+        return model, predictor_sizes
+    else:
+        return model
diff --git a/keras_ssd/ssd_encoder_decoder/__init__.py b/keras_ssd/ssd_encoder_decoder/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/keras_ssd/ssd_encoder_decoder/matching_utils.py b/keras_ssd/ssd_encoder_decoder/matching_utils.py
new file mode 100644
index 0000000..f1fcc90
--- /dev/null
+++ b/keras_ssd/ssd_encoder_decoder/matching_utils.py
@@ -0,0 +1,116 @@
+'''
+Utilities to match ground truth boxes to anchor boxes.
+
+Copyright (C) 2018 Pierluigi Ferrari
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+'''
+
+from __future__ import division
+import numpy as np
+
+def match_bipartite_greedy(weight_matrix):
+    '''
+    Returns a bipartite matching according to the given weight matrix.
+
+    The algorithm works as follows:
+
+    Let the first axis of `weight_matrix` represent ground truth boxes
+    and the second axis anchor boxes.
+    The ground truth box that has the greatest similarity with any
+    anchor box will be matched first, then out of the remaining ground
+    truth boxes, the ground truth box that has the greatest similarity
+    with any of the remaining anchor boxes will be matched second, and
+    so on. That is, the ground truth boxes will be matched in descending
+    order by maximum similarity with any of the respectively remaining
+    anchor boxes.
+    The runtime complexity is O(m^2 * n), where `m` is the number of
+    ground truth boxes and `n` is the number of anchor boxes.
+
+    Arguments:
+        weight_matrix (array): A 2D Numpy array that represents the weight matrix
+            for the matching process. If `(m,n)` is the shape of the weight matrix,
+            it must be `m <= n`. The weights can be integers or floating point
+            numbers. The matching process will maximize, i.e. larger weights are
+            preferred over smaller weights.
+
+    Returns:
+        A 1D Numpy array of length `weight_matrix.shape[0]` that represents
+        the matched index along the second axis of `weight_matrix` for each index
+        along the first axis.
+    '''
+
+    weight_matrix = np.copy(weight_matrix) # We'll modify this array.
+    num_ground_truth_boxes = weight_matrix.shape[0]
+    all_gt_indices = list(range(num_ground_truth_boxes)) # Only relevant for fancy-indexing below.
+
+    # This 1D array will contain for each ground truth box the index of
+    # the matched anchor box.
+    matches = np.zeros(num_ground_truth_boxes, dtype=np.int)
+
+    # In each iteration of the loop below, exactly one ground truth box
+    # will be matched to one anchor box.
+    for _ in range(num_ground_truth_boxes):
+
+        # Find the maximal anchor-ground truth pair in two steps: First, reduce
+        # over the anchor boxes and then reduce over the ground truth boxes.
+        anchor_indices = np.argmax(weight_matrix, axis=1) # Reduce along the anchor box axis.
+        overlaps = weight_matrix[all_gt_indices, anchor_indices]
+        ground_truth_index = np.argmax(overlaps) # Reduce along the ground truth box axis.
+        anchor_index = anchor_indices[ground_truth_index]
+        matches[ground_truth_index] = anchor_index # Set the match.
+
+        # Set the row of the matched ground truth box and the column of the matched
+        # anchor box to all zeros. This ensures that those boxes will not be matched again,
+        # because they will never be the best matches for any other boxes.
+        weight_matrix[ground_truth_index] = 0
+        weight_matrix[:,anchor_index] = 0
+
+    return matches
+
+def match_multi(weight_matrix, threshold):
+    '''
+    Matches all elements along the second axis of `weight_matrix` to their best
+    matches along the first axis subject to the constraint that the weight of a match
+    must be greater than or equal to `threshold` in order to produce a match.
+
+    If the weight matrix contains elements that should be ignored, the row or column
+    representing the respective elemet should be set to a value below `threshold`.
+
+    Arguments:
+        weight_matrix (array): A 2D Numpy array that represents the weight matrix
+            for the matching process. If `(m,n)` is the shape of the weight matrix,
+            it must be `m <= n`. The weights can be integers or floating point
+            numbers. The matching process will maximize, i.e. larger weights are
+            preferred over smaller weights.
+        threshold (float): A float that represents the threshold (i.e. lower bound)
+            that must be met by a pair of elements to produce a match.
+
+    Returns:
+        Two 1D Numpy arrays of equal length that represent the matched indices. The first
+        array contains the indices along the first axis of `weight_matrix`, the second array
+        contains the indices along the second axis.
+    '''
+
+    num_anchor_boxes = weight_matrix.shape[1]
+    all_anchor_indices = list(range(num_anchor_boxes)) # Only relevant for fancy-indexing below.
+
+    # Find the best ground truth match for every anchor box.
+    ground_truth_indices = np.argmax(weight_matrix, axis=0) # Array of shape (weight_matrix.shape[1],)
+    overlaps = weight_matrix[ground_truth_indices, all_anchor_indices] # Array of shape (weight_matrix.shape[1],)
+
+    # Filter out the matches with a weight below the threshold.
+    anchor_indices_thresh_met = np.nonzero(overlaps >= threshold)[0]
+    gt_indices_thresh_met = ground_truth_indices[anchor_indices_thresh_met]
+
+    return gt_indices_thresh_met, anchor_indices_thresh_met
diff --git a/keras_ssd/ssd_encoder_decoder/ssd_input_encoder.py b/keras_ssd/ssd_encoder_decoder/ssd_input_encoder.py
new file mode 100644
index 0000000..15fbb53
--- /dev/null
+++ b/keras_ssd/ssd_encoder_decoder/ssd_input_encoder.py
@@ -0,0 +1,617 @@
+'''
+An encoder that converts ground truth annotations to SSD-compatible training targets.
+
+Copyright (C) 2018 Pierluigi Ferrari
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+'''
+
+from __future__ import division
+import numpy as np
+
+from bounding_box_utils.bounding_box_utils import iou, convert_coordinates
+from ssd_encoder_decoder.matching_utils import match_bipartite_greedy, match_multi
+
+class SSDInputEncoder:
+    '''
+    Transforms ground truth labels for object detection in images
+    (2D bounding box coordinates and class labels) to the format required for
+    training an SSD model.
+
+    In the process of encoding the ground truth labels, a template of anchor boxes
+    is being built, which are subsequently matched to the ground truth boxes
+    via an intersection-over-union threshold criterion.
+    '''
+
+    def __init__(self,
+                 img_height,
+                 img_width,
+                 n_classes,
+                 predictor_sizes,
+                 min_scale=0.1,
+                 max_scale=0.9,
+                 scales=None,
+                 aspect_ratios_global=[0.5, 1.0, 2.0],
+                 aspect_ratios_per_layer=None,
+                 two_boxes_for_ar1=True,
+                 steps=None,
+                 offsets=None,
+                 clip_boxes=False,
+                 variances=[0.1, 0.1, 0.2, 0.2],
+                 matching_type='multi',
+                 pos_iou_threshold=0.5,
+                 neg_iou_limit=0.3,
+                 border_pixels='half',
+                 coords='centroids',
+                 normalize_coords=True,
+                 background_id=0):
+        '''
+        Arguments:
+            img_height (int): The height of the input images.
+            img_width (int): The width of the input images.
+            n_classes (int): The number of positive classes, e.g. 20 for Pascal VOC, 80 for MS COCO.
+            predictor_sizes (list): A list of int-tuples of the format `(height, width)`
+                containing the output heights and widths of the convolutional predictor layers.
+            min_scale (float, optional): The smallest scaling factor for the size of the anchor boxes as a fraction
+                of the shorter side of the input images. Note that you should set the scaling factors
+                such that the resulting anchor box sizes correspond to the sizes of the objects you are trying
+                to detect. Must be >0.
+            max_scale (float, optional): The largest scaling factor for the size of the anchor boxes as a fraction
+                of the shorter side of the input images. All scaling factors between the smallest and the
+                largest will be linearly interpolated. Note that the second to last of the linearly interpolated
+                scaling factors will actually be the scaling factor for the last predictor layer, while the last
+                scaling factor is used for the second box for aspect ratio 1 in the last predictor layer
+                if `two_boxes_for_ar1` is `True`. Note that you should set the scaling factors
+                such that the resulting anchor box sizes correspond to the sizes of the objects you are trying
+                to detect. Must be greater than or equal to `min_scale`.
+            scales (list, optional): A list of floats >0 containing scaling factors per convolutional predictor layer.
+                This list must be one element longer than the number of predictor layers. The first `k` elements are the
+                scaling factors for the `k` predictor layers, while the last element is used for the second box
+                for aspect ratio 1 in the last predictor layer if `two_boxes_for_ar1` is `True`. This additional
+                last scaling factor must be passed either way, even if it is not being used. If a list is passed,
+                this argument overrides `min_scale` and `max_scale`. All scaling factors must be greater than zero.
+                Note that you should set the scaling factors such that the resulting anchor box sizes correspond to
+                the sizes of the objects you are trying to detect.
+            aspect_ratios_global (list, optional): The list of aspect ratios for which anchor boxes are to be
+                generated. This list is valid for all prediction layers. Note that you should set the aspect ratios such
+                that the resulting anchor box shapes roughly correspond to the shapes of the objects you are trying to detect.
+            aspect_ratios_per_layer (list, optional): A list containing one aspect ratio list for each prediction layer.
+                If a list is passed, it overrides `aspect_ratios_global`. Note that you should set the aspect ratios such
+                that the resulting anchor box shapes very roughly correspond to the shapes of the objects you are trying to detect.
+            two_boxes_for_ar1 (bool, optional): Only relevant for aspect ratios lists that contain 1. Will be ignored otherwise.
+                If `True`, two anchor boxes will be generated for aspect ratio 1. The first will be generated
+                using the scaling factor for the respective layer, the second one will be generated using
+                geometric mean of said scaling factor and next bigger scaling factor.
+            steps (list, optional): `None` or a list with as many elements as there are predictor layers. The elements can be
+                either ints/floats or tuples of two ints/floats. These numbers represent for each predictor layer how many
+                pixels apart the anchor box center points should be vertically and horizontally along the spatial grid over
+                the image. If the list contains ints/floats, then that value will be used for both spatial dimensions.
+                If the list contains tuples of two ints/floats, then they represent `(step_height, step_width)`.
+                If no steps are provided, then they will be computed such that the anchor box center points will form an
+                equidistant grid within the image dimensions.
+            offsets (list, optional): `None` or a list with as many elements as there are predictor layers. The elements can be
+                either floats or tuples of two floats. These numbers represent for each predictor layer how many
+                pixels from the top and left boarders of the image the top-most and left-most anchor box center points should be
+                as a fraction of `steps`. The last bit is important: The offsets are not absolute pixel values, but fractions
+                of the step size specified in the `steps` argument. If the list contains floats, then that value will
+                be used for both spatial dimensions. If the list contains tuples of two floats, then they represent
+                `(vertical_offset, horizontal_offset)`. If no offsets are provided, then they will default to 0.5 of the step size.
+            clip_boxes (bool, optional): If `True`, limits the anchor box coordinates to stay within image boundaries.
+            variances (list, optional): A list of 4 floats >0. The anchor box offset for each coordinate will be divided by
+                its respective variance value.
+            matching_type (str, optional): Can be either 'multi' or 'bipartite'. In 'bipartite' mode, each ground truth box will
+                be matched only to the one anchor box with the highest IoU overlap. In 'multi' mode, in addition to the aforementioned
+                bipartite matching, all anchor boxes with an IoU overlap greater than or equal to the `pos_iou_threshold` will be
+                matched to a given ground truth box.
+            pos_iou_threshold (float, optional): The intersection-over-union similarity threshold that must be
+                met in order to match a given ground truth box to a given anchor box.
+            neg_iou_limit (float, optional): The maximum allowed intersection-over-union similarity of an
+                anchor box with any ground truth box to be labeled a negative (i.e. background) box. If an
+                anchor box is neither a positive, nor a negative box, it will be ignored during training.
+            border_pixels (str, optional): How to treat the border pixels of the bounding boxes.
+                Can be 'include', 'exclude', or 'half'. If 'include', the border pixels belong
+                to the boxes. If 'exclude', the border pixels do not belong to the boxes.
+                If 'half', then one of each of the two horizontal and vertical borders belong
+                to the boxex, but not the other.
+            coords (str, optional): The box coordinate format to be used internally by the model (i.e. this is not the input format
+                of the ground truth labels). Can be either 'centroids' for the format `(cx, cy, w, h)` (box center coordinates, width,
+                and height), 'minmax' for the format `(xmin, xmax, ymin, ymax)`, or 'corners' for the format `(xmin, ymin, xmax, ymax)`.
+            normalize_coords (bool, optional): If `True`, the encoder uses relative instead of absolute coordinates.
+                This means instead of using absolute tartget coordinates, the encoder will scale all coordinates to be within [0,1].
+                This way learning becomes independent of the input image size.
+            background_id (int, optional): Determines which class ID is for the background class.
+        '''
+        predictor_sizes = np.array(predictor_sizes)
+        if predictor_sizes.ndim == 1:
+            predictor_sizes = np.expand_dims(predictor_sizes, axis=0)
+
+        ##################################################################################
+        # Handle exceptions.
+        ##################################################################################
+
+        if (min_scale is None or max_scale is None) and scales is None:
+            raise ValueError("Either `min_scale` and `max_scale` or `scales` need to be specified.")
+
+        if scales:
+            if (len(scales) != predictor_sizes.shape[0] + 1): # Must be two nested `if` statements since `list` and `bool` cannot be combined by `&`
+                raise ValueError("It must be either scales is None or len(scales) == len(predictor_sizes)+1, but len(scales) == {} and len(predictor_sizes)+1 == {}".format(len(scales), len(predictor_sizes)+1))
+            scales = np.array(scales)
+            if np.any(scales <= 0):
+                raise ValueError("All values in `scales` must be greater than 0, but the passed list of scales is {}".format(scales))
+        else: # If no list of scales was passed, we need to make sure that `min_scale` and `max_scale` are valid values.
+            if not 0 < min_scale <= max_scale:
+                raise ValueError("It must be 0 < min_scale <= max_scale, but it is min_scale = {} and max_scale = {}".format(min_scale, max_scale))
+
+        if not (aspect_ratios_per_layer is None):
+            if (len(aspect_ratios_per_layer) != predictor_sizes.shape[0]): # Must be two nested `if` statements since `list` and `bool` cannot be combined by `&`
+                raise ValueError("It must be either aspect_ratios_per_layer is None or len(aspect_ratios_per_layer) == len(predictor_sizes), but len(aspect_ratios_per_layer) == {} and len(predictor_sizes) == {}".format(len(aspect_ratios_per_layer), len(predictor_sizes)))
+            for aspect_ratios in aspect_ratios_per_layer:
+                if np.any(np.array(aspect_ratios) <= 0):
+                    raise ValueError("All aspect ratios must be greater than zero.")
+        else:
+            if (aspect_ratios_global is None):
+                raise ValueError("At least one of `aspect_ratios_global` and `aspect_ratios_per_layer` must not be `None`.")
+            if np.any(np.array(aspect_ratios_global) <= 0):
+                raise ValueError("All aspect ratios must be greater than zero.")
+
+        if len(variances) != 4:
+            raise ValueError("4 variance values must be pased, but {} values were received.".format(len(variances)))
+        variances = np.array(variances)
+        if np.any(variances <= 0):
+            raise ValueError("All variances must be >0, but the variances given are {}".format(variances))
+
+        if not (coords == 'minmax' or coords == 'centroids' or coords == 'corners'):
+            raise ValueError("Unexpected value for `coords`. Supported values are 'minmax', 'corners' and 'centroids'.")
+
+        if (not (steps is None)) and (len(steps) != predictor_sizes.shape[0]):
+            raise ValueError("You must provide at least one step value per predictor layer.")
+
+        if (not (offsets is None)) and (len(offsets) != predictor_sizes.shape[0]):
+            raise ValueError("You must provide at least one offset value per predictor layer.")
+
+        ##################################################################################
+        # Set or compute members.
+        ##################################################################################
+
+        self.img_height = img_height
+        self.img_width = img_width
+        self.n_classes = n_classes + 1 # + 1 for the background class
+        self.predictor_sizes = predictor_sizes
+        self.min_scale = min_scale
+        self.max_scale = max_scale
+        # If `scales` is None, compute the scaling factors by linearly interpolating between
+        # `min_scale` and `max_scale`. If an explicit list of `scales` is given, however,
+        # then it takes precedent over `min_scale` and `max_scale`.
+        if (scales is None):
+            self.scales = np.linspace(self.min_scale, self.max_scale, len(self.predictor_sizes)+1)
+        else:
+            # If a list of scales is given explicitly, we'll use that instead of computing it from `min_scale` and `max_scale`.
+            self.scales = scales
+        # If `aspect_ratios_per_layer` is None, then we use the same list of aspect ratios
+        # `aspect_ratios_global` for all predictor layers. If `aspect_ratios_per_layer` is given,
+        # however, then it takes precedent over `aspect_ratios_global`.
+        if (aspect_ratios_per_layer is None):
+            self.aspect_ratios = [aspect_ratios_global] * predictor_sizes.shape[0]
+        else:
+            # If aspect ratios are given per layer, we'll use those.
+            self.aspect_ratios = aspect_ratios_per_layer
+        self.two_boxes_for_ar1 = two_boxes_for_ar1
+        if not (steps is None):
+            self.steps = steps
+        else:
+            self.steps = [None] * predictor_sizes.shape[0]
+        if not (offsets is None):
+            self.offsets = offsets
+        else:
+            self.offsets = [None] * predictor_sizes.shape[0]
+        self.clip_boxes = clip_boxes
+        self.variances = variances
+        self.matching_type = matching_type
+        self.pos_iou_threshold = pos_iou_threshold
+        self.neg_iou_limit = neg_iou_limit
+        self.border_pixels = border_pixels
+        self.coords = coords
+        self.normalize_coords = normalize_coords
+        self.background_id = background_id
+
+        # Compute the number of boxes per spatial location for each predictor layer.
+        # For example, if a predictor layer has three different aspect ratios, [1.0, 0.5, 2.0], and is
+        # supposed to predict two boxes of slightly different size for aspect ratio 1.0, then that predictor
+        # layer predicts a total of four boxes at every spatial location across the feature map.
+        if not (aspect_ratios_per_layer is None):
+            self.n_boxes = []
+            for aspect_ratios in aspect_ratios_per_layer:
+                if (1 in aspect_ratios) & two_boxes_for_ar1:
+                    self.n_boxes.append(len(aspect_ratios) + 1)
+                else:
+                    self.n_boxes.append(len(aspect_ratios))
+        else:
+            if (1 in aspect_ratios_global) & two_boxes_for_ar1:
+                self.n_boxes = len(aspect_ratios_global) + 1
+            else:
+                self.n_boxes = len(aspect_ratios_global)
+
+        ##################################################################################
+        # Compute the anchor boxes for each predictor layer.
+        ##################################################################################
+
+        # Compute the anchor boxes for each predictor layer. We only have to do this once
+        # since the anchor boxes depend only on the model configuration, not on the input data.
+        # For each predictor layer (i.e. for each scaling factor) the tensors for that layer's
+        # anchor boxes will have the shape `(feature_map_height, feature_map_width, n_boxes, 4)`.
+
+        self.boxes_list = [] # This will store the anchor boxes for each predicotr layer.
+
+        # The following lists just store diagnostic information. Sometimes it's handy to have the
+        # boxes' center points, heights, widths, etc. in a list.
+        self.wh_list_diag = [] # Box widths and heights for each predictor layer
+        self.steps_diag = [] # Horizontal and vertical distances between any two boxes for each predictor layer
+        self.offsets_diag = [] # Offsets for each predictor layer
+        self.centers_diag = [] # Anchor box center points as `(cy, cx)` for each predictor layer
+
+        # Iterate over all predictor layers and compute the anchor boxes for each one.
+        for i in range(len(self.predictor_sizes)):
+            boxes, center, wh, step, offset = self.generate_anchor_boxes_for_layer(feature_map_size=self.predictor_sizes[i],
+                                                                                   aspect_ratios=self.aspect_ratios[i],
+                                                                                   this_scale=self.scales[i],
+                                                                                   next_scale=self.scales[i+1],
+                                                                                   this_steps=self.steps[i],
+                                                                                   this_offsets=self.offsets[i],
+                                                                                   diagnostics=True)
+            self.boxes_list.append(boxes)
+            self.wh_list_diag.append(wh)
+            self.steps_diag.append(step)
+            self.offsets_diag.append(offset)
+            self.centers_diag.append(center)
+
+    def __call__(self, ground_truth_labels, diagnostics=False):
+        '''
+        Converts ground truth bounding box data into a suitable format to train an SSD model.
+
+        Arguments:
+            ground_truth_labels (list): A python list of length `batch_size` that contains one 2D Numpy array
+                for each batch image. Each such array has `k` rows for the `k` ground truth bounding boxes belonging
+                to the respective image, and the data for each ground truth bounding box has the format
+                `(class_id, xmin, ymin, xmax, ymax)` (i.e. the 'corners' coordinate format), and `class_id` must be
+                an integer greater than 0 for all boxes as class ID 0 is reserved for the background class.
+            diagnostics (bool, optional): If `True`, not only the encoded ground truth tensor will be returned,
+                but also a copy of it with anchor box coordinates in place of the ground truth coordinates.
+                This can be very useful if you want to visualize which anchor boxes got matched to which ground truth
+                boxes.
+
+        Returns:
+            `y_encoded`, a 3D numpy array of shape `(batch_size, #boxes, #classes + 4 + 4 + 4)` that serves as the
+            ground truth label tensor for training, where `#boxes` is the total number of boxes predicted by the
+            model per image, and the classes are one-hot-encoded. The four elements after the class vecotrs in
+            the last axis are the box coordinates, the next four elements after that are just dummy elements, and
+            the last four elements are the variances.
+        '''
+
+        # Mapping to define which indices represent which coordinates in the ground truth.
+        class_id = 0
+        xmin = 1
+        ymin = 2
+        xmax = 3
+        ymax = 4
+
+        batch_size = len(ground_truth_labels)
+
+        ##################################################################################
+        # Generate the template for y_encoded.
+        ##################################################################################
+
+        y_encoded = self.generate_encoding_template(batch_size=batch_size, diagnostics=False)
+
+        ##################################################################################
+        # Match ground truth boxes to anchor boxes.
+        ##################################################################################
+
+        # Match the ground truth boxes to the anchor boxes. Every anchor box that does not have
+        # a ground truth match and for which the maximal IoU overlap with any ground truth box is less
+        # than or equal to `neg_iou_limit` will be a negative (background) box.
+
+        y_encoded[:, :, self.background_id] = 1 # All boxes are background boxes by default.
+        n_boxes = y_encoded.shape[1] # The total number of boxes that the model predicts per batch item
+        class_vectors = np.eye(self.n_classes) # An identity matrix that we'll use as one-hot class vectors
+
+        for i in range(batch_size): # For each batch item...
+
+            if ground_truth_labels[i].size == 0: continue # If there is no ground truth for this batch item, there is nothing to match.
+            labels = ground_truth_labels[i].astype(np.float) # The labels for this batch item
+
+            # Check for degenerate ground truth bounding boxes before attempting any computations.
+            if np.any(labels[:,[xmax]] - labels[:,[xmin]] <= 0) or np.any(labels[:,[ymax]] - labels[:,[ymin]] <= 0):
+                raise DegenerateBoxError("SSDInputEncoder detected degenerate ground truth bounding boxes for batch item {} with bounding boxes {}, ".format(i, labels) +
+                                         "i.e. bounding boxes where xmax <= xmin and/or ymax <= ymin. Degenerate ground truth " +
+                                         "bounding boxes will lead to NaN errors during the training.")
+
+            # Maybe normalize the box coordinates.
+            if self.normalize_coords:
+                labels[:,[ymin,ymax]] /= self.img_height # Normalize ymin and ymax relative to the image height
+                labels[:,[xmin,xmax]] /= self.img_width # Normalize xmin and xmax relative to the image width
+
+            # Maybe convert the box coordinate format.
+            if self.coords == 'centroids':
+                labels = convert_coordinates(labels, start_index=xmin, conversion='corners2centroids', border_pixels=self.border_pixels)
+            elif self.coords == 'minmax':
+                labels = convert_coordinates(labels, start_index=xmin, conversion='corners2minmax')
+
+            classes_one_hot = class_vectors[labels[:, class_id].astype(np.int)] # The one-hot class IDs for the ground truth boxes of this batch item
+            labels_one_hot = np.concatenate([classes_one_hot, labels[:, [xmin,ymin,xmax,ymax]]], axis=-1) # The one-hot version of the labels for this batch item
+
+            # Compute the IoU similarities between all anchor boxes and all ground truth boxes for this batch item.
+            # This is a matrix of shape `(num_ground_truth_boxes, num_anchor_boxes)`.
+            similarities = iou(labels[:,[xmin,ymin,xmax,ymax]], y_encoded[i,:,-12:-8], coords=self.coords, mode='outer_product', border_pixels=self.border_pixels)
+
+            # First: Do bipartite matching, i.e. match each ground truth box to the one anchor box with the highest IoU.
+            #        This ensures that each ground truth box will have at least one good match.
+
+            # For each ground truth box, get the anchor box to match with it.
+            bipartite_matches = match_bipartite_greedy(weight_matrix=similarities)
+
+            # Write the ground truth data to the matched anchor boxes.
+            y_encoded[i, bipartite_matches, :-8] = labels_one_hot
+
+            # Set the columns of the matched anchor boxes to zero to indicate that they were matched.
+            similarities[:, bipartite_matches] = 0
+
+            # Second: Maybe do 'multi' matching, where each remaining anchor box will be matched to its most similar
+            #         ground truth box with an IoU of at least `pos_iou_threshold`, or not matched if there is no
+            #         such ground truth box.
+
+            if self.matching_type == 'multi':
+
+                # Get all matches that satisfy the IoU threshold.
+                matches = match_multi(weight_matrix=similarities, threshold=self.pos_iou_threshold)
+
+                # Write the ground truth data to the matched anchor boxes.
+                y_encoded[i, matches[1], :-8] = labels_one_hot[matches[0]]
+
+                # Set the columns of the matched anchor boxes to zero to indicate that they were matched.
+                similarities[:, matches[1]] = 0
+
+            # Third: Now after the matching is done, all negative (background) anchor boxes that have
+            #        an IoU of `neg_iou_limit` or more with any ground truth box will be set to netral,
+            #        i.e. they will no longer be background boxes. These anchors are "too close" to a
+            #        ground truth box to be valid background boxes.
+
+            max_background_similarities = np.amax(similarities, axis=0)
+            neutral_boxes = np.nonzero(max_background_similarities >= self.neg_iou_limit)[0]
+            y_encoded[i, neutral_boxes, self.background_id] = 0
+
+        ##################################################################################
+        # Convert box coordinates to anchor box offsets.
+        ##################################################################################
+
+        if self.coords == 'centroids':
+            y_encoded[:,:,[-12,-11]] -= y_encoded[:,:,[-8,-7]] # cx(gt) - cx(anchor), cy(gt) - cy(anchor)
+            y_encoded[:,:,[-12,-11]] /= y_encoded[:,:,[-6,-5]] * y_encoded[:,:,[-4,-3]] # (cx(gt) - cx(anchor)) / w(anchor) / cx_variance, (cy(gt) - cy(anchor)) / h(anchor) / cy_variance
+            y_encoded[:,:,[-10,-9]] /= y_encoded[:,:,[-6,-5]] # w(gt) / w(anchor), h(gt) / h(anchor)
+            y_encoded[:,:,[-10,-9]] = np.log(y_encoded[:,:,[-10,-9]]) / y_encoded[:,:,[-2,-1]] # ln(w(gt) / w(anchor)) / w_variance, ln(h(gt) / h(anchor)) / h_variance (ln == natural logarithm)
+        elif self.coords == 'corners':
+            y_encoded[:,:,-12:-8] -= y_encoded[:,:,-8:-4] # (gt - anchor) for all four coordinates
+            y_encoded[:,:,[-12,-10]] /= np.expand_dims(y_encoded[:,:,-6] - y_encoded[:,:,-8], axis=-1) # (xmin(gt) - xmin(anchor)) / w(anchor), (xmax(gt) - xmax(anchor)) / w(anchor)
+            y_encoded[:,:,[-11,-9]] /= np.expand_dims(y_encoded[:,:,-5] - y_encoded[:,:,-7], axis=-1) # (ymin(gt) - ymin(anchor)) / h(anchor), (ymax(gt) - ymax(anchor)) / h(anchor)
+            y_encoded[:,:,-12:-8] /= y_encoded[:,:,-4:] # (gt - anchor) / size(anchor) / variance for all four coordinates, where 'size' refers to w and h respectively
+        elif self.coords == 'minmax':
+            y_encoded[:,:,-12:-8] -= y_encoded[:,:,-8:-4] # (gt - anchor) for all four coordinates
+            y_encoded[:,:,[-12,-11]] /= np.expand_dims(y_encoded[:,:,-7] - y_encoded[:,:,-8], axis=-1) # (xmin(gt) - xmin(anchor)) / w(anchor), (xmax(gt) - xmax(anchor)) / w(anchor)
+            y_encoded[:,:,[-10,-9]] /= np.expand_dims(y_encoded[:,:,-5] - y_encoded[:,:,-6], axis=-1) # (ymin(gt) - ymin(anchor)) / h(anchor), (ymax(gt) - ymax(anchor)) / h(anchor)
+            y_encoded[:,:,-12:-8] /= y_encoded[:,:,-4:] # (gt - anchor) / size(anchor) / variance for all four coordinates, where 'size' refers to w and h respectively
+
+        if diagnostics:
+            # Here we'll save the matched anchor boxes (i.e. anchor boxes that were matched to a ground truth box, but keeping the anchor box coordinates).
+            y_matched_anchors = np.copy(y_encoded)
+            y_matched_anchors[:,:,-12:-8] = 0 # Keeping the anchor box coordinates means setting the offsets to zero.
+            return y_encoded, y_matched_anchors
+        else:
+            return y_encoded
+
+    def generate_anchor_boxes_for_layer(self,
+                                        feature_map_size,
+                                        aspect_ratios,
+                                        this_scale,
+                                        next_scale,
+                                        this_steps=None,
+                                        this_offsets=None,
+                                        diagnostics=False):
+        '''
+        Computes an array of the spatial positions and sizes of the anchor boxes for one predictor layer
+        of size `feature_map_size == [feature_map_height, feature_map_width]`.
+
+        Arguments:
+            feature_map_size (tuple): A list or tuple `[feature_map_height, feature_map_width]` with the spatial
+                dimensions of the feature map for which to generate the anchor boxes.
+            aspect_ratios (list): A list of floats, the aspect ratios for which anchor boxes are to be generated.
+                All list elements must be unique.
+            this_scale (float): A float in [0, 1], the scaling factor for the size of the generate anchor boxes
+                as a fraction of the shorter side of the input image.
+            next_scale (float): A float in [0, 1], the next larger scaling factor. Only relevant if
+                `self.two_boxes_for_ar1 == True`.
+            diagnostics (bool, optional): If true, the following additional outputs will be returned:
+                1) A list of the center point `x` and `y` coordinates for each spatial location.
+                2) A list containing `(width, height)` for each box aspect ratio.
+                3) A tuple containing `(step_height, step_width)`
+                4) A tuple containing `(offset_height, offset_width)`
+                This information can be useful to understand in just a few numbers what the generated grid of
+                anchor boxes actually looks like, i.e. how large the different boxes are and how dense
+                their spatial distribution is, in order to determine whether the box grid covers the input images
+                appropriately and whether the box sizes are appropriate to fit the sizes of the objects
+                to be detected.
+
+        Returns:
+            A 4D Numpy tensor of shape `(feature_map_height, feature_map_width, n_boxes_per_cell, 4)` where the
+            last dimension contains `(xmin, xmax, ymin, ymax)` for each anchor box in each cell of the feature map.
+        '''
+        # Compute box width and height for each aspect ratio.
+
+        # The shorter side of the image will be used to compute `w` and `h` using `scale` and `aspect_ratios`.
+        size = min(self.img_height, self.img_width)
+        # Compute the box widths and and heights for all aspect ratios
+        wh_list = []
+        for ar in aspect_ratios:
+            if (ar == 1):
+                # Compute the regular anchor box for aspect ratio 1.
+                box_height = box_width = this_scale * size
+                wh_list.append((box_width, box_height))
+                if self.two_boxes_for_ar1:
+                    # Compute one slightly larger version using the geometric mean of this scale value and the next.
+                    box_height = box_width = np.sqrt(this_scale * next_scale) * size
+                    wh_list.append((box_width, box_height))
+            else:
+                box_width = this_scale * size * np.sqrt(ar)
+                box_height = this_scale * size / np.sqrt(ar)
+                wh_list.append((box_width, box_height))
+        wh_list = np.array(wh_list)
+        n_boxes = len(wh_list)
+
+        # Compute the grid of box center points. They are identical for all aspect ratios.
+
+        # Compute the step sizes, i.e. how far apart the anchor box center points will be vertically and horizontally.
+        if (this_steps is None):
+            step_height = self.img_height / feature_map_size[0]
+            step_width = self.img_width / feature_map_size[1]
+        else:
+            if isinstance(this_steps, (list, tuple)) and (len(this_steps) == 2):
+                step_height = this_steps[0]
+                step_width = this_steps[1]
+            elif isinstance(this_steps, (int, float)):
+                step_height = this_steps
+                step_width = this_steps
+        # Compute the offsets, i.e. at what pixel values the first anchor box center point will be from the top and from the left of the image.
+        if (this_offsets is None):
+            offset_height = 0.5
+            offset_width = 0.5
+        else:
+            if isinstance(this_offsets, (list, tuple)) and (len(this_offsets) == 2):
+                offset_height = this_offsets[0]
+                offset_width = this_offsets[1]
+            elif isinstance(this_offsets, (int, float)):
+                offset_height = this_offsets
+                offset_width = this_offsets
+        # Now that we have the offsets and step sizes, compute the grid of anchor box center points.
+        cy = np.linspace(offset_height * step_height, (offset_height + feature_map_size[0] - 1) * step_height, feature_map_size[0])
+        cx = np.linspace(offset_width * step_width, (offset_width + feature_map_size[1] - 1) * step_width, feature_map_size[1])
+        cx_grid, cy_grid = np.meshgrid(cx, cy)
+        cx_grid = np.expand_dims(cx_grid, -1) # This is necessary for np.tile() to do what we want further down
+        cy_grid = np.expand_dims(cy_grid, -1) # This is necessary for np.tile() to do what we want further down
+
+        # Create a 4D tensor template of shape `(feature_map_height, feature_map_width, n_boxes, 4)`
+        # where the last dimension will contain `(cx, cy, w, h)`
+        boxes_tensor = np.zeros((feature_map_size[0], feature_map_size[1], n_boxes, 4))
+
+        boxes_tensor[:, :, :, 0] = np.tile(cx_grid, (1, 1, n_boxes)) # Set cx
+        boxes_tensor[:, :, :, 1] = np.tile(cy_grid, (1, 1, n_boxes)) # Set cy
+        boxes_tensor[:, :, :, 2] = wh_list[:, 0] # Set w
+        boxes_tensor[:, :, :, 3] = wh_list[:, 1] # Set h
+
+        # Convert `(cx, cy, w, h)` to `(xmin, ymin, xmax, ymax)`
+        boxes_tensor = convert_coordinates(boxes_tensor, start_index=0, conversion='centroids2corners')
+
+        # If `clip_boxes` is enabled, clip the coordinates to lie within the image boundaries
+        if self.clip_boxes:
+            x_coords = boxes_tensor[:,:,:,[0, 2]]
+            x_coords[x_coords >= self.img_width] = self.img_width - 1
+            x_coords[x_coords < 0] = 0
+            boxes_tensor[:,:,:,[0, 2]] = x_coords
+            y_coords = boxes_tensor[:,:,:,[1, 3]]
+            y_coords[y_coords >= self.img_height] = self.img_height - 1
+            y_coords[y_coords < 0] = 0
+            boxes_tensor[:,:,:,[1, 3]] = y_coords
+
+        # `normalize_coords` is enabled, normalize the coordinates to be within [0,1]
+        if self.normalize_coords:
+            boxes_tensor[:, :, :, [0, 2]] /= self.img_width
+            boxes_tensor[:, :, :, [1, 3]] /= self.img_height
+
+        # TODO: Implement box limiting directly for `(cx, cy, w, h)` so that we don't have to unnecessarily convert back and forth.
+        if self.coords == 'centroids':
+            # Convert `(xmin, ymin, xmax, ymax)` back to `(cx, cy, w, h)`.
+            boxes_tensor = convert_coordinates(boxes_tensor, start_index=0, conversion='corners2centroids', border_pixels='half')
+        elif self.coords == 'minmax':
+            # Convert `(xmin, ymin, xmax, ymax)` to `(xmin, xmax, ymin, ymax).
+            boxes_tensor = convert_coordinates(boxes_tensor, start_index=0, conversion='corners2minmax', border_pixels='half')
+
+        if diagnostics:
+            return boxes_tensor, (cy, cx), wh_list, (step_height, step_width), (offset_height, offset_width)
+        else:
+            return boxes_tensor
+
+    def generate_encoding_template(self, batch_size, diagnostics=False):
+        '''
+        Produces an encoding template for the ground truth label tensor for a given batch.
+
+        Note that all tensor creation, reshaping and concatenation operations performed in this function
+        and the sub-functions it calls are identical to those performed inside the SSD model. This, of course,
+        must be the case in order to preserve the spatial meaning of each box prediction, but it's useful to make
+        yourself aware of this fact and why it is necessary.
+
+        In other words, the boxes in `y_encoded` must have a specific order in order correspond to the right spatial
+        positions and scales of the boxes predicted by the model. The sequence of operations here ensures that `y_encoded`
+        has this specific form.
+
+        Arguments:
+            batch_size (int): The batch size.
+            diagnostics (bool, optional): See the documnentation for `generate_anchor_boxes()`. The diagnostic output
+                here is similar, just for all predictor conv layers.
+
+        Returns:
+            A Numpy array of shape `(batch_size, #boxes, #classes + 12)`, the template into which to encode
+            the ground truth labels for training. The last axis has length `#classes + 12` because the model
+            output contains not only the 4 predicted box coordinate offsets, but also the 4 coordinates for
+            the anchor boxes and the 4 variance values.
+        '''
+        # Tile the anchor boxes for each predictor layer across all batch items.
+        boxes_batch = []
+        for boxes in self.boxes_list:
+            # Prepend one dimension to `self.boxes_list` to account for the batch size and tile it along.
+            # The result will be a 5D tensor of shape `(batch_size, feature_map_height, feature_map_width, n_boxes, 4)`
+            boxes = np.expand_dims(boxes, axis=0)
+            boxes = np.tile(boxes, (batch_size, 1, 1, 1, 1))
+
+            # Now reshape the 5D tensor above into a 3D tensor of shape
+            # `(batch, feature_map_height * feature_map_width * n_boxes, 4)`. The resulting
+            # order of the tensor content will be identical to the order obtained from the reshaping operation
+            # in our Keras model (we're using the Tensorflow backend, and tf.reshape() and np.reshape()
+            # use the same default index order, which is C-like index ordering)
+            boxes = np.reshape(boxes, (batch_size, -1, 4))
+            boxes_batch.append(boxes)
+
+        # Concatenate the anchor tensors from the individual layers to one.
+        boxes_tensor = np.concatenate(boxes_batch, axis=1)
+
+        # 3: Create a template tensor to hold the one-hot class encodings of shape `(batch, #boxes, #classes)`
+        #    It will contain all zeros for now, the classes will be set in the matching process that follows
+        classes_tensor = np.zeros((batch_size, boxes_tensor.shape[1], self.n_classes))
+
+        # 4: Create a tensor to contain the variances. This tensor has the same shape as `boxes_tensor` and simply
+        #    contains the same 4 variance values for every position in the last axis.
+        variances_tensor = np.zeros_like(boxes_tensor)
+        variances_tensor += self.variances # Long live broadcasting
+
+        # 4: Concatenate the classes, boxes and variances tensors to get our final template for y_encoded. We also need
+        #    another tensor of the shape of `boxes_tensor` as a space filler so that `y_encoding_template` has the same
+        #    shape as the SSD model output tensor. The content of this tensor is irrelevant, we'll just use
+        #    `boxes_tensor` a second time.
+        y_encoding_template = np.concatenate((classes_tensor, boxes_tensor, boxes_tensor, variances_tensor), axis=2)
+
+        if diagnostics:
+            return y_encoding_template, self.centers_diag, self.wh_list_diag, self.steps_diag, self.offsets_diag
+        else:
+            return y_encoding_template
+
+class DegenerateBoxError(Exception):
+    '''
+    An exception class to be raised if degenerate boxes are being detected.
+    '''
+    pass
diff --git a/keras_ssd/ssd_encoder_decoder/ssd_output_decoder.py b/keras_ssd/ssd_encoder_decoder/ssd_output_decoder.py
new file mode 100644
index 0000000..e6dce6a
--- /dev/null
+++ b/keras_ssd/ssd_encoder_decoder/ssd_output_decoder.py
@@ -0,0 +1,530 @@
+'''
+Includes:
+* Functions to decode and filter raw SSD model output. These are only needed if the
+  SSD model does not have a `DecodeDetections` layer.
+* Functions to perform greedy non-maximum suppression
+
+Copyright (C) 2018 Pierluigi Ferrari
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+'''
+
+from __future__ import division
+import numpy as np
+
+from bounding_box_utils.bounding_box_utils import iou, convert_coordinates
+
+def greedy_nms(y_pred_decoded, iou_threshold=0.45, coords='corners', border_pixels='half'):
+    '''
+    Perform greedy non-maximum suppression on the input boxes.
+
+    Greedy NMS works by selecting the box with the highest score and
+    removing all boxes around it that are too close to it measured by IoU-similarity.
+    Out of the boxes that are left over, once again the one with the highest
+    score is selected and so on, until no boxes with too much overlap are left.
+
+    Arguments:
+        y_pred_decoded (list): A batch of decoded predictions. For a given batch size `n` this
+            is a list of length `n` where each list element is a 2D Numpy array.
+            For a batch item with `k` predicted boxes this 2D Numpy array has
+            shape `(k, 6)`, where each row contains the coordinates of the respective
+            box in the format `[class_id, score, xmin, xmax, ymin, ymax]`.
+            Technically, the number of columns doesn't have to be 6, it can be
+            arbitrary as long as the first four elements of each row are
+            `xmin`, `xmax`, `ymin`, `ymax` (in this order) and the last element
+            is the score assigned to the prediction. Note that this function is
+            agnostic to the scale of the score or what it represents.
+        iou_threshold (float, optional): All boxes with a Jaccard similarity of
+            greater than `iou_threshold` with a locally maximal box will be removed
+            from the set of predictions, where 'maximal' refers to the box score.
+        coords (str, optional): The coordinate format of `y_pred_decoded`.
+            Can be one of the formats supported by `iou()`.
+        border_pixels (str, optional): How to treat the border pixels of the bounding boxes.
+            Can be 'include', 'exclude', or 'half'. If 'include', the border pixels belong
+            to the boxes. If 'exclude', the border pixels do not belong to the boxes.
+            If 'half', then one of each of the two horizontal and vertical borders belong
+            to the boxex, but not the other.
+
+    Returns:
+        The predictions after removing non-maxima. The format is the same as the input format.
+    '''
+    y_pred_decoded_nms = []
+    for batch_item in y_pred_decoded: # For the labels of each batch item...
+        boxes_left = np.copy(batch_item)
+        maxima = [] # This is where we store the boxes that make it through the non-maximum suppression
+        while boxes_left.shape[0] > 0: # While there are still boxes left to compare...
+            maximum_index = np.argmax(boxes_left[:,1]) # ...get the index of the next box with the highest confidence...
+            maximum_box = np.copy(boxes_left[maximum_index]) # ...copy that box and...
+            maxima.append(maximum_box) # ...append it to `maxima` because we'll definitely keep it
+            boxes_left = np.delete(boxes_left, maximum_index, axis=0) # Now remove the maximum box from `boxes_left`
+            if boxes_left.shape[0] == 0: break # If there are no boxes left after this step, break. Otherwise...
+            similarities = iou(boxes_left[:,2:], maximum_box[2:], coords=coords, mode='element-wise', border_pixels=border_pixels) # ...compare (IoU) the other left over boxes to the maximum box...
+            boxes_left = boxes_left[similarities <= iou_threshold] # ...so that we can remove the ones that overlap too much with the maximum box
+        y_pred_decoded_nms.append(np.array(maxima))
+
+    return y_pred_decoded_nms
+
+def _greedy_nms(predictions, iou_threshold=0.45, coords='corners', border_pixels='half'):
+    '''
+    The same greedy non-maximum suppression algorithm as above, but slightly modified for use as an internal
+    function for per-class NMS in `decode_detections()`.
+    '''
+    boxes_left = np.copy(predictions)
+    maxima = [] # This is where we store the boxes that make it through the non-maximum suppression
+    while boxes_left.shape[0] > 0: # While there are still boxes left to compare...
+        maximum_index = np.argmax(boxes_left[:,0]) # ...get the index of the next box with the highest confidence...
+        maximum_box = np.copy(boxes_left[maximum_index]) # ...copy that box and...
+        maxima.append(maximum_box) # ...append it to `maxima` because we'll definitely keep it
+        boxes_left = np.delete(boxes_left, maximum_index, axis=0) # Now remove the maximum box from `boxes_left`
+        if boxes_left.shape[0] == 0: break # If there are no boxes left after this step, break. Otherwise...
+        similarities = iou(boxes_left[:,1:], maximum_box[1:], coords=coords, mode='element-wise', border_pixels=border_pixels) # ...compare (IoU) the other left over boxes to the maximum box...
+        boxes_left = boxes_left[similarities <= iou_threshold] # ...so that we can remove the ones that overlap too much with the maximum box
+    return np.array(maxima)
+
+def _greedy_nms2(predictions, iou_threshold=0.45, coords='corners', border_pixels='half'):
+    '''
+    The same greedy non-maximum suppression algorithm as above, but slightly modified for use as an internal
+    function in `decode_detections_fast()`.
+    '''
+    boxes_left = np.copy(predictions)
+    maxima = [] # This is where we store the boxes that make it through the non-maximum suppression
+    while boxes_left.shape[0] > 0: # While there are still boxes left to compare...
+        maximum_index = np.argmax(boxes_left[:,1]) # ...get the index of the next box with the highest confidence...
+        maximum_box = np.copy(boxes_left[maximum_index]) # ...copy that box and...
+        maxima.append(maximum_box) # ...append it to `maxima` because we'll definitely keep it
+        boxes_left = np.delete(boxes_left, maximum_index, axis=0) # Now remove the maximum box from `boxes_left`
+        if boxes_left.shape[0] == 0: break # If there are no boxes left after this step, break. Otherwise...
+        similarities = iou(boxes_left[:,2:], maximum_box[2:], coords=coords, mode='element-wise', border_pixels=border_pixels) # ...compare (IoU) the other left over boxes to the maximum box...
+        boxes_left = boxes_left[similarities <= iou_threshold] # ...so that we can remove the ones that overlap too much with the maximum box
+    return np.array(maxima)
+
+def decode_detections(y_pred,
+                      confidence_thresh=0.01,
+                      iou_threshold=0.45,
+                      top_k=200,
+                      input_coords='centroids',
+                      normalize_coords=True,
+                      img_height=None,
+                      img_width=None,
+                      border_pixels='half'):
+    '''
+    Convert model prediction output back to a format that contains only the positive box predictions
+    (i.e. the same format that `SSDInputEncoder` takes as input).
+
+    After the decoding, two stages of prediction filtering are performed for each class individually:
+    First confidence thresholding, then greedy non-maximum suppression. The filtering results for all
+    classes are concatenated and the `top_k` overall highest confidence results constitute the final
+    predictions for a given batch item. This procedure follows the original Caffe implementation.
+    For a slightly different and more efficient alternative to decode raw model output that performs
+    non-maximum suppresion globally instead of per class, see `decode_detections_fast()` below.
+
+    Arguments:
+        y_pred (array): The prediction output of the SSD model, expected to be a Numpy array
+            of shape `(batch_size, #boxes, #classes + 4 + 4 + 4)`, where `#boxes` is the total number of
+            boxes predicted by the model per image and the last axis contains
+            `[one-hot vector for the classes, 4 predicted coordinate offsets, 4 anchor box coordinates, 4 variances]`.
+        confidence_thresh (float, optional): A float in [0,1), the minimum classification confidence in a specific
+            positive class in order to be considered for the non-maximum suppression stage for the respective class.
+            A lower value will result in a larger part of the selection process being done by the non-maximum suppression
+            stage, while a larger value will result in a larger part of the selection process happening in the confidence
+            thresholding stage.
+        iou_threshold (float, optional): A float in [0,1]. All boxes with a Jaccard similarity of greater than `iou_threshold`
+            with a locally maximal box will be removed from the set of predictions for a given class, where 'maximal' refers
+            to the box score.
+        top_k (int, optional): The number of highest scoring predictions to be kept for each batch item after the
+            non-maximum suppression stage.
+        input_coords (str, optional): The box coordinate format that the model outputs. Can be either 'centroids'
+            for the format `(cx, cy, w, h)` (box center coordinates, width, and height), 'minmax' for the format
+            `(xmin, xmax, ymin, ymax)`, or 'corners' for the format `(xmin, ymin, xmax, ymax)`.
+        normalize_coords (bool, optional): Set to `True` if the model outputs relative coordinates (i.e. coordinates in [0,1])
+            and you wish to transform these relative coordinates back to absolute coordinates. If the model outputs
+            relative coordinates, but you do not want to convert them back to absolute coordinates, set this to `False`.
+            Do not set this to `True` if the model already outputs absolute coordinates, as that would result in incorrect
+            coordinates. Requires `img_height` and `img_width` if set to `True`.
+        img_height (int, optional): The height of the input images. Only needed if `normalize_coords` is `True`.
+        img_width (int, optional): The width of the input images. Only needed if `normalize_coords` is `True`.
+        border_pixels (str, optional): How to treat the border pixels of the bounding boxes.
+            Can be 'include', 'exclude', or 'half'. If 'include', the border pixels belong
+            to the boxes. If 'exclude', the border pixels do not belong to the boxes.
+            If 'half', then one of each of the two horizontal and vertical borders belong
+            to the boxex, but not the other.
+
+    Returns:
+        A python list of length `batch_size` where each list element represents the predicted boxes
+        for one image and contains a Numpy array of shape `(boxes, 6)` where each row is a box prediction for
+        a non-background class for the respective image in the format `[class_id, confidence, xmin, ymin, xmax, ymax]`.
+    '''
+    if normalize_coords and ((img_height is None) or (img_width is None)):
+        raise ValueError("If relative box coordinates are supposed to be converted to absolute coordinates, the decoder needs the image size in order to decode the predictions, but `img_height == {}` and `img_width == {}`".format(img_height, img_width))
+
+    # 1: Convert the box coordinates from the predicted anchor box offsets to predicted absolute coordinates
+
+    y_pred_decoded_raw = np.copy(y_pred[:,:,:-8]) # Slice out the classes and the four offsets, throw away the anchor coordinates and variances, resulting in a tensor of shape `[batch, n_boxes, n_classes + 4 coordinates]`
+
+    if input_coords == 'centroids':
+        y_pred_decoded_raw[:,:,[-2,-1]] = np.exp(y_pred_decoded_raw[:,:,[-2,-1]] * y_pred[:,:,[-2,-1]]) # exp(ln(w(pred)/w(anchor)) / w_variance * w_variance) == w(pred) / w(anchor), exp(ln(h(pred)/h(anchor)) / h_variance * h_variance) == h(pred) / h(anchor)
+        y_pred_decoded_raw[:,:,[-2,-1]] *= y_pred[:,:,[-6,-5]] # (w(pred) / w(anchor)) * w(anchor) == w(pred), (h(pred) / h(anchor)) * h(anchor) == h(pred)
+        y_pred_decoded_raw[:,:,[-4,-3]] *= y_pred[:,:,[-4,-3]] * y_pred[:,:,[-6,-5]] # (delta_cx(pred) / w(anchor) / cx_variance) * cx_variance * w(anchor) == delta_cx(pred), (delta_cy(pred) / h(anchor) / cy_variance) * cy_variance * h(anchor) == delta_cy(pred)
+        y_pred_decoded_raw[:,:,[-4,-3]] += y_pred[:,:,[-8,-7]] # delta_cx(pred) + cx(anchor) == cx(pred), delta_cy(pred) + cy(anchor) == cy(pred)
+        y_pred_decoded_raw = convert_coordinates(y_pred_decoded_raw, start_index=-4, conversion='centroids2corners')
+    elif input_coords == 'minmax':
+        y_pred_decoded_raw[:,:,-4:] *= y_pred[:,:,-4:] # delta(pred) / size(anchor) / variance * variance == delta(pred) / size(anchor) for all four coordinates, where 'size' refers to w or h, respectively
+        y_pred_decoded_raw[:,:,[-4,-3]] *= np.expand_dims(y_pred[:,:,-7] - y_pred[:,:,-8], axis=-1) # delta_xmin(pred) / w(anchor) * w(anchor) == delta_xmin(pred), delta_xmax(pred) / w(anchor) * w(anchor) == delta_xmax(pred)
+        y_pred_decoded_raw[:,:,[-2,-1]] *= np.expand_dims(y_pred[:,:,-5] - y_pred[:,:,-6], axis=-1) # delta_ymin(pred) / h(anchor) * h(anchor) == delta_ymin(pred), delta_ymax(pred) / h(anchor) * h(anchor) == delta_ymax(pred)
+        y_pred_decoded_raw[:,:,-4:] += y_pred[:,:,-8:-4] # delta(pred) + anchor == pred for all four coordinates
+        y_pred_decoded_raw = convert_coordinates(y_pred_decoded_raw, start_index=-4, conversion='minmax2corners')
+    elif input_coords == 'corners':
+        y_pred_decoded_raw[:,:,-4:] *= y_pred[:,:,-4:] # delta(pred) / size(anchor) / variance * variance == delta(pred) / size(anchor) for all four coordinates, where 'size' refers to w or h, respectively
+        y_pred_decoded_raw[:,:,[-4,-2]] *= np.expand_dims(y_pred[:,:,-6] - y_pred[:,:,-8], axis=-1) # delta_xmin(pred) / w(anchor) * w(anchor) == delta_xmin(pred), delta_xmax(pred) / w(anchor) * w(anchor) == delta_xmax(pred)
+        y_pred_decoded_raw[:,:,[-3,-1]] *= np.expand_dims(y_pred[:,:,-5] - y_pred[:,:,-7], axis=-1) # delta_ymin(pred) / h(anchor) * h(anchor) == delta_ymin(pred), delta_ymax(pred) / h(anchor) * h(anchor) == delta_ymax(pred)
+        y_pred_decoded_raw[:,:,-4:] += y_pred[:,:,-8:-4] # delta(pred) + anchor == pred for all four coordinates
+    else:
+        raise ValueError("Unexpected value for `input_coords`. Supported input coordinate formats are 'minmax', 'corners' and 'centroids'.")
+
+    # 2: If the model predicts normalized box coordinates and they are supposed to be converted back to absolute coordinates, do that
+
+    if normalize_coords:
+        y_pred_decoded_raw[:,:,[-4,-2]] *= img_width # Convert xmin, xmax back to absolute coordinates
+        y_pred_decoded_raw[:,:,[-3,-1]] *= img_height # Convert ymin, ymax back to absolute coordinates
+
+    # 3: Apply confidence thresholding and non-maximum suppression per class
+
+    n_classes = y_pred_decoded_raw.shape[-1] - 4 # The number of classes is the length of the last axis minus the four box coordinates
+
+    y_pred_decoded = [] # Store the final predictions in this list
+    for batch_item in y_pred_decoded_raw: # `batch_item` has shape `[n_boxes, n_classes + 4 coords]`
+        pred = [] # Store the final predictions for this batch item here
+        for class_id in range(1, n_classes): # For each class except the background class (which has class ID 0)...
+            single_class = batch_item[:,[class_id, -4, -3, -2, -1]] # ...keep only the confidences for that class, making this an array of shape `[n_boxes, 5]` and...
+            threshold_met = single_class[single_class[:,0] > confidence_thresh] # ...keep only those boxes with a confidence above the set threshold.
+            if threshold_met.shape[0] > 0: # If any boxes made the threshold...
+                maxima = _greedy_nms(threshold_met, iou_threshold=iou_threshold, coords='corners', border_pixels=border_pixels) # ...perform NMS on them.
+                maxima_output = np.zeros((maxima.shape[0], maxima.shape[1] + 1)) # Expand the last dimension by one element to have room for the class ID. This is now an arrray of shape `[n_boxes, 6]`
+                maxima_output[:,0] = class_id # Write the class ID to the first column...
+                maxima_output[:,1:] = maxima # ...and write the maxima to the other columns...
+                pred.append(maxima_output) # ...and append the maxima for this class to the list of maxima for this batch item.
+        # Once we're through with all classes, keep only the `top_k` maxima with the highest scores
+        if pred: # If there are any predictions left after confidence-thresholding...
+            pred = np.concatenate(pred, axis=0)
+            if top_k != 'all' and pred.shape[0] > top_k: # If we have more than `top_k` results left at this point, otherwise there is nothing to filter,...
+                top_k_indices = np.argpartition(pred[:,1], kth=pred.shape[0]-top_k, axis=0)[pred.shape[0]-top_k:] # ...get the indices of the `top_k` highest-score maxima...
+                pred = pred[top_k_indices] # ...and keep only those entries of `pred`...
+        else:
+            pred = np.array(pred) # Even if empty, `pred` must become a Numpy array.
+        y_pred_decoded.append(pred) # ...and now that we're done, append the array of final predictions for this batch item to the output list
+
+    return y_pred_decoded
+
+def decode_detections_fast(y_pred,
+                           confidence_thresh=0.5,
+                           iou_threshold=0.45,
+                           top_k='all',
+                           input_coords='centroids',
+                           normalize_coords=True,
+                           img_height=None,
+                           img_width=None,
+                           border_pixels='half'):
+    '''
+    Convert model prediction output back to a format that contains only the positive box predictions
+    (i.e. the same format that `enconde_y()` takes as input).
+
+    Optionally performs confidence thresholding and greedy non-maximum suppression after the decoding stage.
+
+    Note that the decoding procedure used here is not the same as the procedure used in the original Caffe implementation.
+    For each box, the procedure used here assigns the box's highest confidence as its predicted class. Then it removes
+    all boxes for which the highest confidence is the background class. This results in less work for the subsequent
+    non-maximum suppression, because the vast majority of the predictions will be filtered out just by the fact that
+    their highest confidence is for the background class. It is much more efficient than the procedure of the original
+    implementation, but the results may also differ.
+
+    Arguments:
+        y_pred (array): The prediction output of the SSD model, expected to be a Numpy array
+            of shape `(batch_size, #boxes, #classes + 4 + 4 + 4)`, where `#boxes` is the total number of
+            boxes predicted by the model per image and the last axis contains
+            `[one-hot vector for the classes, 4 predicted coordinate offsets, 4 anchor box coordinates, 4 variances]`.
+        confidence_thresh (float, optional): A float in [0,1), the minimum classification confidence in any positive
+            class required for a given box to be considered a positive prediction. A lower value will result
+            in better recall, while a higher value will result in better precision. Do not use this parameter with the
+            goal to combat the inevitably many duplicates that an SSD will produce, the subsequent non-maximum suppression
+            stage will take care of those.
+        iou_threshold (float, optional): `None` or a float in [0,1]. If `None`, no non-maximum suppression will be
+            performed. If not `None`, greedy NMS will be performed after the confidence thresholding stage, meaning
+            all boxes with a Jaccard similarity of greater than `iou_threshold` with a locally maximal box will be removed
+            from the set of predictions, where 'maximal' refers to the box score.
+        top_k (int, optional): 'all' or an integer with number of highest scoring predictions to be kept for each batch item
+            after the non-maximum suppression stage. If 'all', all predictions left after the NMS stage will be kept.
+        input_coords (str, optional): The box coordinate format that the model outputs. Can be either 'centroids'
+            for the format `(cx, cy, w, h)` (box center coordinates, width, and height), 'minmax' for the format
+            `(xmin, xmax, ymin, ymax)`, or 'corners' for the format `(xmin, ymin, xmax, ymax)`.
+        normalize_coords (bool, optional): Set to `True` if the model outputs relative coordinates (i.e. coordinates in [0,1])
+            and you wish to transform these relative coordinates back to absolute coordinates. If the model outputs
+            relative coordinates, but you do not want to convert them back to absolute coordinates, set this to `False`.
+            Do not set this to `True` if the model already outputs absolute coordinates, as that would result in incorrect
+            coordinates. Requires `img_height` and `img_width` if set to `True`.
+        img_height (int, optional): The height of the input images. Only needed if `normalize_coords` is `True`.
+        img_width (int, optional): The width of the input images. Only needed if `normalize_coords` is `True`.
+        border_pixels (str, optional): How to treat the border pixels of the bounding boxes.
+            Can be 'include', 'exclude', or 'half'. If 'include', the border pixels belong
+            to the boxes. If 'exclude', the border pixels do not belong to the boxes.
+            If 'half', then one of each of the two horizontal and vertical borders belong
+            to the boxex, but not the other.
+
+    Returns:
+        A python list of length `batch_size` where each list element represents the predicted boxes
+        for one image and contains a Numpy array of shape `(boxes, 6)` where each row is a box prediction for
+        a non-background class for the respective image in the format `[class_id, confidence, xmin, xmax, ymin, ymax]`.
+    '''
+    if normalize_coords and ((img_height is None) or (img_width is None)):
+        raise ValueError("If relative box coordinates are supposed to be converted to absolute coordinates, the decoder needs the image size in order to decode the predictions, but `img_height == {}` and `img_width == {}`".format(img_height, img_width))
+
+    # 1: Convert the classes from one-hot encoding to their class ID
+    y_pred_converted = np.copy(y_pred[:,:,-14:-8]) # Slice out the four offset predictions plus two elements whereto we'll write the class IDs and confidences in the next step
+    y_pred_converted[:,:,0] = np.argmax(y_pred[:,:,:-12], axis=-1) # The indices of the highest confidence values in the one-hot class vectors are the class ID
+    y_pred_converted[:,:,1] = np.amax(y_pred[:,:,:-12], axis=-1) # Store the confidence values themselves, too
+
+    # 2: Convert the box coordinates from the predicted anchor box offsets to predicted absolute coordinates
+    if input_coords == 'centroids':
+        y_pred_converted[:,:,[4,5]] = np.exp(y_pred_converted[:,:,[4,5]] * y_pred[:,:,[-2,-1]]) # exp(ln(w(pred)/w(anchor)) / w_variance * w_variance) == w(pred) / w(anchor), exp(ln(h(pred)/h(anchor)) / h_variance * h_variance) == h(pred) / h(anchor)
+        y_pred_converted[:,:,[4,5]] *= y_pred[:,:,[-6,-5]] # (w(pred) / w(anchor)) * w(anchor) == w(pred), (h(pred) / h(anchor)) * h(anchor) == h(pred)
+        y_pred_converted[:,:,[2,3]] *= y_pred[:,:,[-4,-3]] * y_pred[:,:,[-6,-5]] # (delta_cx(pred) / w(anchor) / cx_variance) * cx_variance * w(anchor) == delta_cx(pred), (delta_cy(pred) / h(anchor) / cy_variance) * cy_variance * h(anchor) == delta_cy(pred)
+        y_pred_converted[:,:,[2,3]] += y_pred[:,:,[-8,-7]] # delta_cx(pred) + cx(anchor) == cx(pred), delta_cy(pred) + cy(anchor) == cy(pred)
+        y_pred_converted = convert_coordinates(y_pred_converted, start_index=-4, conversion='centroids2corners')
+    elif input_coords == 'minmax':
+        y_pred_converted[:,:,2:] *= y_pred[:,:,-4:] # delta(pred) / size(anchor) / variance * variance == delta(pred) / size(anchor) for all four coordinates, where 'size' refers to w or h, respectively
+        y_pred_converted[:,:,[2,3]] *= np.expand_dims(y_pred[:,:,-7] - y_pred[:,:,-8], axis=-1) # delta_xmin(pred) / w(anchor) * w(anchor) == delta_xmin(pred), delta_xmax(pred) / w(anchor) * w(anchor) == delta_xmax(pred)
+        y_pred_converted[:,:,[4,5]] *= np.expand_dims(y_pred[:,:,-5] - y_pred[:,:,-6], axis=-1) # delta_ymin(pred) / h(anchor) * h(anchor) == delta_ymin(pred), delta_ymax(pred) / h(anchor) * h(anchor) == delta_ymax(pred)
+        y_pred_converted[:,:,2:] += y_pred[:,:,-8:-4] # delta(pred) + anchor == pred for all four coordinates
+        y_pred_converted = convert_coordinates(y_pred_converted, start_index=-4, conversion='minmax2corners')
+    elif input_coords == 'corners':
+        y_pred_converted[:,:,2:] *= y_pred[:,:,-4:] # delta(pred) / size(anchor) / variance * variance == delta(pred) / size(anchor) for all four coordinates, where 'size' refers to w or h, respectively
+        y_pred_converted[:,:,[2,4]] *= np.expand_dims(y_pred[:,:,-6] - y_pred[:,:,-8], axis=-1) # delta_xmin(pred) / w(anchor) * w(anchor) == delta_xmin(pred), delta_xmax(pred) / w(anchor) * w(anchor) == delta_xmax(pred)
+        y_pred_converted[:,:,[3,5]] *= np.expand_dims(y_pred[:,:,-5] - y_pred[:,:,-7], axis=-1) # delta_ymin(pred) / h(anchor) * h(anchor) == delta_ymin(pred), delta_ymax(pred) / h(anchor) * h(anchor) == delta_ymax(pred)
+        y_pred_converted[:,:,2:] += y_pred[:,:,-8:-4] # delta(pred) + anchor == pred for all four coordinates
+    else:
+        raise ValueError("Unexpected value for `coords`. Supported values are 'minmax', 'corners' and 'centroids'.")
+
+    # 3: If the model predicts normalized box coordinates and they are supposed to be converted back to absolute coordinates, do that
+    if normalize_coords:
+        y_pred_converted[:,:,[2,4]] *= img_width # Convert xmin, xmax back to absolute coordinates
+        y_pred_converted[:,:,[3,5]] *= img_height # Convert ymin, ymax back to absolute coordinates
+
+    # 4: Decode our huge `(batch, #boxes, 6)` tensor into a list of length `batch` where each list entry is an array containing only the positive predictions
+    y_pred_decoded = []
+    for batch_item in y_pred_converted: # For each image in the batch...
+        boxes = batch_item[np.nonzero(batch_item[:,0])] # ...get all boxes that don't belong to the background class,...
+        boxes = boxes[boxes[:,1] >= confidence_thresh] # ...then filter out those positive boxes for which the prediction confidence is too low and after that...
+        if iou_threshold: # ...if an IoU threshold is set...
+            boxes = _greedy_nms2(boxes, iou_threshold=iou_threshold, coords='corners', border_pixels=border_pixels) # ...perform NMS on the remaining boxes.
+        if top_k != 'all' and boxes.shape[0] > top_k: # If we have more than `top_k` results left at this point...
+            top_k_indices = np.argpartition(boxes[:,1], kth=boxes.shape[0]-top_k, axis=0)[boxes.shape[0]-top_k:] # ...get the indices of the `top_k` highest-scoring boxes...
+            boxes = boxes[top_k_indices] # ...and keep only those boxes...
+        y_pred_decoded.append(boxes) # ...and now that we're done, append the array of final predictions for this batch item to the output list
+
+    return y_pred_decoded
+
+################################################################################################
+# Debugging tools, not relevant for normal use
+################################################################################################
+
+# The functions below are for debugging, so you won't normally need them. That is,
+# unless you need to debug your model, of course.
+
+def decode_detections_debug(y_pred,
+                            confidence_thresh=0.01,
+                            iou_threshold=0.45,
+                            top_k=200,
+                            input_coords='centroids',
+                            normalize_coords=True,
+                            img_height=None,
+                            img_width=None,
+                            variance_encoded_in_target=False,
+                            border_pixels='half'):
+    '''
+    This decoder performs the same processing as `decode_detections()`, but the output format for each left-over
+    predicted box is `[box_id, class_id, confidence, xmin, ymin, xmax, ymax]`.
+
+    That is, in addition to the usual data, each predicted box has the internal index of that box within
+    the model (`box_id`) prepended to it. This allows you to know exactly which part of the model made a given
+    box prediction; in particular, it allows you to know which predictor layer made a given prediction.
+    This can be useful for debugging.
+
+    Arguments:
+        y_pred (array): The prediction output of the SSD model, expected to be a Numpy array
+            of shape `(batch_size, #boxes, #classes + 4 + 4 + 4)`, where `#boxes` is the total number of
+            boxes predicted by the model per image and the last axis contains
+            `[one-hot vector for the classes, 4 predicted coordinate offsets, 4 anchor box coordinates, 4 variances]`.
+        confidence_thresh (float, optional): A float in [0,1), the minimum classification confidence in a specific
+            positive class in order to be considered for the non-maximum suppression stage for the respective class.
+            A lower value will result in a larger part of the selection process being done by the non-maximum suppression
+            stage, while a larger value will result in a larger part of the selection process happening in the confidence
+            thresholding stage.
+        iou_threshold (float, optional): A float in [0,1]. All boxes with a Jaccard similarity of greater than `iou_threshold`
+            with a locally maximal box will be removed from the set of predictions for a given class, where 'maximal' refers
+            to the box score.
+        top_k (int, optional): The number of highest scoring predictions to be kept for each batch item after the
+            non-maximum suppression stage.
+        input_coords (str, optional): The box coordinate format that the model outputs. Can be either 'centroids'
+            for the format `(cx, cy, w, h)` (box center coordinates, width, and height), 'minmax' for the format
+            `(xmin, xmax, ymin, ymax)`, or 'corners' for the format `(xmin, ymin, xmax, ymax)`.
+        normalize_coords (bool, optional): Set to `True` if the model outputs relative coordinates (i.e. coordinates in [0,1])
+            and you wish to transform these relative coordinates back to absolute coordinates. If the model outputs
+            relative coordinates, but you do not want to convert them back to absolute coordinates, set this to `False`.
+            Do not set this to `True` if the model already outputs absolute coordinates, as that would result in incorrect
+            coordinates. Requires `img_height` and `img_width` if set to `True`.
+        img_height (int, optional): The height of the input images. Only needed if `normalize_coords` is `True`.
+        img_width (int, optional): The width of the input images. Only needed if `normalize_coords` is `True`.
+        border_pixels (str, optional): How to treat the border pixels of the bounding boxes.
+            Can be 'include', 'exclude', or 'half'. If 'include', the border pixels belong
+            to the boxes. If 'exclude', the border pixels do not belong to the boxes.
+            If 'half', then one of each of the two horizontal and vertical borders belong
+            to the boxex, but not the other.
+
+    Returns:
+        A python list of length `batch_size` where each list element represents the predicted boxes
+        for one image and contains a Numpy array of shape `(boxes, 7)` where each row is a box prediction for
+        a non-background class for the respective image in the format `[box_id, class_id, confidence, xmin, ymin, xmax, ymax]`.
+    '''
+    if normalize_coords and ((img_height is None) or (img_width is None)):
+        raise ValueError("If relative box coordinates are supposed to be converted to absolute coordinates, the decoder needs the image size in order to decode the predictions, but `img_height == {}` and `img_width == {}`".format(img_height, img_width))
+
+    # 1: Convert the box coordinates from the predicted anchor box offsets to predicted absolute coordinates
+
+    y_pred_decoded_raw = np.copy(y_pred[:,:,:-8]) # Slice out the classes and the four offsets, throw away the anchor coordinates and variances, resulting in a tensor of shape `[batch, n_boxes, n_classes + 4 coordinates]`
+
+    if input_coords == 'centroids':
+        if variance_encoded_in_target:
+            # Decode the predicted box center x and y coordinates.
+            y_pred_decoded_raw[:,:,[-4,-3]] = y_pred_decoded_raw[:,:,[-4,-3]] * y_pred[:,:,[-6,-5]] + y_pred[:,:,[-8,-7]]
+            # Decode the predicted box width and heigt.
+            y_pred_decoded_raw[:,:,[-2,-1]] = np.exp(y_pred_decoded_raw[:,:,[-2,-1]]) * y_pred[:,:,[-6,-5]]
+        else:
+            # Decode the predicted box center x and y coordinates.
+            y_pred_decoded_raw[:,:,[-4,-3]] = y_pred_decoded_raw[:,:,[-4,-3]] * y_pred[:,:,[-6,-5]] * y_pred[:,:,[-4,-3]] + y_pred[:,:,[-8,-7]]
+            # Decode the predicted box width and heigt.
+            y_pred_decoded_raw[:,:,[-2,-1]] = np.exp(y_pred_decoded_raw[:,:,[-2,-1]] * y_pred[:,:,[-2,-1]]) * y_pred[:,:,[-6,-5]]
+        y_pred_decoded_raw = convert_coordinates(y_pred_decoded_raw, start_index=-4, conversion='centroids2corners')
+    elif input_coords == 'minmax':
+        y_pred_decoded_raw[:,:,-4:] *= y_pred[:,:,-4:] # delta(pred) / size(anchor) / variance * variance == delta(pred) / size(anchor) for all four coordinates, where 'size' refers to w or h, respectively
+        y_pred_decoded_raw[:,:,[-4,-3]] *= np.expand_dims(y_pred[:,:,-7] - y_pred[:,:,-8], axis=-1) # delta_xmin(pred) / w(anchor) * w(anchor) == delta_xmin(pred), delta_xmax(pred) / w(anchor) * w(anchor) == delta_xmax(pred)
+        y_pred_decoded_raw[:,:,[-2,-1]] *= np.expand_dims(y_pred[:,:,-5] - y_pred[:,:,-6], axis=-1) # delta_ymin(pred) / h(anchor) * h(anchor) == delta_ymin(pred), delta_ymax(pred) / h(anchor) * h(anchor) == delta_ymax(pred)
+        y_pred_decoded_raw[:,:,-4:] += y_pred[:,:,-8:-4] # delta(pred) + anchor == pred for all four coordinates
+        y_pred_decoded_raw = convert_coordinates(y_pred_decoded_raw, start_index=-4, conversion='minmax2corners')
+    elif input_coords == 'corners':
+        y_pred_decoded_raw[:,:,-4:] *= y_pred[:,:,-4:] # delta(pred) / size(anchor) / variance * variance == delta(pred) / size(anchor) for all four coordinates, where 'size' refers to w or h, respectively
+        y_pred_decoded_raw[:,:,[-4,-2]] *= np.expand_dims(y_pred[:,:,-6] - y_pred[:,:,-8], axis=-1) # delta_xmin(pred) / w(anchor) * w(anchor) == delta_xmin(pred), delta_xmax(pred) / w(anchor) * w(anchor) == delta_xmax(pred)
+        y_pred_decoded_raw[:,:,[-3,-1]] *= np.expand_dims(y_pred[:,:,-5] - y_pred[:,:,-7], axis=-1) # delta_ymin(pred) / h(anchor) * h(anchor) == delta_ymin(pred), delta_ymax(pred) / h(anchor) * h(anchor) == delta_ymax(pred)
+        y_pred_decoded_raw[:,:,-4:] += y_pred[:,:,-8:-4] # delta(pred) + anchor == pred for all four coordinates
+    else:
+        raise ValueError("Unexpected value for `input_coords`. Supported input coordinate formats are 'minmax', 'corners' and 'centroids'.")
+
+    # 2: If the model predicts normalized box coordinates and they are supposed to be converted back to absolute coordinates, do that
+
+    if normalize_coords:
+        y_pred_decoded_raw[:,:,[-4,-2]] *= img_width # Convert xmin, xmax back to absolute coordinates
+        y_pred_decoded_raw[:,:,[-3,-1]] *= img_height # Convert ymin, ymax back to absolute coordinates
+
+    # 3: For each batch item, prepend each box's internal index to its coordinates.
+
+    y_pred_decoded_raw2 = np.zeros((y_pred_decoded_raw.shape[0], y_pred_decoded_raw.shape[1], y_pred_decoded_raw.shape[2] + 1)) # Expand the last axis by one.
+    y_pred_decoded_raw2[:,:,1:] = y_pred_decoded_raw
+    y_pred_decoded_raw2[:,:,0] = np.arange(y_pred_decoded_raw.shape[1]) # Put the box indices as the first element for each box via broadcasting.
+    y_pred_decoded_raw = y_pred_decoded_raw2
+
+    # 4: Apply confidence thresholding and non-maximum suppression per class
+
+    n_classes = y_pred_decoded_raw.shape[-1] - 5 # The number of classes is the length of the last axis minus the four box coordinates and minus the index
+
+    y_pred_decoded = [] # Store the final predictions in this list
+    for batch_item in y_pred_decoded_raw: # `batch_item` has shape `[n_boxes, n_classes + 4 coords]`
+        pred = [] # Store the final predictions for this batch item here
+        for class_id in range(1, n_classes): # For each class except the background class (which has class ID 0)...
+            single_class = batch_item[:,[0, class_id + 1, -4, -3, -2, -1]] # ...keep only the confidences for that class, making this an array of shape `[n_boxes, 6]` and...
+            threshold_met = single_class[single_class[:,1] > confidence_thresh] # ...keep only those boxes with a confidence above the set threshold.
+            if threshold_met.shape[0] > 0: # If any boxes made the threshold...
+                maxima = _greedy_nms_debug(threshold_met, iou_threshold=iou_threshold, coords='corners', border_pixels=border_pixels) # ...perform NMS on them.
+                maxima_output = np.zeros((maxima.shape[0], maxima.shape[1] + 1)) # Expand the last dimension by one element to have room for the class ID. This is now an arrray of shape `[n_boxes, 6]`
+                maxima_output[:,0] = maxima[:,0] # Write the box index to the first column...
+                maxima_output[:,1] = class_id # ...and write the class ID to the second column...
+                maxima_output[:,2:] = maxima[:,1:] # ...and write the rest of the maxima data to the other columns...
+                pred.append(maxima_output) # ...and append the maxima for this class to the list of maxima for this batch item.
+        # Once we're through with all classes, keep only the `top_k` maxima with the highest scores
+        pred = np.concatenate(pred, axis=0)
+        if pred.shape[0] > top_k: # If we have more than `top_k` results left at this point, otherwise there is nothing to filter,...
+            top_k_indices = np.argpartition(pred[:,2], kth=pred.shape[0]-top_k, axis=0)[pred.shape[0]-top_k:] # ...get the indices of the `top_k` highest-score maxima...
+            pred = pred[top_k_indices] # ...and keep only those entries of `pred`...
+        y_pred_decoded.append(pred) # ...and now that we're done, append the array of final predictions for this batch item to the output list
+
+    return y_pred_decoded
+
+def _greedy_nms_debug(predictions, iou_threshold=0.45, coords='corners', border_pixels='half'):
+    '''
+    The same greedy non-maximum suppression algorithm as above, but slightly modified for use as an internal
+    function for per-class NMS in `decode_detections_debug()`. The difference is that it keeps the indices of all
+    left-over boxes for each batch item, which allows you to know which predictor layer predicted a given output
+    box and is thus useful for debugging.
+    '''
+    boxes_left = np.copy(predictions)
+    maxima = [] # This is where we store the boxes that make it through the non-maximum suppression
+    while boxes_left.shape[0] > 0: # While there are still boxes left to compare...
+        maximum_index = np.argmax(boxes_left[:,1]) # ...get the index of the next box with the highest confidence...
+        maximum_box = np.copy(boxes_left[maximum_index]) # ...copy that box and...
+        maxima.append(maximum_box) # ...append it to `maxima` because we'll definitely keep it
+        boxes_left = np.delete(boxes_left, maximum_index, axis=0) # Now remove the maximum box from `boxes_left`
+        if boxes_left.shape[0] == 0: break # If there are no boxes left after this step, break. Otherwise...
+        similarities = iou(boxes_left[:,2:], maximum_box[2:], coords=coords, mode='element-wise', border_pixels=border_pixels) # ...compare (IoU) the other left over boxes to the maximum box...
+        boxes_left = boxes_left[similarities <= iou_threshold] # ...so that we can remove the ones that overlap too much with the maximum box
+    return np.array(maxima)
+
+def get_num_boxes_per_pred_layer(predictor_sizes, aspect_ratios, two_boxes_for_ar1):
+    '''
+    Returns a list of the number of boxes that each predictor layer predicts.
+
+    `aspect_ratios` must be a nested list, containing a list of aspect ratios
+    for each predictor layer.
+    '''
+    num_boxes_per_pred_layer = []
+    for i in range(len(predictor_sizes)):
+        if two_boxes_for_ar1:
+            num_boxes_per_pred_layer.append(predictor_sizes[i][0] * predictor_sizes[i][1] * (len(aspect_ratios[i]) + 1))
+        else:
+            num_boxes_per_pred_layer.append(predictor_sizes[i][0] * predictor_sizes[i][1] * len(aspect_ratios[i]))
+    return num_boxes_per_pred_layer
+
+def get_pred_layers(y_pred_decoded, num_boxes_per_pred_layer):
+    '''
+    For a given prediction tensor decoded with `decode_detections_debug()`, returns a list
+    with the indices of the predictor layers that made each predictions.
+
+    That is, this function lets you know which predictor layer is responsible
+    for a given prediction.
+
+    Arguments:
+        y_pred_decoded (array): The decoded model output tensor. Must have been
+            decoded with `decode_detections_debug()` so that it contains the internal box index
+            for each predicted box.
+        num_boxes_per_pred_layer (list): A list that contains the total number
+            of boxes that each predictor layer predicts.
+    '''
+    pred_layers_all = []
+    cum_boxes_per_pred_layer = np.cumsum(num_boxes_per_pred_layer)
+    for batch_item in y_pred_decoded:
+        pred_layers = []
+        for prediction in batch_item:
+            if (prediction[0] < 0) or (prediction[0] >= cum_boxes_per_pred_layer[-1]):
+                raise ValueError("Box index is out of bounds of the possible indices as given by the values in `num_boxes_per_pred_layer`.")
+            for i in range(len(cum_boxes_per_pred_layer)):
+                if prediction[0] < cum_boxes_per_pred_layer[i]:
+                    pred_layers.append(i)
+                    break
+        pred_layers_all.append(pred_layers)
+    return pred_layers_all