diff --git a/keras_ssd/.DS_Store b/keras_ssd/.DS_Store new file mode 100644 index 0000000..fcc3484 Binary files /dev/null and b/keras_ssd/.DS_Store differ diff --git a/keras_ssd/.gitattributes b/keras_ssd/.gitattributes new file mode 100755 index 0000000..f4c7e5f --- /dev/null +++ b/keras_ssd/.gitattributes @@ -0,0 +1 @@ +*.ipynb linguist-language=Python diff --git a/keras_ssd/.github/stale.yml b/keras_ssd/.github/stale.yml new file mode 100644 index 0000000..73cb6b9 --- /dev/null +++ b/keras_ssd/.github/stale.yml @@ -0,0 +1,24 @@ +# Configuration for probot-stale - https://github.com/probot/stale + +# Number of days of inactivity before an Issue or Pull Request becomes stale +daysUntilStale: 7 +# Number of days of inactivity before a stale Issue or Pull Request is closed +daysUntilClose: 7 +# Issues or Pull Requests with these labels will never be considered stale. Set to `[]` to disable +exemptLabels: + - pinned + - security + - "[Status] Maybe Later" +# Label to use when marking as stale +staleLabel: stale +# Comment to post when marking as stale. Set to `false` to disable +markComment: > + This issue has been automatically marked as stale because it has not had + recent activity. It will be closed if no further activity occurs. Thank you + for your contributions. +# Comment to post when removing the stale label. Set to `false` to disable +unmarkComment: false +# Comment to post when closing a stale Issue or Pull Request. Set to `false` to disable +closeComment: false +# Limit to only `issues` or `pulls` +# only: issues diff --git a/keras_ssd/.gitignore b/keras_ssd/.gitignore new file mode 100755 index 0000000..9531469 --- /dev/null +++ b/keras_ssd/.gitignore @@ -0,0 +1,98 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +env/ +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*,cover +.hypothesis/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints +.ipynb_checkpoints/ + +# pyenv +.python-version + +# celery beat schedule file +celerybeat-schedule + +# SageMath parsed files +*.sage.py + +# dotenv +.env + +# virtualenv +.venv +venv/ +ENV/ + +# Spyder project settings +.spyderproject + +# Rope project settings +.ropeproject + +# Ignore any files and directories that begin with the word "local" +local* diff --git a/keras_ssd/__init__.py b/keras_ssd/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/keras_ssd/bounding_box_utils/__init__.py b/keras_ssd/bounding_box_utils/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/keras_ssd/bounding_box_utils/bounding_box_utils.py b/keras_ssd/bounding_box_utils/bounding_box_utils.py new file mode 100644 index 0000000..36ce3dc --- /dev/null +++ b/keras_ssd/bounding_box_utils/bounding_box_utils.py @@ -0,0 +1,383 @@ +''' +Includes: +* Function to compute the IoU similarity for axis-aligned, rectangular, 2D bounding boxes +* Function for coordinate conversion for axis-aligned, rectangular, 2D bounding boxes + +Copyright (C) 2018 Pierluigi Ferrari + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +''' + +from __future__ import division +import numpy as np + +def convert_coordinates(tensor, start_index, conversion, border_pixels='half'): + ''' + Convert coordinates for axis-aligned 2D boxes between two coordinate formats. + + Creates a copy of `tensor`, i.e. does not operate in place. Currently there are + three supported coordinate formats that can be converted from and to each other: + 1) (xmin, xmax, ymin, ymax) - the 'minmax' format + 2) (xmin, ymin, xmax, ymax) - the 'corners' format + 2) (cx, cy, w, h) - the 'centroids' format + + Arguments: + tensor (array): A Numpy nD array containing the four consecutive coordinates + to be converted somewhere in the last axis. + start_index (int): The index of the first coordinate in the last axis of `tensor`. + conversion (str, optional): The conversion direction. Can be 'minmax2centroids', + 'centroids2minmax', 'corners2centroids', 'centroids2corners', 'minmax2corners', + or 'corners2minmax'. + border_pixels (str, optional): How to treat the border pixels of the bounding boxes. + Can be 'include', 'exclude', or 'half'. If 'include', the border pixels belong + to the boxes. If 'exclude', the border pixels do not belong to the boxes. + If 'half', then one of each of the two horizontal and vertical borders belong + to the boxex, but not the other. + + Returns: + A Numpy nD array, a copy of the input tensor with the converted coordinates + in place of the original coordinates and the unaltered elements of the original + tensor elsewhere. + ''' + if border_pixels == 'half': + d = 0 + elif border_pixels == 'include': + d = 1 + elif border_pixels == 'exclude': + d = -1 + + ind = start_index + tensor1 = np.copy(tensor).astype(np.float) + if conversion == 'minmax2centroids': + tensor1[..., ind] = (tensor[..., ind] + tensor[..., ind+1]) / 2.0 # Set cx + tensor1[..., ind+1] = (tensor[..., ind+2] + tensor[..., ind+3]) / 2.0 # Set cy + tensor1[..., ind+2] = tensor[..., ind+1] - tensor[..., ind] + d # Set w + tensor1[..., ind+3] = tensor[..., ind+3] - tensor[..., ind+2] + d # Set h + elif conversion == 'centroids2minmax': + tensor1[..., ind] = tensor[..., ind] - tensor[..., ind+2] / 2.0 # Set xmin + tensor1[..., ind+1] = tensor[..., ind] + tensor[..., ind+2] / 2.0 # Set xmax + tensor1[..., ind+2] = tensor[..., ind+1] - tensor[..., ind+3] / 2.0 # Set ymin + tensor1[..., ind+3] = tensor[..., ind+1] + tensor[..., ind+3] / 2.0 # Set ymax + elif conversion == 'corners2centroids': + tensor1[..., ind] = (tensor[..., ind] + tensor[..., ind+2]) / 2.0 # Set cx + tensor1[..., ind+1] = (tensor[..., ind+1] + tensor[..., ind+3]) / 2.0 # Set cy + tensor1[..., ind+2] = tensor[..., ind+2] - tensor[..., ind] + d # Set w + tensor1[..., ind+3] = tensor[..., ind+3] - tensor[..., ind+1] + d # Set h + elif conversion == 'centroids2corners': + tensor1[..., ind] = tensor[..., ind] - tensor[..., ind+2] / 2.0 # Set xmin + tensor1[..., ind+1] = tensor[..., ind+1] - tensor[..., ind+3] / 2.0 # Set ymin + tensor1[..., ind+2] = tensor[..., ind] + tensor[..., ind+2] / 2.0 # Set xmax + tensor1[..., ind+3] = tensor[..., ind+1] + tensor[..., ind+3] / 2.0 # Set ymax + elif (conversion == 'minmax2corners') or (conversion == 'corners2minmax'): + tensor1[..., ind+1] = tensor[..., ind+2] + tensor1[..., ind+2] = tensor[..., ind+1] + else: + raise ValueError("Unexpected conversion value. Supported values are 'minmax2centroids', 'centroids2minmax', 'corners2centroids', 'centroids2corners', 'minmax2corners', and 'corners2minmax'.") + + return tensor1 + +def convert_coordinates2(tensor, start_index, conversion): + ''' + A matrix multiplication implementation of `convert_coordinates()`. + Supports only conversion between the 'centroids' and 'minmax' formats. + + This function is marginally slower on average than `convert_coordinates()`, + probably because it involves more (unnecessary) arithmetic operations (unnecessary + because the two matrices are sparse). + + For details please refer to the documentation of `convert_coordinates()`. + ''' + ind = start_index + tensor1 = np.copy(tensor).astype(np.float) + if conversion == 'minmax2centroids': + M = np.array([[0.5, 0. , -1., 0.], + [0.5, 0. , 1., 0.], + [0. , 0.5, 0., -1.], + [0. , 0.5, 0., 1.]]) + tensor1[..., ind:ind+4] = np.dot(tensor1[..., ind:ind+4], M) + elif conversion == 'centroids2minmax': + M = np.array([[ 1. , 1. , 0. , 0. ], + [ 0. , 0. , 1. , 1. ], + [-0.5, 0.5, 0. , 0. ], + [ 0. , 0. , -0.5, 0.5]]) # The multiplicative inverse of the matrix above + tensor1[..., ind:ind+4] = np.dot(tensor1[..., ind:ind+4], M) + else: + raise ValueError("Unexpected conversion value. Supported values are 'minmax2centroids' and 'centroids2minmax'.") + + return tensor1 + +def intersection_area(boxes1, boxes2, coords='centroids', mode='outer_product', border_pixels='half'): + ''' + Computes the intersection areas of two sets of axis-aligned 2D rectangular boxes. + + Let `boxes1` and `boxes2` contain `m` and `n` boxes, respectively. + + In 'outer_product' mode, returns an `(m,n)` matrix with the intersection areas for all possible + combinations of the boxes in `boxes1` and `boxes2`. + + In 'element-wise' mode, `m` and `n` must be broadcast-compatible. Refer to the explanation + of the `mode` argument for details. + + Arguments: + boxes1 (array): Either a 1D Numpy array of shape `(4, )` containing the coordinates for one box in the + format specified by `coords` or a 2D Numpy array of shape `(m, 4)` containing the coordinates for `m` boxes. + If `mode` is set to 'element_wise', the shape must be broadcast-compatible with `boxes2`. + boxes2 (array): Either a 1D Numpy array of shape `(4, )` containing the coordinates for one box in the + format specified by `coords` or a 2D Numpy array of shape `(n, 4)` containing the coordinates for `n` boxes. + If `mode` is set to 'element_wise', the shape must be broadcast-compatible with `boxes1`. + coords (str, optional): The coordinate format in the input arrays. Can be either 'centroids' for the format + `(cx, cy, w, h)`, 'minmax' for the format `(xmin, xmax, ymin, ymax)`, or 'corners' for the format + `(xmin, ymin, xmax, ymax)`. + mode (str, optional): Can be one of 'outer_product' and 'element-wise'. In 'outer_product' mode, returns an + `(m,n)` matrix with the intersection areas for all possible combinations of the `m` boxes in `boxes1` with the + `n` boxes in `boxes2`. In 'element-wise' mode, returns a 1D array and the shapes of `boxes1` and `boxes2` + must be boadcast-compatible. If both `boxes1` and `boxes2` have `m` boxes, then this returns an array of + length `m` where the i-th position contains the intersection area of `boxes1[i]` with `boxes2[i]`. + border_pixels (str, optional): How to treat the border pixels of the bounding boxes. + Can be 'include', 'exclude', or 'half'. If 'include', the border pixels belong + to the boxes. If 'exclude', the border pixels do not belong to the boxes. + If 'half', then one of each of the two horizontal and vertical borders belong + to the boxex, but not the other. + + Returns: + A 1D or 2D Numpy array (refer to the `mode` argument for details) of dtype float containing values with + the intersection areas of the boxes in `boxes1` and `boxes2`. + ''' + + # Make sure the boxes have the right shapes. + if boxes1.ndim > 2: raise ValueError("boxes1 must have rank either 1 or 2, but has rank {}.".format(boxes1.ndim)) + if boxes2.ndim > 2: raise ValueError("boxes2 must have rank either 1 or 2, but has rank {}.".format(boxes2.ndim)) + + if boxes1.ndim == 1: boxes1 = np.expand_dims(boxes1, axis=0) + if boxes2.ndim == 1: boxes2 = np.expand_dims(boxes2, axis=0) + + if not (boxes1.shape[1] == boxes2.shape[1] == 4): raise ValueError("All boxes must consist of 4 coordinates, but the boxes in `boxes1` and `boxes2` have {} and {} coordinates, respectively.".format(boxes1.shape[1], boxes2.shape[1])) + if not mode in {'outer_product', 'element-wise'}: raise ValueError("`mode` must be one of 'outer_product' and 'element-wise', but got '{}'.",format(mode)) + + # Convert the coordinates if necessary. + if coords == 'centroids': + boxes1 = convert_coordinates(boxes1, start_index=0, conversion='centroids2corners') + boxes2 = convert_coordinates(boxes2, start_index=0, conversion='centroids2corners') + coords = 'corners' + elif not (coords in {'minmax', 'corners'}): + raise ValueError("Unexpected value for `coords`. Supported values are 'minmax', 'corners' and 'centroids'.") + + m = boxes1.shape[0] # The number of boxes in `boxes1` + n = boxes2.shape[0] # The number of boxes in `boxes2` + + # Set the correct coordinate indices for the respective formats. + if coords == 'corners': + xmin = 0 + ymin = 1 + xmax = 2 + ymax = 3 + elif coords == 'minmax': + xmin = 0 + xmax = 1 + ymin = 2 + ymax = 3 + + if border_pixels == 'half': + d = 0 + elif border_pixels == 'include': + d = 1 # If border pixels are supposed to belong to the bounding boxes, we have to add one pixel to any difference `xmax - xmin` or `ymax - ymin`. + elif border_pixels == 'exclude': + d = -1 # If border pixels are not supposed to belong to the bounding boxes, we have to subtract one pixel from any difference `xmax - xmin` or `ymax - ymin`. + + # Compute the intersection areas. + + if mode == 'outer_product': + + # For all possible box combinations, get the greater xmin and ymin values. + # This is a tensor of shape (m,n,2). + min_xy = np.maximum(np.tile(np.expand_dims(boxes1[:,[xmin,ymin]], axis=1), reps=(1, n, 1)), + np.tile(np.expand_dims(boxes2[:,[xmin,ymin]], axis=0), reps=(m, 1, 1))) + + # For all possible box combinations, get the smaller xmax and ymax values. + # This is a tensor of shape (m,n,2). + max_xy = np.minimum(np.tile(np.expand_dims(boxes1[:,[xmax,ymax]], axis=1), reps=(1, n, 1)), + np.tile(np.expand_dims(boxes2[:,[xmax,ymax]], axis=0), reps=(m, 1, 1))) + + # Compute the side lengths of the intersection rectangles. + side_lengths = np.maximum(0, max_xy - min_xy + d) + + return side_lengths[:,:,0] * side_lengths[:,:,1] + + elif mode == 'element-wise': + + min_xy = np.maximum(boxes1[:,[xmin,ymin]], boxes2[:,[xmin,ymin]]) + max_xy = np.minimum(boxes1[:,[xmax,ymax]], boxes2[:,[xmax,ymax]]) + + # Compute the side lengths of the intersection rectangles. + side_lengths = np.maximum(0, max_xy - min_xy + d) + + return side_lengths[:,0] * side_lengths[:,1] + +def intersection_area_(boxes1, boxes2, coords='corners', mode='outer_product', border_pixels='half'): + ''' + The same as 'intersection_area()' but for internal use, i.e. without all the safety checks. + ''' + + m = boxes1.shape[0] # The number of boxes in `boxes1` + n = boxes2.shape[0] # The number of boxes in `boxes2` + + # Set the correct coordinate indices for the respective formats. + if coords == 'corners': + xmin = 0 + ymin = 1 + xmax = 2 + ymax = 3 + elif coords == 'minmax': + xmin = 0 + xmax = 1 + ymin = 2 + ymax = 3 + + if border_pixels == 'half': + d = 0 + elif border_pixels == 'include': + d = 1 # If border pixels are supposed to belong to the bounding boxes, we have to add one pixel to any difference `xmax - xmin` or `ymax - ymin`. + elif border_pixels == 'exclude': + d = -1 # If border pixels are not supposed to belong to the bounding boxes, we have to subtract one pixel from any difference `xmax - xmin` or `ymax - ymin`. + + # Compute the intersection areas. + + if mode == 'outer_product': + + # For all possible box combinations, get the greater xmin and ymin values. + # This is a tensor of shape (m,n,2). + min_xy = np.maximum(np.tile(np.expand_dims(boxes1[:,[xmin,ymin]], axis=1), reps=(1, n, 1)), + np.tile(np.expand_dims(boxes2[:,[xmin,ymin]], axis=0), reps=(m, 1, 1))) + + # For all possible box combinations, get the smaller xmax and ymax values. + # This is a tensor of shape (m,n,2). + max_xy = np.minimum(np.tile(np.expand_dims(boxes1[:,[xmax,ymax]], axis=1), reps=(1, n, 1)), + np.tile(np.expand_dims(boxes2[:,[xmax,ymax]], axis=0), reps=(m, 1, 1))) + + # Compute the side lengths of the intersection rectangles. + side_lengths = np.maximum(0, max_xy - min_xy + d) + + return side_lengths[:,:,0] * side_lengths[:,:,1] + + elif mode == 'element-wise': + + min_xy = np.maximum(boxes1[:,[xmin,ymin]], boxes2[:,[xmin,ymin]]) + max_xy = np.minimum(boxes1[:,[xmax,ymax]], boxes2[:,[xmax,ymax]]) + + # Compute the side lengths of the intersection rectangles. + side_lengths = np.maximum(0, max_xy - min_xy + d) + + return side_lengths[:,0] * side_lengths[:,1] + + +def iou(boxes1, boxes2, coords='centroids', mode='outer_product', border_pixels='half'): + ''' + Computes the intersection-over-union similarity (also known as Jaccard similarity) + of two sets of axis-aligned 2D rectangular boxes. + + Let `boxes1` and `boxes2` contain `m` and `n` boxes, respectively. + + In 'outer_product' mode, returns an `(m,n)` matrix with the IoUs for all possible + combinations of the boxes in `boxes1` and `boxes2`. + + In 'element-wise' mode, `m` and `n` must be broadcast-compatible. Refer to the explanation + of the `mode` argument for details. + + Arguments: + boxes1 (array): Either a 1D Numpy array of shape `(4, )` containing the coordinates for one box in the + format specified by `coords` or a 2D Numpy array of shape `(m, 4)` containing the coordinates for `m` boxes. + If `mode` is set to 'element_wise', the shape must be broadcast-compatible with `boxes2`. + boxes2 (array): Either a 1D Numpy array of shape `(4, )` containing the coordinates for one box in the + format specified by `coords` or a 2D Numpy array of shape `(n, 4)` containing the coordinates for `n` boxes. + If `mode` is set to 'element_wise', the shape must be broadcast-compatible with `boxes1`. + coords (str, optional): The coordinate format in the input arrays. Can be either 'centroids' for the format + `(cx, cy, w, h)`, 'minmax' for the format `(xmin, xmax, ymin, ymax)`, or 'corners' for the format + `(xmin, ymin, xmax, ymax)`. + mode (str, optional): Can be one of 'outer_product' and 'element-wise'. In 'outer_product' mode, returns an + `(m,n)` matrix with the IoU overlaps for all possible combinations of the `m` boxes in `boxes1` with the + `n` boxes in `boxes2`. In 'element-wise' mode, returns a 1D array and the shapes of `boxes1` and `boxes2` + must be boadcast-compatible. If both `boxes1` and `boxes2` have `m` boxes, then this returns an array of + length `m` where the i-th position contains the IoU overlap of `boxes1[i]` with `boxes2[i]`. + border_pixels (str, optional): How to treat the border pixels of the bounding boxes. + Can be 'include', 'exclude', or 'half'. If 'include', the border pixels belong + to the boxes. If 'exclude', the border pixels do not belong to the boxes. + If 'half', then one of each of the two horizontal and vertical borders belong + to the boxex, but not the other. + + Returns: + A 1D or 2D Numpy array (refer to the `mode` argument for details) of dtype float containing values in [0,1], + the Jaccard similarity of the boxes in `boxes1` and `boxes2`. 0 means there is no overlap between two given + boxes, 1 means their coordinates are identical. + ''' + + # Make sure the boxes have the right shapes. + if boxes1.ndim > 2: raise ValueError("boxes1 must have rank either 1 or 2, but has rank {}.".format(boxes1.ndim)) + if boxes2.ndim > 2: raise ValueError("boxes2 must have rank either 1 or 2, but has rank {}.".format(boxes2.ndim)) + + if boxes1.ndim == 1: boxes1 = np.expand_dims(boxes1, axis=0) + if boxes2.ndim == 1: boxes2 = np.expand_dims(boxes2, axis=0) + + if not (boxes1.shape[1] == boxes2.shape[1] == 4): raise ValueError("All boxes must consist of 4 coordinates, but the boxes in `boxes1` and `boxes2` have {} and {} coordinates, respectively.".format(boxes1.shape[1], boxes2.shape[1])) + if not mode in {'outer_product', 'element-wise'}: raise ValueError("`mode` must be one of 'outer_product' and 'element-wise', but got '{}'.".format(mode)) + + # Convert the coordinates if necessary. + if coords == 'centroids': + boxes1 = convert_coordinates(boxes1, start_index=0, conversion='centroids2corners') + boxes2 = convert_coordinates(boxes2, start_index=0, conversion='centroids2corners') + coords = 'corners' + elif not (coords in {'minmax', 'corners'}): + raise ValueError("Unexpected value for `coords`. Supported values are 'minmax', 'corners' and 'centroids'.") + + # Compute the IoU. + + # Compute the interesection areas. + + intersection_areas = intersection_area_(boxes1, boxes2, coords=coords, mode=mode) + + m = boxes1.shape[0] # The number of boxes in `boxes1` + n = boxes2.shape[0] # The number of boxes in `boxes2` + + # Compute the union areas. + + # Set the correct coordinate indices for the respective formats. + if coords == 'corners': + xmin = 0 + ymin = 1 + xmax = 2 + ymax = 3 + elif coords == 'minmax': + xmin = 0 + xmax = 1 + ymin = 2 + ymax = 3 + + if border_pixels == 'half': + d = 0 + elif border_pixels == 'include': + d = 1 # If border pixels are supposed to belong to the bounding boxes, we have to add one pixel to any difference `xmax - xmin` or `ymax - ymin`. + elif border_pixels == 'exclude': + d = -1 # If border pixels are not supposed to belong to the bounding boxes, we have to subtract one pixel from any difference `xmax - xmin` or `ymax - ymin`. + + if mode == 'outer_product': + + boxes1_areas = np.tile(np.expand_dims((boxes1[:,xmax] - boxes1[:,xmin] + d) * (boxes1[:,ymax] - boxes1[:,ymin] + d), axis=1), reps=(1,n)) + boxes2_areas = np.tile(np.expand_dims((boxes2[:,xmax] - boxes2[:,xmin] + d) * (boxes2[:,ymax] - boxes2[:,ymin] + d), axis=0), reps=(m,1)) + + elif mode == 'element-wise': + + boxes1_areas = (boxes1[:,xmax] - boxes1[:,xmin] + d) * (boxes1[:,ymax] - boxes1[:,ymin] + d) + boxes2_areas = (boxes2[:,xmax] - boxes2[:,xmin] + d) * (boxes2[:,ymax] - boxes2[:,ymin] + d) + + union_areas = boxes1_areas + boxes2_areas - intersection_areas + + return intersection_areas / union_areas diff --git a/keras_ssd/data_generator/__init__.py b/keras_ssd/data_generator/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/keras_ssd/data_generator/data_augmentation_chain_constant_input_size.py b/keras_ssd/data_generator/data_augmentation_chain_constant_input_size.py new file mode 100644 index 0000000..2c18a98 --- /dev/null +++ b/keras_ssd/data_generator/data_augmentation_chain_constant_input_size.py @@ -0,0 +1,183 @@ +''' +The data augmentation operations of the original SSD implementation. + +Copyright (C) 2018 Pierluigi Ferrari + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +''' + +from __future__ import division +import numpy as np + +from data_generator.object_detection_2d_photometric_ops import ConvertColor, ConvertDataType, ConvertTo3Channels, RandomBrightness, RandomContrast, RandomHue, RandomSaturation +from data_generator.object_detection_2d_geometric_ops import RandomFlip, RandomTranslate, RandomScale +from data_generator.object_detection_2d_image_boxes_validation_utils import BoundGenerator, BoxFilter, ImageValidator + +class DataAugmentationConstantInputSize: + ''' + Applies a chain of photometric and geometric image transformations. For documentation, please refer + to the documentation of the individual transformations involved. + + Important: This augmentation chain is suitable for constant-size images only. + ''' + + def __init__(self, + random_brightness=(-48, 48, 0.5), + random_contrast=(0.5, 1.8, 0.5), + random_saturation=(0.5, 1.8, 0.5), + random_hue=(18, 0.5), + random_flip=0.5, + random_translate=((0.03,0.5), (0.03,0.5), 0.5), + random_scale=(0.5, 2.0, 0.5), + n_trials_max=3, + clip_boxes=True, + overlap_criterion='area', + bounds_box_filter=(0.3, 1.0), + bounds_validator=(0.5, 1.0), + n_boxes_min=1, + background=(0,0,0), + labels_format={'class_id': 0, 'xmin': 1, 'ymin': 2, 'xmax': 3, 'ymax': 4}): + + if (random_scale[0] >= 1) or (random_scale[1] <= 1): + raise ValueError("This sequence of transformations only makes sense if the minimum scaling factor is <1 and the maximum scaling factor is >1.") + + self.n_trials_max = n_trials_max + self.clip_boxes = clip_boxes + self.overlap_criterion = overlap_criterion + self.bounds_box_filter = bounds_box_filter + self.bounds_validator = bounds_validator + self.n_boxes_min = n_boxes_min + self.background = background + self.labels_format = labels_format + + # Determines which boxes are kept in an image after the transformations have been applied. + self.box_filter = BoxFilter(check_overlap=True, + check_min_area=True, + check_degenerate=True, + overlap_criterion=self.overlap_criterion, + overlap_bounds=self.bounds_box_filter, + min_area=16, + labels_format=self.labels_format) + + # Determines whether the result of the transformations is a valid training image. + self.image_validator = ImageValidator(overlap_criterion=self.overlap_criterion, + bounds=self.bounds_validator, + n_boxes_min=self.n_boxes_min, + labels_format=self.labels_format) + + # Utility distortions + self.convert_RGB_to_HSV = ConvertColor(current='RGB', to='HSV') + self.convert_HSV_to_RGB = ConvertColor(current='HSV', to='RGB') + self.convert_to_float32 = ConvertDataType(to='float32') + self.convert_to_uint8 = ConvertDataType(to='uint8') + self.convert_to_3_channels = ConvertTo3Channels() # Make sure all images end up having 3 channels. + + # Photometric transformations + self.random_brightness = RandomBrightness(lower=random_brightness[0], upper=random_brightness[1], prob=random_brightness[2]) + self.random_contrast = RandomContrast(lower=random_contrast[0], upper=random_contrast[1], prob=random_contrast[2]) + self.random_saturation = RandomSaturation(lower=random_saturation[0], upper=random_saturation[1], prob=random_saturation[2]) + self.random_hue = RandomHue(max_delta=random_hue[0], prob=random_hue[1]) + + # Geometric transformations + self.random_flip = RandomFlip(dim='horizontal', prob=random_flip, labels_format=self.labels_format) + self.random_translate = RandomTranslate(dy_minmax=random_translate[0], + dx_minmax=random_translate[1], + prob=random_translate[2], + clip_boxes=self.clip_boxes, + box_filter=self.box_filter, + image_validator=self.image_validator, + n_trials_max=self.n_trials_max, + background=self.background, + labels_format=self.labels_format) + self.random_zoom_in = RandomScale(min_factor=1.0, + max_factor=random_scale[1], + prob=random_scale[2], + clip_boxes=self.clip_boxes, + box_filter=self.box_filter, + image_validator=self.image_validator, + n_trials_max=self.n_trials_max, + background=self.background, + labels_format=self.labels_format) + self.random_zoom_out = RandomScale(min_factor=random_scale[0], + max_factor=1.0, + prob=random_scale[2], + clip_boxes=self.clip_boxes, + box_filter=self.box_filter, + image_validator=self.image_validator, + n_trials_max=self.n_trials_max, + background=self.background, + labels_format=self.labels_format) + + # If we zoom in, do translation before scaling. + self.sequence1 = [self.convert_to_3_channels, + self.convert_to_float32, + self.random_brightness, + self.random_contrast, + self.convert_to_uint8, + self.convert_RGB_to_HSV, + self.convert_to_float32, + self.random_saturation, + self.random_hue, + self.convert_to_uint8, + self.convert_HSV_to_RGB, + self.random_translate, + self.random_zoom_in, + self.random_flip] + + # If we zoom out, do scaling before translation. + self.sequence2 = [self.convert_to_3_channels, + self.convert_to_float32, + self.random_brightness, + self.convert_to_uint8, + self.convert_RGB_to_HSV, + self.convert_to_float32, + self.random_saturation, + self.random_hue, + self.convert_to_uint8, + self.convert_HSV_to_RGB, + self.convert_to_float32, + self.random_contrast, + self.convert_to_uint8, + self.random_zoom_out, + self.random_translate, + self.random_flip] + + def __call__(self, image, labels=None): + + self.random_translate.labels_format = self.labels_format + self.random_zoom_in.labels_format = self.labels_format + self.random_zoom_out.labels_format = self.labels_format + self.random_flip.labels_format = self.labels_format + + # Choose sequence 1 with probability 0.5. + if np.random.choice(2): + + if not (labels is None): + for transform in self.sequence1: + image, labels = transform(image, labels) + return image, labels + else: + for transform in self.sequence1: + image = transform(image) + return image + # Choose sequence 2 with probability 0.5. + else: + + if not (labels is None): + for transform in self.sequence2: + image, labels = transform(image, labels) + return image, labels + else: + for transform in self.sequence2: + image = transform(image) + return image diff --git a/keras_ssd/data_generator/data_augmentation_chain_original_ssd.py b/keras_ssd/data_generator/data_augmentation_chain_original_ssd.py new file mode 100644 index 0000000..af8d498 --- /dev/null +++ b/keras_ssd/data_generator/data_augmentation_chain_original_ssd.py @@ -0,0 +1,280 @@ +''' +The data augmentation operations of the original SSD implementation. + +Copyright (C) 2018 Pierluigi Ferrari + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +''' + +from __future__ import division +import numpy as np +import cv2 +import inspect + +from data_generator.object_detection_2d_photometric_ops import ConvertColor, ConvertDataType, ConvertTo3Channels, RandomBrightness, RandomContrast, RandomHue, RandomSaturation, RandomChannelSwap +from data_generator.object_detection_2d_patch_sampling_ops import PatchCoordinateGenerator, RandomPatch, RandomPatchInf +from data_generator.object_detection_2d_geometric_ops import ResizeRandomInterp, RandomFlip +from data_generator.object_detection_2d_image_boxes_validation_utils import BoundGenerator, BoxFilter, ImageValidator + +class SSDRandomCrop: + ''' + Performs the same random crops as defined by the `batch_sampler` instructions + of the original Caffe implementation of SSD. A description of this random cropping + strategy can also be found in the data augmentation section of the paper: + https://arxiv.org/abs/1512.02325 + ''' + + def __init__(self, labels_format={'class_id': 0, 'xmin': 1, 'ymin': 2, 'xmax': 3, 'ymax': 4}): + ''' + Arguments: + labels_format (dict, optional): A dictionary that defines which index in the last axis of the labels + of an image contains which bounding box coordinate. The dictionary maps at least the keywords + 'xmin', 'ymin', 'xmax', and 'ymax' to their respective indices within last axis of the labels array. + ''' + + self.labels_format = labels_format + + # This randomly samples one of the lower IoU bounds defined + # by the `sample_space` every time it is called. + self.bound_generator = BoundGenerator(sample_space=((None, None), + (0.1, None), + (0.3, None), + (0.5, None), + (0.7, None), + (0.9, None)), + weights=None) + + # Produces coordinates for candidate patches such that the height + # and width of the patches are between 0.3 and 1.0 of the height + # and width of the respective image and the aspect ratio of the + # patches is between 0.5 and 2.0. + self.patch_coord_generator = PatchCoordinateGenerator(must_match='h_w', + min_scale=0.3, + max_scale=1.0, + scale_uniformly=False, + min_aspect_ratio = 0.5, + max_aspect_ratio = 2.0) + + # Filters out boxes whose center point does not lie within the + # chosen patches. + self.box_filter = BoxFilter(check_overlap=True, + check_min_area=False, + check_degenerate=False, + overlap_criterion='center_point', + labels_format=self.labels_format) + + # Determines whether a given patch is considered a valid patch. + # Defines a patch to be valid if at least one ground truth bounding box + # (n_boxes_min == 1) has an IoU overlap with the patch that + # meets the requirements defined by `bound_generator`. + self.image_validator = ImageValidator(overlap_criterion='iou', + n_boxes_min=1, + labels_format=self.labels_format, + border_pixels='half') + + # Performs crops according to the parameters set in the objects above. + # Runs until either a valid patch is found or the original input image + # is returned unaltered. Runs a maximum of 50 trials to find a valid + # patch for each new sampled IoU threshold. Every 50 trials, the original + # image is returned as is with probability (1 - prob) = 0.143. + self.random_crop = RandomPatchInf(patch_coord_generator=self.patch_coord_generator, + box_filter=self.box_filter, + image_validator=self.image_validator, + bound_generator=self.bound_generator, + n_trials_max=50, + clip_boxes=True, + prob=0.857, + labels_format=self.labels_format) + + def __call__(self, image, labels=None, return_inverter=False): + self.random_crop.labels_format = self.labels_format + return self.random_crop(image, labels, return_inverter) + +class SSDExpand: + ''' + Performs the random image expansion as defined by the `train_transform_param` instructions + of the original Caffe implementation of SSD. A description of this expansion strategy + can also be found in section 3.6 ("Data Augmentation for Small Object Accuracy") of the paper: + https://arxiv.org/abs/1512.02325 + ''' + + def __init__(self, background=(123, 117, 104), labels_format={'class_id': 0, 'xmin': 1, 'ymin': 2, 'xmax': 3, 'ymax': 4}): + ''' + Arguments: + background (list/tuple, optional): A 3-tuple specifying the RGB color value of the + background pixels of the translated images. + labels_format (dict, optional): A dictionary that defines which index in the last axis of the labels + of an image contains which bounding box coordinate. The dictionary maps at least the keywords + 'xmin', 'ymin', 'xmax', and 'ymax' to their respective indices within last axis of the labels array. + ''' + + self.labels_format = labels_format + + # Generate coordinates for patches that are between 1.0 and 4.0 times + # the size of the input image in both spatial dimensions. + self.patch_coord_generator = PatchCoordinateGenerator(must_match='h_w', + min_scale=1.0, + max_scale=4.0, + scale_uniformly=True) + + # With probability 0.5, place the input image randomly on a canvas filled with + # mean color values according to the parameters set above. With probability 0.5, + # return the input image unaltered. + self.expand = RandomPatch(patch_coord_generator=self.patch_coord_generator, + box_filter=None, + image_validator=None, + n_trials_max=1, + clip_boxes=False, + prob=0.5, + background=background, + labels_format=self.labels_format) + + def __call__(self, image, labels=None, return_inverter=False): + self.expand.labels_format = self.labels_format + return self.expand(image, labels, return_inverter) + +class SSDPhotometricDistortions: + ''' + Performs the photometric distortions defined by the `train_transform_param` instructions + of the original Caffe implementation of SSD. + ''' + + def __init__(self): + + self.convert_RGB_to_HSV = ConvertColor(current='RGB', to='HSV') + self.convert_HSV_to_RGB = ConvertColor(current='HSV', to='RGB') + self.convert_to_float32 = ConvertDataType(to='float32') + self.convert_to_uint8 = ConvertDataType(to='uint8') + self.convert_to_3_channels = ConvertTo3Channels() + self.random_brightness = RandomBrightness(lower=-32, upper=32, prob=0.5) + self.random_contrast = RandomContrast(lower=0.5, upper=1.5, prob=0.5) + self.random_saturation = RandomSaturation(lower=0.5, upper=1.5, prob=0.5) + self.random_hue = RandomHue(max_delta=18, prob=0.5) + self.random_channel_swap = RandomChannelSwap(prob=0.0) + + self.sequence1 = [self.convert_to_3_channels, + self.convert_to_float32, + self.random_brightness, + self.random_contrast, + self.convert_to_uint8, + self.convert_RGB_to_HSV, + self.convert_to_float32, + self.random_saturation, + self.random_hue, + self.convert_to_uint8, + self.convert_HSV_to_RGB, + self.random_channel_swap] + + self.sequence2 = [self.convert_to_3_channels, + self.convert_to_float32, + self.random_brightness, + self.convert_to_uint8, + self.convert_RGB_to_HSV, + self.convert_to_float32, + self.random_saturation, + self.random_hue, + self.convert_to_uint8, + self.convert_HSV_to_RGB, + self.convert_to_float32, + self.random_contrast, + self.convert_to_uint8, + self.random_channel_swap] + + def __call__(self, image, labels): + + # Choose sequence 1 with probability 0.5. + if np.random.choice(2): + + for transform in self.sequence1: + image, labels = transform(image, labels) + return image, labels + # Choose sequence 2 with probability 0.5. + else: + + for transform in self.sequence2: + image, labels = transform(image, labels) + return image, labels + +class SSDDataAugmentation: + ''' + Reproduces the data augmentation pipeline used in the training of the original + Caffe implementation of SSD. + ''' + + def __init__(self, + img_height=300, + img_width=300, + background=(123, 117, 104), + labels_format={'class_id': 0, 'xmin': 1, 'ymin': 2, 'xmax': 3, 'ymax': 4}): + ''' + Arguments: + height (int): The desired height of the output images in pixels. + width (int): The desired width of the output images in pixels. + background (list/tuple, optional): A 3-tuple specifying the RGB color value of the + background pixels of the translated images. + labels_format (dict, optional): A dictionary that defines which index in the last axis of the labels + of an image contains which bounding box coordinate. The dictionary maps at least the keywords + 'xmin', 'ymin', 'xmax', and 'ymax' to their respective indices within last axis of the labels array. + ''' + + self.labels_format = labels_format + + self.photometric_distortions = SSDPhotometricDistortions() + self.expand = SSDExpand(background=background, labels_format=self.labels_format) + self.random_crop = SSDRandomCrop(labels_format=self.labels_format) + self.random_flip = RandomFlip(dim='horizontal', prob=0.5, labels_format=self.labels_format) + + # This box filter makes sure that the resized images don't contain any degenerate boxes. + # Resizing the images could lead the boxes to becomes smaller. For boxes that are already + # pretty small, that might result in boxes with height and/or width zero, which we obviously + # cannot allow. + self.box_filter = BoxFilter(check_overlap=False, + check_min_area=False, + check_degenerate=True, + labels_format=self.labels_format) + + self.resize = ResizeRandomInterp(height=img_height, + width=img_width, + interpolation_modes=[cv2.INTER_NEAREST, + cv2.INTER_LINEAR, + cv2.INTER_CUBIC, + cv2.INTER_AREA, + cv2.INTER_LANCZOS4], + box_filter=self.box_filter, + labels_format=self.labels_format) + + self.sequence = [self.photometric_distortions, + self.expand, + self.random_crop, + self.random_flip, + self.resize] + + def __call__(self, image, labels, return_inverter=False): + self.expand.labels_format = self.labels_format + self.random_crop.labels_format = self.labels_format + self.random_flip.labels_format = self.labels_format + self.resize.labels_format = self.labels_format + + inverters = [] + + for transform in self.sequence: + if return_inverter and ('return_inverter' in inspect.signature(transform).parameters): + image, labels, inverter = transform(image, labels, return_inverter=True) + inverters.append(inverter) + else: + image, labels = transform(image, labels) + + if return_inverter: + return image, labels, inverters[::-1] + else: + return image, labels diff --git a/keras_ssd/data_generator/data_augmentation_chain_satellite.py b/keras_ssd/data_generator/data_augmentation_chain_satellite.py new file mode 100644 index 0000000..c2e2cb9 --- /dev/null +++ b/keras_ssd/data_generator/data_augmentation_chain_satellite.py @@ -0,0 +1,157 @@ +''' +A data augmentation pipeline for datasets in bird's eye view, i.e. where there is +no "up" or "down" in the images. + +Copyright (C) 2018 Pierluigi Ferrari + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +''' + +from __future__ import division +import numpy as np + +from data_generator.object_detection_2d_photometric_ops import ConvertColor, ConvertDataType, ConvertTo3Channels, RandomBrightness, RandomContrast, RandomHue, RandomSaturation +from data_generator.object_detection_2d_geometric_ops import Resize, RandomFlip, RandomRotate +from data_generator.object_detection_2d_patch_sampling_ops import PatchCoordinateGenerator, RandomPatch +from data_generator.object_detection_2d_image_boxes_validation_utils import BoxFilter, ImageValidator + +class DataAugmentationSatellite: + ''' + A data augmentation pipeline for datasets in bird's eye view, i.e. where there is + no "up" or "down" in the images. + + Applies a chain of photometric and geometric image transformations. For documentation, please refer + to the documentation of the individual transformations involved. + ''' + + def __init__(self, + resize_height, + resize_width, + random_brightness=(-48, 48, 0.5), + random_contrast=(0.5, 1.8, 0.5), + random_saturation=(0.5, 1.8, 0.5), + random_hue=(18, 0.5), + random_flip=0.5, + random_rotate=([90, 180, 270], 0.5), + min_scale=0.3, + max_scale=2.0, + min_aspect_ratio = 0.8, + max_aspect_ratio = 1.25, + n_trials_max=3, + clip_boxes=True, + overlap_criterion='area', + bounds_box_filter=(0.3, 1.0), + bounds_validator=(0.5, 1.0), + n_boxes_min=1, + background=(0,0,0), + labels_format={'class_id': 0, 'xmin': 1, 'ymin': 2, 'xmax': 3, 'ymax': 4}): + + self.n_trials_max = n_trials_max + self.clip_boxes = clip_boxes + self.overlap_criterion = overlap_criterion + self.bounds_box_filter = bounds_box_filter + self.bounds_validator = bounds_validator + self.n_boxes_min = n_boxes_min + self.background = background + self.labels_format = labels_format + + # Determines which boxes are kept in an image after the transformations have been applied. + self.box_filter_patch = BoxFilter(check_overlap=True, + check_min_area=False, + check_degenerate=False, + overlap_criterion=self.overlap_criterion, + overlap_bounds=self.bounds_box_filter, + labels_format=self.labels_format) + + self.box_filter_resize = BoxFilter(check_overlap=False, + check_min_area=True, + check_degenerate=True, + min_area=16, + labels_format=self.labels_format) + + # Determines whether the result of the transformations is a valid training image. + self.image_validator = ImageValidator(overlap_criterion=self.overlap_criterion, + bounds=self.bounds_validator, + n_boxes_min=self.n_boxes_min, + labels_format=self.labels_format) + + # Utility transformations + self.convert_to_3_channels = ConvertTo3Channels() # Make sure all images end up having 3 channels. + self.convert_RGB_to_HSV = ConvertColor(current='RGB', to='HSV') + self.convert_HSV_to_RGB = ConvertColor(current='HSV', to='RGB') + self.convert_to_float32 = ConvertDataType(to='float32') + self.convert_to_uint8 = ConvertDataType(to='uint8') + self.resize = Resize(height=resize_height, + width=resize_width, + box_filter=self.box_filter_resize, + labels_format=self.labels_format) + + # Photometric transformations + self.random_brightness = RandomBrightness(lower=random_brightness[0], upper=random_brightness[1], prob=random_brightness[2]) + self.random_contrast = RandomContrast(lower=random_contrast[0], upper=random_contrast[1], prob=random_contrast[2]) + self.random_saturation = RandomSaturation(lower=random_saturation[0], upper=random_saturation[1], prob=random_saturation[2]) + self.random_hue = RandomHue(max_delta=random_hue[0], prob=random_hue[1]) + + # Geometric transformations + self.random_horizontal_flip = RandomFlip(dim='horizontal', prob=random_flip, labels_format=self.labels_format) + self.random_vertical_flip = RandomFlip(dim='vertical', prob=random_flip, labels_format=self.labels_format) + self.random_rotate = RandomRotate(angles=random_rotate[0], prob=random_rotate[1], labels_format=self.labels_format) + self.patch_coord_generator = PatchCoordinateGenerator(must_match='w_ar', + min_scale=min_scale, + max_scale=max_scale, + scale_uniformly=False, + min_aspect_ratio = min_aspect_ratio, + max_aspect_ratio = max_aspect_ratio) + self.random_patch = RandomPatch(patch_coord_generator=self.patch_coord_generator, + box_filter=self.box_filter_patch, + image_validator=self.image_validator, + n_trials_max=self.n_trials_max, + clip_boxes=self.clip_boxes, + prob=1.0, + can_fail=False, + labels_format=self.labels_format) + + # Define the processing chain. + self.transformations = [self.convert_to_3_channels, + self.convert_to_float32, + self.random_brightness, + self.random_contrast, + self.convert_to_uint8, + self.convert_RGB_to_HSV, + self.convert_to_float32, + self.random_saturation, + self.random_hue, + self.convert_to_uint8, + self.convert_HSV_to_RGB, + self.random_horizontal_flip, + self.random_vertical_flip, + self.random_rotate, + self.random_patch, + self.resize] + + def __call__(self, image, labels=None): + + self.random_patch.labels_format = self.labels_format + self.random_horizontal_flip.labels_format = self.labels_format + self.random_vertical_flip.labels_format = self.labels_format + self.random_rotate.labels_format = self.labels_format + self.resize.labels_format = self.labels_format + + if not (labels is None): + for transform in self.transformations: + image, labels = transform(image, labels) + return image, labels + else: + for transform in self.sequence1: + image = transform(image) + return image diff --git a/keras_ssd/data_generator/data_augmentation_chain_variable_input_size.py b/keras_ssd/data_generator/data_augmentation_chain_variable_input_size.py new file mode 100644 index 0000000..7d9f2b4 --- /dev/null +++ b/keras_ssd/data_generator/data_augmentation_chain_variable_input_size.py @@ -0,0 +1,152 @@ +''' +A data augmentation pipeline suitable for variable-size images that produces effects +that are similar (but not identical) to those of the original SSD data augmentation +pipeline while being faster. + +Copyright (C) 2018 Pierluigi Ferrari + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +''' + +from __future__ import division +import numpy as np + +from data_generator.object_detection_2d_photometric_ops import ConvertColor, ConvertDataType, ConvertTo3Channels, RandomBrightness, RandomContrast, RandomHue, RandomSaturation +from data_generator.object_detection_2d_geometric_ops import Resize, RandomFlip +from data_generator.object_detection_2d_patch_sampling_ops import PatchCoordinateGenerator, RandomPatch +from data_generator.object_detection_2d_image_boxes_validation_utils import BoxFilter, ImageValidator + +class DataAugmentationVariableInputSize: + ''' + A data augmentation pipeline suitable for variable-size images that produces effects + that are similar (but not identical!) to those of the original SSD data augmentation + pipeline while being faster. + + Applies a chain of photometric and geometric image transformations. For documentation, please refer + to the documentation of the individual transformations involved. + ''' + + def __init__(self, + resize_height, + resize_width, + random_brightness=(-48, 48, 0.5), + random_contrast=(0.5, 1.8, 0.5), + random_saturation=(0.5, 1.8, 0.5), + random_hue=(18, 0.5), + random_flip=0.5, + min_scale=0.3, + max_scale=2.0, + min_aspect_ratio = 0.5, + max_aspect_ratio = 2.0, + n_trials_max=3, + clip_boxes=True, + overlap_criterion='area', + bounds_box_filter=(0.3, 1.0), + bounds_validator=(0.5, 1.0), + n_boxes_min=1, + background=(0,0,0), + labels_format={'class_id': 0, 'xmin': 1, 'ymin': 2, 'xmax': 3, 'ymax': 4}): + + self.n_trials_max = n_trials_max + self.clip_boxes = clip_boxes + self.overlap_criterion = overlap_criterion + self.bounds_box_filter = bounds_box_filter + self.bounds_validator = bounds_validator + self.n_boxes_min = n_boxes_min + self.background = background + self.labels_format = labels_format + + # Determines which boxes are kept in an image after the transformations have been applied. + self.box_filter_patch = BoxFilter(check_overlap=True, + check_min_area=False, + check_degenerate=False, + overlap_criterion=self.overlap_criterion, + overlap_bounds=self.bounds_box_filter, + labels_format=self.labels_format) + + self.box_filter_resize = BoxFilter(check_overlap=False, + check_min_area=True, + check_degenerate=True, + min_area=16, + labels_format=self.labels_format) + + # Determines whether the result of the transformations is a valid training image. + self.image_validator = ImageValidator(overlap_criterion=self.overlap_criterion, + bounds=self.bounds_validator, + n_boxes_min=self.n_boxes_min, + labels_format=self.labels_format) + + # Utility transformations + self.convert_to_3_channels = ConvertTo3Channels() # Make sure all images end up having 3 channels. + self.convert_RGB_to_HSV = ConvertColor(current='RGB', to='HSV') + self.convert_HSV_to_RGB = ConvertColor(current='HSV', to='RGB') + self.convert_to_float32 = ConvertDataType(to='float32') + self.convert_to_uint8 = ConvertDataType(to='uint8') + self.resize = Resize(height=resize_height, + width=resize_width, + box_filter=self.box_filter_resize, + labels_format=self.labels_format) + + # Photometric transformations + self.random_brightness = RandomBrightness(lower=random_brightness[0], upper=random_brightness[1], prob=random_brightness[2]) + self.random_contrast = RandomContrast(lower=random_contrast[0], upper=random_contrast[1], prob=random_contrast[2]) + self.random_saturation = RandomSaturation(lower=random_saturation[0], upper=random_saturation[1], prob=random_saturation[2]) + self.random_hue = RandomHue(max_delta=random_hue[0], prob=random_hue[1]) + + # Geometric transformations + self.random_flip = RandomFlip(dim='horizontal', prob=random_flip, labels_format=self.labels_format) + self.patch_coord_generator = PatchCoordinateGenerator(must_match='w_ar', + min_scale=min_scale, + max_scale=max_scale, + scale_uniformly=False, + min_aspect_ratio = min_aspect_ratio, + max_aspect_ratio = max_aspect_ratio) + self.random_patch = RandomPatch(patch_coord_generator=self.patch_coord_generator, + box_filter=self.box_filter_patch, + image_validator=self.image_validator, + n_trials_max=self.n_trials_max, + clip_boxes=self.clip_boxes, + prob=1.0, + can_fail=False, + labels_format=self.labels_format) + + # Define the processing chain + self.transformations = [self.convert_to_3_channels, + self.convert_to_float32, + self.random_brightness, + self.random_contrast, + self.convert_to_uint8, + self.convert_RGB_to_HSV, + self.convert_to_float32, + self.random_saturation, + self.random_hue, + self.convert_to_uint8, + self.convert_HSV_to_RGB, + self.random_patch, + self.random_flip, + self.resize] + + def __call__(self, image, labels=None): + + self.random_patch.labels_format = self.labels_format + self.random_flip.labels_format = self.labels_format + self.resize.labels_format = self.labels_format + + if not (labels is None): + for transform in self.transformations: + image, labels = transform(image, labels) + return image, labels + else: + for transform in self.sequence1: + image = transform(image) + return image diff --git a/keras_ssd/data_generator/object_detection_2d_data_generator.py b/keras_ssd/data_generator/object_detection_2d_data_generator.py new file mode 100644 index 0000000..e5e6526 --- /dev/null +++ b/keras_ssd/data_generator/object_detection_2d_data_generator.py @@ -0,0 +1,1220 @@ +''' +A data generator for 2D object detection. + +Copyright (C) 2018 Pierluigi Ferrari + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +''' + +from __future__ import division +import numpy as np +import inspect +from collections import defaultdict +import warnings +import sklearn.utils +from copy import deepcopy +from PIL import Image +import cv2 +import csv +import os +import sys +from tqdm import tqdm, trange +try: + import h5py +except ImportError: + warnings.warn("'h5py' module is missing. The fast HDF5 dataset option will be unavailable.") +try: + import json +except ImportError: + warnings.warn("'json' module is missing. The JSON-parser will be unavailable.") +try: + from bs4 import BeautifulSoup +except ImportError: + warnings.warn("'BeautifulSoup' module is missing. The XML-parser will be unavailable.") +try: + import pickle +except ImportError: + warnings.warn("'pickle' module is missing. You won't be able to save parsed file lists and annotations as pickled files.") + +from ssd_encoder_decoder.ssd_input_encoder import SSDInputEncoder +from data_generator.object_detection_2d_image_boxes_validation_utils import BoxFilter + +class DegenerateBatchError(Exception): + ''' + An exception class to be raised if a generated batch ends up being degenerate, + e.g. if a generated batch is empty. + ''' + pass + +class DatasetError(Exception): + ''' + An exception class to be raised if a anything is wrong with the dataset, + in particular if you try to generate batches when no dataset was loaded. + ''' + pass + +class DataGenerator: + ''' + A generator to generate batches of samples and corresponding labels indefinitely. + + Can shuffle the dataset consistently after each complete pass. + + Currently provides three methods to parse annotation data: A general-purpose CSV parser, + an XML parser for the Pascal VOC datasets, and a JSON parser for the MS COCO datasets. + If the annotations of your dataset are in a format that is not supported by these parsers, + you could just add another parser method and still use this generator. + + Can perform image transformations for data conversion and data augmentation, + for details please refer to the documentation of the `generate()` method. + ''' + + def __init__(self, + load_images_into_memory=False, + hdf5_dataset_path=None, + filenames=None, + filenames_type='text', + images_dir=None, + labels=None, + image_ids=None, + eval_neutral=None, + labels_output_format=('class_id', 'xmin', 'ymin', 'xmax', 'ymax'), + verbose=True): + ''' + Initializes the data generator. You can either load a dataset directly here in the constructor, + e.g. an HDF5 dataset, or you can use one of the parser methods to read in a dataset. + + Arguments: + load_images_into_memory (bool, optional): If `True`, the entire dataset will be loaded into memory. + This enables noticeably faster data generation than loading batches of images into memory ad hoc. + Be sure that you have enough memory before you activate this option. + hdf5_dataset_path (str, optional): The full file path of an HDF5 file that contains a dataset in the + format that the `create_hdf5_dataset()` method produces. If you load such an HDF5 dataset, you + don't need to use any of the parser methods anymore, the HDF5 dataset already contains all relevant + data. + filenames (string or list, optional): `None` or either a Python list/tuple or a string representing + a filepath. If a list/tuple is passed, it must contain the file names (full paths) of the + images to be used. Note that the list/tuple must contain the paths to the images, + not the images themselves. If a filepath string is passed, it must point either to + (1) a pickled file containing a list/tuple as described above. In this case the `filenames_type` + argument must be set to `pickle`. + Or + (2) a text file. Each line of the text file contains the file name (basename of the file only, + not the full directory path) to one image and nothing else. In this case the `filenames_type` + argument must be set to `text` and you must pass the path to the directory that contains the + images in `images_dir`. + filenames_type (string, optional): In case a string is passed for `filenames`, this indicates what + type of file `filenames` is. It can be either 'pickle' for a pickled file or 'text' for a + plain text file. + images_dir (string, optional): In case a text file is passed for `filenames`, the full paths to + the images will be composed from `images_dir` and the names in the text file, i.e. this + should be the directory that contains the images to which the text file refers. + If `filenames_type` is not 'text', then this argument is irrelevant. + labels (string or list, optional): `None` or either a Python list/tuple or a string representing + the path to a pickled file containing a list/tuple. The list/tuple must contain Numpy arrays + that represent the labels of the dataset. + image_ids (string or list, optional): `None` or either a Python list/tuple or a string representing + the path to a pickled file containing a list/tuple. The list/tuple must contain the image + IDs of the images in the dataset. + eval_neutral (string or list, optional): `None` or either a Python list/tuple or a string representing + the path to a pickled file containing a list/tuple. The list/tuple must contain for each image + a list that indicates for each ground truth object in the image whether that object is supposed + to be treated as neutral during an evaluation. + labels_output_format (list, optional): A list of five strings representing the desired order of the five + items class ID, xmin, ymin, xmax, ymax in the generated ground truth data (if any). The expected + strings are 'xmin', 'ymin', 'xmax', 'ymax', 'class_id'. + verbose (bool, optional): If `True`, prints out the progress for some constructor operations that may + take a bit longer. + ''' + self.labels_output_format = labels_output_format + self.labels_format={'class_id': labels_output_format.index('class_id'), + 'xmin': labels_output_format.index('xmin'), + 'ymin': labels_output_format.index('ymin'), + 'xmax': labels_output_format.index('xmax'), + 'ymax': labels_output_format.index('ymax')} # This dictionary is for internal use. + + self.dataset_size = 0 # As long as we haven't loaded anything yet, the dataset size is zero. + self.load_images_into_memory = load_images_into_memory + self.images = None # The only way that this list will not stay `None` is if `load_images_into_memory == True`. + + # `self.filenames` is a list containing all file names of the image samples (full paths). + # Note that it does not contain the actual image files themselves. This list is one of the outputs of the parser methods. + # In case you are loading an HDF5 dataset, this list will be `None`. + if not filenames is None: + if isinstance(filenames, (list, tuple)): + self.filenames = filenames + elif isinstance(filenames, str): + with open(filenames, 'rb') as f: + if filenames_type == 'pickle': + self.filenames = pickle.load(f) + elif filenames_type == 'text': + self.filenames = [os.path.join(images_dir, line.strip()) for line in f] + else: + raise ValueError("`filenames_type` can be either 'text' or 'pickle'.") + else: + raise ValueError("`filenames` must be either a Python list/tuple or a string representing a filepath (to a pickled or text file). The value you passed is neither of the two.") + self.dataset_size = len(self.filenames) + self.dataset_indices = np.arange(self.dataset_size, dtype=np.int32) + if load_images_into_memory: + self.images = [] + if verbose: it = tqdm(self.filenames, desc='Loading images into memory', file=sys.stdout) + else: it = self.filenames + for filename in it: + with Image.open(filename) as image: + self.images.append(np.array(image, dtype=np.uint8)) + else: + self.filenames = None + + # In case ground truth is available, `self.labels` is a list containing for each image a list (or NumPy array) + # of ground truth bounding boxes for that image. + if not labels is None: + if isinstance(labels, str): + with open(labels, 'rb') as f: + self.labels = pickle.load(f) + elif isinstance(labels, (list, tuple)): + self.labels = labels + else: + raise ValueError("`labels` must be either a Python list/tuple or a string representing the path to a pickled file containing a list/tuple. The value you passed is neither of the two.") + else: + self.labels = None + + if not image_ids is None: + if isinstance(image_ids, str): + with open(image_ids, 'rb') as f: + self.image_ids = pickle.load(f) + elif isinstance(image_ids, (list, tuple)): + self.image_ids = image_ids + else: + raise ValueError("`image_ids` must be either a Python list/tuple or a string representing the path to a pickled file containing a list/tuple. The value you passed is neither of the two.") + else: + self.image_ids = None + + if not eval_neutral is None: + if isinstance(eval_neutral, str): + with open(eval_neutral, 'rb') as f: + self.eval_neutral = pickle.load(f) + elif isinstance(eval_neutral, (list, tuple)): + self.eval_neutral = eval_neutral + else: + raise ValueError("`image_ids` must be either a Python list/tuple or a string representing the path to a pickled file containing a list/tuple. The value you passed is neither of the two.") + else: + self.eval_neutral = None + + if not hdf5_dataset_path is None: + self.hdf5_dataset_path = hdf5_dataset_path + self.load_hdf5_dataset(verbose=verbose) + else: + self.hdf5_dataset = None + + def load_hdf5_dataset(self, verbose=True): + ''' + Loads an HDF5 dataset that is in the format that the `create_hdf5_dataset()` method + produces. + + Arguments: + verbose (bool, optional): If `True`, prints out the progress while loading + the dataset. + + Returns: + None. + ''' + + self.hdf5_dataset = h5py.File(self.hdf5_dataset_path, 'r') + self.dataset_size = len(self.hdf5_dataset['images']) + self.dataset_indices = np.arange(self.dataset_size, dtype=np.int32) # Instead of shuffling the HDF5 dataset or images in memory, we will shuffle this index list. + + if self.load_images_into_memory: + self.images = [] + if verbose: tr = trange(self.dataset_size, desc='Loading images into memory', file=sys.stdout) + else: tr = range(self.dataset_size) + for i in tr: + self.images.append(self.hdf5_dataset['images'][i].reshape(self.hdf5_dataset['image_shapes'][i])) + + if self.hdf5_dataset.attrs['has_labels']: + self.labels = [] + labels = self.hdf5_dataset['labels'] + label_shapes = self.hdf5_dataset['label_shapes'] + if verbose: tr = trange(self.dataset_size, desc='Loading labels', file=sys.stdout) + else: tr = range(self.dataset_size) + for i in tr: + self.labels.append(labels[i].reshape(label_shapes[i])) + + if self.hdf5_dataset.attrs['has_image_ids']: + self.image_ids = [] + image_ids = self.hdf5_dataset['image_ids'] + if verbose: tr = trange(self.dataset_size, desc='Loading image IDs', file=sys.stdout) + else: tr = range(self.dataset_size) + for i in tr: + self.image_ids.append(image_ids[i]) + + if self.hdf5_dataset.attrs['has_eval_neutral']: + self.eval_neutral = [] + eval_neutral = self.hdf5_dataset['eval_neutral'] + if verbose: tr = trange(self.dataset_size, desc='Loading evaluation-neutrality annotations', file=sys.stdout) + else: tr = range(self.dataset_size) + for i in tr: + self.eval_neutral.append(eval_neutral[i]) + + def parse_csv(self, + images_dir, + labels_filename, + input_format, + include_classes='all', + random_sample=False, + ret=False, + verbose=True): + ''' + Arguments: + images_dir (str): The path to the directory that contains the images. + labels_filename (str): The filepath to a CSV file that contains one ground truth bounding box per line + and each line contains the following six items: image file name, class ID, xmin, xmax, ymin, ymax. + The six items do not have to be in a specific order, but they must be the first six columns of + each line. The order of these items in the CSV file must be specified in `input_format`. + The class ID is an integer greater than zero. Class ID 0 is reserved for the background class. + `xmin` and `xmax` are the left-most and right-most absolute horizontal coordinates of the box, + `ymin` and `ymax` are the top-most and bottom-most absolute vertical coordinates of the box. + The image name is expected to be just the name of the image file without the directory path + at which the image is located. + input_format (list): A list of six strings representing the order of the six items + image file name, class ID, xmin, xmax, ymin, ymax in the input CSV file. The expected strings + are 'image_name', 'xmin', 'xmax', 'ymin', 'ymax', 'class_id'. + include_classes (list, optional): Either 'all' or a list of integers containing the class IDs that + are to be included in the dataset. If 'all', all ground truth boxes will be included in the dataset. + random_sample (float, optional): Either `False` or a float in `[0,1]`. If this is `False`, the + full dataset will be used by the generator. If this is a float in `[0,1]`, a randomly sampled + fraction of the dataset will be used, where `random_sample` is the fraction of the dataset + to be used. For example, if `random_sample = 0.2`, 20 precent of the dataset will be randomly selected, + the rest will be ommitted. The fraction refers to the number of images, not to the number + of boxes, i.e. each image that will be added to the dataset will always be added with all + of its boxes. + ret (bool, optional): Whether or not to return the outputs of the parser. + verbose (bool, optional): If `True`, prints out the progress for operations that may take a bit longer. + + Returns: + None by default, optionally lists for whichever are available of images, image filenames, labels, and image IDs. + ''' + + # Set class members. + self.images_dir = images_dir + self.labels_filename = labels_filename + self.input_format = input_format + self.include_classes = include_classes + + # Before we begin, make sure that we have a labels_filename and an input_format + if self.labels_filename is None or self.input_format is None: + raise ValueError("`labels_filename` and/or `input_format` have not been set yet. You need to pass them as arguments.") + + # Erase data that might have been parsed before + self.filenames = [] + self.image_ids = [] + self.labels = [] + + # First, just read in the CSV file lines and sort them. + + data = [] + + with open(self.labels_filename, newline='') as csvfile: + csvread = csv.reader(csvfile, delimiter=',') + next(csvread) # Skip the header row. + for row in csvread: # For every line (i.e for every bounding box) in the CSV file... + if self.include_classes == 'all' or int(row[self.input_format.index('class_id')].strip()) in self.include_classes: # If the class_id is among the classes that are to be included in the dataset... + box = [] # Store the box class and coordinates here + box.append(row[self.input_format.index('image_name')].strip()) # Select the image name column in the input format and append its content to `box` + for element in self.labels_output_format: # For each element in the output format (where the elements are the class ID and the four box coordinates)... + box.append(int(row[self.input_format.index(element)].strip())) # ...select the respective column in the input format and append it to `box`. + data.append(box) + + data = sorted(data) # The data needs to be sorted, otherwise the next step won't give the correct result + + # Now that we've made sure that the data is sorted by file names, + # we can compile the actual samples and labels lists + + current_file = data[0][0] # The current image for which we're collecting the ground truth boxes + current_image_id = data[0][0].split('.')[0] # The image ID will be the portion of the image name before the first dot. + current_labels = [] # The list where we collect all ground truth boxes for a given image + add_to_dataset = False + for i, box in enumerate(data): + + if box[0] == current_file: # If this box (i.e. this line of the CSV file) belongs to the current image file + current_labels.append(box[1:]) + if i == len(data)-1: # If this is the last line of the CSV file + if random_sample: # In case we're not using the full dataset, but a random sample of it. + p = np.random.uniform(0,1) + if p >= (1-random_sample): + self.labels.append(np.stack(current_labels, axis=0)) + self.filenames.append(os.path.join(self.images_dir, current_file)) + self.image_ids.append(current_image_id) + else: + self.labels.append(np.stack(current_labels, axis=0)) + self.filenames.append(os.path.join(self.images_dir, current_file)) + self.image_ids.append(current_image_id) + else: # If this box belongs to a new image file + if random_sample: # In case we're not using the full dataset, but a random sample of it. + p = np.random.uniform(0,1) + if p >= (1-random_sample): + self.labels.append(np.stack(current_labels, axis=0)) + self.filenames.append(os.path.join(self.images_dir, current_file)) + self.image_ids.append(current_image_id) + else: + self.labels.append(np.stack(current_labels, axis=0)) + self.filenames.append(os.path.join(self.images_dir, current_file)) + self.image_ids.append(current_image_id) + current_labels = [] # Reset the labels list because this is a new file. + current_file = box[0] + current_image_id = box[0].split('.')[0] + current_labels.append(box[1:]) + if i == len(data)-1: # If this is the last line of the CSV file + if random_sample: # In case we're not using the full dataset, but a random sample of it. + p = np.random.uniform(0,1) + if p >= (1-random_sample): + self.labels.append(np.stack(current_labels, axis=0)) + self.filenames.append(os.path.join(self.images_dir, current_file)) + self.image_ids.append(current_image_id) + else: + self.labels.append(np.stack(current_labels, axis=0)) + self.filenames.append(os.path.join(self.images_dir, current_file)) + self.image_ids.append(current_image_id) + + self.dataset_size = len(self.filenames) + self.dataset_indices = np.arange(self.dataset_size, dtype=np.int32) + if self.load_images_into_memory: + self.images = [] + if verbose: it = tqdm(self.filenames, desc='Loading images into memory', file=sys.stdout) + else: it = self.filenames + for filename in it: + with Image.open(filename) as image: + self.images.append(np.array(image, dtype=np.uint8)) + + if ret: # In case we want to return these + return self.images, self.filenames, self.labels, self.image_ids + + def parse_xml(self, + images_dirs, + image_set_filenames, + annotations_dirs=[], + classes=['background', + 'aeroplane', 'bicycle', 'bird', 'boat', + 'bottle', 'bus', 'car', 'cat', + 'chair', 'cow', 'diningtable', 'dog', + 'horse', 'motorbike', 'person', 'pottedplant', + 'sheep', 'sofa', 'train', 'tvmonitor'], + include_classes = 'all', + exclude_truncated=False, + exclude_difficult=False, + ret=False, + verbose=True): + ''' + This is an XML parser for the Pascal VOC datasets. It might be applicable to other datasets with minor changes to + the code, but in its current form it expects the data format and XML tags of the Pascal VOC datasets. + + Arguments: + images_dirs (list): A list of strings, where each string is the path of a directory that + contains images that are to be part of the dataset. This allows you to aggregate multiple datasets + into one (e.g. one directory that contains the images for Pascal VOC 2007, another that contains + the images for Pascal VOC 2012, etc.). + image_set_filenames (list): A list of strings, where each string is the path of the text file with the image + set to be loaded. Must be one file per image directory given. These text files define what images in the + respective image directories are to be part of the dataset and simply contains one image ID per line + and nothing else. + annotations_dirs (list, optional): A list of strings, where each string is the path of a directory that + contains the annotations (XML files) that belong to the images in the respective image directories given. + The directories must contain one XML file per image and the name of an XML file must be the image ID + of the image it belongs to. The content of the XML files must be in the Pascal VOC format. + classes (list, optional): A list containing the names of the object classes as found in the + `name` XML tags. Must include the class `background` as the first list item. The order of this list + defines the class IDs. + include_classes (list, optional): Either 'all' or a list of integers containing the class IDs that + are to be included in the dataset. If 'all', all ground truth boxes will be included in the dataset. + exclude_truncated (bool, optional): If `True`, excludes boxes that are labeled as 'truncated'. + exclude_difficult (bool, optional): If `True`, excludes boxes that are labeled as 'difficult'. + ret (bool, optional): Whether or not to return the outputs of the parser. + verbose (bool, optional): If `True`, prints out the progress for operations that may take a bit longer. + + Returns: + None by default, optionally lists for whichever are available of images, image filenames, labels, image IDs, + and a list indicating which boxes are annotated with the label "difficult". + ''' + # Set class members. + self.images_dirs = images_dirs + self.annotations_dirs = annotations_dirs + self.image_set_filenames = image_set_filenames + self.classes = classes + self.include_classes = include_classes + + # Erase data that might have been parsed before. + self.filenames = [] + self.image_ids = [] + self.labels = [] + self.eval_neutral = [] + if not annotations_dirs: + self.labels = None + self.eval_neutral = None + annotations_dirs = [None] * len(images_dirs) + + for images_dir, image_set_filename, annotations_dir in zip(images_dirs, image_set_filenames, annotations_dirs): + # Read the image set file that so that we know all the IDs of all the images to be included in the dataset. + with open(image_set_filename) as f: + image_ids = [line.strip() for line in f] # Note: These are strings, not integers. + self.image_ids += image_ids + + if verbose: it = tqdm(image_ids, desc="Processing image set '{}'".format(os.path.basename(image_set_filename)), file=sys.stdout) + else: it = image_ids + + # Loop over all images in this dataset. + for image_id in it: + + filename = '{}'.format(image_id) + '.jpg' + self.filenames.append(os.path.join(images_dir, filename)) + + if not annotations_dir is None: + # Parse the XML file for this image. + with open(os.path.join(annotations_dir, image_id + '.xml')) as f: + soup = BeautifulSoup(f, 'xml') + + folder = soup.folder.text # In case we want to return the folder in addition to the image file name. Relevant for determining which dataset an image belongs to. + #filename = soup.filename.text + + boxes = [] # We'll store all boxes for this image here. + eval_neutr = [] # We'll store whether a box is annotated as "difficult" here. + objects = soup.find_all('object') # Get a list of all objects in this image. + + # Parse the data for each object. + for obj in objects: + class_name = obj.find('name', recursive=False).text + class_id = self.classes.index(class_name) + # Check whether this class is supposed to be included in the dataset. + if (not self.include_classes == 'all') and (not class_id in self.include_classes): continue + pose = obj.find('pose', recursive=False).text + truncated = int(obj.find('truncated', recursive=False).text) + if exclude_truncated and (truncated == 1): continue + difficult = int(obj.find('difficult', recursive=False).text) + if exclude_difficult and (difficult == 1): continue + # Get the bounding box coordinates. + bndbox = obj.find('bndbox', recursive=False) + xmin = int(bndbox.xmin.text) + ymin = int(bndbox.ymin.text) + xmax = int(bndbox.xmax.text) + ymax = int(bndbox.ymax.text) + item_dict = {'folder': folder, + 'image_name': filename, + 'image_id': image_id, + 'class_name': class_name, + 'class_id': class_id, + 'pose': pose, + 'truncated': truncated, + 'difficult': difficult, + 'xmin': xmin, + 'ymin': ymin, + 'xmax': xmax, + 'ymax': ymax} + box = [] + for item in self.labels_output_format: + box.append(item_dict[item]) + boxes.append(box) + if difficult: eval_neutr.append(True) + else: eval_neutr.append(False) + + self.labels.append(boxes) + self.eval_neutral.append(eval_neutr) + + self.dataset_size = len(self.filenames) + self.dataset_indices = np.arange(self.dataset_size, dtype=np.int32) + if self.load_images_into_memory: + self.images = [] + if verbose: it = tqdm(self.filenames, desc='Loading images into memory', file=sys.stdout) + else: it = self.filenames + for filename in it: + with Image.open(filename) as image: + self.images.append(np.array(image, dtype=np.uint8)) + + if ret: + return self.images, self.filenames, self.labels, self.image_ids, self.eval_neutral + + def parse_json(self, + images_dirs, + annotations_filenames, + ground_truth_available=False, + include_classes='all', + ret=False, + verbose=True): + ''' + This is an JSON parser for the MS COCO datasets. It might be applicable to other datasets with minor changes to + the code, but in its current form it expects the JSON format of the MS COCO datasets. + + Arguments: + images_dirs (list, optional): A list of strings, where each string is the path of a directory that + contains images that are to be part of the dataset. This allows you to aggregate multiple datasets + into one (e.g. one directory that contains the images for MS COCO Train 2014, another one for MS COCO + Val 2014, another one for MS COCO Train 2017 etc.). + annotations_filenames (list): A list of strings, where each string is the path of the JSON file + that contains the annotations for the images in the respective image directories given, i.e. one + JSON file per image directory that contains the annotations for all images in that directory. + The content of the JSON files must be in MS COCO object detection format. Note that these annotations + files do not necessarily need to contain ground truth information. MS COCO also provides annotations + files without ground truth information for the test datasets, called `image_info_[...].json`. + ground_truth_available (bool, optional): Set `True` if the annotations files contain ground truth information. + include_classes (list, optional): Either 'all' or a list of integers containing the class IDs that + are to be included in the dataset. If 'all', all ground truth boxes will be included in the dataset. + ret (bool, optional): Whether or not to return the outputs of the parser. + verbose (bool, optional): If `True`, prints out the progress for operations that may take a bit longer. + + Returns: + None by default, optionally lists for whichever are available of images, image filenames, labels and image IDs. + ''' + self.images_dirs = images_dirs + self.annotations_filenames = annotations_filenames + self.include_classes = include_classes + # Erase data that might have been parsed before. + self.filenames = [] + self.image_ids = [] + self.labels = [] + if not ground_truth_available: + self.labels = None + + # Build the dictionaries that map between class names and class IDs. + with open(annotations_filenames[0], 'r') as f: + annotations = json.load(f) + # Unfortunately the 80 MS COCO class IDs are not all consecutive. They go + # from 1 to 90 and some numbers are skipped. Since the IDs that we feed + # into a neural network must be consecutive, we'll save both the original + # (non-consecutive) IDs as well as transformed maps. + # We'll save both the map between the original + self.cats_to_names = {} # The map between class names (values) and their original IDs (keys) + self.classes_to_names = [] # A list of the class names with their indices representing the transformed IDs + self.classes_to_names.append('background') # Need to add the background class first so that the indexing is right. + self.cats_to_classes = {} # A dictionary that maps between the original (keys) and the transformed IDs (values) + self.classes_to_cats = {} # A dictionary that maps between the transformed (keys) and the original IDs (values) + for i, cat in enumerate(annotations['categories']): + self.cats_to_names[cat['id']] = cat['name'] + self.classes_to_names.append(cat['name']) + self.cats_to_classes[cat['id']] = i + 1 + self.classes_to_cats[i + 1] = cat['id'] + + # Iterate over all datasets. + for images_dir, annotations_filename in zip(self.images_dirs, self.annotations_filenames): + # Load the JSON file. + with open(annotations_filename, 'r') as f: + annotations = json.load(f) + + if ground_truth_available: + # Create the annotations map, a dictionary whose keys are the image IDs + # and whose values are the annotations for the respective image ID. + image_ids_to_annotations = defaultdict(list) + for annotation in annotations['annotations']: + image_ids_to_annotations[annotation['image_id']].append(annotation) + + if verbose: it = tqdm(annotations['images'], desc="Processing '{}'".format(os.path.basename(annotations_filename)), file=sys.stdout) + else: it = annotations['images'] + + # Loop over all images in this dataset. + for img in it: + + self.filenames.append(os.path.join(images_dir, img['file_name'])) + self.image_ids.append(img['id']) + + if ground_truth_available: + # Get all annotations for this image. + annotations = image_ids_to_annotations[img['id']] + boxes = [] + for annotation in annotations: + cat_id = annotation['category_id'] + # Check if this class is supposed to be included in the dataset. + if (not self.include_classes == 'all') and (not cat_id in self.include_classes): continue + # Transform the original class ID to fit in the sequence of consecutive IDs. + class_id = self.cats_to_classes[cat_id] + xmin = annotation['bbox'][0] + ymin = annotation['bbox'][1] + width = annotation['bbox'][2] + height = annotation['bbox'][3] + # Compute `xmax` and `ymax`. + xmax = xmin + width + ymax = ymin + height + item_dict = {'image_name': img['file_name'], + 'image_id': img['id'], + 'class_id': class_id, + 'xmin': xmin, + 'ymin': ymin, + 'xmax': xmax, + 'ymax': ymax} + box = [] + for item in self.labels_output_format: + box.append(item_dict[item]) + boxes.append(box) + self.labels.append(boxes) + + self.dataset_size = len(self.filenames) + self.dataset_indices = np.arange(self.dataset_size, dtype=np.int32) + if self.load_images_into_memory: + self.images = [] + if verbose: it = tqdm(self.filenames, desc='Loading images into memory', file=sys.stdout) + else: it = self.filenames + for filename in it: + with Image.open(filename) as image: + self.images.append(np.array(image, dtype=np.uint8)) + + if ret: + return self.images, self.filenames, self.labels, self.image_ids + + def create_hdf5_dataset(self, + file_path='dataset.h5', + resize=False, + variable_image_size=True, + verbose=True): + ''' + Converts the currently loaded dataset into a HDF5 file. This HDF5 file contains all + images as uncompressed arrays in a contiguous block of memory, which allows for them + to be loaded faster. Such an uncompressed dataset, however, may take up considerably + more space on your hard drive than the sum of the source images in a compressed format + such as JPG or PNG. + + It is recommended that you always convert the dataset into an HDF5 dataset if you + have enugh hard drive space since loading from an HDF5 dataset accelerates the data + generation noticeably. + + Note that you must load a dataset (e.g. via one of the parser methods) before creating + an HDF5 dataset from it. + + The created HDF5 dataset will remain open upon its creation so that it can be used right + away. + + Arguments: + file_path (str, optional): The full file path under which to store the HDF5 dataset. + You can load this output file via the `DataGenerator` constructor in the future. + resize (tuple, optional): `False` or a 2-tuple `(height, width)` that represents the + target size for the images. All images in the dataset will be resized to this + target size before they will be written to the HDF5 file. If `False`, no resizing + will be performed. + variable_image_size (bool, optional): The only purpose of this argument is that its + value will be stored in the HDF5 dataset in order to be able to quickly find out + whether the images in the dataset all have the same size or not. + verbose (bool, optional): Whether or not prit out the progress of the dataset creation. + + Returns: + None. + ''' + + self.hdf5_dataset_path = file_path + + dataset_size = len(self.filenames) + + # Create the HDF5 file. + hdf5_dataset = h5py.File(file_path, 'w') + + # Create a few attributes that tell us what this dataset contains. + # The dataset will obviously always contain images, but maybe it will + # also contain labels, image IDs, etc. + hdf5_dataset.attrs.create(name='has_labels', data=False, shape=None, dtype=np.bool_) + hdf5_dataset.attrs.create(name='has_image_ids', data=False, shape=None, dtype=np.bool_) + hdf5_dataset.attrs.create(name='has_eval_neutral', data=False, shape=None, dtype=np.bool_) + # It's useful to be able to quickly check whether the images in a dataset all + # have the same size or not, so add a boolean attribute for that. + if variable_image_size and not resize: + hdf5_dataset.attrs.create(name='variable_image_size', data=True, shape=None, dtype=np.bool_) + else: + hdf5_dataset.attrs.create(name='variable_image_size', data=False, shape=None, dtype=np.bool_) + + # Create the dataset in which the images will be stored as flattened arrays. + # This allows us, among other things, to store images of variable size. + hdf5_images = hdf5_dataset.create_dataset(name='images', + shape=(dataset_size,), + maxshape=(None), + dtype=h5py.special_dtype(vlen=np.uint8)) + + # Create the dataset that will hold the image heights, widths and channels that + # we need in order to reconstruct the images from the flattened arrays later. + hdf5_image_shapes = hdf5_dataset.create_dataset(name='image_shapes', + shape=(dataset_size, 3), + maxshape=(None, 3), + dtype=np.int32) + + if not (self.labels is None): + + # Create the dataset in which the labels will be stored as flattened arrays. + hdf5_labels = hdf5_dataset.create_dataset(name='labels', + shape=(dataset_size,), + maxshape=(None), + dtype=h5py.special_dtype(vlen=np.int32)) + + # Create the dataset that will hold the dimensions of the labels arrays for + # each image so that we can restore the labels from the flattened arrays later. + hdf5_label_shapes = hdf5_dataset.create_dataset(name='label_shapes', + shape=(dataset_size, 2), + maxshape=(None, 2), + dtype=np.int32) + + hdf5_dataset.attrs.modify(name='has_labels', value=True) + + if not (self.image_ids is None): + + hdf5_image_ids = hdf5_dataset.create_dataset(name='image_ids', + shape=(dataset_size,), + maxshape=(None), + dtype=h5py.special_dtype(vlen=str)) + + hdf5_dataset.attrs.modify(name='has_image_ids', value=True) + + if not (self.eval_neutral is None): + + # Create the dataset in which the labels will be stored as flattened arrays. + hdf5_eval_neutral = hdf5_dataset.create_dataset(name='eval_neutral', + shape=(dataset_size,), + maxshape=(None), + dtype=h5py.special_dtype(vlen=np.bool_)) + + hdf5_dataset.attrs.modify(name='has_eval_neutral', value=True) + + if verbose: + tr = trange(dataset_size, desc='Creating HDF5 dataset', file=sys.stdout) + else: + tr = range(dataset_size) + + # Iterate over all images in the dataset. + for i in tr: + + # Store the image. + with Image.open(self.filenames[i]) as image: + + image = np.asarray(image, dtype=np.uint8) + + # Make sure all images end up having three channels. + if image.ndim == 2: + image = np.stack([image] * 3, axis=-1) + elif image.ndim == 3: + if image.shape[2] == 1: + image = np.concatenate([image] * 3, axis=-1) + elif image.shape[2] == 4: + image = image[:,:,:3] + + if resize: + image = cv2.resize(image, dsize=(resize[1], resize[0])) + + # Flatten the image array and write it to the images dataset. + hdf5_images[i] = image.reshape(-1) + # Write the image's shape to the image shapes dataset. + hdf5_image_shapes[i] = image.shape + + # Store the ground truth if we have any. + if not (self.labels is None): + + labels = np.asarray(self.labels[i]) + # Flatten the labels array and write it to the labels dataset. + hdf5_labels[i] = labels.reshape(-1) + # Write the labels' shape to the label shapes dataset. + hdf5_label_shapes[i] = labels.shape + + # Store the image ID if we have one. + if not (self.image_ids is None): + + hdf5_image_ids[i] = self.image_ids[i] + + # Store the evaluation-neutrality annotations if we have any. + if not (self.eval_neutral is None): + + hdf5_eval_neutral[i] = self.eval_neutral[i] + + hdf5_dataset.close() + self.hdf5_dataset = h5py.File(file_path, 'r') + self.hdf5_dataset_path = file_path + self.dataset_size = len(self.hdf5_dataset['images']) + self.dataset_indices = np.arange(self.dataset_size, dtype=np.int32) # Instead of shuffling the HDF5 dataset, we will shuffle this index list. + + def generate(self, + batch_size=32, + shuffle=True, + transformations=[], + label_encoder=None, + returns={'processed_images', 'encoded_labels'}, + keep_images_without_gt=False, + degenerate_box_handling='remove'): + ''' + Generates batches of samples and (optionally) corresponding labels indefinitely. + + Can shuffle the samples consistently after each complete pass. + + Optionally takes a list of arbitrary image transformations to apply to the + samples ad hoc. + + Arguments: + batch_size (int, optional): The size of the batches to be generated. + shuffle (bool, optional): Whether or not to shuffle the dataset before each pass. + This option should always be `True` during training, but it can be useful to turn shuffling off + for debugging or if you're using the generator for prediction. + transformations (list, optional): A list of transformations that will be applied to the images and labels + in the given order. Each transformation is a callable that takes as input an image (as a Numpy array) + and optionally labels (also as a Numpy array) and returns an image and optionally labels in the same + format. + label_encoder (callable, optional): Only relevant if labels are given. A callable that takes as input the + labels of a batch (as a list of Numpy arrays) and returns some structure that represents those labels. + The general use case for this is to convert labels from their input format to a format that a given object + detection model needs as its training targets. + returns (set, optional): A set of strings that determines what outputs the generator yields. The generator's output + is always a tuple that contains the outputs specified in this set and only those. If an output is not available, + it will be `None`. The output tuple can contain the following outputs according to the specified keyword strings: + * 'processed_images': An array containing the processed images. Will always be in the outputs, so it doesn't + matter whether or not you include this keyword in the set. + * 'encoded_labels': The encoded labels tensor. Will always be in the outputs if a label encoder is given, + so it doesn't matter whether or not you include this keyword in the set if you pass a label encoder. + * 'matched_anchors': Only available if `labels_encoder` is an `SSDInputEncoder` object. The same as 'encoded_labels', + but containing anchor box coordinates for all matched anchor boxes instead of ground truth coordinates. + This can be useful to visualize what anchor boxes are being matched to each ground truth box. Only available + in training mode. + * 'processed_labels': The processed, but not yet encoded labels. This is a list that contains for each + batch image a Numpy array with all ground truth boxes for that image. Only available if ground truth is available. + * 'filenames': A list containing the file names (full paths) of the images in the batch. + * 'image_ids': A list containing the integer IDs of the images in the batch. Only available if there + are image IDs available. + * 'evaluation-neutral': A nested list of lists of booleans. Each list contains `True` or `False` for every ground truth + bounding box of the respective image depending on whether that bounding box is supposed to be evaluation-neutral (`True`) + or not (`False`). May return `None` if there exists no such concept for a given dataset. An example for + evaluation-neutrality are the ground truth boxes annotated as "difficult" in the Pascal VOC datasets, which are + usually treated to be neutral in a model evaluation. + * 'inverse_transform': A nested list that contains a list of "inverter" functions for each item in the batch. + These inverter functions take (predicted) labels for an image as input and apply the inverse of the transformations + that were applied to the original image to them. This makes it possible to let the model make predictions on a + transformed image and then convert these predictions back to the original image. This is mostly relevant for + evaluation: If you want to evaluate your model on a dataset with varying image sizes, then you are forced to + transform the images somehow (e.g. by resizing or cropping) to make them all the same size. Your model will then + predict boxes for those transformed images, but for the evaluation you will need predictions with respect to the + original images, not with respect to the transformed images. This means you will have to transform the predicted + box coordinates back to the original image sizes. Note that for each image, the inverter functions for that + image need to be applied in the order in which they are given in the respective list for that image. + * 'original_images': A list containing the original images in the batch before any processing. + * 'original_labels': A list containing the original ground truth boxes for the images in this batch before any + processing. Only available if ground truth is available. + The order of the outputs in the tuple is the order of the list above. If `returns` contains a keyword for an + output that is unavailable, that output omitted in the yielded tuples and a warning will be raised. + keep_images_without_gt (bool, optional): If `False`, images for which there aren't any ground truth boxes before + any transformations have been applied will be removed from the batch. If `True`, such images will be kept + in the batch. + degenerate_box_handling (str, optional): How to handle degenerate boxes, which are boxes that have `xmax <= xmin` and/or + `ymax <= ymin`. Degenerate boxes can sometimes be in the dataset, or non-degenerate boxes can become degenerate + after they were processed by transformations. Note that the generator checks for degenerate boxes after all + transformations have been applied (if any), but before the labels were passed to the `label_encoder` (if one was given). + Can be one of 'warn' or 'remove'. If 'warn', the generator will merely print a warning to let you know that there + are degenerate boxes in a batch. If 'remove', the generator will remove degenerate boxes from the batch silently. + + Yields: + The next batch as a tuple of items as defined by the `returns` argument. + ''' + + if self.dataset_size == 0: + raise DatasetError("Cannot generate batches because you did not load a dataset.") + + ############################################################################################# + # Warn if any of the set returns aren't possible. + ############################################################################################# + + if self.labels is None: + if any([ret in returns for ret in ['original_labels', 'processed_labels', 'encoded_labels', 'matched_anchors', 'evaluation-neutral']]): + warnings.warn("Since no labels were given, none of 'original_labels', 'processed_labels', 'evaluation-neutral', 'encoded_labels', and 'matched_anchors' " + + "are possible returns, but you set `returns = {}`. The impossible returns will be `None`.".format(returns)) + elif label_encoder is None: + if any([ret in returns for ret in ['encoded_labels', 'matched_anchors']]): + warnings.warn("Since no label encoder was given, 'encoded_labels' and 'matched_anchors' aren't possible returns, " + + "but you set `returns = {}`. The impossible returns will be `None`.".format(returns)) + elif not isinstance(label_encoder, SSDInputEncoder): + if 'matched_anchors' in returns: + warnings.warn("`label_encoder` is not an `SSDInputEncoder` object, therefore 'matched_anchors' is not a possible return, " + + "but you set `returns = {}`. The impossible returns will be `None`.".format(returns)) + + ############################################################################################# + # Do a few preparatory things like maybe shuffling the dataset initially. + ############################################################################################# + + if shuffle: + objects_to_shuffle = [self.dataset_indices] + if not (self.filenames is None): + objects_to_shuffle.append(self.filenames) + if not (self.labels is None): + objects_to_shuffle.append(self.labels) + if not (self.image_ids is None): + objects_to_shuffle.append(self.image_ids) + if not (self.eval_neutral is None): + objects_to_shuffle.append(self.eval_neutral) + shuffled_objects = sklearn.utils.shuffle(*objects_to_shuffle) + for i in range(len(objects_to_shuffle)): + objects_to_shuffle[i][:] = shuffled_objects[i] + + if degenerate_box_handling == 'remove': + box_filter = BoxFilter(check_overlap=False, + check_min_area=False, + check_degenerate=True, + labels_format=self.labels_format) + + # Override the labels formats of all the transformations to make sure they are set correctly. + if not (self.labels is None): + for transform in transformations: + transform.labels_format = self.labels_format + + ############################################################################################# + # Generate mini batches. + ############################################################################################# + + current = 0 + + while True: + + batch_X, batch_y = [], [] + + if current >= self.dataset_size: + current = 0 + + ######################################################################################### + # Maybe shuffle the dataset if a full pass over the dataset has finished. + ######################################################################################### + + if shuffle: + objects_to_shuffle = [self.dataset_indices] + if not (self.filenames is None): + objects_to_shuffle.append(self.filenames) + if not (self.labels is None): + objects_to_shuffle.append(self.labels) + if not (self.image_ids is None): + objects_to_shuffle.append(self.image_ids) + if not (self.eval_neutral is None): + objects_to_shuffle.append(self.eval_neutral) + shuffled_objects = sklearn.utils.shuffle(*objects_to_shuffle) + for i in range(len(objects_to_shuffle)): + objects_to_shuffle[i][:] = shuffled_objects[i] + + ######################################################################################### + # Get the images, (maybe) image IDs, (maybe) labels, etc. for this batch. + ######################################################################################### + + # We prioritize our options in the following order: + # 1) If we have the images already loaded in memory, get them from there. + # 2) Else, if we have an HDF5 dataset, get the images from there. + # 3) Else, if we have neither of the above, we'll have to load the individual image + # files from disk. + batch_indices = self.dataset_indices[current:current+batch_size] + if not (self.images is None): + for i in batch_indices: + batch_X.append(self.images[i]) + if not (self.filenames is None): + batch_filenames = self.filenames[current:current+batch_size] + else: + batch_filenames = None + elif not (self.hdf5_dataset is None): + for i in batch_indices: + batch_X.append(self.hdf5_dataset['images'][i].reshape(self.hdf5_dataset['image_shapes'][i])) + if not (self.filenames is None): + batch_filenames = self.filenames[current:current+batch_size] + else: + batch_filenames = None + else: + batch_filenames = self.filenames[current:current+batch_size] + for filename in batch_filenames: + with Image.open(filename) as image: + batch_X.append(np.array(image, dtype=np.uint8)) + + # Get the labels for this batch (if there are any). + if not (self.labels is None): + batch_y = deepcopy(self.labels[current:current+batch_size]) + else: + batch_y = None + + if not (self.eval_neutral is None): + batch_eval_neutral = self.eval_neutral[current:current+batch_size] + else: + batch_eval_neutral = None + + # Get the image IDs for this batch (if there are any). + if not (self.image_ids is None): + batch_image_ids = self.image_ids[current:current+batch_size] + else: + batch_image_ids = None + + if 'original_images' in returns: + batch_original_images = deepcopy(batch_X) # The original, unaltered images + if 'original_labels' in returns: + batch_original_labels = deepcopy(batch_y) # The original, unaltered labels + + current += batch_size + + ######################################################################################### + # Maybe perform image transformations. + ######################################################################################### + + batch_items_to_remove = [] # In case we need to remove any images from the batch, store their indices in this list. + batch_inverse_transforms = [] + + for i in range(len(batch_X)): + + if not (self.labels is None): + # Convert the labels for this image to an array (in case they aren't already). + batch_y[i] = np.array(batch_y[i]) + # If this image has no ground truth boxes, maybe we don't want to keep it in the batch. + if (batch_y[i].size == 0) and not keep_images_without_gt: + batch_items_to_remove.append(i) + batch_inverse_transforms.append([]) + continue + + # Apply any image transformations we may have received. + if transformations: + + inverse_transforms = [] + + for transform in transformations: + + if not (self.labels is None): + + if ('inverse_transform' in returns) and ('return_inverter' in inspect.signature(transform).parameters): + batch_X[i], batch_y[i], inverse_transform = transform(batch_X[i], batch_y[i], return_inverter=True) + inverse_transforms.append(inverse_transform) + else: + batch_X[i], batch_y[i] = transform(batch_X[i], batch_y[i]) + + if batch_X[i] is None: # In case the transform failed to produce an output image, which is possible for some random transforms. + batch_items_to_remove.append(i) + batch_inverse_transforms.append([]) + continue + + else: + + if ('inverse_transform' in returns) and ('return_inverter' in inspect.signature(transform).parameters): + batch_X[i], inverse_transform = transform(batch_X[i], return_inverter=True) + inverse_transforms.append(inverse_transform) + else: + batch_X[i] = transform(batch_X[i]) + + batch_inverse_transforms.append(inverse_transforms[::-1]) + + ######################################################################################### + # Check for degenerate boxes in this batch item. + ######################################################################################### + + if not (self.labels is None): + + xmin = self.labels_format['xmin'] + ymin = self.labels_format['ymin'] + xmax = self.labels_format['xmax'] + ymax = self.labels_format['ymax'] + + if np.any(batch_y[i][:,xmax] - batch_y[i][:,xmin] <= 0) or np.any(batch_y[i][:,ymax] - batch_y[i][:,ymin] <= 0): + if degenerate_box_handling == 'warn': + warnings.warn("Detected degenerate ground truth bounding boxes for batch item {} with bounding boxes {}, ".format(i, batch_y[i]) + + "i.e. bounding boxes where xmax <= xmin and/or ymax <= ymin. " + + "This could mean that your dataset contains degenerate ground truth boxes, or that any image transformations you may apply might " + + "result in degenerate ground truth boxes, or that you are parsing the ground truth in the wrong coordinate format." + + "Degenerate ground truth bounding boxes may lead to NaN errors during the training.") + elif degenerate_box_handling == 'remove': + batch_y[i] = box_filter(batch_y[i]) + if (batch_y[i].size == 0) and not keep_images_without_gt: + batch_items_to_remove.append(i) + + ######################################################################################### + # Remove any items we might not want to keep from the batch. + ######################################################################################### + + if batch_items_to_remove: + for j in sorted(batch_items_to_remove, reverse=True): + # This isn't efficient, but it hopefully shouldn't need to be done often anyway. + batch_X.pop(j) + batch_filenames.pop(j) + if batch_inverse_transforms: batch_inverse_transforms.pop(j) + if not (self.labels is None): batch_y.pop(j) + if not (self.image_ids is None): batch_image_ids.pop(j) + if not (self.eval_neutral is None): batch_eval_neutral.pop(j) + if 'original_images' in returns: batch_original_images.pop(j) + if 'original_labels' in returns and not (self.labels is None): batch_original_labels.pop(j) + + ######################################################################################### + + # CAUTION: Converting `batch_X` into an array will result in an empty batch if the images have varying sizes + # or varying numbers of channels. At this point, all images must have the same size and the same + # number of channels. + batch_X = np.array(batch_X) + if (batch_X.size == 0): + raise DegenerateBatchError("You produced an empty batch. This might be because the images in the batch vary " + + "in their size and/or number of channels. Note that after all transformations " + + "(if any were given) have been applied to all images in the batch, all images " + + "must be homogenous in size along all axes.") + + ######################################################################################### + # If we have a label encoder, encode our labels. + ######################################################################################### + + if not (label_encoder is None or self.labels is None): + + if ('matched_anchors' in returns) and isinstance(label_encoder, SSDInputEncoder): + batch_y_encoded, batch_matched_anchors = label_encoder(batch_y, diagnostics=True) + else: + batch_y_encoded = label_encoder(batch_y, diagnostics=False) + batch_matched_anchors = None + + else: + batch_y_encoded = None + batch_matched_anchors = None + + ######################################################################################### + # Compose the output. + ######################################################################################### + + ret = [] + if 'processed_images' in returns: ret.append(batch_X) + if 'encoded_labels' in returns: ret.append(batch_y_encoded) + if 'matched_anchors' in returns: ret.append(batch_matched_anchors) + if 'processed_labels' in returns: ret.append(batch_y) + if 'filenames' in returns: ret.append(batch_filenames) + if 'image_ids' in returns: ret.append(batch_image_ids) + if 'evaluation-neutral' in returns: ret.append(batch_eval_neutral) + if 'inverse_transform' in returns: ret.append(batch_inverse_transforms) + if 'original_images' in returns: ret.append(batch_original_images) + if 'original_labels' in returns: ret.append(batch_original_labels) + + yield ret + + def save_dataset(self, + filenames_path='filenames.pkl', + labels_path=None, + image_ids_path=None, + eval_neutral_path=None): + ''' + Writes the current `filenames`, `labels`, and `image_ids` lists to the specified files. + This is particularly useful for large datasets with annotations that are + parsed from XML files, which can take quite long. If you'll be using the + same dataset repeatedly, you don't want to have to parse the XML label + files every time. + + Arguments: + filenames_path (str): The path under which to save the filenames pickle. + labels_path (str): The path under which to save the labels pickle. + image_ids_path (str, optional): The path under which to save the image IDs pickle. + eval_neutral_path (str, optional): The path under which to save the pickle for + the evaluation-neutrality annotations. + ''' + with open(filenames_path, 'wb') as f: + pickle.dump(self.filenames, f) + if not labels_path is None: + with open(labels_path, 'wb') as f: + pickle.dump(self.labels, f) + if not image_ids_path is None: + with open(image_ids_path, 'wb') as f: + pickle.dump(self.image_ids, f) + if not eval_neutral_path is None: + with open(eval_neutral_path, 'wb') as f: + pickle.dump(self.eval_neutral, f) + + def get_dataset(self): + ''' + Returns: + 4-tuple containing lists and/or `None` for the filenames, labels, image IDs, + and evaluation-neutrality annotations. + ''' + return self.filenames, self.labels, self.image_ids, self.eval_neutral + + def get_dataset_size(self): + ''' + Returns: + The number of images in the dataset. + ''' + return self.dataset_size diff --git a/keras_ssd/data_generator/object_detection_2d_geometric_ops.py b/keras_ssd/data_generator/object_detection_2d_geometric_ops.py new file mode 100644 index 0000000..1b36815 --- /dev/null +++ b/keras_ssd/data_generator/object_detection_2d_geometric_ops.py @@ -0,0 +1,779 @@ +''' +Various geometric image transformations for 2D object detection, both deterministic +and probabilistic. + +Copyright (C) 2018 Pierluigi Ferrari + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +''' + +from __future__ import division +import numpy as np +import cv2 +import random + +from data_generator.object_detection_2d_image_boxes_validation_utils import BoxFilter, ImageValidator + +class Resize: + ''' + Resizes images to a specified height and width in pixels. + ''' + + def __init__(self, + height, + width, + interpolation_mode=cv2.INTER_LINEAR, + box_filter=None, + labels_format={'class_id': 0, 'xmin': 1, 'ymin': 2, 'xmax': 3, 'ymax': 4}): + ''' + Arguments: + height (int): The desired height of the output images in pixels. + width (int): The desired width of the output images in pixels. + interpolation_mode (int, optional): An integer that denotes a valid + OpenCV interpolation mode. For example, integers 0 through 5 are + valid interpolation modes. + box_filter (BoxFilter, optional): Only relevant if ground truth bounding boxes are given. + A `BoxFilter` object to filter out bounding boxes that don't meet the given criteria + after the transformation. Refer to the `BoxFilter` documentation for details. If `None`, + the validity of the bounding boxes is not checked. + labels_format (dict, optional): A dictionary that defines which index in the last axis of the labels + of an image contains which bounding box coordinate. The dictionary maps at least the keywords + 'xmin', 'ymin', 'xmax', and 'ymax' to their respective indices within last axis of the labels array. + ''' + if not (isinstance(box_filter, BoxFilter) or box_filter is None): + raise ValueError("`box_filter` must be either `None` or a `BoxFilter` object.") + self.out_height = height + self.out_width = width + self.interpolation_mode = interpolation_mode + self.box_filter = box_filter + self.labels_format = labels_format + + def __call__(self, image, labels=None, return_inverter=False): + + img_height, img_width = image.shape[:2] + + xmin = self.labels_format['xmin'] + ymin = self.labels_format['ymin'] + xmax = self.labels_format['xmax'] + ymax = self.labels_format['ymax'] + + image = cv2.resize(image, + dsize=(self.out_width, self.out_height), + interpolation=self.interpolation_mode) + + if return_inverter: + def inverter(labels): + labels = np.copy(labels) + labels[:, [ymin+1, ymax+1]] = np.round(labels[:, [ymin+1, ymax+1]] * (img_height / self.out_height), decimals=0) + labels[:, [xmin+1, xmax+1]] = np.round(labels[:, [xmin+1, xmax+1]] * (img_width / self.out_width), decimals=0) + return labels + + if labels is None: + if return_inverter: + return image, inverter + else: + return image + else: + labels = np.copy(labels) + labels[:, [ymin, ymax]] = np.round(labels[:, [ymin, ymax]] * (self.out_height / img_height), decimals=0) + labels[:, [xmin, xmax]] = np.round(labels[:, [xmin, xmax]] * (self.out_width / img_width), decimals=0) + + if not (self.box_filter is None): + self.box_filter.labels_format = self.labels_format + labels = self.box_filter(labels=labels, + image_height=self.out_height, + image_width=self.out_width) + + if return_inverter: + return image, labels, inverter + else: + return image, labels + +class ResizeRandomInterp: + ''' + Resizes images to a specified height and width in pixels using a radnomly + selected interpolation mode. + ''' + + def __init__(self, + height, + width, + interpolation_modes=[cv2.INTER_NEAREST, + cv2.INTER_LINEAR, + cv2.INTER_CUBIC, + cv2.INTER_AREA, + cv2.INTER_LANCZOS4], + box_filter=None, + labels_format={'class_id': 0, 'xmin': 1, 'ymin': 2, 'xmax': 3, 'ymax': 4}): + ''' + Arguments: + height (int): The desired height of the output image in pixels. + width (int): The desired width of the output image in pixels. + interpolation_modes (list/tuple, optional): A list/tuple of integers + that represent valid OpenCV interpolation modes. For example, + integers 0 through 5 are valid interpolation modes. + box_filter (BoxFilter, optional): Only relevant if ground truth bounding boxes are given. + A `BoxFilter` object to filter out bounding boxes that don't meet the given criteria + after the transformation. Refer to the `BoxFilter` documentation for details. If `None`, + the validity of the bounding boxes is not checked. + labels_format (dict, optional): A dictionary that defines which index in the last axis of the labels + of an image contains which bounding box coordinate. The dictionary maps at least the keywords + 'xmin', 'ymin', 'xmax', and 'ymax' to their respective indices within last axis of the labels array. + ''' + if not (isinstance(interpolation_modes, (list, tuple))): + raise ValueError("`interpolation_mode` must be a list or tuple.") + self.height = height + self.width = width + self.interpolation_modes = interpolation_modes + self.box_filter = box_filter + self.labels_format = labels_format + self.resize = Resize(height=self.height, + width=self.width, + box_filter=self.box_filter, + labels_format=self.labels_format) + + def __call__(self, image, labels=None, return_inverter=False): + self.resize.interpolation_mode = np.random.choice(self.interpolation_modes) + self.resize.labels_format = self.labels_format + return self.resize(image, labels, return_inverter) + +class Flip: + ''' + Flips images horizontally or vertically. + ''' + def __init__(self, + dim='horizontal', + labels_format={'class_id': 0, 'xmin': 1, 'ymin': 2, 'xmax': 3, 'ymax': 4}): + ''' + Arguments: + dim (str, optional): Can be either of 'horizontal' and 'vertical'. + If 'horizontal', images will be flipped horizontally, i.e. along + the vertical axis. If 'horizontal', images will be flipped vertically, + i.e. along the horizontal axis. + labels_format (dict, optional): A dictionary that defines which index in the last axis of the labels + of an image contains which bounding box coordinate. The dictionary maps at least the keywords + 'xmin', 'ymin', 'xmax', and 'ymax' to their respective indices within last axis of the labels array. + ''' + if not (dim in {'horizontal', 'vertical'}): raise ValueError("`dim` can be one of 'horizontal' and 'vertical'.") + self.dim = dim + self.labels_format = labels_format + + def __call__(self, image, labels=None, return_inverter=False): + + img_height, img_width = image.shape[:2] + + xmin = self.labels_format['xmin'] + ymin = self.labels_format['ymin'] + xmax = self.labels_format['xmax'] + ymax = self.labels_format['ymax'] + + if self.dim == 'horizontal': + image = image[:,::-1] + if labels is None: + return image + else: + labels = np.copy(labels) + labels[:, [xmin, xmax]] = img_width - labels[:, [xmax, xmin]] + return image, labels + else: + image = image[::-1] + if labels is None: + return image + else: + labels = np.copy(labels) + labels[:, [ymin, ymax]] = img_height - labels[:, [ymax, ymin]] + return image, labels + +class RandomFlip: + ''' + Randomly flips images horizontally or vertically. The randomness only refers + to whether or not the image will be flipped. + ''' + def __init__(self, + dim='horizontal', + prob=0.5, + labels_format={'class_id': 0, 'xmin': 1, 'ymin': 2, 'xmax': 3, 'ymax': 4}): + ''' + Arguments: + dim (str, optional): Can be either of 'horizontal' and 'vertical'. + If 'horizontal', images will be flipped horizontally, i.e. along + the vertical axis. If 'horizontal', images will be flipped vertically, + i.e. along the horizontal axis. + prob (float, optional): `(1 - prob)` determines the probability with which the original, + unaltered image is returned. + labels_format (dict, optional): A dictionary that defines which index in the last axis of the labels + of an image contains which bounding box coordinate. The dictionary maps at least the keywords + 'xmin', 'ymin', 'xmax', and 'ymax' to their respective indices within last axis of the labels array. + ''' + self.dim = dim + self.prob = prob + self.labels_format = labels_format + self.flip = Flip(dim=self.dim, labels_format=self.labels_format) + + def __call__(self, image, labels=None): + p = np.random.uniform(0,1) + if p >= (1.0-self.prob): + self.flip.labels_format = self.labels_format + return self.flip(image, labels) + elif labels is None: + return image + else: + return image, labels + +class Translate: + ''' + Translates images horizontally and/or vertically. + ''' + + def __init__(self, + dy, + dx, + clip_boxes=True, + box_filter=None, + background=(0,0,0), + labels_format={'class_id': 0, 'xmin': 1, 'ymin': 2, 'xmax': 3, 'ymax': 4}): + ''' + Arguments: + dy (float): The fraction of the image height by which to translate images along the + vertical axis. Positive values translate images downwards, negative values + translate images upwards. + dx (float): The fraction of the image width by which to translate images along the + horizontal axis. Positive values translate images to the right, negative values + translate images to the left. + clip_boxes (bool, optional): Only relevant if ground truth bounding boxes are given. + If `True`, any ground truth bounding boxes will be clipped to lie entirely within the + image after the translation. + box_filter (BoxFilter, optional): Only relevant if ground truth bounding boxes are given. + A `BoxFilter` object to filter out bounding boxes that don't meet the given criteria + after the transformation. Refer to the `BoxFilter` documentation for details. If `None`, + the validity of the bounding boxes is not checked. + background (list/tuple, optional): A 3-tuple specifying the RGB color value of the + background pixels of the translated images. + labels_format (dict, optional): A dictionary that defines which index in the last axis of the labels + of an image contains which bounding box coordinate. The dictionary maps at least the keywords + 'xmin', 'ymin', 'xmax', and 'ymax' to their respective indices within last axis of the labels array. + ''' + + if not (isinstance(box_filter, BoxFilter) or box_filter is None): + raise ValueError("`box_filter` must be either `None` or a `BoxFilter` object.") + self.dy_rel = dy + self.dx_rel = dx + self.clip_boxes = clip_boxes + self.box_filter = box_filter + self.background = background + self.labels_format = labels_format + + def __call__(self, image, labels=None): + + img_height, img_width = image.shape[:2] + + # Compute the translation matrix. + dy_abs = int(round(img_height * self.dy_rel)) + dx_abs = int(round(img_width * self.dx_rel)) + M = np.float32([[1, 0, dx_abs], + [0, 1, dy_abs]]) + + # Translate the image. + image = cv2.warpAffine(image, + M=M, + dsize=(img_width, img_height), + borderMode=cv2.BORDER_CONSTANT, + borderValue=self.background) + + if labels is None: + return image + else: + xmin = self.labels_format['xmin'] + ymin = self.labels_format['ymin'] + xmax = self.labels_format['xmax'] + ymax = self.labels_format['ymax'] + + labels = np.copy(labels) + # Translate the box coordinates to the translated image's coordinate system. + labels[:,[xmin,xmax]] += dx_abs + labels[:,[ymin,ymax]] += dy_abs + + # Compute all valid boxes for this patch. + if not (self.box_filter is None): + self.box_filter.labels_format = self.labels_format + labels = self.box_filter(labels=labels, + image_height=img_height, + image_width=img_width) + + if self.clip_boxes: + labels[:,[ymin,ymax]] = np.clip(labels[:,[ymin,ymax]], a_min=0, a_max=img_height-1) + labels[:,[xmin,xmax]] = np.clip(labels[:,[xmin,xmax]], a_min=0, a_max=img_width-1) + + return image, labels + +class RandomTranslate: + ''' + Randomly translates images horizontally and/or vertically. + ''' + + def __init__(self, + dy_minmax=(0.03,0.3), + dx_minmax=(0.03,0.3), + prob=0.5, + clip_boxes=True, + box_filter=None, + image_validator=None, + n_trials_max=3, + background=(0,0,0), + labels_format={'class_id': 0, 'xmin': 1, 'ymin': 2, 'xmax': 3, 'ymax': 4}): + ''' + Arguments: + dy_minmax (list/tuple, optional): A 2-tuple `(min, max)` of non-negative floats that + determines the minimum and maximum relative translation of images along the vertical + axis both upward and downward. That is, images will be randomly translated by at least + `min` and at most `max` either upward or downward. For example, if `dy_minmax == (0.05,0.3)`, + an image of size `(100,100)` will be translated by at least 5 and at most 30 pixels + either upward or downward. The translation direction is chosen randomly. + dx_minmax (list/tuple, optional): A 2-tuple `(min, max)` of non-negative floats that + determines the minimum and maximum relative translation of images along the horizontal + axis both to the left and right. That is, images will be randomly translated by at least + `min` and at most `max` either left or right. For example, if `dx_minmax == (0.05,0.3)`, + an image of size `(100,100)` will be translated by at least 5 and at most 30 pixels + either left or right. The translation direction is chosen randomly. + prob (float, optional): `(1 - prob)` determines the probability with which the original, + unaltered image is returned. + clip_boxes (bool, optional): Only relevant if ground truth bounding boxes are given. + If `True`, any ground truth bounding boxes will be clipped to lie entirely within the + image after the translation. + box_filter (BoxFilter, optional): Only relevant if ground truth bounding boxes are given. + A `BoxFilter` object to filter out bounding boxes that don't meet the given criteria + after the transformation. Refer to the `BoxFilter` documentation for details. If `None`, + the validity of the bounding boxes is not checked. + image_validator (ImageValidator, optional): Only relevant if ground truth bounding boxes are given. + An `ImageValidator` object to determine whether a translated image is valid. If `None`, + any outcome is valid. + n_trials_max (int, optional): Only relevant if ground truth bounding boxes are given. + Determines the maxmial number of trials to produce a valid image. If no valid image could + be produced in `n_trials_max` trials, returns the unaltered input image. + background (list/tuple, optional): A 3-tuple specifying the RGB color value of the + background pixels of the translated images. + labels_format (dict, optional): A dictionary that defines which index in the last axis of the labels + of an image contains which bounding box coordinate. The dictionary maps at least the keywords + 'xmin', 'ymin', 'xmax', and 'ymax' to their respective indices within last axis of the labels array. + ''' + if dy_minmax[0] > dy_minmax[1]: + raise ValueError("It must be `dy_minmax[0] <= dy_minmax[1]`.") + if dx_minmax[0] > dx_minmax[1]: + raise ValueError("It must be `dx_minmax[0] <= dx_minmax[1]`.") + if dy_minmax[0] < 0 or dx_minmax[0] < 0: + raise ValueError("It must be `dy_minmax[0] >= 0` and `dx_minmax[0] >= 0`.") + if not (isinstance(image_validator, ImageValidator) or image_validator is None): + raise ValueError("`image_validator` must be either `None` or an `ImageValidator` object.") + self.dy_minmax = dy_minmax + self.dx_minmax = dx_minmax + self.prob = prob + self.clip_boxes = clip_boxes + self.box_filter = box_filter + self.image_validator = image_validator + self.n_trials_max = n_trials_max + self.background = background + self.labels_format = labels_format + self.translate = Translate(dy=0, + dx=0, + clip_boxes=self.clip_boxes, + box_filter=self.box_filter, + background=self.background, + labels_format=self.labels_format) + + def __call__(self, image, labels=None): + + p = np.random.uniform(0,1) + if p >= (1.0-self.prob): + + img_height, img_width = image.shape[:2] + + xmin = self.labels_format['xmin'] + ymin = self.labels_format['ymin'] + xmax = self.labels_format['xmax'] + ymax = self.labels_format['ymax'] + + # Override the preset labels format. + if not self.image_validator is None: + self.image_validator.labels_format = self.labels_format + self.translate.labels_format = self.labels_format + + for _ in range(max(1, self.n_trials_max)): + + # Pick the relative amount by which to translate. + dy_abs = np.random.uniform(self.dy_minmax[0], self.dy_minmax[1]) + dx_abs = np.random.uniform(self.dx_minmax[0], self.dx_minmax[1]) + # Pick the direction in which to translate. + dy = np.random.choice([-dy_abs, dy_abs]) + dx = np.random.choice([-dx_abs, dx_abs]) + self.translate.dy_rel = dy + self.translate.dx_rel = dx + + if (labels is None) or (self.image_validator is None): + # We either don't have any boxes or if we do, we will accept any outcome as valid. + return self.translate(image, labels) + else: + # Translate the box coordinates to the translated image's coordinate system. + new_labels = np.copy(labels) + new_labels[:, [ymin, ymax]] += int(round(img_height * dy)) + new_labels[:, [xmin, xmax]] += int(round(img_width * dx)) + + # Check if the patch is valid. + if self.image_validator(labels=new_labels, + image_height=img_height, + image_width=img_width): + return self.translate(image, labels) + + # If all attempts failed, return the unaltered input image. + if labels is None: + return image + + else: + return image, labels + + elif labels is None: + return image + + else: + return image, labels + +class Scale: + ''' + Scales images, i.e. zooms in or out. + ''' + + def __init__(self, + factor, + clip_boxes=True, + box_filter=None, + background=(0,0,0), + labels_format={'class_id': 0, 'xmin': 1, 'ymin': 2, 'xmax': 3, 'ymax': 4}): + ''' + Arguments: + factor (float): The fraction of the image size by which to scale images. Must be positive. + clip_boxes (bool, optional): Only relevant if ground truth bounding boxes are given. + If `True`, any ground truth bounding boxes will be clipped to lie entirely within the + image after the translation. + box_filter (BoxFilter, optional): Only relevant if ground truth bounding boxes are given. + A `BoxFilter` object to filter out bounding boxes that don't meet the given criteria + after the transformation. Refer to the `BoxFilter` documentation for details. If `None`, + the validity of the bounding boxes is not checked. + background (list/tuple, optional): A 3-tuple specifying the RGB color value of the potential + background pixels of the scaled images. + labels_format (dict, optional): A dictionary that defines which index in the last axis of the labels + of an image contains which bounding box coordinate. The dictionary maps at least the keywords + 'xmin', 'ymin', 'xmax', and 'ymax' to their respective indices within last axis of the labels array. + ''' + + if factor <= 0: + raise ValueError("It must be `factor > 0`.") + if not (isinstance(box_filter, BoxFilter) or box_filter is None): + raise ValueError("`box_filter` must be either `None` or a `BoxFilter` object.") + self.factor = factor + self.clip_boxes = clip_boxes + self.box_filter = box_filter + self.background = background + self.labels_format = labels_format + + def __call__(self, image, labels=None): + + img_height, img_width = image.shape[:2] + + # Compute the rotation matrix. + M = cv2.getRotationMatrix2D(center=(img_width / 2, img_height / 2), + angle=0, + scale=self.factor) + + # Scale the image. + image = cv2.warpAffine(image, + M=M, + dsize=(img_width, img_height), + borderMode=cv2.BORDER_CONSTANT, + borderValue=self.background) + + if labels is None: + return image + else: + xmin = self.labels_format['xmin'] + ymin = self.labels_format['ymin'] + xmax = self.labels_format['xmax'] + ymax = self.labels_format['ymax'] + + labels = np.copy(labels) + # Scale the bounding boxes accordingly. + # Transform two opposite corner points of the rectangular boxes using the rotation matrix `M`. + toplefts = np.array([labels[:,xmin], labels[:,ymin], np.ones(labels.shape[0])]) + bottomrights = np.array([labels[:,xmax], labels[:,ymax], np.ones(labels.shape[0])]) + new_toplefts = (np.dot(M, toplefts)).T + new_bottomrights = (np.dot(M, bottomrights)).T + labels[:,[xmin,ymin]] = np.round(new_toplefts, decimals=0).astype(np.int) + labels[:,[xmax,ymax]] = np.round(new_bottomrights, decimals=0).astype(np.int) + + # Compute all valid boxes for this patch. + if not (self.box_filter is None): + self.box_filter.labels_format = self.labels_format + labels = self.box_filter(labels=labels, + image_height=img_height, + image_width=img_width) + + if self.clip_boxes: + labels[:,[ymin,ymax]] = np.clip(labels[:,[ymin,ymax]], a_min=0, a_max=img_height-1) + labels[:,[xmin,xmax]] = np.clip(labels[:,[xmin,xmax]], a_min=0, a_max=img_width-1) + + return image, labels + +class RandomScale: + ''' + Randomly scales images. + ''' + + def __init__(self, + min_factor=0.5, + max_factor=1.5, + prob=0.5, + clip_boxes=True, + box_filter=None, + image_validator=None, + n_trials_max=3, + background=(0,0,0), + labels_format={'class_id': 0, 'xmin': 1, 'ymin': 2, 'xmax': 3, 'ymax': 4}): + ''' + Arguments: + min_factor (float, optional): The minimum fraction of the image size by which to scale images. + Must be positive. + max_factor (float, optional): The maximum fraction of the image size by which to scale images. + Must be positive. + prob (float, optional): `(1 - prob)` determines the probability with which the original, + unaltered image is returned. + clip_boxes (bool, optional): Only relevant if ground truth bounding boxes are given. + If `True`, any ground truth bounding boxes will be clipped to lie entirely within the + image after the translation. + box_filter (BoxFilter, optional): Only relevant if ground truth bounding boxes are given. + A `BoxFilter` object to filter out bounding boxes that don't meet the given criteria + after the transformation. Refer to the `BoxFilter` documentation for details. If `None`, + the validity of the bounding boxes is not checked. + image_validator (ImageValidator, optional): Only relevant if ground truth bounding boxes are given. + An `ImageValidator` object to determine whether a scaled image is valid. If `None`, + any outcome is valid. + n_trials_max (int, optional): Only relevant if ground truth bounding boxes are given. + Determines the maxmial number of trials to produce a valid image. If no valid image could + be produced in `n_trials_max` trials, returns the unaltered input image. + background (list/tuple, optional): A 3-tuple specifying the RGB color value of the potential + background pixels of the scaled images. + labels_format (dict, optional): A dictionary that defines which index in the last axis of the labels + of an image contains which bounding box coordinate. The dictionary maps at least the keywords + 'xmin', 'ymin', 'xmax', and 'ymax' to their respective indices within last axis of the labels array. + ''' + + if not (0 < min_factor <= max_factor): + raise ValueError("It must be `0 < min_factor <= max_factor`.") + if not (isinstance(image_validator, ImageValidator) or image_validator is None): + raise ValueError("`image_validator` must be either `None` or an `ImageValidator` object.") + self.min_factor = min_factor + self.max_factor = max_factor + self.prob = prob + self.clip_boxes = clip_boxes + self.box_filter = box_filter + self.image_validator = image_validator + self.n_trials_max = n_trials_max + self.background = background + self.labels_format = labels_format + self.scale = Scale(factor=1.0, + clip_boxes=self.clip_boxes, + box_filter=self.box_filter, + background=self.background, + labels_format=self.labels_format) + + def __call__(self, image, labels=None): + + p = np.random.uniform(0,1) + if p >= (1.0-self.prob): + + img_height, img_width = image.shape[:2] + + xmin = self.labels_format['xmin'] + ymin = self.labels_format['ymin'] + xmax = self.labels_format['xmax'] + ymax = self.labels_format['ymax'] + + # Override the preset labels format. + if not self.image_validator is None: + self.image_validator.labels_format = self.labels_format + self.scale.labels_format = self.labels_format + + for _ in range(max(1, self.n_trials_max)): + + # Pick a scaling factor. + factor = np.random.uniform(self.min_factor, self.max_factor) + self.scale.factor = factor + + if (labels is None) or (self.image_validator is None): + # We either don't have any boxes or if we do, we will accept any outcome as valid. + return self.scale(image, labels) + else: + # Scale the bounding boxes accordingly. + # Transform two opposite corner points of the rectangular boxes using the rotation matrix `M`. + toplefts = np.array([labels[:,xmin], labels[:,ymin], np.ones(labels.shape[0])]) + bottomrights = np.array([labels[:,xmax], labels[:,ymax], np.ones(labels.shape[0])]) + + # Compute the rotation matrix. + M = cv2.getRotationMatrix2D(center=(img_width / 2, img_height / 2), + angle=0, + scale=factor) + + new_toplefts = (np.dot(M, toplefts)).T + new_bottomrights = (np.dot(M, bottomrights)).T + + new_labels = np.copy(labels) + new_labels[:,[xmin,ymin]] = np.around(new_toplefts, decimals=0).astype(np.int) + new_labels[:,[xmax,ymax]] = np.around(new_bottomrights, decimals=0).astype(np.int) + + # Check if the patch is valid. + if self.image_validator(labels=new_labels, + image_height=img_height, + image_width=img_width): + return self.scale(image, labels) + + # If all attempts failed, return the unaltered input image. + if labels is None: + return image + + else: + return image, labels + + elif labels is None: + return image + + else: + return image, labels + +class Rotate: + ''' + Rotates images counter-clockwise by 90, 180, or 270 degrees. + ''' + + def __init__(self, + angle, + labels_format={'class_id': 0, 'xmin': 1, 'ymin': 2, 'xmax': 3, 'ymax': 4}): + ''' + Arguments: + angle (int): The angle in degrees by which to rotate the images counter-clockwise. + Only 90, 180, and 270 are valid values. + labels_format (dict, optional): A dictionary that defines which index in the last axis of the labels + of an image contains which bounding box coordinate. The dictionary maps at least the keywords + 'xmin', 'ymin', 'xmax', and 'ymax' to their respective indices within last axis of the labels array. + ''' + + if not angle in {90, 180, 270}: + raise ValueError("`angle` must be in the set {90, 180, 270}.") + self.angle = angle + self.labels_format = labels_format + + def __call__(self, image, labels=None): + + img_height, img_width = image.shape[:2] + + # Compute the rotation matrix. + M = cv2.getRotationMatrix2D(center=(img_width / 2, img_height / 2), + angle=self.angle, + scale=1) + + # Get the sine and cosine from the rotation matrix. + cos_angle = np.abs(M[0, 0]) + sin_angle = np.abs(M[0, 1]) + + # Compute the new bounding dimensions of the image. + img_width_new = int(img_height * sin_angle + img_width * cos_angle) + img_height_new = int(img_height * cos_angle + img_width * sin_angle) + + # Adjust the rotation matrix to take into account the translation. + M[1, 2] += (img_height_new - img_height) / 2 + M[0, 2] += (img_width_new - img_width) / 2 + + # Rotate the image. + image = cv2.warpAffine(image, + M=M, + dsize=(img_width_new, img_height_new)) + + if labels is None: + return image + else: + xmin = self.labels_format['xmin'] + ymin = self.labels_format['ymin'] + xmax = self.labels_format['xmax'] + ymax = self.labels_format['ymax'] + + labels = np.copy(labels) + # Rotate the bounding boxes accordingly. + # Transform two opposite corner points of the rectangular boxes using the rotation matrix `M`. + toplefts = np.array([labels[:,xmin], labels[:,ymin], np.ones(labels.shape[0])]) + bottomrights = np.array([labels[:,xmax], labels[:,ymax], np.ones(labels.shape[0])]) + new_toplefts = (np.dot(M, toplefts)).T + new_bottomrights = (np.dot(M, bottomrights)).T + labels[:,[xmin,ymin]] = np.round(new_toplefts, decimals=0).astype(np.int) + labels[:,[xmax,ymax]] = np.round(new_bottomrights, decimals=0).astype(np.int) + + if self.angle == 90: + # ymin and ymax were switched by the rotation. + labels[:,[ymax,ymin]] = labels[:,[ymin,ymax]] + elif self.angle == 180: + # ymin and ymax were switched by the rotation, + # and also xmin and xmax were switched. + labels[:,[ymax,ymin]] = labels[:,[ymin,ymax]] + labels[:,[xmax,xmin]] = labels[:,[xmin,xmax]] + elif self.angle == 270: + # xmin and xmax were switched by the rotation. + labels[:,[xmax,xmin]] = labels[:,[xmin,xmax]] + + return image, labels + +class RandomRotate: + ''' + Randomly rotates images counter-clockwise. + ''' + + def __init__(self, + angles=[90, 180, 270], + prob=0.5, + labels_format={'class_id': 0, 'xmin': 1, 'ymin': 2, 'xmax': 3, 'ymax': 4}): + ''' + Arguments: + angle (list): The list of angles in degrees from which one is randomly selected to rotate + the images counter-clockwise. Only 90, 180, and 270 are valid values. + prob (float, optional): `(1 - prob)` determines the probability with which the original, + unaltered image is returned. + labels_format (dict, optional): A dictionary that defines which index in the last axis of the labels + of an image contains which bounding box coordinate. The dictionary maps at least the keywords + 'xmin', 'ymin', 'xmax', and 'ymax' to their respective indices within last axis of the labels array. + ''' + for angle in angles: + if not angle in {90, 180, 270}: + raise ValueError("`angles` can only contain the values 90, 180, and 270.") + self.angles = angles + self.prob = prob + self.labels_format = labels_format + self.rotate = Rotate(angle=90, labels_format=self.labels_format) + + def __call__(self, image, labels=None): + + p = np.random.uniform(0,1) + if p >= (1.0-self.prob): + # Pick a rotation angle. + self.rotate.angle = random.choice(self.angles) + self.rotate.labels_format = self.labels_format + return self.rotate(image, labels) + + elif labels is None: + return image + + else: + return image, labels diff --git a/keras_ssd/data_generator/object_detection_2d_image_boxes_validation_utils.py b/keras_ssd/data_generator/object_detection_2d_image_boxes_validation_utils.py new file mode 100644 index 0000000..8338fd7 --- /dev/null +++ b/keras_ssd/data_generator/object_detection_2d_image_boxes_validation_utils.py @@ -0,0 +1,322 @@ +''' +Utilities for 2D object detection related to answering the following questions: +1. Given an image size and bounding boxes, which bounding boxes meet certain + requirements with respect to the image size? +2. Given an image size and bounding boxes, is an image of that size valid with + respect to the bounding boxes according to certain requirements? + +Copyright (C) 2018 Pierluigi Ferrari + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +''' + +from __future__ import division +import numpy as np + +from bounding_box_utils.bounding_box_utils import iou + +class BoundGenerator: + ''' + Generates pairs of floating point values that represent lower and upper bounds + from a given sample space. + ''' + def __init__(self, + sample_space=((0.1, None), + (0.3, None), + (0.5, None), + (0.7, None), + (0.9, None), + (None, None)), + weights=None): + ''' + Arguments: + sample_space (list or tuple): A list, tuple, or array-like object of shape + `(n, 2)` that contains `n` samples to choose from, where each sample + is a 2-tuple of scalars and/or `None` values. + weights (list or tuple, optional): A list or tuple representing the distribution + over the sample space. If `None`, a uniform distribution will be assumed. + ''' + + if (not (weights is None)) and len(weights) != len(sample_space): + raise ValueError("`weights` must either be `None` for uniform distribution or have the same length as `sample_space`.") + + self.sample_space = [] + for bound_pair in sample_space: + if len(bound_pair) != 2: + raise ValueError("All elements of the sample space must be 2-tuples.") + bound_pair = list(bound_pair) + if bound_pair[0] is None: bound_pair[0] = 0.0 + if bound_pair[1] is None: bound_pair[1] = 1.0 + if bound_pair[0] > bound_pair[1]: + raise ValueError("For all sample space elements, the lower bound cannot be greater than the upper bound.") + self.sample_space.append(bound_pair) + + self.sample_space_size = len(self.sample_space) + + if weights is None: + self.weights = [1.0/self.sample_space_size] * self.sample_space_size + else: + self.weights = weights + + def __call__(self): + ''' + Returns: + An item of the sample space, i.e. a 2-tuple of scalars. + ''' + i = np.random.choice(self.sample_space_size, p=self.weights) + return self.sample_space[i] + +class BoxFilter: + ''' + Returns all bounding boxes that are valid with respect to a the defined criteria. + ''' + + def __init__(self, + check_overlap=True, + check_min_area=True, + check_degenerate=True, + overlap_criterion='center_point', + overlap_bounds=(0.3, 1.0), + min_area=16, + labels_format={'class_id': 0, 'xmin': 1, 'ymin': 2, 'xmax': 3, 'ymax': 4}, + border_pixels='half'): + ''' + Arguments: + check_overlap (bool, optional): Whether or not to enforce the overlap requirements defined by + `overlap_criterion` and `overlap_bounds`. Sometimes you might want to use the box filter only + to enforce a certain minimum area for all boxes (see next argument), in such cases you can + turn the overlap requirements off. + check_min_area (bool, optional): Whether or not to enforce the minimum area requirement defined + by `min_area`. If `True`, any boxes that have an area (in pixels) that is smaller than `min_area` + will be removed from the labels of an image. Bounding boxes below a certain area aren't useful + training examples. An object that takes up only, say, 5 pixels in an image is probably not + recognizable anymore, neither for a human, nor for an object detection model. It makes sense + to remove such boxes. + check_degenerate (bool, optional): Whether or not to check for and remove degenerate bounding boxes. + Degenerate bounding boxes are boxes that have `xmax <= xmin` and/or `ymax <= ymin`. In particular, + boxes with a width and/or height of zero are degenerate. It is obviously important to filter out + such boxes, so you should only set this option to `False` if you are certain that degenerate + boxes are not possible in your data and processing chain. + overlap_criterion (str, optional): Can be either of 'center_point', 'iou', or 'area'. Determines + which boxes are considered valid with respect to a given image. If set to 'center_point', + a given bounding box is considered valid if its center point lies within the image. + If set to 'area', a given bounding box is considered valid if the quotient of its intersection + area with the image and its own area is within the given `overlap_bounds`. If set to 'iou', a given + bounding box is considered valid if its IoU with the image is within the given `overlap_bounds`. + overlap_bounds (list or BoundGenerator, optional): Only relevant if `overlap_criterion` is 'area' or 'iou'. + Determines the lower and upper bounds for `overlap_criterion`. Can be either a 2-tuple of scalars + representing a lower bound and an upper bound, or a `BoundGenerator` object, which provides + the possibility to generate bounds randomly. + min_area (int, optional): Only relevant if `check_min_area` is `True`. Defines the minimum area in + pixels that a bounding box must have in order to be valid. Boxes with an area smaller than this + will be removed. + labels_format (dict, optional): A dictionary that defines which index in the last axis of the labels + of an image contains which bounding box coordinate. The dictionary maps at least the keywords + 'xmin', 'ymin', 'xmax', and 'ymax' to their respective indices within last axis of the labels array. + border_pixels (str, optional): How to treat the border pixels of the bounding boxes. + Can be 'include', 'exclude', or 'half'. If 'include', the border pixels belong + to the boxes. If 'exclude', the border pixels do not belong to the boxes. + If 'half', then one of each of the two horizontal and vertical borders belong + to the boxex, but not the other. + ''' + if not isinstance(overlap_bounds, (list, tuple, BoundGenerator)): + raise ValueError("`overlap_bounds` must be either a 2-tuple of scalars or a `BoundGenerator` object.") + if isinstance(overlap_bounds, (list, tuple)) and (overlap_bounds[0] > overlap_bounds[1]): + raise ValueError("The lower bound must not be greater than the upper bound.") + if not (overlap_criterion in {'iou', 'area', 'center_point'}): + raise ValueError("`overlap_criterion` must be one of 'iou', 'area', or 'center_point'.") + self.overlap_criterion = overlap_criterion + self.overlap_bounds = overlap_bounds + self.min_area = min_area + self.check_overlap = check_overlap + self.check_min_area = check_min_area + self.check_degenerate = check_degenerate + self.labels_format = labels_format + self.border_pixels = border_pixels + + def __call__(self, + labels, + image_height=None, + image_width=None): + ''' + Arguments: + labels (array): The labels to be filtered. This is an array with shape `(m,n)`, where + `m` is the number of bounding boxes and `n` is the number of elements that defines + each bounding box (box coordinates, class ID, etc.). The box coordinates are expected + to be in the image's coordinate system. + image_height (int): Only relevant if `check_overlap == True`. The height of the image + (in pixels) to compare the box coordinates to. + image_width (int): `check_overlap == True`. The width of the image (in pixels) to compare + the box coordinates to. + + Returns: + An array containing the labels of all boxes that are valid. + ''' + + labels = np.copy(labels) + + xmin = self.labels_format['xmin'] + ymin = self.labels_format['ymin'] + xmax = self.labels_format['xmax'] + ymax = self.labels_format['ymax'] + + # Record the boxes that pass all checks here. + requirements_met = np.ones(shape=labels.shape[0], dtype=np.bool) + + if self.check_degenerate: + + non_degenerate = (labels[:,xmax] > labels[:,xmin]) * (labels[:,ymax] > labels[:,ymin]) + requirements_met *= non_degenerate + + if self.check_min_area: + + min_area_met = (labels[:,xmax] - labels[:,xmin]) * (labels[:,ymax] - labels[:,ymin]) >= self.min_area + requirements_met *= min_area_met + + if self.check_overlap: + + # Get the lower and upper bounds. + if isinstance(self.overlap_bounds, BoundGenerator): + lower, upper = self.overlap_bounds() + else: + lower, upper = self.overlap_bounds + + # Compute which boxes are valid. + + if self.overlap_criterion == 'iou': + # Compute the patch coordinates. + image_coords = np.array([0, 0, image_width, image_height]) + # Compute the IoU between the patch and all of the ground truth boxes. + image_boxes_iou = iou(image_coords, labels[:, [xmin, ymin, xmax, ymax]], coords='corners', mode='element-wise', border_pixels=self.border_pixels) + requirements_met *= (image_boxes_iou > lower) * (image_boxes_iou <= upper) + + elif self.overlap_criterion == 'area': + if self.border_pixels == 'half': + d = 0 + elif self.border_pixels == 'include': + d = 1 # If border pixels are supposed to belong to the bounding boxes, we have to add one pixel to any difference `xmax - xmin` or `ymax - ymin`. + elif self.border_pixels == 'exclude': + d = -1 # If border pixels are not supposed to belong to the bounding boxes, we have to subtract one pixel from any difference `xmax - xmin` or `ymax - ymin`. + # Compute the areas of the boxes. + box_areas = (labels[:,xmax] - labels[:,xmin] + d) * (labels[:,ymax] - labels[:,ymin] + d) + # Compute the intersection area between the patch and all of the ground truth boxes. + clipped_boxes = np.copy(labels) + clipped_boxes[:,[ymin,ymax]] = np.clip(labels[:,[ymin,ymax]], a_min=0, a_max=image_height-1) + clipped_boxes[:,[xmin,xmax]] = np.clip(labels[:,[xmin,xmax]], a_min=0, a_max=image_width-1) + intersection_areas = (clipped_boxes[:,xmax] - clipped_boxes[:,xmin] + d) * (clipped_boxes[:,ymax] - clipped_boxes[:,ymin] + d) # +1 because the border pixels belong to the box areas. + # Check which boxes meet the overlap requirements. + if lower == 0.0: + mask_lower = intersection_areas > lower * box_areas # If `self.lower == 0`, we want to make sure that boxes with area 0 don't count, hence the ">" sign instead of the ">=" sign. + else: + mask_lower = intersection_areas >= lower * box_areas # Especially for the case `self.lower == 1` we want the ">=" sign, otherwise no boxes would count at all. + mask_upper = intersection_areas <= upper * box_areas + requirements_met *= mask_lower * mask_upper + + elif self.overlap_criterion == 'center_point': + # Compute the center points of the boxes. + cy = (labels[:,ymin] + labels[:,ymax]) / 2 + cx = (labels[:,xmin] + labels[:,xmax]) / 2 + # Check which of the boxes have center points within the cropped patch remove those that don't. + requirements_met *= (cy >= 0.0) * (cy <= image_height-1) * (cx >= 0.0) * (cx <= image_width-1) + + return labels[requirements_met] + +class ImageValidator: + ''' + Returns `True` if a given minimum number of bounding boxes meets given overlap + requirements with an image of a given height and width. + ''' + + def __init__(self, + overlap_criterion='center_point', + bounds=(0.3, 1.0), + n_boxes_min=1, + labels_format={'class_id': 0, 'xmin': 1, 'ymin': 2, 'xmax': 3, 'ymax': 4}, + border_pixels='half'): + ''' + Arguments: + overlap_criterion (str, optional): Can be either of 'center_point', 'iou', or 'area'. Determines + which boxes are considered valid with respect to a given image. If set to 'center_point', + a given bounding box is considered valid if its center point lies within the image. + If set to 'area', a given bounding box is considered valid if the quotient of its intersection + area with the image and its own area is within `lower` and `upper`. If set to 'iou', a given + bounding box is considered valid if its IoU with the image is within `lower` and `upper`. + bounds (list or BoundGenerator, optional): Only relevant if `overlap_criterion` is 'area' or 'iou'. + Determines the lower and upper bounds for `overlap_criterion`. Can be either a 2-tuple of scalars + representing a lower bound and an upper bound, or a `BoundGenerator` object, which provides + the possibility to generate bounds randomly. + n_boxes_min (int or str, optional): Either a non-negative integer or the string 'all'. + Determines the minimum number of boxes that must meet the `overlap_criterion` with respect to + an image of the given height and width in order for the image to be a valid image. + If set to 'all', an image is considered valid if all given boxes meet the `overlap_criterion`. + labels_format (dict, optional): A dictionary that defines which index in the last axis of the labels + of an image contains which bounding box coordinate. The dictionary maps at least the keywords + 'xmin', 'ymin', 'xmax', and 'ymax' to their respective indices within last axis of the labels array. + border_pixels (str, optional): How to treat the border pixels of the bounding boxes. + Can be 'include', 'exclude', or 'half'. If 'include', the border pixels belong + to the boxes. If 'exclude', the border pixels do not belong to the boxes. + If 'half', then one of each of the two horizontal and vertical borders belong + to the boxex, but not the other. + ''' + if not ((isinstance(n_boxes_min, int) and n_boxes_min > 0) or n_boxes_min == 'all'): + raise ValueError("`n_boxes_min` must be a positive integer or 'all'.") + self.overlap_criterion = overlap_criterion + self.bounds = bounds + self.n_boxes_min = n_boxes_min + self.labels_format = labels_format + self.border_pixels = border_pixels + self.box_filter = BoxFilter(check_overlap=True, + check_min_area=False, + check_degenerate=False, + overlap_criterion=self.overlap_criterion, + overlap_bounds=self.bounds, + labels_format=self.labels_format, + border_pixels=self.border_pixels) + + def __call__(self, + labels, + image_height, + image_width): + ''' + Arguments: + labels (array): The labels to be tested. The box coordinates are expected + to be in the image's coordinate system. + image_height (int): The height of the image to compare the box coordinates to. + image_width (int): The width of the image to compare the box coordinates to. + + Returns: + A boolean indicating whether an imgae of the given height and width is + valid with respect to the given bounding boxes. + ''' + + self.box_filter.overlap_bounds = self.bounds + self.box_filter.labels_format = self.labels_format + + # Get all boxes that meet the overlap requirements. + valid_labels = self.box_filter(labels=labels, + image_height=image_height, + image_width=image_width) + + # Check whether enough boxes meet the requirements. + if isinstance(self.n_boxes_min, int): + # The image is valid if at least `self.n_boxes_min` ground truth boxes meet the requirements. + if len(valid_labels) >= self.n_boxes_min: + return True + else: + return False + elif self.n_boxes_min == 'all': + # The image is valid if all ground truth boxes meet the requirements. + if len(valid_labels) == len(labels): + return True + else: + return False diff --git a/keras_ssd/data_generator/object_detection_2d_misc_utils.py b/keras_ssd/data_generator/object_detection_2d_misc_utils.py new file mode 100644 index 0000000..1a4397f --- /dev/null +++ b/keras_ssd/data_generator/object_detection_2d_misc_utils.py @@ -0,0 +1,73 @@ +''' +Miscellaneous data generator utilities. + +Copyright (C) 2018 Pierluigi Ferrari + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +''' + +from __future__ import division +import numpy as np + +def apply_inverse_transforms(y_pred_decoded, inverse_transforms): + ''' + Takes a list or Numpy array of decoded predictions and applies a given list of + transforms to them. The list of inverse transforms would usually contain the + inverter functions that some of the image transformations that come with this + data generator return. This function would normally be used to transform predictions + that were made on a transformed image back to the original image. + + Arguments: + y_pred_decoded (list or array): Either a list of length `batch_size` that + contains Numpy arrays that contain the predictions for each batch item + or a Numpy array. If this is a list of Numpy arrays, the arrays would + usually have the shape `(num_predictions, 6)`, where `num_predictions` + is different for each batch item. If this is a Numpy array, it would + usually have the shape `(batch_size, num_predictions, 6)`. The last axis + would usually contain the class ID, confidence score, and four bounding + box coordinates for each prediction. + inverse_predictions (list): A nested list of length `batch_size` that contains + for each batch item a list of functions that take one argument (one element + of `y_pred_decoded` if it is a list or one slice along the first axis of + `y_pred_decoded` if it is an array) and return an output of the same shape + and data type. + + Returns: + The transformed predictions, which have the same structure as `y_pred_decoded`. + ''' + + if isinstance(y_pred_decoded, list): + + y_pred_decoded_inv = [] + + for i in range(len(y_pred_decoded)): + y_pred_decoded_inv.append(np.copy(y_pred_decoded[i])) + if y_pred_decoded_inv[i].size > 0: # If there are any predictions for this batch item. + for inverter in inverse_transforms[i]: + if not (inverter is None): + y_pred_decoded_inv[i] = inverter(y_pred_decoded_inv[i]) + + elif isinstance(y_pred_decoded, np.ndarray): + + y_pred_decoded_inv = np.copy(y_pred_decoded) + + for i in range(len(y_pred_decoded)): + if y_pred_decoded_inv[i].size > 0: # If there are any predictions for this batch item. + for inverter in inverse_transforms[i]: + if not (inverter is None): + y_pred_decoded_inv[i] = inverter(y_pred_decoded_inv[i]) + + else: + raise ValueError("`y_pred_decoded` must be either a list or a Numpy array.") + + return y_pred_decoded_inv diff --git a/keras_ssd/data_generator/object_detection_2d_patch_sampling_ops.py b/keras_ssd/data_generator/object_detection_2d_patch_sampling_ops.py new file mode 100644 index 0000000..bec7002 --- /dev/null +++ b/keras_ssd/data_generator/object_detection_2d_patch_sampling_ops.py @@ -0,0 +1,881 @@ +''' +Various patch sampling operations for data augmentation in 2D object detection. + +Copyright (C) 2018 Pierluigi Ferrari + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +''' + +from __future__ import division +import numpy as np + +from data_generator.object_detection_2d_image_boxes_validation_utils import BoundGenerator, BoxFilter, ImageValidator + +class PatchCoordinateGenerator: + ''' + Generates random patch coordinates that meet specified requirements. + ''' + + def __init__(self, + img_height=None, + img_width=None, + must_match='h_w', + min_scale=0.3, + max_scale=1.0, + scale_uniformly=False, + min_aspect_ratio = 0.5, + max_aspect_ratio = 2.0, + patch_ymin=None, + patch_xmin=None, + patch_height=None, + patch_width=None, + patch_aspect_ratio=None): + ''' + Arguments: + img_height (int): The height of the image for which the patch coordinates + shall be generated. Doesn't have to be known upon construction. + img_width (int): The width of the image for which the patch coordinates + shall be generated. Doesn't have to be known upon construction. + must_match (str, optional): Can be either of 'h_w', 'h_ar', and 'w_ar'. + Specifies which two of the three quantities height, width, and aspect + ratio determine the shape of the generated patch. The respective third + quantity will be computed from the other two. For example, + if `must_match == 'h_w'`, then the patch's height and width will be + set to lie within [min_scale, max_scale] of the image size or to + `patch_height` and/or `patch_width`, if given. The patch's aspect ratio + is the dependent variable in this case, it will be computed from the + height and width. Any given values for `patch_aspect_ratio`, + `min_aspect_ratio`, or `max_aspect_ratio` will be ignored. + min_scale (float, optional): The minimum size of a dimension of the patch + as a fraction of the respective dimension of the image. Can be greater + than 1. For example, if the image width is 200 and `min_scale == 0.5`, + then the width of the generated patch will be at least 100. If `min_scale == 1.5`, + the width of the generated patch will be at least 300. + max_scale (float, optional): The maximum size of a dimension of the patch + as a fraction of the respective dimension of the image. Can be greater + than 1. For example, if the image width is 200 and `max_scale == 1.0`, + then the width of the generated patch will be at most 200. If `max_scale == 1.5`, + the width of the generated patch will be at most 300. Must be greater than + `min_scale`. + scale_uniformly (bool, optional): If `True` and if `must_match == 'h_w'`, + the patch height and width will be scaled uniformly, otherwise they will + be scaled independently. + min_aspect_ratio (float, optional): Determines the minimum aspect ratio + for the generated patches. + max_aspect_ratio (float, optional): Determines the maximum aspect ratio + for the generated patches. + patch_ymin (int, optional): `None` or the vertical coordinate of the top left + corner of the generated patches. If this is not `None`, the position of the + patches along the vertical axis is fixed. If this is `None`, then the + vertical position of generated patches will be chosen randomly such that + the overlap of a patch and the image along the vertical dimension is + always maximal. + patch_xmin (int, optional): `None` or the horizontal coordinate of the top left + corner of the generated patches. If this is not `None`, the position of the + patches along the horizontal axis is fixed. If this is `None`, then the + horizontal position of generated patches will be chosen randomly such that + the overlap of a patch and the image along the horizontal dimension is + always maximal. + patch_height (int, optional): `None` or the fixed height of the generated patches. + patch_width (int, optional): `None` or the fixed width of the generated patches. + patch_aspect_ratio (float, optional): `None` or the fixed aspect ratio of the + generated patches. + ''' + + if not (must_match in {'h_w', 'h_ar', 'w_ar'}): + raise ValueError("`must_match` must be either of 'h_w', 'h_ar' and 'w_ar'.") + if min_scale >= max_scale: + raise ValueError("It must be `min_scale < max_scale`.") + if min_aspect_ratio >= max_aspect_ratio: + raise ValueError("It must be `min_aspect_ratio < max_aspect_ratio`.") + if scale_uniformly and not ((patch_height is None) and (patch_width is None)): + raise ValueError("If `scale_uniformly == True`, `patch_height` and `patch_width` must both be `None`.") + self.img_height = img_height + self.img_width = img_width + self.must_match = must_match + self.min_scale = min_scale + self.max_scale = max_scale + self.scale_uniformly = scale_uniformly + self.min_aspect_ratio = min_aspect_ratio + self.max_aspect_ratio = max_aspect_ratio + self.patch_ymin = patch_ymin + self.patch_xmin = patch_xmin + self.patch_height = patch_height + self.patch_width = patch_width + self.patch_aspect_ratio = patch_aspect_ratio + + def __call__(self): + ''' + Returns: + A 4-tuple `(ymin, xmin, height, width)` that represents the coordinates + of the generated patch. + ''' + + # Get the patch height and width. + + if self.must_match == 'h_w': # Aspect is the dependent variable. + if not self.scale_uniformly: + # Get the height. + if self.patch_height is None: + patch_height = int(np.random.uniform(self.min_scale, self.max_scale) * self.img_height) + else: + patch_height = self.patch_height + # Get the width. + if self.patch_width is None: + patch_width = int(np.random.uniform(self.min_scale, self.max_scale) * self.img_width) + else: + patch_width = self.patch_width + else: + scaling_factor = np.random.uniform(self.min_scale, self.max_scale) + patch_height = int(scaling_factor * self.img_height) + patch_width = int(scaling_factor * self.img_width) + + elif self.must_match == 'h_ar': # Width is the dependent variable. + # Get the height. + if self.patch_height is None: + patch_height = int(np.random.uniform(self.min_scale, self.max_scale) * self.img_height) + else: + patch_height = self.patch_height + # Get the aspect ratio. + if self.patch_aspect_ratio is None: + patch_aspect_ratio = np.random.uniform(self.min_aspect_ratio, self.max_aspect_ratio) + else: + patch_aspect_ratio = self.patch_aspect_ratio + # Get the width. + patch_width = int(patch_height * patch_aspect_ratio) + + elif self.must_match == 'w_ar': # Height is the dependent variable. + # Get the width. + if self.patch_width is None: + patch_width = int(np.random.uniform(self.min_scale, self.max_scale) * self.img_width) + else: + patch_width = self.patch_width + # Get the aspect ratio. + if self.patch_aspect_ratio is None: + patch_aspect_ratio = np.random.uniform(self.min_aspect_ratio, self.max_aspect_ratio) + else: + patch_aspect_ratio = self.patch_aspect_ratio + # Get the height. + patch_height = int(patch_width / patch_aspect_ratio) + + # Get the top left corner coordinates of the patch. + + if self.patch_ymin is None: + # Compute how much room we have along the vertical axis to place the patch. + # A negative number here means that we want to sample a patch that is larger than the original image + # in the vertical dimension, in which case the patch will be placed such that it fully contains the + # image in the vertical dimension. + y_range = self.img_height - patch_height + # Select a random top left corner for the sample position from the possible positions. + if y_range >= 0: patch_ymin = np.random.randint(0, y_range + 1) # There are y_range + 1 possible positions for the crop in the vertical dimension. + else: patch_ymin = np.random.randint(y_range, 1) # The possible positions for the image on the background canvas in the vertical dimension. + else: + patch_ymin = self.patch_ymin + + if self.patch_xmin is None: + # Compute how much room we have along the horizontal axis to place the patch. + # A negative number here means that we want to sample a patch that is larger than the original image + # in the horizontal dimension, in which case the patch will be placed such that it fully contains the + # image in the horizontal dimension. + x_range = self.img_width - patch_width + # Select a random top left corner for the sample position from the possible positions. + if x_range >= 0: patch_xmin = np.random.randint(0, x_range + 1) # There are x_range + 1 possible positions for the crop in the horizontal dimension. + else: patch_xmin = np.random.randint(x_range, 1) # The possible positions for the image on the background canvas in the horizontal dimension. + else: + patch_xmin = self.patch_xmin + + return (patch_ymin, patch_xmin, patch_height, patch_width) + +class CropPad: + ''' + Crops and/or pads an image deterministically. + + Depending on the given output patch size and the position (top left corner) relative + to the input image, the image will be cropped and/or padded along one or both spatial + dimensions. + + For example, if the output patch lies entirely within the input image, this will result + in a regular crop. If the input image lies entirely within the output patch, this will + result in the image being padded in every direction. All other cases are mixed cases + where the image might be cropped in some directions and padded in others. + + The output patch can be arbitrary in both size and position as long as it overlaps + with the input image. + ''' + + def __init__(self, + patch_ymin, + patch_xmin, + patch_height, + patch_width, + clip_boxes=True, + box_filter=None, + background=(0,0,0), + labels_format={'class_id': 0, 'xmin': 1, 'ymin': 2, 'xmax': 3, 'ymax': 4}): + ''' + Arguments: + patch_ymin (int, optional): The vertical coordinate of the top left corner of the output + patch relative to the image coordinate system. Can be negative (i.e. lie outside the image) + as long as the resulting patch still overlaps with the image. + patch_ymin (int, optional): The horizontal coordinate of the top left corner of the output + patch relative to the image coordinate system. Can be negative (i.e. lie outside the image) + as long as the resulting patch still overlaps with the image. + patch_height (int): The height of the patch to be sampled from the image. Can be greater + than the height of the input image. + patch_width (int): The width of the patch to be sampled from the image. Can be greater + than the width of the input image. + clip_boxes (bool, optional): Only relevant if ground truth bounding boxes are given. + If `True`, any ground truth bounding boxes will be clipped to lie entirely within the + sampled patch. + box_filter (BoxFilter, optional): Only relevant if ground truth bounding boxes are given. + A `BoxFilter` object to filter out bounding boxes that don't meet the given criteria + after the transformation. Refer to the `BoxFilter` documentation for details. If `None`, + the validity of the bounding boxes is not checked. + background (list/tuple, optional): A 3-tuple specifying the RGB color value of the potential + background pixels of the scaled images. In the case of single-channel images, + the first element of `background` will be used as the background pixel value. + labels_format (dict, optional): A dictionary that defines which index in the last axis of the labels + of an image contains which bounding box coordinate. The dictionary maps at least the keywords + 'xmin', 'ymin', 'xmax', and 'ymax' to their respective indices within last axis of the labels array. + ''' + #if (patch_height <= 0) or (patch_width <= 0): + # raise ValueError("Patch height and width must both be positive.") + #if (patch_ymin + patch_height < 0) or (patch_xmin + patch_width < 0): + # raise ValueError("A patch with the given coordinates cannot overlap with an input image.") + if not (isinstance(box_filter, BoxFilter) or box_filter is None): + raise ValueError("`box_filter` must be either `None` or a `BoxFilter` object.") + self.patch_height = patch_height + self.patch_width = patch_width + self.patch_ymin = patch_ymin + self.patch_xmin = patch_xmin + self.clip_boxes = clip_boxes + self.box_filter = box_filter + self.background = background + self.labels_format = labels_format + + def __call__(self, image, labels=None, return_inverter=False): + + img_height, img_width = image.shape[:2] + + if (self.patch_ymin > img_height) or (self.patch_xmin > img_width): + raise ValueError("The given patch doesn't overlap with the input image.") + + labels = np.copy(labels) + + xmin = self.labels_format['xmin'] + ymin = self.labels_format['ymin'] + xmax = self.labels_format['xmax'] + ymax = self.labels_format['ymax'] + + # Top left corner of the patch relative to the image coordinate system: + patch_ymin = self.patch_ymin + patch_xmin = self.patch_xmin + + # Create a canvas of the size of the patch we want to end up with. + if image.ndim == 3: + canvas = np.zeros(shape=(self.patch_height, self.patch_width, 3), dtype=np.uint8) + canvas[:, :] = self.background + elif image.ndim == 2: + canvas = np.zeros(shape=(self.patch_height, self.patch_width), dtype=np.uint8) + canvas[:, :] = self.background[0] + + # Perform the crop. + if patch_ymin < 0 and patch_xmin < 0: # Pad the image at the top and on the left. + image_crop_height = min(img_height, self.patch_height + patch_ymin) # The number of pixels of the image that will end up on the canvas in the vertical direction. + image_crop_width = min(img_width, self.patch_width + patch_xmin) # The number of pixels of the image that will end up on the canvas in the horizontal direction. + canvas[-patch_ymin:-patch_ymin + image_crop_height, -patch_xmin:-patch_xmin + image_crop_width] = image[:image_crop_height, :image_crop_width] + + elif patch_ymin < 0 and patch_xmin >= 0: # Pad the image at the top and crop it on the left. + image_crop_height = min(img_height, self.patch_height + patch_ymin) # The number of pixels of the image that will end up on the canvas in the vertical direction. + image_crop_width = min(self.patch_width, img_width - patch_xmin) # The number of pixels of the image that will end up on the canvas in the horizontal direction. + canvas[-patch_ymin:-patch_ymin + image_crop_height, :image_crop_width] = image[:image_crop_height, patch_xmin:patch_xmin + image_crop_width] + + elif patch_ymin >= 0 and patch_xmin < 0: # Crop the image at the top and pad it on the left. + image_crop_height = min(self.patch_height, img_height - patch_ymin) # The number of pixels of the image that will end up on the canvas in the vertical direction. + image_crop_width = min(img_width, self.patch_width + patch_xmin) # The number of pixels of the image that will end up on the canvas in the horizontal direction. + canvas[:image_crop_height, -patch_xmin:-patch_xmin + image_crop_width] = image[patch_ymin:patch_ymin + image_crop_height, :image_crop_width] + + elif patch_ymin >= 0 and patch_xmin >= 0: # Crop the image at the top and on the left. + image_crop_height = min(self.patch_height, img_height - patch_ymin) # The number of pixels of the image that will end up on the canvas in the vertical direction. + image_crop_width = min(self.patch_width, img_width - patch_xmin) # The number of pixels of the image that will end up on the canvas in the horizontal direction. + canvas[:image_crop_height, :image_crop_width] = image[patch_ymin:patch_ymin + image_crop_height, patch_xmin:patch_xmin + image_crop_width] + + image = canvas + + if return_inverter: + def inverter(labels): + labels = np.copy(labels) + labels[:, [ymin+1, ymax+1]] += patch_ymin + labels[:, [xmin+1, xmax+1]] += patch_xmin + return labels + + if not (labels is None): + + # Translate the box coordinates to the patch's coordinate system. + labels[:, [ymin, ymax]] -= patch_ymin + labels[:, [xmin, xmax]] -= patch_xmin + + # Compute all valid boxes for this patch. + if not (self.box_filter is None): + self.box_filter.labels_format = self.labels_format + labels = self.box_filter(labels=labels, + image_height=self.patch_height, + image_width=self.patch_width) + + if self.clip_boxes: + labels[:,[ymin,ymax]] = np.clip(labels[:,[ymin,ymax]], a_min=0, a_max=self.patch_height-1) + labels[:,[xmin,xmax]] = np.clip(labels[:,[xmin,xmax]], a_min=0, a_max=self.patch_width-1) + + if return_inverter: + return image, labels, inverter + else: + return image, labels + + else: + if return_inverter: + return image, inverter + else: + return image + +class Crop: + ''' + Crops off the specified numbers of pixels from the borders of images. + + This is just a convenience interface for `CropPad`. + ''' + + def __init__(self, + crop_top, + crop_bottom, + crop_left, + crop_right, + clip_boxes=True, + box_filter=None, + labels_format={'class_id': 0, 'xmin': 1, 'ymin': 2, 'xmax': 3, 'ymax': 4}): + self.crop_top = crop_top + self.crop_bottom = crop_bottom + self.crop_left = crop_left + self.crop_right = crop_right + self.clip_boxes = clip_boxes + self.box_filter = box_filter + self.labels_format = labels_format + self.crop = CropPad(patch_ymin=self.crop_top, + patch_xmin=self.crop_left, + patch_height=None, + patch_width=None, + clip_boxes=self.clip_boxes, + box_filter=self.box_filter, + labels_format=self.labels_format) + + def __call__(self, image, labels=None, return_inverter=False): + + img_height, img_width = image.shape[:2] + + self.crop.patch_height = img_height - self.crop_top - self.crop_bottom + self.crop.patch_width = img_width - self.crop_left - self.crop_right + self.crop.labels_format = self.labels_format + + return self.crop(image, labels, return_inverter) + +class Pad: + ''' + Pads images by the specified numbers of pixels on each side. + + This is just a convenience interface for `CropPad`. + ''' + + def __init__(self, + pad_top, + pad_bottom, + pad_left, + pad_right, + background=(0,0,0), + labels_format={'class_id': 0, 'xmin': 1, 'ymin': 2, 'xmax': 3, 'ymax': 4}): + self.pad_top = pad_top + self.pad_bottom = pad_bottom + self.pad_left = pad_left + self.pad_right = pad_right + self.background = background + self.labels_format = labels_format + self.pad = CropPad(patch_ymin=-self.pad_top, + patch_xmin=-self.pad_left, + patch_height=None, + patch_width=None, + clip_boxes=False, + box_filter=None, + background=self.background, + labels_format=self.labels_format) + + def __call__(self, image, labels=None, return_inverter=False): + + img_height, img_width = image.shape[:2] + + self.pad.patch_height = img_height + self.pad_top + self.pad_bottom + self.pad.patch_width = img_width + self.pad_left + self.pad_right + self.pad.labels_format = self.labels_format + + return self.pad(image, labels, return_inverter) + +class RandomPatch: + ''' + Randomly samples a patch from an image. The randomness refers to whatever + randomness may be introduced by the patch coordinate generator, the box filter, + and the patch validator. + + Input images may be cropped and/or padded along either or both of the two + spatial dimensions as necessary in order to obtain the required patch. + + As opposed to `RandomPatchInf`, it is possible for this transform to fail to produce + an output image at all, in which case it will return `None`. This is useful, because + if this transform is used to generate patches of a fixed size or aspect ratio, then + the caller needs to be able to rely on the output image satisfying the set size or + aspect ratio. It might therefore not be an option to return the unaltered input image + as other random transforms do when they fail to produce a valid transformed image. + ''' + + def __init__(self, + patch_coord_generator, + box_filter=None, + image_validator=None, + n_trials_max=3, + clip_boxes=True, + prob=1.0, + background=(0,0,0), + can_fail=False, + labels_format={'class_id': 0, 'xmin': 1, 'ymin': 2, 'xmax': 3, 'ymax': 4}): + ''' + Arguments: + patch_coord_generator (PatchCoordinateGenerator): A `PatchCoordinateGenerator` object + to generate the positions and sizes of the patches to be sampled from the input images. + box_filter (BoxFilter, optional): Only relevant if ground truth bounding boxes are given. + A `BoxFilter` object to filter out bounding boxes that don't meet the given criteria + after the transformation. Refer to the `BoxFilter` documentation for details. If `None`, + the validity of the bounding boxes is not checked. + image_validator (ImageValidator, optional): Only relevant if ground truth bounding boxes are given. + An `ImageValidator` object to determine whether a sampled patch is valid. If `None`, + any outcome is valid. + n_trials_max (int, optional): Only relevant if ground truth bounding boxes are given. + Determines the maxmial number of trials to sample a valid patch. If no valid patch could + be sampled in `n_trials_max` trials, returns one `None` in place of each regular output. + clip_boxes (bool, optional): Only relevant if ground truth bounding boxes are given. + If `True`, any ground truth bounding boxes will be clipped to lie entirely within the + sampled patch. + prob (float, optional): `(1 - prob)` determines the probability with which the original, + unaltered image is returned. + background (list/tuple, optional): A 3-tuple specifying the RGB color value of the potential + background pixels of the scaled images. In the case of single-channel images, + the first element of `background` will be used as the background pixel value. + can_fail (bool, optional): If `True`, will return `None` if no valid patch could be found after + `n_trials_max` trials. If `False`, will return the unaltered input image in such a case. + labels_format (dict, optional): A dictionary that defines which index in the last axis of the labels + of an image contains which bounding box coordinate. The dictionary maps at least the keywords + 'xmin', 'ymin', 'xmax', and 'ymax' to their respective indices within last axis of the labels array. + ''' + if not isinstance(patch_coord_generator, PatchCoordinateGenerator): + raise ValueError("`patch_coord_generator` must be an instance of `PatchCoordinateGenerator`.") + if not (isinstance(image_validator, ImageValidator) or image_validator is None): + raise ValueError("`image_validator` must be either `None` or an `ImageValidator` object.") + self.patch_coord_generator = patch_coord_generator + self.box_filter = box_filter + self.image_validator = image_validator + self.n_trials_max = n_trials_max + self.clip_boxes = clip_boxes + self.prob = prob + self.background = background + self.can_fail = can_fail + self.labels_format = labels_format + self.sample_patch = CropPad(patch_ymin=None, + patch_xmin=None, + patch_height=None, + patch_width=None, + clip_boxes=self.clip_boxes, + box_filter=self.box_filter, + background=self.background, + labels_format=self.labels_format) + + def __call__(self, image, labels=None, return_inverter=False): + + p = np.random.uniform(0,1) + if p >= (1.0-self.prob): + + img_height, img_width = image.shape[:2] + self.patch_coord_generator.img_height = img_height + self.patch_coord_generator.img_width = img_width + + xmin = self.labels_format['xmin'] + ymin = self.labels_format['ymin'] + xmax = self.labels_format['xmax'] + ymax = self.labels_format['ymax'] + + # Override the preset labels format. + if not self.image_validator is None: + self.image_validator.labels_format = self.labels_format + self.sample_patch.labels_format = self.labels_format + + for _ in range(max(1, self.n_trials_max)): + + # Generate patch coordinates. + patch_ymin, patch_xmin, patch_height, patch_width = self.patch_coord_generator() + + self.sample_patch.patch_ymin = patch_ymin + self.sample_patch.patch_xmin = patch_xmin + self.sample_patch.patch_height = patch_height + self.sample_patch.patch_width = patch_width + + if (labels is None) or (self.image_validator is None): + # We either don't have any boxes or if we do, we will accept any outcome as valid. + return self.sample_patch(image, labels, return_inverter) + else: + # Translate the box coordinates to the patch's coordinate system. + new_labels = np.copy(labels) + new_labels[:, [ymin, ymax]] -= patch_ymin + new_labels[:, [xmin, xmax]] -= patch_xmin + # Check if the patch is valid. + if self.image_validator(labels=new_labels, + image_height=patch_height, + image_width=patch_width): + return self.sample_patch(image, labels, return_inverter) + + # If we weren't able to sample a valid patch... + if self.can_fail: + # ...return `None`. + if labels is None: + if return_inverter: + return None, None + else: + return None + else: + if return_inverter: + return None, None, None + else: + return None, None + else: + # ...return the unaltered input image. + if labels is None: + if return_inverter: + return image, None + else: + return image + else: + if return_inverter: + return image, labels, None + else: + return image, labels + + else: + if return_inverter: + def inverter(labels): + return labels + + if labels is None: + if return_inverter: + return image, inverter + else: + return image + else: + if return_inverter: + return image, labels, inverter + else: + return image, labels + +class RandomPatchInf: + ''' + Randomly samples a patch from an image. The randomness refers to whatever + randomness may be introduced by the patch coordinate generator, the box filter, + and the patch validator. + + Input images may be cropped and/or padded along either or both of the two + spatial dimensions as necessary in order to obtain the required patch. + + This operation is very similar to `RandomPatch`, except that: + 1. This operation runs indefinitely until either a valid patch is found or + the input image is returned unaltered, i.e. it cannot fail. + 2. If a bound generator is given, a new pair of bounds will be generated + every `n_trials_max` iterations. + ''' + + def __init__(self, + patch_coord_generator, + box_filter=None, + image_validator=None, + bound_generator=None, + n_trials_max=50, + clip_boxes=True, + prob=0.857, + background=(0,0,0), + labels_format={'class_id': 0, 'xmin': 1, 'ymin': 2, 'xmax': 3, 'ymax': 4}): + ''' + Arguments: + patch_coord_generator (PatchCoordinateGenerator): A `PatchCoordinateGenerator` object + to generate the positions and sizes of the patches to be sampled from the input images. + box_filter (BoxFilter, optional): Only relevant if ground truth bounding boxes are given. + A `BoxFilter` object to filter out bounding boxes that don't meet the given criteria + after the transformation. Refer to the `BoxFilter` documentation for details. If `None`, + the validity of the bounding boxes is not checked. + image_validator (ImageValidator, optional): Only relevant if ground truth bounding boxes are given. + An `ImageValidator` object to determine whether a sampled patch is valid. If `None`, + any outcome is valid. + bound_generator (BoundGenerator, optional): A `BoundGenerator` object to generate upper and + lower bound values for the patch validator. Every `n_trials_max` trials, a new pair of + upper and lower bounds will be generated until a valid patch is found or the original image + is returned. This bound generator overrides the bound generator of the patch validator. + n_trials_max (int, optional): Only relevant if ground truth bounding boxes are given. + The sampler will run indefinitely until either a valid patch is found or the original image + is returned, but this determines the maxmial number of trials to sample a valid patch for each + selected pair of lower and upper bounds before a new pair is picked. + clip_boxes (bool, optional): Only relevant if ground truth bounding boxes are given. + If `True`, any ground truth bounding boxes will be clipped to lie entirely within the + sampled patch. + prob (float, optional): `(1 - prob)` determines the probability with which the original, + unaltered image is returned. + background (list/tuple, optional): A 3-tuple specifying the RGB color value of the potential + background pixels of the scaled images. In the case of single-channel images, + the first element of `background` will be used as the background pixel value. + labels_format (dict, optional): A dictionary that defines which index in the last axis of the labels + of an image contains which bounding box coordinate. The dictionary maps at least the keywords + 'xmin', 'ymin', 'xmax', and 'ymax' to their respective indices within last axis of the labels array. + ''' + + if not isinstance(patch_coord_generator, PatchCoordinateGenerator): + raise ValueError("`patch_coord_generator` must be an instance of `PatchCoordinateGenerator`.") + if not (isinstance(image_validator, ImageValidator) or image_validator is None): + raise ValueError("`image_validator` must be either `None` or an `ImageValidator` object.") + if not (isinstance(bound_generator, BoundGenerator) or bound_generator is None): + raise ValueError("`bound_generator` must be either `None` or a `BoundGenerator` object.") + self.patch_coord_generator = patch_coord_generator + self.box_filter = box_filter + self.image_validator = image_validator + self.bound_generator = bound_generator + self.n_trials_max = n_trials_max + self.clip_boxes = clip_boxes + self.prob = prob + self.background = background + self.labels_format = labels_format + self.sample_patch = CropPad(patch_ymin=None, + patch_xmin=None, + patch_height=None, + patch_width=None, + clip_boxes=self.clip_boxes, + box_filter=self.box_filter, + background=self.background, + labels_format=self.labels_format) + + def __call__(self, image, labels=None, return_inverter=False): + + img_height, img_width = image.shape[:2] + self.patch_coord_generator.img_height = img_height + self.patch_coord_generator.img_width = img_width + + xmin = self.labels_format['xmin'] + ymin = self.labels_format['ymin'] + xmax = self.labels_format['xmax'] + ymax = self.labels_format['ymax'] + + # Override the preset labels format. + if not self.image_validator is None: + self.image_validator.labels_format = self.labels_format + self.sample_patch.labels_format = self.labels_format + + while True: # Keep going until we either find a valid patch or return the original image. + + p = np.random.uniform(0,1) + if p >= (1.0-self.prob): + + # In case we have a bound generator, pick a lower and upper bound for the patch validator. + if not ((self.image_validator is None) or (self.bound_generator is None)): + self.image_validator.bounds = self.bound_generator() + + # Use at most `self.n_trials_max` attempts to find a crop + # that meets our requirements. + for _ in range(max(1, self.n_trials_max)): + + # Generate patch coordinates. + patch_ymin, patch_xmin, patch_height, patch_width = self.patch_coord_generator() + + self.sample_patch.patch_ymin = patch_ymin + self.sample_patch.patch_xmin = patch_xmin + self.sample_patch.patch_height = patch_height + self.sample_patch.patch_width = patch_width + + # Check if the resulting patch meets the aspect ratio requirements. + aspect_ratio = patch_width / patch_height + if not (self.patch_coord_generator.min_aspect_ratio <= aspect_ratio <= self.patch_coord_generator.max_aspect_ratio): + continue + + if (labels is None) or (self.image_validator is None): + # We either don't have any boxes or if we do, we will accept any outcome as valid. + return self.sample_patch(image, labels, return_inverter) + else: + # Translate the box coordinates to the patch's coordinate system. + new_labels = np.copy(labels) + new_labels[:, [ymin, ymax]] -= patch_ymin + new_labels[:, [xmin, xmax]] -= patch_xmin + # Check if the patch contains the minimum number of boxes we require. + if self.image_validator(labels=new_labels, + image_height=patch_height, + image_width=patch_width): + return self.sample_patch(image, labels, return_inverter) + else: + if return_inverter: + def inverter(labels): + return labels + + if labels is None: + if return_inverter: + return image, inverter + else: + return image + else: + if return_inverter: + return image, labels, inverter + else: + return image, labels + +class RandomMaxCropFixedAR: + ''' + Crops the largest possible patch of a given fixed aspect ratio + from an image. + + Since the aspect ratio of the sampled patches is constant, they + can subsequently be resized to the same size without distortion. + ''' + + def __init__(self, + patch_aspect_ratio, + box_filter=None, + image_validator=None, + n_trials_max=3, + clip_boxes=True, + labels_format={'class_id': 0, 'xmin': 1, 'ymin': 2, 'xmax': 3, 'ymax': 4}): + ''' + Arguments: + patch_aspect_ratio (float): The fixed aspect ratio that all sampled patches will have. + box_filter (BoxFilter, optional): Only relevant if ground truth bounding boxes are given. + A `BoxFilter` object to filter out bounding boxes that don't meet the given criteria + after the transformation. Refer to the `BoxFilter` documentation for details. If `None`, + the validity of the bounding boxes is not checked. + image_validator (ImageValidator, optional): Only relevant if ground truth bounding boxes are given. + An `ImageValidator` object to determine whether a sampled patch is valid. If `None`, + any outcome is valid. + n_trials_max (int, optional): Only relevant if ground truth bounding boxes are given. + Determines the maxmial number of trials to sample a valid patch. If no valid patch could + be sampled in `n_trials_max` trials, returns `None`. + clip_boxes (bool, optional): Only relevant if ground truth bounding boxes are given. + If `True`, any ground truth bounding boxes will be clipped to lie entirely within the + sampled patch. + labels_format (dict, optional): A dictionary that defines which index in the last axis of the labels + of an image contains which bounding box coordinate. The dictionary maps at least the keywords + 'xmin', 'ymin', 'xmax', and 'ymax' to their respective indices within last axis of the labels array. + ''' + + self.patch_aspect_ratio = patch_aspect_ratio + self.box_filter = box_filter + self.image_validator = image_validator + self.n_trials_max = n_trials_max + self.clip_boxes = clip_boxes + self.labels_format = labels_format + self.random_patch = RandomPatch(patch_coord_generator=PatchCoordinateGenerator(), # Just a dummy object + box_filter=self.box_filter, + image_validator=self.image_validator, + n_trials_max=self.n_trials_max, + clip_boxes=self.clip_boxes, + prob=1.0, + can_fail=False, + labels_format=self.labels_format) + + def __call__(self, image, labels=None, return_inverter=False): + + img_height, img_width = image.shape[:2] + + # The ratio of the input image aspect ratio and patch aspect ratio determines the maximal possible crop. + image_aspect_ratio = img_width / img_height + + if image_aspect_ratio < self.patch_aspect_ratio: + patch_width = img_width + patch_height = int(round(patch_width / self.patch_aspect_ratio)) + else: + patch_height = img_height + patch_width = int(round(patch_height * self.patch_aspect_ratio)) + + # Now that we know the desired height and width for the patch, + # instantiate an appropriate patch coordinate generator. + patch_coord_generator = PatchCoordinateGenerator(img_height=img_height, + img_width=img_width, + must_match='h_w', + patch_height=patch_height, + patch_width=patch_width) + + # The rest of the work is done by `RandomPatch`. + self.random_patch.patch_coord_generator = patch_coord_generator + self.random_patch.labels_format = self.labels_format + return self.random_patch(image, labels, return_inverter) + +class RandomPadFixedAR: + ''' + Adds the minimal possible padding to an image that results in a patch + of the given fixed aspect ratio that contains the entire image. + + Since the aspect ratio of the resulting images is constant, they + can subsequently be resized to the same size without distortion. + ''' + + def __init__(self, + patch_aspect_ratio, + background=(0,0,0), + labels_format={'class_id': 0, 'xmin': 1, 'ymin': 2, 'xmax': 3, 'ymax': 4}): + ''' + Arguments: + patch_aspect_ratio (float): The fixed aspect ratio that all sampled patches will have. + background (list/tuple, optional): A 3-tuple specifying the RGB color value of the potential + background pixels of the scaled images. In the case of single-channel images, + the first element of `background` will be used as the background pixel value. + labels_format (dict, optional): A dictionary that defines which index in the last axis of the labels + of an image contains which bounding box coordinate. The dictionary maps at least the keywords + 'xmin', 'ymin', 'xmax', and 'ymax' to their respective indices within last axis of the labels array. + ''' + + self.patch_aspect_ratio = patch_aspect_ratio + self.background = background + self.labels_format = labels_format + self.random_patch = RandomPatch(patch_coord_generator=PatchCoordinateGenerator(), # Just a dummy object + box_filter=None, + image_validator=None, + n_trials_max=1, + clip_boxes=False, + background=self.background, + prob=1.0, + labels_format=self.labels_format) + + def __call__(self, image, labels=None, return_inverter=False): + + img_height, img_width = image.shape[:2] + + if img_width < img_height: + patch_height = img_height + patch_width = int(round(patch_height * self.patch_aspect_ratio)) + else: + patch_width = img_width + patch_height = int(round(patch_width / self.patch_aspect_ratio)) + + # Now that we know the desired height and width for the patch, + # instantiate an appropriate patch coordinate generator. + patch_coord_generator = PatchCoordinateGenerator(img_height=img_height, + img_width=img_width, + must_match='h_w', + patch_height=patch_height, + patch_width=patch_width) + + # The rest of the work is done by `RandomPatch`. + self.random_patch.patch_coord_generator = patch_coord_generator + self.random_patch.labels_format = self.labels_format + return self.random_patch(image, labels, return_inverter) diff --git a/keras_ssd/data_generator/object_detection_2d_photometric_ops.py b/keras_ssd/data_generator/object_detection_2d_photometric_ops.py new file mode 100644 index 0000000..375b7aa --- /dev/null +++ b/keras_ssd/data_generator/object_detection_2d_photometric_ops.py @@ -0,0 +1,485 @@ +''' +Various photometric image transformations, both deterministic and probabilistic. + +Copyright (C) 2018 Pierluigi Ferrari + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +''' + +from __future__ import division +import numpy as np +import cv2 + +class ConvertColor: + ''' + Converts images between RGB, HSV and grayscale color spaces. This is just a wrapper + around `cv2.cvtColor()`. + ''' + def __init__(self, current='RGB', to='HSV', keep_3ch=True): + ''' + Arguments: + current (str, optional): The current color space of the images. Can be + one of 'RGB' and 'HSV'. + to (str, optional): The target color space of the images. Can be one of + 'RGB', 'HSV', and 'GRAY'. + keep_3ch (bool, optional): Only relevant if `to == GRAY`. + If `True`, the resulting grayscale images will have three channels. + ''' + if not ((current in {'RGB', 'HSV'}) and (to in {'RGB', 'HSV', 'GRAY'})): + raise NotImplementedError + self.current = current + self.to = to + self.keep_3ch = keep_3ch + + def __call__(self, image, labels=None): + if self.current == 'RGB' and self.to == 'HSV': + image = cv2.cvtColor(image, cv2.COLOR_RGB2HSV) + elif self.current == 'RGB' and self.to == 'GRAY': + image = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY) + if self.keep_3ch: + image = np.stack([image] * 3, axis=-1) + elif self.current == 'HSV' and self.to == 'RGB': + image = cv2.cvtColor(image, cv2.COLOR_HSV2RGB) + elif self.current == 'HSV' and self.to == 'GRAY': + image = cv2.cvtColor(image, cv2.COLOR_HSV2GRAY) + if self.keep_3ch: + image = np.stack([image] * 3, axis=-1) + if labels is None: + return image + else: + return image, labels + +class ConvertDataType: + ''' + Converts images represented as Numpy arrays between `uint8` and `float32`. + Serves as a helper for certain photometric distortions. This is just a wrapper + around `np.ndarray.astype()`. + ''' + def __init__(self, to='uint8'): + ''' + Arguments: + to (string, optional): To which datatype to convert the input images. + Can be either of 'uint8' and 'float32'. + ''' + if not (to == 'uint8' or to == 'float32'): + raise ValueError("`to` can be either of 'uint8' or 'float32'.") + self.to = to + + def __call__(self, image, labels=None): + if self.to == 'uint8': + image = np.round(image, decimals=0).astype(np.uint8) + else: + image = image.astype(np.float32) + if labels is None: + return image + else: + return image, labels + +class ConvertTo3Channels: + ''' + Converts 1-channel and 4-channel images to 3-channel images. Does nothing to images that + already have 3 channels. In the case of 4-channel images, the fourth channel will be + discarded. + ''' + def __init__(self): + pass + + def __call__(self, image, labels=None): + if image.ndim == 2: + image = np.stack([image] * 3, axis=-1) + elif image.ndim == 3: + if image.shape[2] == 1: + image = np.concatenate([image] * 3, axis=-1) + elif image.shape[2] == 4: + image = image[:,:,:3] + if labels is None: + return image + else: + return image, labels + +class Hue: + ''' + Changes the hue of HSV images. + + Important: + - Expects HSV input. + - Expects input array to be of `dtype` `float`. + ''' + def __init__(self, delta): + ''' + Arguments: + delta (int): An integer in the closed interval `[-180, 180]` that determines the hue change, where + a change by integer `delta` means a change by `2 * delta` degrees. Read up on the HSV color format + if you need more information. + ''' + if not (-180 <= delta <= 180): raise ValueError("`delta` must be in the closed interval `[-180, 180]`.") + self.delta = delta + + def __call__(self, image, labels=None): + image[:, :, 0] = (image[:, :, 0] + self.delta) % 180.0 + if labels is None: + return image + else: + return image, labels + +class RandomHue: + ''' + Randomly changes the hue of HSV images. + + Important: + - Expects HSV input. + - Expects input array to be of `dtype` `float`. + ''' + def __init__(self, max_delta=18, prob=0.5): + ''' + Arguments: + max_delta (int): An integer in the closed interval `[0, 180]` that determines the maximal absolute + hue change. + prob (float, optional): `(1 - prob)` determines the probability with which the original, + unaltered image is returned. + ''' + if not (0 <= max_delta <= 180): raise ValueError("`max_delta` must be in the closed interval `[0, 180]`.") + self.max_delta = max_delta + self.prob = prob + self.change_hue = Hue(delta=0) + + def __call__(self, image, labels=None): + p = np.random.uniform(0,1) + if p >= (1.0-self.prob): + self.change_hue.delta = np.random.uniform(-self.max_delta, self.max_delta) + return self.change_hue(image, labels) + elif labels is None: + return image + else: + return image, labels + +class Saturation: + ''' + Changes the saturation of HSV images. + + Important: + - Expects HSV input. + - Expects input array to be of `dtype` `float`. + ''' + def __init__(self, factor): + ''' + Arguments: + factor (float): A float greater than zero that determines saturation change, where + values less than one result in less saturation and values greater than one result + in more saturation. + ''' + if factor <= 0.0: raise ValueError("It must be `factor > 0`.") + self.factor = factor + + def __call__(self, image, labels=None): + image[:,:,1] = np.clip(image[:,:,1] * self.factor, 0, 255) + if labels is None: + return image + else: + return image, labels + +class RandomSaturation: + ''' + Randomly changes the saturation of HSV images. + + Important: + - Expects HSV input. + - Expects input array to be of `dtype` `float`. + ''' + def __init__(self, lower=0.3, upper=2.0, prob=0.5): + ''' + Arguments: + lower (float, optional): A float greater than zero, the lower bound for the random + saturation change. + upper (float, optional): A float greater than zero, the upper bound for the random + saturation change. Must be greater than `lower`. + prob (float, optional): `(1 - prob)` determines the probability with which the original, + unaltered image is returned. + ''' + if lower >= upper: raise ValueError("`upper` must be greater than `lower`.") + self.lower = lower + self.upper = upper + self.prob = prob + self.change_saturation = Saturation(factor=1.0) + + def __call__(self, image, labels=None): + p = np.random.uniform(0,1) + if p >= (1.0-self.prob): + self.change_saturation.factor = np.random.uniform(self.lower, self.upper) + return self.change_saturation(image, labels) + elif labels is None: + return image + else: + return image, labels + +class Brightness: + ''' + Changes the brightness of RGB images. + + Important: + - Expects RGB input. + - Expects input array to be of `dtype` `float`. + ''' + def __init__(self, delta): + ''' + Arguments: + delta (int): An integer, the amount to add to or subtract from the intensity + of every pixel. + ''' + self.delta = delta + + def __call__(self, image, labels=None): + image = np.clip(image + self.delta, 0, 255) + if labels is None: + return image + else: + return image, labels + +class RandomBrightness: + ''' + Randomly changes the brightness of RGB images. + + Important: + - Expects RGB input. + - Expects input array to be of `dtype` `float`. + ''' + def __init__(self, lower=-84, upper=84, prob=0.5): + ''' + Arguments: + lower (int, optional): An integer, the lower bound for the random brightness change. + upper (int, optional): An integer, the upper bound for the random brightness change. + Must be greater than `lower`. + prob (float, optional): `(1 - prob)` determines the probability with which the original, + unaltered image is returned. + ''' + if lower >= upper: raise ValueError("`upper` must be greater than `lower`.") + self.lower = float(lower) + self.upper = float(upper) + self.prob = prob + self.change_brightness = Brightness(delta=0) + + def __call__(self, image, labels=None): + p = np.random.uniform(0,1) + if p >= (1.0-self.prob): + self.change_brightness.delta = np.random.uniform(self.lower, self.upper) + return self.change_brightness(image, labels) + elif labels is None: + return image + else: + return image, labels + +class Contrast: + ''' + Changes the contrast of RGB images. + + Important: + - Expects RGB input. + - Expects input array to be of `dtype` `float`. + ''' + def __init__(self, factor): + ''' + Arguments: + factor (float): A float greater than zero that determines contrast change, where + values less than one result in less contrast and values greater than one result + in more contrast. + ''' + if factor <= 0.0: raise ValueError("It must be `factor > 0`.") + self.factor = factor + + def __call__(self, image, labels=None): + image = np.clip(127.5 + self.factor * (image - 127.5), 0, 255) + if labels is None: + return image + else: + return image, labels + +class RandomContrast: + ''' + Randomly changes the contrast of RGB images. + + Important: + - Expects RGB input. + - Expects input array to be of `dtype` `float`. + ''' + def __init__(self, lower=0.5, upper=1.5, prob=0.5): + ''' + Arguments: + lower (float, optional): A float greater than zero, the lower bound for the random + contrast change. + upper (float, optional): A float greater than zero, the upper bound for the random + contrast change. Must be greater than `lower`. + prob (float, optional): `(1 - prob)` determines the probability with which the original, + unaltered image is returned. + ''' + if lower >= upper: raise ValueError("`upper` must be greater than `lower`.") + self.lower = lower + self.upper = upper + self.prob = prob + self.change_contrast = Contrast(factor=1.0) + + def __call__(self, image, labels=None): + p = np.random.uniform(0,1) + if p >= (1.0-self.prob): + self.change_contrast.factor = np.random.uniform(self.lower, self.upper) + return self.change_contrast(image, labels) + elif labels is None: + return image + else: + return image, labels + +class Gamma: + ''' + Changes the gamma value of RGB images. + + Important: Expects RGB input. + ''' + def __init__(self, gamma): + ''' + Arguments: + gamma (float): A float greater than zero that determines gamma change. + ''' + if gamma <= 0.0: raise ValueError("It must be `gamma > 0`.") + self.gamma = gamma + self.gamma_inv = 1.0 / gamma + # Build a lookup table mapping the pixel values [0, 255] to + # their adjusted gamma values. + self.table = np.array([((i / 255.0) ** self.gamma_inv) * 255 for i in np.arange(0, 256)]).astype("uint8") + + def __call__(self, image, labels=None): + image = cv2.LUT(image, table) + if labels is None: + return image + else: + return image, labels + +class RandomGamma: + ''' + Randomly changes the gamma value of RGB images. + + Important: Expects RGB input. + ''' + def __init__(self, lower=0.25, upper=2.0, prob=0.5): + ''' + Arguments: + lower (float, optional): A float greater than zero, the lower bound for the random + gamma change. + upper (float, optional): A float greater than zero, the upper bound for the random + gamma change. Must be greater than `lower`. + prob (float, optional): `(1 - prob)` determines the probability with which the original, + unaltered image is returned. + ''' + if lower >= upper: raise ValueError("`upper` must be greater than `lower`.") + self.lower = lower + self.upper = upper + self.prob = prob + + def __call__(self, image, labels=None): + p = np.random.uniform(0,1) + if p >= (1.0-self.prob): + gamma = np.random.uniform(self.lower, self.upper) + change_gamma = Gamma(gamma=gamma) + return change_gamma(image, labels) + elif labels is None: + return image + else: + return image, labels + +class HistogramEqualization: + ''' + Performs histogram equalization on HSV images. + + Importat: Expects HSV input. + ''' + def __init__(self): + pass + + def __call__(self, image, labels=None): + image[:,:,2] = cv2.equalizeHist(image[:,:,2]) + if labels is None: + return image + else: + return image, labels + +class RandomHistogramEqualization: + ''' + Randomly performs histogram equalization on HSV images. The randomness only refers + to whether or not the equalization is performed. + + Importat: Expects HSV input. + ''' + def __init__(self, prob=0.5): + ''' + Arguments: + prob (float, optional): `(1 - prob)` determines the probability with which the original, + unaltered image is returned. + ''' + self.prob = prob + self.equalize = HistogramEqualization() + + def __call__(self, image, labels=None): + p = np.random.uniform(0,1) + if p >= (1.0-self.prob): + return self.equalize(image, labels) + elif labels is None: + return image + else: + return image, labels + +class ChannelSwap: + ''' + Swaps the channels of images. + ''' + def __init__(self, order): + ''' + Arguments: + order (tuple): A tuple of integers that defines the desired channel order + of the input images after the channel swap. + ''' + self.order = order + + def __call__(self, image, labels=None): + image = image[:,:,self.order] + if labels is None: + return image + else: + return image, labels + +class RandomChannelSwap: + ''' + Randomly swaps the channels of RGB images. + + Important: Expects RGB input. + ''' + def __init__(self, prob=0.5): + ''' + Arguments: + prob (float, optional): `(1 - prob)` determines the probability with which the original, + unaltered image is returned. + ''' + self.prob = prob + # All possible permutations of the three image channels except the original order. + self.permutations = ((0, 2, 1), + (1, 0, 2), (1, 2, 0), + (2, 0, 1), (2, 1, 0)) + self.swap_channels = ChannelSwap(order=(0, 1, 2)) + + def __call__(self, image, labels=None): + p = np.random.uniform(0,1) + if p >= (1.0-self.prob): + i = np.random.randint(5) # There are 6 possible permutations. + self.swap_channels.order = self.permutations[i] + return self.swap_channels(image, labels) + elif labels is None: + return image + else: + return image, labels diff --git a/keras_ssd/eval_utils/__init__.py b/keras_ssd/eval_utils/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/keras_ssd/eval_utils/average_precision_evaluator.py b/keras_ssd/eval_utils/average_precision_evaluator.py new file mode 100644 index 0000000..e1c52f9 --- /dev/null +++ b/keras_ssd/eval_utils/average_precision_evaluator.py @@ -0,0 +1,906 @@ +''' +An evaluator to compute the Pascal VOC-style mean average precision (both the pre-2010 +and post-2010 algorithm versions) of a given Keras SSD model on a given dataset. + +Copyright (C) 2018 Pierluigi Ferrari + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +''' + +from __future__ import division +import numpy as np +from math import ceil +from tqdm import trange +import sys +import warnings + +from data_generator.object_detection_2d_data_generator import DataGenerator +from data_generator.object_detection_2d_geometric_ops import Resize +from data_generator.object_detection_2d_patch_sampling_ops import RandomPadFixedAR +from data_generator.object_detection_2d_photometric_ops import ConvertTo3Channels +from ssd_encoder_decoder.ssd_output_decoder import decode_detections +from data_generator.object_detection_2d_misc_utils import apply_inverse_transforms + +from bounding_box_utils.bounding_box_utils import iou + +class Evaluator: + ''' + Computes the mean average precision of the given Keras SSD model on the given dataset. + + Can compute the Pascal-VOC-style average precision in both the pre-2010 (k-point sampling) + and post-2010 (integration) algorithm versions. + + Optionally also returns the average precisions, precisions, and recalls. + + The algorithm is identical to the official Pascal VOC pre-2010 detection evaluation algorithm + in its default settings, but can be cusomized in a number of ways. + ''' + + def __init__(self, + model, + n_classes, + data_generator, + model_mode='inference', + pred_format={'class_id': 0, 'conf': 1, 'xmin': 2, 'ymin': 3, 'xmax': 4, 'ymax': 5}, + gt_format={'class_id': 0, 'xmin': 1, 'ymin': 2, 'xmax': 3, 'ymax': 4}): + ''' + Arguments: + model (Keras model): A Keras SSD model object. + n_classes (int): The number of positive classes, e.g. 20 for Pascal VOC, 80 for MS COCO. + data_generator (DataGenerator): A `DataGenerator` object with the evaluation dataset. + model_mode (str, optional): The mode in which the model was created, i.e. 'training', 'inference' or 'inference_fast'. + This is needed in order to know whether the model output is already decoded or still needs to be decoded. Refer to + the model documentation for the meaning of the individual modes. + pred_format (dict, optional): A dictionary that defines which index in the last axis of the model's decoded predictions + contains which bounding box coordinate. The dictionary must map the keywords 'class_id', 'conf' (for the confidence), + 'xmin', 'ymin', 'xmax', and 'ymax' to their respective indices within last axis. + gt_format (list, optional): A dictionary that defines which index of a ground truth bounding box contains which of the five + items class ID, xmin, ymin, xmax, ymax. The expected strings are 'xmin', 'ymin', 'xmax', 'ymax', 'class_id'. + ''' + + if not isinstance(data_generator, DataGenerator): + warnings.warn("`data_generator` is not a `DataGenerator` object, which will cause undefined behavior.") + + self.model = model + self.data_generator = data_generator + self.n_classes = n_classes + self.model_mode = model_mode + self.pred_format = pred_format + self.gt_format = gt_format + + # The following lists all contain per-class data, i.e. all list have the length `n_classes + 1`, + # where one element is for the background class, i.e. that element is just a dummy entry. + self.prediction_results = None + self.num_gt_per_class = None + self.true_positives = None + self.false_positives = None + self.cumulative_true_positives = None + self.cumulative_false_positives = None + self.cumulative_precisions = None # "Cumulative" means that the i-th element in each list represents the precision for the first i highest condidence predictions for that class. + self.cumulative_recalls = None # "Cumulative" means that the i-th element in each list represents the recall for the first i highest condidence predictions for that class. + self.average_precisions = None + self.mean_average_precision = None + + def __call__(self, + img_height, + img_width, + batch_size, + data_generator_mode='resize', + round_confidences=False, + matching_iou_threshold=0.5, + border_pixels='include', + sorting_algorithm='quicksort', + average_precision_mode='sample', + num_recall_points=11, + ignore_neutral_boxes=True, + return_precisions=False, + return_recalls=False, + return_average_precisions=False, + verbose=True, + decoding_confidence_thresh=0.01, + decoding_iou_threshold=0.45, + decoding_top_k=200, + decoding_pred_coords='centroids', + decoding_normalize_coords=True): + ''' + Computes the mean average precision of the given Keras SSD model on the given dataset. + + Optionally also returns the averages precisions, precisions, and recalls. + + All the individual steps of the overall evaluation algorithm can also be called separately + (check out the other methods of this class), but this runs the overall algorithm all at once. + + Arguments: + img_height (int): The input image height for the model. + img_width (int): The input image width for the model. + batch_size (int): The batch size for the evaluation. + data_generator_mode (str, optional): Either of 'resize' and 'pad'. If 'resize', the input images will + be resized (i.e. warped) to `(img_height, img_width)`. This mode does not preserve the aspect ratios of the images. + If 'pad', the input images will be first padded so that they have the aspect ratio defined by `img_height` + and `img_width` and then resized to `(img_height, img_width)`. This mode preserves the aspect ratios of the images. + round_confidences (int, optional): `False` or an integer that is the number of decimals that the prediction + confidences will be rounded to. If `False`, the confidences will not be rounded. + matching_iou_threshold (float, optional): A prediction will be considered a true positive if it has a Jaccard overlap + of at least `matching_iou_threshold` with any ground truth bounding box of the same class. + border_pixels (str, optional): How to treat the border pixels of the bounding boxes. + Can be 'include', 'exclude', or 'half'. If 'include', the border pixels belong + to the boxes. If 'exclude', the border pixels do not belong to the boxes. + If 'half', then one of each of the two horizontal and vertical borders belong + to the boxex, but not the other. + sorting_algorithm (str, optional): Which sorting algorithm the matching algorithm should use. This argument accepts + any valid sorting algorithm for Numpy's `argsort()` function. You will usually want to choose between 'quicksort' + (fastest and most memory efficient, but not stable) and 'mergesort' (slight slower and less memory efficient, but stable). + The official Matlab evaluation algorithm uses a stable sorting algorithm, so this algorithm is only guaranteed + to behave identically if you choose 'mergesort' as the sorting algorithm, but it will almost always behave identically + even if you choose 'quicksort' (but no guarantees). + average_precision_mode (str, optional): Can be either 'sample' or 'integrate'. In the case of 'sample', the average precision + will be computed according to the Pascal VOC formula that was used up until VOC 2009, where the precision will be sampled + for `num_recall_points` recall values. In the case of 'integrate', the average precision will be computed according to the + Pascal VOC formula that was used from VOC 2010 onward, where the average precision will be computed by numerically integrating + over the whole preciscion-recall curve instead of sampling individual points from it. 'integrate' mode is basically just + the limit case of 'sample' mode as the number of sample points increases. + num_recall_points (int, optional): The number of points to sample from the precision-recall-curve to compute the average + precisions. In other words, this is the number of equidistant recall values for which the resulting precision will be + computed. 11 points is the value used in the official Pascal VOC 2007 detection evaluation algorithm. + ignore_neutral_boxes (bool, optional): In case the data generator provides annotations indicating whether a ground truth + bounding box is supposed to either count or be neutral for the evaluation, this argument decides what to do with these + annotations. If `False`, even boxes that are annotated as neutral will be counted into the evaluation. If `True`, + neutral boxes will be ignored for the evaluation. An example for evaluation-neutrality are the ground truth boxes + annotated as "difficult" in the Pascal VOC datasets, which are usually treated as neutral for the evaluation. + return_precisions (bool, optional): If `True`, returns a nested list containing the cumulative precisions for each class. + return_recalls (bool, optional): If `True`, returns a nested list containing the cumulative recalls for each class. + return_average_precisions (bool, optional): If `True`, returns a list containing the average precision for each class. + verbose (bool, optional): If `True`, will print out the progress during runtime. + decoding_confidence_thresh (float, optional): Only relevant if the model is in 'training' mode. + A float in [0,1), the minimum classification confidence in a specific positive class in order to be considered + for the non-maximum suppression stage for the respective class. A lower value will result in a larger part of the + selection process being done by the non-maximum suppression stage, while a larger value will result in a larger + part of the selection process happening in the confidence thresholding stage. + decoding_iou_threshold (float, optional): Only relevant if the model is in 'training' mode. A float in [0,1]. + All boxes with a Jaccard similarity of greater than `iou_threshold` with a locally maximal box will be removed + from the set of predictions for a given class, where 'maximal' refers to the box score. + decoding_top_k (int, optional): Only relevant if the model is in 'training' mode. The number of highest scoring + predictions to be kept for each batch item after the non-maximum suppression stage. + decoding_input_coords (str, optional): Only relevant if the model is in 'training' mode. The box coordinate format + that the model outputs. Can be either 'centroids' for the format `(cx, cy, w, h)` (box center coordinates, width, and height), + 'minmax' for the format `(xmin, xmax, ymin, ymax)`, or 'corners' for the format `(xmin, ymin, xmax, ymax)`. + decoding_normalize_coords (bool, optional): Only relevant if the model is in 'training' mode. Set to `True` if the model + outputs relative coordinates. Do not set this to `True` if the model already outputs absolute coordinates, + as that would result in incorrect coordinates. + + Returns: + A float, the mean average precision, plus any optional returns specified in the arguments. + ''' + + ############################################################################################# + # Predict on the entire dataset. + ############################################################################################# + + self.predict_on_dataset(img_height=img_height, + img_width=img_width, + batch_size=batch_size, + data_generator_mode=data_generator_mode, + decoding_confidence_thresh=decoding_confidence_thresh, + decoding_iou_threshold=decoding_iou_threshold, + decoding_top_k=decoding_top_k, + decoding_pred_coords=decoding_pred_coords, + decoding_normalize_coords=decoding_normalize_coords, + decoding_border_pixels=border_pixels, + round_confidences=round_confidences, + verbose=verbose, + ret=False) + + ############################################################################################# + # Get the total number of ground truth boxes for each class. + ############################################################################################# + + self.get_num_gt_per_class(ignore_neutral_boxes=ignore_neutral_boxes, + verbose=False, + ret=False) + + ############################################################################################# + # Match predictions to ground truth boxes for all classes. + ############################################################################################# + + self.match_predictions(ignore_neutral_boxes=ignore_neutral_boxes, + matching_iou_threshold=matching_iou_threshold, + border_pixels=border_pixels, + sorting_algorithm=sorting_algorithm, + verbose=verbose, + ret=False) + + ############################################################################################# + # Compute the cumulative precision and recall for all classes. + ############################################################################################# + + self.compute_precision_recall(verbose=verbose, ret=False) + + ############################################################################################# + # Compute the average precision for this class. + ############################################################################################# + + self.compute_average_precisions(mode=average_precision_mode, + num_recall_points=num_recall_points, + verbose=verbose, + ret=False) + + ############################################################################################# + # Compute the mean average precision. + ############################################################################################# + + mean_average_precision = self.compute_mean_average_precision(ret=True) + + ############################################################################################# + + # Compile the returns. + if return_precisions or return_recalls or return_average_precisions: + ret = [mean_average_precision] + if return_average_precisions: + ret.append(self.average_precisions) + if return_precisions: + ret.append(self.cumulative_precisions) + if return_recalls: + ret.append(self.cumulative_recalls) + return ret + else: + return mean_average_precision + + def predict_on_dataset(self, + img_height, + img_width, + batch_size, + data_generator_mode='resize', + decoding_confidence_thresh=0.01, + decoding_iou_threshold=0.45, + decoding_top_k=200, + decoding_pred_coords='centroids', + decoding_normalize_coords=True, + decoding_border_pixels='include', + round_confidences=False, + verbose=True, + ret=False): + ''' + Runs predictions for the given model over the entire dataset given by `data_generator`. + + Arguments: + img_height (int): The input image height for the model. + img_width (int): The input image width for the model. + batch_size (int): The batch size for the evaluation. + data_generator_mode (str, optional): Either of 'resize' and 'pad'. If 'resize', the input images will + be resized (i.e. warped) to `(img_height, img_width)`. This mode does not preserve the aspect ratios of the images. + If 'pad', the input images will be first padded so that they have the aspect ratio defined by `img_height` + and `img_width` and then resized to `(img_height, img_width)`. This mode preserves the aspect ratios of the images. + decoding_confidence_thresh (float, optional): Only relevant if the model is in 'training' mode. + A float in [0,1), the minimum classification confidence in a specific positive class in order to be considered + for the non-maximum suppression stage for the respective class. A lower value will result in a larger part of the + selection process being done by the non-maximum suppression stage, while a larger value will result in a larger + part of the selection process happening in the confidence thresholding stage. + decoding_iou_threshold (float, optional): Only relevant if the model is in 'training' mode. A float in [0,1]. + All boxes with a Jaccard similarity of greater than `iou_threshold` with a locally maximal box will be removed + from the set of predictions for a given class, where 'maximal' refers to the box score. + decoding_top_k (int, optional): Only relevant if the model is in 'training' mode. The number of highest scoring + predictions to be kept for each batch item after the non-maximum suppression stage. + decoding_input_coords (str, optional): Only relevant if the model is in 'training' mode. The box coordinate format + that the model outputs. Can be either 'centroids' for the format `(cx, cy, w, h)` (box center coordinates, width, and height), + 'minmax' for the format `(xmin, xmax, ymin, ymax)`, or 'corners' for the format `(xmin, ymin, xmax, ymax)`. + decoding_normalize_coords (bool, optional): Only relevant if the model is in 'training' mode. Set to `True` if the model + outputs relative coordinates. Do not set this to `True` if the model already outputs absolute coordinates, + as that would result in incorrect coordinates. + round_confidences (int, optional): `False` or an integer that is the number of decimals that the prediction + confidences will be rounded to. If `False`, the confidences will not be rounded. + verbose (bool, optional): If `True`, will print out the progress during runtime. + ret (bool, optional): If `True`, returns the predictions. + + Returns: + None by default. Optionally, a nested list containing the predictions for each class. + ''' + + class_id_pred = self.pred_format['class_id'] + conf_pred = self.pred_format['conf'] + xmin_pred = self.pred_format['xmin'] + ymin_pred = self.pred_format['ymin'] + xmax_pred = self.pred_format['xmax'] + ymax_pred = self.pred_format['ymax'] + + ############################################################################################# + # Configure the data generator for the evaluation. + ############################################################################################# + + convert_to_3_channels = ConvertTo3Channels() + resize = Resize(height=img_height,width=img_width, labels_format=self.gt_format) + if data_generator_mode == 'resize': + transformations = [convert_to_3_channels, + resize] + elif data_generator_mode == 'pad': + random_pad = RandomPadFixedAR(patch_aspect_ratio=img_width/img_height, labels_format=self.gt_format) + transformations = [convert_to_3_channels, + random_pad, + resize] + else: + raise ValueError("`data_generator_mode` can be either of 'resize' or 'pad', but received '{}'.".format(data_generator_mode)) + + # Set the generator parameters. + generator = self.data_generator.generate(batch_size=batch_size, + shuffle=False, + transformations=transformations, + label_encoder=None, + returns={'processed_images', + 'image_ids', + 'evaluation-neutral', + 'inverse_transform', + 'original_labels'}, + keep_images_without_gt=True, + degenerate_box_handling='remove') + + # If we don't have any real image IDs, generate pseudo-image IDs. + # This is just to make the evaluator compatible both with datasets that do and don't + # have image IDs. + if self.data_generator.image_ids is None: + self.data_generator.image_ids = list(range(self.data_generator.get_dataset_size())) + + ############################################################################################# + # Predict over all batches of the dataset and store the predictions. + ############################################################################################# + + # We have to generate a separate results list for each class. + results = [list() for _ in range(self.n_classes + 1)] + + # Create a dictionary that maps image IDs to ground truth annotations. + # We'll need it below. + image_ids_to_labels = {} + + # Compute the number of batches to iterate over the entire dataset. + n_images = self.data_generator.get_dataset_size() + n_batches = int(ceil(n_images / batch_size)) + if verbose: + print("Number of images in the evaluation dataset: {}".format(n_images)) + print() + tr = trange(n_batches, file=sys.stdout) + tr.set_description('Producing predictions batch-wise') + else: + tr = range(n_batches) + + # Loop over all batches. + for j in tr: + # Generate batch. + batch_X, batch_image_ids, batch_eval_neutral, batch_inverse_transforms, batch_orig_labels = next(generator) + # Predict. + y_pred = self.model.predict(batch_X) + # If the model was created in 'training' mode, the raw predictions need to + # be decoded and filtered, otherwise that's already taken care of. + if self.model_mode == 'training': + # Decode. + y_pred = decode_detections(y_pred, + confidence_thresh=decoding_confidence_thresh, + iou_threshold=decoding_iou_threshold, + top_k=decoding_top_k, + input_coords=decoding_pred_coords, + normalize_coords=decoding_normalize_coords, + img_height=img_height, + img_width=img_width, + border_pixels=decoding_border_pixels) + else: + # Filter out the all-zeros dummy elements of `y_pred`. + y_pred_filtered = [] + for i in range(len(y_pred)): + y_pred_filtered.append(y_pred[i][y_pred[i,:,0] != 0]) + y_pred = y_pred_filtered + # Convert the predicted box coordinates for the original images. + y_pred = apply_inverse_transforms(y_pred, batch_inverse_transforms) + + # Iterate over all batch items. + for k, batch_item in enumerate(y_pred): + + image_id = batch_image_ids[k] + + for box in batch_item: + class_id = int(box[class_id_pred]) + # Round the box coordinates to reduce the required memory. + if round_confidences: + confidence = round(box[conf_pred], round_confidences) + else: + confidence = box[conf_pred] + xmin = round(box[xmin_pred], 1) + ymin = round(box[ymin_pred], 1) + xmax = round(box[xmax_pred], 1) + ymax = round(box[ymax_pred], 1) + prediction = (image_id, confidence, xmin, ymin, xmax, ymax) + # Append the predicted box to the results list for its class. + results[class_id].append(prediction) + + self.prediction_results = results + + if ret: + return results + + def write_predictions_to_txt(self, + classes=None, + out_file_prefix='comp3_det_test_', + verbose=True): + ''' + Writes the predictions for all classes to separate text files according to the Pascal VOC results format. + + Arguments: + classes (list, optional): `None` or a list of strings containing the class names of all classes in the dataset, + including some arbitrary name for the background class. This list will be used to name the output text files. + The ordering of the names in the list represents the ordering of the classes as they are predicted by the model, + i.e. the element with index 3 in this list should correspond to the class with class ID 3 in the model's predictions. + If `None`, the output text files will be named by their class IDs. + out_file_prefix (str, optional): A prefix for the output text file names. The suffix to each output text file name will + be the respective class name followed by the `.txt` file extension. This string is also how you specify the directory + in which the results are to be saved. + verbose (bool, optional): If `True`, will print out the progress during runtime. + + Returns: + None. + ''' + + if self.prediction_results is None: + raise ValueError("There are no prediction results. You must run `predict_on_dataset()` before calling this method.") + + # We generate a separate results file for each class. + for class_id in range(1, self.n_classes + 1): + + if verbose: + print("Writing results file for class {}/{}.".format(class_id, self.n_classes)) + + if classes is None: + class_suffix = '{:04d}'.format(class_id) + else: + class_suffix = classes[class_id] + + results_file = open('{}{}.txt'.format(out_file_prefix, class_suffix), 'w') + + for prediction in self.prediction_results[class_id]: + + prediction_list = list(prediction) + prediction_list[0] = '{:06d}'.format(int(prediction_list[0])) + prediction_list[1] = round(prediction_list[1], 4) + prediction_txt = ' '.join(map(str, prediction_list)) + '\n' + results_file.write(prediction_txt) + + results_file.close() + + if verbose: + print("All results files saved.") + + def get_num_gt_per_class(self, + ignore_neutral_boxes=True, + verbose=True, + ret=False): + ''' + Counts the number of ground truth boxes for each class across the dataset. + + Arguments: + ignore_neutral_boxes (bool, optional): In case the data generator provides annotations indicating whether a ground truth + bounding box is supposed to either count or be neutral for the evaluation, this argument decides what to do with these + annotations. If `True`, only non-neutral ground truth boxes will be counted, otherwise all ground truth boxes will + be counted. + verbose (bool, optional): If `True`, will print out the progress during runtime. + ret (bool, optional): If `True`, returns the list of counts. + + Returns: + None by default. Optionally, a list containing a count of the number of ground truth boxes for each class across the + entire dataset. + ''' + + if self.data_generator.labels is None: + raise ValueError("Computing the number of ground truth boxes per class not possible, no ground truth given.") + + num_gt_per_class = np.zeros(shape=(self.n_classes+1), dtype=np.int) + + class_id_index = self.gt_format['class_id'] + + ground_truth = self.data_generator.labels + + if verbose: + print('Computing the number of positive ground truth boxes per class.') + tr = trange(len(ground_truth), file=sys.stdout) + else: + tr = range(len(ground_truth)) + + # Iterate over the ground truth for all images in the dataset. + for i in tr: + + boxes = np.asarray(ground_truth[i]) + + # Iterate over all ground truth boxes for the current image. + for j in range(boxes.shape[0]): + + if ignore_neutral_boxes and not (self.data_generator.eval_neutral is None): + if not self.data_generator.eval_neutral[i][j]: + # If this box is not supposed to be evaluation-neutral, + # increment the counter for the respective class ID. + class_id = boxes[j, class_id_index] + num_gt_per_class[class_id] += 1 + else: + # If there is no such thing as evaluation-neutral boxes for + # our dataset, always increment the counter for the respective + # class ID. + class_id = boxes[j, class_id_index] + num_gt_per_class[class_id] += 1 + + self.num_gt_per_class = num_gt_per_class + + if ret: + return num_gt_per_class + + def match_predictions(self, + ignore_neutral_boxes=True, + matching_iou_threshold=0.5, + border_pixels='include', + sorting_algorithm='quicksort', + verbose=True, + ret=False): + ''' + Matches predictions to ground truth boxes. + + Note that `predict_on_dataset()` must be called before calling this method. + + Arguments: + ignore_neutral_boxes (bool, optional): In case the data generator provides annotations indicating whether a ground truth + bounding box is supposed to either count or be neutral for the evaluation, this argument decides what to do with these + annotations. If `False`, even boxes that are annotated as neutral will be counted into the evaluation. If `True`, + neutral boxes will be ignored for the evaluation. An example for evaluation-neutrality are the ground truth boxes + annotated as "difficult" in the Pascal VOC datasets, which are usually treated as neutral for the evaluation. + matching_iou_threshold (float, optional): A prediction will be considered a true positive if it has a Jaccard overlap + of at least `matching_iou_threshold` with any ground truth bounding box of the same class. + border_pixels (str, optional): How to treat the border pixels of the bounding boxes. + Can be 'include', 'exclude', or 'half'. If 'include', the border pixels belong + to the boxes. If 'exclude', the border pixels do not belong to the boxes. + If 'half', then one of each of the two horizontal and vertical borders belong + to the boxex, but not the other. + sorting_algorithm (str, optional): Which sorting algorithm the matching algorithm should use. This argument accepts + any valid sorting algorithm for Numpy's `argsort()` function. You will usually want to choose between 'quicksort' + (fastest and most memory efficient, but not stable) and 'mergesort' (slight slower and less memory efficient, but stable). + The official Matlab evaluation algorithm uses a stable sorting algorithm, so this algorithm is only guaranteed + to behave identically if you choose 'mergesort' as the sorting algorithm, but it will almost always behave identically + even if you choose 'quicksort' (but no guarantees). + verbose (bool, optional): If `True`, will print out the progress during runtime. + ret (bool, optional): If `True`, returns the true and false positives. + + Returns: + None by default. Optionally, four nested lists containing the true positives, false positives, cumulative true positives, + and cumulative false positives for each class. + ''' + + if self.data_generator.labels is None: + raise ValueError("Matching predictions to ground truth boxes not possible, no ground truth given.") + + if self.prediction_results is None: + raise ValueError("There are no prediction results. You must run `predict_on_dataset()` before calling this method.") + + class_id_gt = self.gt_format['class_id'] + xmin_gt = self.gt_format['xmin'] + ymin_gt = self.gt_format['ymin'] + xmax_gt = self.gt_format['xmax'] + ymax_gt = self.gt_format['ymax'] + + # Convert the ground truth to a more efficient format for what we need + # to do, which is access ground truth by image ID repeatedly. + ground_truth = {} + eval_neutral_available = not (self.data_generator.eval_neutral is None) # Whether or not we have annotations to decide whether ground truth boxes should be neutral or not. + for i in range(len(self.data_generator.image_ids)): + image_id = str(self.data_generator.image_ids[i]) + labels = self.data_generator.labels[i] + if ignore_neutral_boxes and eval_neutral_available: + ground_truth[image_id] = (np.asarray(labels), np.asarray(self.data_generator.eval_neutral[i])) + else: + ground_truth[image_id] = np.asarray(labels) + + true_positives = [[]] # The false positives for each class, sorted by descending confidence. + false_positives = [[]] # The true positives for each class, sorted by descending confidence. + cumulative_true_positives = [[]] + cumulative_false_positives = [[]] + + # Iterate over all classes. + for class_id in range(1, self.n_classes + 1): + + predictions = self.prediction_results[class_id] + + # Store the matching results in these lists: + true_pos = np.zeros(len(predictions), dtype=np.int) # 1 for every prediction that is a true positive, 0 otherwise + false_pos = np.zeros(len(predictions), dtype=np.int) # 1 for every prediction that is a false positive, 0 otherwise + + # In case there are no predictions at all for this class, we're done here. + if len(predictions) == 0: + print("No predictions for class {}/{}".format(class_id, self.n_classes)) + true_positives.append(true_pos) + false_positives.append(false_pos) + continue + + # Convert the predictions list for this class into a structured array so that we can sort it by confidence. + + # Get the number of characters needed to store the image ID strings in the structured array. + num_chars_per_image_id = len(str(predictions[0][0])) + 6 # Keep a few characters buffer in case some image IDs are longer than others. + # Create the data type for the structured array. + preds_data_type = np.dtype([('image_id', 'U{}'.format(num_chars_per_image_id)), + ('confidence', 'f4'), + ('xmin', 'f4'), + ('ymin', 'f4'), + ('xmax', 'f4'), + ('ymax', 'f4')]) + # Create the structured array + predictions = np.array(predictions, dtype=preds_data_type) + + # Sort the detections by decreasing confidence. + descending_indices = np.argsort(-predictions['confidence'], kind=sorting_algorithm) + predictions_sorted = predictions[descending_indices] + + if verbose: + tr = trange(len(predictions), file=sys.stdout) + tr.set_description("Matching predictions to ground truth, class {}/{}.".format(class_id, self.n_classes)) + else: + tr = range(len(predictions.shape)) + + # Keep track of which ground truth boxes were already matched to a detection. + gt_matched = {} + + # Iterate over all predictions. + for i in tr: + + prediction = predictions_sorted[i] + image_id = prediction['image_id'] + pred_box = np.asarray(list(prediction[['xmin', 'ymin', 'xmax', 'ymax']])) # Convert the structured array element to a regular array. + + # Get the relevant ground truth boxes for this prediction, + # i.e. all ground truth boxes that match the prediction's + # image ID and class ID. + + # The ground truth could either be a tuple with `(ground_truth_boxes, eval_neutral_boxes)` + # or only `ground_truth_boxes`. + if ignore_neutral_boxes and eval_neutral_available: + gt, eval_neutral = ground_truth[image_id] + else: + gt = ground_truth[image_id] + gt = np.asarray(gt) + class_mask = gt[:,class_id_gt] == class_id + gt = gt[class_mask] + if ignore_neutral_boxes and eval_neutral_available: + eval_neutral = eval_neutral[class_mask] + + if gt.size == 0: + # If the image doesn't contain any objects of this class, + # the prediction becomes a false positive. + false_pos[i] = 1 + continue + + # Compute the IoU of this prediction with all ground truth boxes of the same class. + overlaps = iou(boxes1=gt[:,[xmin_gt, ymin_gt, xmax_gt, ymax_gt]], + boxes2=pred_box, + coords='corners', + mode='element-wise', + border_pixels=border_pixels) + + # For each detection, match the ground truth box with the highest overlap. + # It's possible that the same ground truth box will be matched to multiple + # detections. + gt_match_index = np.argmax(overlaps) + gt_match_overlap = overlaps[gt_match_index] + + if gt_match_overlap < matching_iou_threshold: + # False positive, IoU threshold violated: + # Those predictions whose matched overlap is below the threshold become + # false positives. + false_pos[i] = 1 + else: + if not (ignore_neutral_boxes and eval_neutral_available) or (eval_neutral[gt_match_index] == False): + # If this is not a ground truth that is supposed to be evaluation-neutral + # (i.e. should be skipped for the evaluation) or if we don't even have the + # concept of neutral boxes. + if not (image_id in gt_matched): + # True positive: + # If the matched ground truth box for this prediction hasn't been matched to a + # different prediction already, we have a true positive. + true_pos[i] = 1 + gt_matched[image_id] = np.zeros(shape=(gt.shape[0]), dtype=np.bool) + gt_matched[image_id][gt_match_index] = True + elif not gt_matched[image_id][gt_match_index]: + # True positive: + # If the matched ground truth box for this prediction hasn't been matched to a + # different prediction already, we have a true positive. + true_pos[i] = 1 + gt_matched[image_id][gt_match_index] = True + else: + # False positive, duplicate detection: + # If the matched ground truth box for this prediction has already been matched + # to a different prediction previously, it is a duplicate detection for an + # already detected object, which counts as a false positive. + false_pos[i] = 1 + + true_positives.append(true_pos) + false_positives.append(false_pos) + + cumulative_true_pos = np.cumsum(true_pos) # Cumulative sums of the true positives + cumulative_false_pos = np.cumsum(false_pos) # Cumulative sums of the false positives + + cumulative_true_positives.append(cumulative_true_pos) + cumulative_false_positives.append(cumulative_false_pos) + + self.true_positives = true_positives + self.false_positives = false_positives + self.cumulative_true_positives = cumulative_true_positives + self.cumulative_false_positives = cumulative_false_positives + + if ret: + return true_positives, false_positives, cumulative_true_positives, cumulative_false_positives + + def compute_precision_recall(self, verbose=True, ret=False): + ''' + Computes the precisions and recalls for all classes. + + Note that `match_predictions()` must be called before calling this method. + + Arguments: + verbose (bool, optional): If `True`, will print out the progress during runtime. + ret (bool, optional): If `True`, returns the precisions and recalls. + + Returns: + None by default. Optionally, two nested lists containing the cumulative precisions and recalls for each class. + ''' + + if (self.cumulative_true_positives is None) or (self.cumulative_false_positives is None): + raise ValueError("True and false positives not available. You must run `match_predictions()` before you call this method.") + + if (self.num_gt_per_class is None): + raise ValueError("Number of ground truth boxes per class not available. You must run `get_num_gt_per_class()` before you call this method.") + + cumulative_precisions = [[]] + cumulative_recalls = [[]] + + # Iterate over all classes. + for class_id in range(1, self.n_classes + 1): + + if verbose: + print("Computing precisions and recalls, class {}/{}".format(class_id, self.n_classes)) + + tp = self.cumulative_true_positives[class_id] + fp = self.cumulative_false_positives[class_id] + + + cumulative_precision = np.where(tp + fp > 0, tp / (tp + fp), 0) # 1D array with shape `(num_predictions,)` + cumulative_recall = tp / self.num_gt_per_class[class_id] # 1D array with shape `(num_predictions,)` + + cumulative_precisions.append(cumulative_precision) + cumulative_recalls.append(cumulative_recall) + + self.cumulative_precisions = cumulative_precisions + self.cumulative_recalls = cumulative_recalls + + if ret: + return cumulative_precisions, cumulative_recalls + + def compute_average_precisions(self, mode='sample', num_recall_points=11, verbose=True, ret=False): + ''' + Computes the average precision for each class. + + Can compute the Pascal-VOC-style average precision in both the pre-2010 (k-point sampling) + and post-2010 (integration) algorithm versions. + + Note that `compute_precision_recall()` must be called before calling this method. + + Arguments: + mode (str, optional): Can be either 'sample' or 'integrate'. In the case of 'sample', the average precision will be computed + according to the Pascal VOC formula that was used up until VOC 2009, where the precision will be sampled for `num_recall_points` + recall values. In the case of 'integrate', the average precision will be computed according to the Pascal VOC formula that + was used from VOC 2010 onward, where the average precision will be computed by numerically integrating over the whole + preciscion-recall curve instead of sampling individual points from it. 'integrate' mode is basically just the limit case + of 'sample' mode as the number of sample points increases. For details, see the references below. + num_recall_points (int, optional): Only relevant if mode is 'sample'. The number of points to sample from the precision-recall-curve + to compute the average precisions. In other words, this is the number of equidistant recall values for which the resulting + precision will be computed. 11 points is the value used in the official Pascal VOC pre-2010 detection evaluation algorithm. + verbose (bool, optional): If `True`, will print out the progress during runtime. + ret (bool, optional): If `True`, returns the average precisions. + + Returns: + None by default. Optionally, a list containing average precision for each class. + + References: + http://host.robots.ox.ac.uk/pascal/VOC/voc2012/htmldoc/devkit_doc.html#sec:ap + ''' + + if (self.cumulative_precisions is None) or (self.cumulative_recalls is None): + raise ValueError("Precisions and recalls not available. You must run `compute_precision_recall()` before you call this method.") + + if not (mode in {'sample', 'integrate'}): + raise ValueError("`mode` can be either 'sample' or 'integrate', but received '{}'".format(mode)) + + average_precisions = [0.0] + + # Iterate over all classes. + for class_id in range(1, self.n_classes + 1): + + if verbose: + print("Computing average precision, class {}/{}".format(class_id, self.n_classes)) + + cumulative_precision = self.cumulative_precisions[class_id] + cumulative_recall = self.cumulative_recalls[class_id] + average_precision = 0.0 + + if mode == 'sample': + + for t in np.linspace(start=0, stop=1, num=num_recall_points, endpoint=True): + + cum_prec_recall_greater_t = cumulative_precision[cumulative_recall >= t] + + if cum_prec_recall_greater_t.size == 0: + precision = 0.0 + else: + precision = np.amax(cum_prec_recall_greater_t) + + average_precision += precision + + average_precision /= num_recall_points + + elif mode == 'integrate': + + # We will compute the precision at all unique recall values. + unique_recalls, unique_recall_indices, unique_recall_counts = np.unique(cumulative_recall, return_index=True, return_counts=True) + + # Store the maximal precision for each recall value and the absolute difference + # between any two unique recal values in the lists below. The products of these + # two nummbers constitute the rectangular areas whose sum will be our numerical + # integral. + maximal_precisions = np.zeros_like(unique_recalls) + recall_deltas = np.zeros_like(unique_recalls) + + # Iterate over all unique recall values in reverse order. This saves a lot of computation: + # For each unique recall value `r`, we want to get the maximal precision value obtained + # for any recall value `r* >= r`. Once we know the maximal precision for the last `k` recall + # values after a given iteration, then in the next iteration, in order compute the maximal + # precisions for the last `l > k` recall values, we only need to compute the maximal precision + # for `l - k` recall values and then take the maximum between that and the previously computed + # maximum instead of computing the maximum over all `l` values. + # We skip the very last recall value, since the precision after between the last recall value + # recall 1.0 is defined to be zero. + for i in range(len(unique_recalls)-2, -1, -1): + begin = unique_recall_indices[i] + end = unique_recall_indices[i + 1] + # When computing the maximal precisions, use the maximum of the previous iteration to + # avoid unnecessary repeated computation over the same precision values. + # The maximal precisions are the heights of the rectangle areas of our integral under + # the precision-recall curve. + maximal_precisions[i] = np.maximum(np.amax(cumulative_precision[begin:end]), maximal_precisions[i + 1]) + # The differences between two adjacent recall values are the widths of our rectangle areas. + recall_deltas[i] = unique_recalls[i + 1] - unique_recalls[i] + + average_precision = np.sum(maximal_precisions * recall_deltas) + + average_precisions.append(average_precision) + + self.average_precisions = average_precisions + + if ret: + return average_precisions + + def compute_mean_average_precision(self, ret=True): + ''' + Computes the mean average precision over all classes. + + Note that `compute_average_precisions()` must be called before calling this method. + + Arguments: + ret (bool, optional): If `True`, returns the mean average precision. + + Returns: + A float, the mean average precision, by default. Optionally, None. + ''' + + if self.average_precisions is None: + raise ValueError("Average precisions not available. You must run `compute_average_precisions()` before you call this method.") + + mean_average_precision = np.average(self.average_precisions[1:]) # The first element is for the background class, so skip it. + self.mean_average_precision = mean_average_precision + + if ret: + return mean_average_precision diff --git a/keras_ssd/eval_utils/coco_utils.py b/keras_ssd/eval_utils/coco_utils.py new file mode 100644 index 0000000..b0e88f8 --- /dev/null +++ b/keras_ssd/eval_utils/coco_utils.py @@ -0,0 +1,200 @@ +''' +A few utilities that are useful when working with the MS COCO datasets. + +Copyright (C) 2018 Pierluigi Ferrari + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +''' + +import json +from tqdm import trange +from math import ceil +import sys + +from data_generator.object_detection_2d_geometric_ops import Resize +from data_generator.object_detection_2d_patch_sampling_ops import RandomPadFixedAR +from data_generator.object_detection_2d_photometric_ops import ConvertTo3Channels +from ssd_encoder_decoder.ssd_output_decoder import decode_detections +from data_generator.object_detection_2d_misc_utils import apply_inverse_transforms + +def get_coco_category_maps(annotations_file): + ''' + Builds dictionaries that map between MS COCO category IDs, transformed category IDs, and category names. + The original MS COCO category IDs are not consecutive unfortunately: The 80 category IDs are spread + across the integers 1 through 90 with some integers skipped. Since we usually use a one-hot + class representation in neural networks, we need to map these non-consecutive original COCO category + IDs (let's call them 'cats') to consecutive category IDs (let's call them 'classes'). + + Arguments: + annotations_file (str): The filepath to any MS COCO annotations JSON file. + + Returns: + 1) cats_to_classes: A dictionary that maps between the original (keys) and the transformed category IDs (values). + 2) classes_to_cats: A dictionary that maps between the transformed (keys) and the original category IDs (values). + 3) cats_to_names: A dictionary that maps between original category IDs (keys) and the respective category names (values). + 4) classes_to_names: A list of the category names (values) with their indices representing the transformed IDs. + ''' + with open(annotations_file, 'r') as f: + annotations = json.load(f) + cats_to_classes = {} + classes_to_cats = {} + cats_to_names = {} + classes_to_names = [] + classes_to_names.append('background') # Need to add the background class first so that the indexing is right. + for i, cat in enumerate(annotations['categories']): + cats_to_classes[cat['id']] = i + 1 + classes_to_cats[i + 1] = cat['id'] + cats_to_names[cat['id']] = cat['name'] + classes_to_names.append(cat['name']) + + return cats_to_classes, classes_to_cats, cats_to_names, classes_to_names + +def predict_all_to_json(out_file, + model, + img_height, + img_width, + classes_to_cats, + data_generator, + batch_size, + data_generator_mode='resize', + model_mode='training', + confidence_thresh=0.01, + iou_threshold=0.45, + top_k=200, + pred_coords='centroids', + normalize_coords=True): + ''' + Runs detection predictions over the whole dataset given a model and saves them in a JSON file + in the MS COCO detection results format. + + Arguments: + out_file (str): The file name (full path) under which to save the results JSON file. + model (Keras model): A Keras SSD model object. + img_height (int): The input image height for the model. + img_width (int): The input image width for the model. + classes_to_cats (dict): A dictionary that maps the consecutive class IDs predicted by the model + to the non-consecutive original MS COCO category IDs. + data_generator (DataGenerator): A `DataGenerator` object with the evaluation dataset. + batch_size (int): The batch size for the evaluation. + data_generator_mode (str, optional): Either of 'resize' or 'pad'. If 'resize', the input images will + be resized (i.e. warped) to `(img_height, img_width)`. This mode does not preserve the aspect ratios of the images. + If 'pad', the input images will be first padded so that they have the aspect ratio defined by `img_height` + and `img_width` and then resized to `(img_height, img_width)`. This mode preserves the aspect ratios of the images. + model_mode (str, optional): The mode in which the model was created, i.e. 'training', 'inference' or 'inference_fast'. + This is needed in order to know whether the model output is already decoded or still needs to be decoded. Refer to + the model documentation for the meaning of the individual modes. + confidence_thresh (float, optional): A float in [0,1), the minimum classification confidence in a specific + positive class in order to be considered for the non-maximum suppression stage for the respective class. + A lower value will result in a larger part of the selection process being done by the non-maximum suppression + stage, while a larger value will result in a larger part of the selection process happening in the confidence + thresholding stage. + iou_threshold (float, optional): A float in [0,1]. All boxes with a Jaccard similarity of greater than `iou_threshold` + with a locally maximal box will be removed from the set of predictions for a given class, where 'maximal' refers + to the box score. + top_k (int, optional): The number of highest scoring predictions to be kept for each batch item after the + non-maximum suppression stage. Defaults to 200, following the paper. + input_coords (str, optional): The box coordinate format that the model outputs. Can be either 'centroids' + for the format `(cx, cy, w, h)` (box center coordinates, width, and height), 'minmax' for the format + `(xmin, xmax, ymin, ymax)`, or 'corners' for the format `(xmin, ymin, xmax, ymax)`. + normalize_coords (bool, optional): Set to `True` if the model outputs relative coordinates (i.e. coordinates in [0,1]) + and you wish to transform these relative coordinates back to absolute coordinates. If the model outputs + relative coordinates, but you do not want to convert them back to absolute coordinates, set this to `False`. + Do not set this to `True` if the model already outputs absolute coordinates, as that would result in incorrect + coordinates. Requires `img_height` and `img_width` if set to `True`. + + Returns: + None. + ''' + + convert_to_3_channels = ConvertTo3Channels() + resize = Resize(height=img_height,width=img_width) + if data_generator_mode == 'resize': + transformations = [convert_to_3_channels, + resize] + elif data_generator_mode == 'pad': + random_pad = RandomPadFixedAR(patch_aspect_ratio=img_width/img_height, clip_boxes=False) + transformations = [convert_to_3_channels, + random_pad, + resize] + else: + raise ValueError("Unexpected argument value: `data_generator_mode` can be either of 'resize' or 'pad', but received '{}'.".format(data_generator_mode)) + + # Set the generator parameters. + generator = data_generator.generate(batch_size=batch_size, + shuffle=False, + transformations=transformations, + label_encoder=None, + returns={'processed_images', + 'image_ids', + 'inverse_transform'}, + keep_images_without_gt=True) + # Put the results in this list. + results = [] + # Compute the number of batches to iterate over the entire dataset. + n_images = data_generator.get_dataset_size() + print("Number of images in the evaluation dataset: {}".format(n_images)) + n_batches = int(ceil(n_images / batch_size)) + # Loop over all batches. + tr = trange(n_batches, file=sys.stdout) + tr.set_description('Producing results file') + for i in tr: + # Generate batch. + batch_X, batch_image_ids, batch_inverse_transforms = next(generator) + # Predict. + y_pred = model.predict(batch_X) + # If the model was created in 'training' mode, the raw predictions need to + # be decoded and filtered, otherwise that's already taken care of. + if model_mode == 'training': + # Decode. + y_pred = decode_detections(y_pred, + confidence_thresh=confidence_thresh, + iou_threshold=iou_threshold, + top_k=top_k, + input_coords=pred_coords, + normalize_coords=normalize_coords, + img_height=img_height, + img_width=img_width) + else: + # Filter out the all-zeros dummy elements of `y_pred`. + y_pred_filtered = [] + for i in range(len(y_pred)): + y_pred_filtered.append(y_pred[i][y_pred[i,:,0] != 0]) + y_pred = y_pred_filtered + # Convert the predicted box coordinates for the original images. + y_pred = apply_inverse_transforms(y_pred, batch_inverse_transforms) + + # Convert each predicted box into the results format. + for k, batch_item in enumerate(y_pred): + for box in batch_item: + class_id = box[0] + # Transform the consecutive class IDs back to the original COCO category IDs. + cat_id = classes_to_cats[class_id] + # Round the box coordinates to reduce the JSON file size. + xmin = float(round(box[2], 1)) + ymin = float(round(box[3], 1)) + xmax = float(round(box[4], 1)) + ymax = float(round(box[5], 1)) + width = xmax - xmin + height = ymax - ymin + bbox = [xmin, ymin, width, height] + result = {} + result['image_id'] = batch_image_ids[k] + result['category_id'] = cat_id + result['score'] = float(round(box[1], 3)) + result['bbox'] = bbox + results.append(result) + + with open(out_file, 'w') as f: + json.dump(results, f) + + print("Prediction results saved in '{}'".format(out_file)) diff --git a/keras_ssd/keras_layers/__init__.py b/keras_ssd/keras_layers/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/keras_ssd/keras_layers/keras_layer_AnchorBoxes.py b/keras_ssd/keras_layers/keras_layer_AnchorBoxes.py new file mode 100644 index 0000000..83a7ab5 --- /dev/null +++ b/keras_ssd/keras_layers/keras_layer_AnchorBoxes.py @@ -0,0 +1,278 @@ +''' +A custom Keras layer to generate anchor boxes. + +Copyright (C) 2018 Pierluigi Ferrari + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +''' + +from __future__ import division +import numpy as np +import keras.backend as K +from keras.engine.topology import InputSpec +from keras.engine.topology import Layer + +from bounding_box_utils.bounding_box_utils import convert_coordinates + +class AnchorBoxes(Layer): + ''' + A Keras layer to create an output tensor containing anchor box coordinates + and variances based on the input tensor and the passed arguments. + + A set of 2D anchor boxes of different aspect ratios is created for each spatial unit of + the input tensor. The number of anchor boxes created per unit depends on the arguments + `aspect_ratios` and `two_boxes_for_ar1`, in the default case it is 4. The boxes + are parameterized by the coordinate tuple `(xmin, xmax, ymin, ymax)`. + + The logic implemented by this layer is identical to the logic in the module + `ssd_box_encode_decode_utils.py`. + + The purpose of having this layer in the network is to make the model self-sufficient + at inference time. Since the model is predicting offsets to the anchor boxes + (rather than predicting absolute box coordinates directly), one needs to know the anchor + box coordinates in order to construct the final prediction boxes from the predicted offsets. + If the model's output tensor did not contain the anchor box coordinates, the necessary + information to convert the predicted offsets back to absolute coordinates would be missing + in the model output. The reason why it is necessary to predict offsets to the anchor boxes + rather than to predict absolute box coordinates directly is explained in `README.md`. + + Input shape: + 4D tensor of shape `(batch, channels, height, width)` if `dim_ordering = 'th'` + or `(batch, height, width, channels)` if `dim_ordering = 'tf'`. + + Output shape: + 5D tensor of shape `(batch, height, width, n_boxes, 8)`. The last axis contains + the four anchor box coordinates and the four variance values for each box. + ''' + + def __init__(self, + img_height, + img_width, + this_scale, + next_scale, + aspect_ratios=[0.5, 1.0, 2.0], + two_boxes_for_ar1=True, + this_steps=None, + this_offsets=None, + clip_boxes=False, + variances=[0.1, 0.1, 0.2, 0.2], + coords='centroids', + normalize_coords=False, + **kwargs): + ''' + All arguments need to be set to the same values as in the box encoding process, otherwise the behavior is undefined. + Some of these arguments are explained in more detail in the documentation of the `SSDBoxEncoder` class. + + Arguments: + img_height (int): The height of the input images. + img_width (int): The width of the input images. + this_scale (float): A float in [0, 1], the scaling factor for the size of the generated anchor boxes + as a fraction of the shorter side of the input image. + next_scale (float): A float in [0, 1], the next larger scaling factor. Only relevant if + `self.two_boxes_for_ar1 == True`. + aspect_ratios (list, optional): The list of aspect ratios for which default boxes are to be + generated for this layer. + two_boxes_for_ar1 (bool, optional): Only relevant if `aspect_ratios` contains 1. + If `True`, two default boxes will be generated for aspect ratio 1. The first will be generated + using the scaling factor for the respective layer, the second one will be generated using + geometric mean of said scaling factor and next bigger scaling factor. + clip_boxes (bool, optional): If `True`, clips the anchor box coordinates to stay within image boundaries. + variances (list, optional): A list of 4 floats >0. The anchor box offset for each coordinate will be divided by + its respective variance value. + coords (str, optional): The box coordinate format to be used internally in the model (i.e. this is not the input format + of the ground truth labels). Can be either 'centroids' for the format `(cx, cy, w, h)` (box center coordinates, width, and height), + 'corners' for the format `(xmin, ymin, xmax, ymax)`, or 'minmax' for the format `(xmin, xmax, ymin, ymax)`. + normalize_coords (bool, optional): Set to `True` if the model uses relative instead of absolute coordinates, + i.e. if the model predicts box coordinates within [0,1] instead of absolute coordinates. + ''' + if K.backend() != 'tensorflow': + raise TypeError("This layer only supports TensorFlow at the moment, but you are using the {} backend.".format(K.backend())) + + if (this_scale < 0) or (next_scale < 0) or (this_scale > 1): + raise ValueError("`this_scale` must be in [0, 1] and `next_scale` must be >0, but `this_scale` == {}, `next_scale` == {}".format(this_scale, next_scale)) + + if len(variances) != 4: + raise ValueError("4 variance values must be pased, but {} values were received.".format(len(variances))) + variances = np.array(variances) + if np.any(variances <= 0): + raise ValueError("All variances must be >0, but the variances given are {}".format(variances)) + + self.img_height = img_height + self.img_width = img_width + self.this_scale = this_scale + self.next_scale = next_scale + self.aspect_ratios = aspect_ratios + self.two_boxes_for_ar1 = two_boxes_for_ar1 + self.this_steps = this_steps + self.this_offsets = this_offsets + self.clip_boxes = clip_boxes + self.variances = variances + self.coords = coords + self.normalize_coords = normalize_coords + # Compute the number of boxes per cell + if (1 in aspect_ratios) and two_boxes_for_ar1: + self.n_boxes = len(aspect_ratios) + 1 + else: + self.n_boxes = len(aspect_ratios) + super(AnchorBoxes, self).__init__(**kwargs) + + def build(self, input_shape): + self.input_spec = [InputSpec(shape=input_shape)] + super(AnchorBoxes, self).build(input_shape) + + def call(self, x, mask=None): + ''' + Return an anchor box tensor based on the shape of the input tensor. + + The logic implemented here is identical to the logic in the module `ssd_box_encode_decode_utils.py`. + + Note that this tensor does not participate in any graph computations at runtime. It is being created + as a constant once during graph creation and is just being output along with the rest of the model output + during runtime. Because of this, all logic is implemented as Numpy array operations and it is sufficient + to convert the resulting Numpy array into a Keras tensor at the very end before outputting it. + + Arguments: + x (tensor): 4D tensor of shape `(batch, channels, height, width)` if `dim_ordering = 'th'` + or `(batch, height, width, channels)` if `dim_ordering = 'tf'`. The input for this + layer must be the output of the localization predictor layer. + ''' + + # Compute box width and height for each aspect ratio + # The shorter side of the image will be used to compute `w` and `h` using `scale` and `aspect_ratios`. + size = min(self.img_height, self.img_width) + # Compute the box widths and and heights for all aspect ratios + wh_list = [] + for ar in self.aspect_ratios: + if (ar == 1): + # Compute the regular anchor box for aspect ratio 1. + box_height = box_width = self.this_scale * size + wh_list.append((box_width, box_height)) + if self.two_boxes_for_ar1: + # Compute one slightly larger version using the geometric mean of this scale value and the next. + box_height = box_width = np.sqrt(self.this_scale * self.next_scale) * size + wh_list.append((box_width, box_height)) + else: + box_height = self.this_scale * size / np.sqrt(ar) + box_width = self.this_scale * size * np.sqrt(ar) + wh_list.append((box_width, box_height)) + wh_list = np.array(wh_list) + + # We need the shape of the input tensor + if K.image_dim_ordering() == 'tf': + batch_size, feature_map_height, feature_map_width, feature_map_channels = x._keras_shape + else: # Not yet relevant since TensorFlow is the only supported backend right now, but it can't harm to have this in here for the future + batch_size, feature_map_channels, feature_map_height, feature_map_width = x._keras_shape + + # Compute the grid of box center points. They are identical for all aspect ratios. + + # Compute the step sizes, i.e. how far apart the anchor box center points will be vertically and horizontally. + if (self.this_steps is None): + step_height = self.img_height / feature_map_height + step_width = self.img_width / feature_map_width + else: + if isinstance(self.this_steps, (list, tuple)) and (len(self.this_steps) == 2): + step_height = self.this_steps[0] + step_width = self.this_steps[1] + elif isinstance(self.this_steps, (int, float)): + step_height = self.this_steps + step_width = self.this_steps + # Compute the offsets, i.e. at what pixel values the first anchor box center point will be from the top and from the left of the image. + if (self.this_offsets is None): + offset_height = 0.5 + offset_width = 0.5 + else: + if isinstance(self.this_offsets, (list, tuple)) and (len(self.this_offsets) == 2): + offset_height = self.this_offsets[0] + offset_width = self.this_offsets[1] + elif isinstance(self.this_offsets, (int, float)): + offset_height = self.this_offsets + offset_width = self.this_offsets + # Now that we have the offsets and step sizes, compute the grid of anchor box center points. + cy = np.linspace(offset_height * step_height, (offset_height + feature_map_height - 1) * step_height, feature_map_height) + cx = np.linspace(offset_width * step_width, (offset_width + feature_map_width - 1) * step_width, feature_map_width) + cx_grid, cy_grid = np.meshgrid(cx, cy) + cx_grid = np.expand_dims(cx_grid, -1) # This is necessary for np.tile() to do what we want further down + cy_grid = np.expand_dims(cy_grid, -1) # This is necessary for np.tile() to do what we want further down + + # Create a 4D tensor template of shape `(feature_map_height, feature_map_width, n_boxes, 4)` + # where the last dimension will contain `(cx, cy, w, h)` + boxes_tensor = np.zeros((feature_map_height, feature_map_width, self.n_boxes, 4)) + + boxes_tensor[:, :, :, 0] = np.tile(cx_grid, (1, 1, self.n_boxes)) # Set cx + boxes_tensor[:, :, :, 1] = np.tile(cy_grid, (1, 1, self.n_boxes)) # Set cy + boxes_tensor[:, :, :, 2] = wh_list[:, 0] # Set w + boxes_tensor[:, :, :, 3] = wh_list[:, 1] # Set h + + # Convert `(cx, cy, w, h)` to `(xmin, xmax, ymin, ymax)` + boxes_tensor = convert_coordinates(boxes_tensor, start_index=0, conversion='centroids2corners') + + # If `clip_boxes` is enabled, clip the coordinates to lie within the image boundaries + if self.clip_boxes: + x_coords = boxes_tensor[:,:,:,[0, 2]] + x_coords[x_coords >= self.img_width] = self.img_width - 1 + x_coords[x_coords < 0] = 0 + boxes_tensor[:,:,:,[0, 2]] = x_coords + y_coords = boxes_tensor[:,:,:,[1, 3]] + y_coords[y_coords >= self.img_height] = self.img_height - 1 + y_coords[y_coords < 0] = 0 + boxes_tensor[:,:,:,[1, 3]] = y_coords + + # If `normalize_coords` is enabled, normalize the coordinates to be within [0,1] + if self.normalize_coords: + boxes_tensor[:, :, :, [0, 2]] /= self.img_width + boxes_tensor[:, :, :, [1, 3]] /= self.img_height + + # TODO: Implement box limiting directly for `(cx, cy, w, h)` so that we don't have to unnecessarily convert back and forth. + if self.coords == 'centroids': + # Convert `(xmin, ymin, xmax, ymax)` back to `(cx, cy, w, h)`. + boxes_tensor = convert_coordinates(boxes_tensor, start_index=0, conversion='corners2centroids', border_pixels='half') + elif self.coords == 'minmax': + # Convert `(xmin, ymin, xmax, ymax)` to `(xmin, xmax, ymin, ymax). + boxes_tensor = convert_coordinates(boxes_tensor, start_index=0, conversion='corners2minmax', border_pixels='half') + + # Create a tensor to contain the variances and append it to `boxes_tensor`. This tensor has the same shape + # as `boxes_tensor` and simply contains the same 4 variance values for every position in the last axis. + variances_tensor = np.zeros_like(boxes_tensor) # Has shape `(feature_map_height, feature_map_width, n_boxes, 4)` + variances_tensor += self.variances # Long live broadcasting + # Now `boxes_tensor` becomes a tensor of shape `(feature_map_height, feature_map_width, n_boxes, 8)` + boxes_tensor = np.concatenate((boxes_tensor, variances_tensor), axis=-1) + + # Now prepend one dimension to `boxes_tensor` to account for the batch size and tile it along + # The result will be a 5D tensor of shape `(batch_size, feature_map_height, feature_map_width, n_boxes, 8)` + boxes_tensor = np.expand_dims(boxes_tensor, axis=0) + boxes_tensor = K.tile(K.constant(boxes_tensor, dtype='float32'), (K.shape(x)[0], 1, 1, 1, 1)) + + return boxes_tensor + + def compute_output_shape(self, input_shape): + if K.image_dim_ordering() == 'tf': + batch_size, feature_map_height, feature_map_width, feature_map_channels = input_shape + else: # Not yet relevant since TensorFlow is the only supported backend right now, but it can't harm to have this in here for the future + batch_size, feature_map_channels, feature_map_height, feature_map_width = input_shape + return (batch_size, feature_map_height, feature_map_width, self.n_boxes, 8) + + def get_config(self): + config = { + 'img_height': self.img_height, + 'img_width': self.img_width, + 'this_scale': self.this_scale, + 'next_scale': self.next_scale, + 'aspect_ratios': list(self.aspect_ratios), + 'two_boxes_for_ar1': self.two_boxes_for_ar1, + 'clip_boxes': self.clip_boxes, + 'variances': list(self.variances), + 'coords': self.coords, + 'normalize_coords': self.normalize_coords + } + base_config = super(AnchorBoxes, self).get_config() + return dict(list(base_config.items()) + list(config.items())) diff --git a/keras_ssd/keras_layers/keras_layer_DecodeDetections.py b/keras_ssd/keras_layers/keras_layer_DecodeDetections.py new file mode 100644 index 0000000..3fc4d57 --- /dev/null +++ b/keras_ssd/keras_layers/keras_layer_DecodeDetections.py @@ -0,0 +1,283 @@ +''' +A custom Keras layer to decode the raw SSD prediction output. Corresponds to the +`DetectionOutput` layer type in the original Caffe implementation of SSD. + +Copyright (C) 2018 Pierluigi Ferrari + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +''' + +from __future__ import division +import numpy as np +import tensorflow as tf +import keras.backend as K +from keras.engine.topology import InputSpec +from keras.engine.topology import Layer + +class DecodeDetections(Layer): + ''' + A Keras layer to decode the raw SSD prediction output. + + Input shape: + 3D tensor of shape `(batch_size, n_boxes, n_classes + 12)`. + + Output shape: + 3D tensor of shape `(batch_size, top_k, 6)`. + ''' + + def __init__(self, + confidence_thresh=0.01, + iou_threshold=0.45, + top_k=200, + nms_max_output_size=400, + coords='centroids', + normalize_coords=True, + img_height=None, + img_width=None, + **kwargs): + ''' + All default argument values follow the Caffe implementation. + + Arguments: + confidence_thresh (float, optional): A float in [0,1), the minimum classification confidence in a specific + positive class in order to be considered for the non-maximum suppression stage for the respective class. + A lower value will result in a larger part of the selection process being done by the non-maximum suppression + stage, while a larger value will result in a larger part of the selection process happening in the confidence + thresholding stage. + iou_threshold (float, optional): A float in [0,1]. All boxes with a Jaccard similarity of greater than `iou_threshold` + with a locally maximal box will be removed from the set of predictions for a given class, where 'maximal' refers + to the box score. + top_k (int, optional): The number of highest scoring predictions to be kept for each batch item after the + non-maximum suppression stage. + nms_max_output_size (int, optional): The maximum number of predictions that will be left after performing non-maximum + suppression. + coords (str, optional): The box coordinate format that the model outputs. Must be 'centroids' + i.e. the format `(cx, cy, w, h)` (box center coordinates, width, and height). Other coordinate formats are + currently not supported. + normalize_coords (bool, optional): Set to `True` if the model outputs relative coordinates (i.e. coordinates in [0,1]) + and you wish to transform these relative coordinates back to absolute coordinates. If the model outputs + relative coordinates, but you do not want to convert them back to absolute coordinates, set this to `False`. + Do not set this to `True` if the model already outputs absolute coordinates, as that would result in incorrect + coordinates. Requires `img_height` and `img_width` if set to `True`. + img_height (int, optional): The height of the input images. Only needed if `normalize_coords` is `True`. + img_width (int, optional): The width of the input images. Only needed if `normalize_coords` is `True`. + ''' + if K.backend() != 'tensorflow': + raise TypeError("This layer only supports TensorFlow at the moment, but you are using the {} backend.".format(K.backend())) + + if normalize_coords and ((img_height is None) or (img_width is None)): + raise ValueError("If relative box coordinates are supposed to be converted to absolute coordinates, the decoder needs the image size in order to decode the predictions, but `img_height == {}` and `img_width == {}`".format(img_height, img_width)) + + if coords != 'centroids': + raise ValueError("The DetectionOutput layer currently only supports the 'centroids' coordinate format.") + + # We need these members for the config. + self.confidence_thresh = confidence_thresh + self.iou_threshold = iou_threshold + self.top_k = top_k + self.normalize_coords = normalize_coords + self.img_height = img_height + self.img_width = img_width + self.coords = coords + self.nms_max_output_size = nms_max_output_size + + # We need these members for TensorFlow. + self.tf_confidence_thresh = tf.constant(self.confidence_thresh, name='confidence_thresh') + self.tf_iou_threshold = tf.constant(self.iou_threshold, name='iou_threshold') + self.tf_top_k = tf.constant(self.top_k, name='top_k') + self.tf_normalize_coords = tf.constant(self.normalize_coords, name='normalize_coords') + self.tf_img_height = tf.constant(self.img_height, dtype=tf.float32, name='img_height') + self.tf_img_width = tf.constant(self.img_width, dtype=tf.float32, name='img_width') + self.tf_nms_max_output_size = tf.constant(self.nms_max_output_size, name='nms_max_output_size') + + super(DecodeDetections, self).__init__(**kwargs) + + def build(self, input_shape): + self.input_spec = [InputSpec(shape=input_shape)] + super(DecodeDetections, self).build(input_shape) + + def call(self, y_pred, mask=None): + ''' + Returns: + 3D tensor of shape `(batch_size, top_k, 6)`. The second axis is zero-padded + to always yield `top_k` predictions per batch item. The last axis contains + the coordinates for each predicted box in the format + `[class_id, confidence, xmin, ymin, xmax, ymax]`. + ''' + + ##################################################################################### + # 1. Convert the box coordinates from predicted anchor box offsets to predicted + # absolute coordinates + ##################################################################################### + + # Convert anchor box offsets to image offsets. + cx = y_pred[...,-12] * y_pred[...,-4] * y_pred[...,-6] + y_pred[...,-8] # cx = cx_pred * cx_variance * w_anchor + cx_anchor + cy = y_pred[...,-11] * y_pred[...,-3] * y_pred[...,-5] + y_pred[...,-7] # cy = cy_pred * cy_variance * h_anchor + cy_anchor + w = tf.exp(y_pred[...,-10] * y_pred[...,-2]) * y_pred[...,-6] # w = exp(w_pred * variance_w) * w_anchor + h = tf.exp(y_pred[...,-9] * y_pred[...,-1]) * y_pred[...,-5] # h = exp(h_pred * variance_h) * h_anchor + + # Convert 'centroids' to 'corners'. + xmin = cx - 0.5 * w + ymin = cy - 0.5 * h + xmax = cx + 0.5 * w + ymax = cy + 0.5 * h + + # If the model predicts box coordinates relative to the image dimensions and they are supposed + # to be converted back to absolute coordinates, do that. + def normalized_coords(): + xmin1 = tf.expand_dims(xmin * self.tf_img_width, axis=-1) + ymin1 = tf.expand_dims(ymin * self.tf_img_height, axis=-1) + xmax1 = tf.expand_dims(xmax * self.tf_img_width, axis=-1) + ymax1 = tf.expand_dims(ymax * self.tf_img_height, axis=-1) + return xmin1, ymin1, xmax1, ymax1 + def non_normalized_coords(): + return tf.expand_dims(xmin, axis=-1), tf.expand_dims(ymin, axis=-1), tf.expand_dims(xmax, axis=-1), tf.expand_dims(ymax, axis=-1) + + xmin, ymin, xmax, ymax = tf.cond(self.tf_normalize_coords, normalized_coords, non_normalized_coords) + + # Concatenate the one-hot class confidences and the converted box coordinates to form the decoded predictions tensor. + y_pred = tf.concat(values=[y_pred[...,:-12], xmin, ymin, xmax, ymax], axis=-1) + + ##################################################################################### + # 2. Perform confidence thresholding, per-class non-maximum suppression, and + # top-k filtering. + ##################################################################################### + + batch_size = tf.shape(y_pred)[0] # Output dtype: tf.int32 + n_boxes = tf.shape(y_pred)[1] + n_classes = y_pred.shape[2] - 4 + class_indices = tf.range(1, n_classes) + + # Create a function that filters the predictions for the given batch item. Specifically, it performs: + # - confidence thresholding + # - non-maximum suppression (NMS) + # - top-k filtering + def filter_predictions(batch_item): + + # Create a function that filters the predictions for one single class. + def filter_single_class(index): + + # From a tensor of shape (n_boxes, n_classes + 4 coordinates) extract + # a tensor of shape (n_boxes, 1 + 4 coordinates) that contains the + # confidnece values for just one class, determined by `index`. + confidences = tf.expand_dims(batch_item[..., index], axis=-1) + class_id = tf.fill(dims=tf.shape(confidences), value=tf.to_float(index)) + box_coordinates = batch_item[...,-4:] + + single_class = tf.concat([class_id, confidences, box_coordinates], axis=-1) + + # Apply confidence thresholding with respect to the class defined by `index`. + threshold_met = single_class[:,1] > self.tf_confidence_thresh + single_class = tf.boolean_mask(tensor=single_class, + mask=threshold_met) + + # If any boxes made the threshold, perform NMS. + def perform_nms(): + scores = single_class[...,1] + + # `tf.image.non_max_suppression()` needs the box coordinates in the format `(ymin, xmin, ymax, xmax)`. + xmin = tf.expand_dims(single_class[...,-4], axis=-1) + ymin = tf.expand_dims(single_class[...,-3], axis=-1) + xmax = tf.expand_dims(single_class[...,-2], axis=-1) + ymax = tf.expand_dims(single_class[...,-1], axis=-1) + boxes = tf.concat(values=[ymin, xmin, ymax, xmax], axis=-1) + + maxima_indices = tf.image.non_max_suppression(boxes=boxes, + scores=scores, + max_output_size=self.tf_nms_max_output_size, + iou_threshold=self.iou_threshold, + name='non_maximum_suppresion') + maxima = tf.gather(params=single_class, + indices=maxima_indices, + axis=0) + return maxima + + def no_confident_predictions(): + return tf.constant(value=0.0, shape=(1,6)) + + single_class_nms = tf.cond(tf.equal(tf.size(single_class), 0), no_confident_predictions, perform_nms) + + # Make sure `single_class` is exactly `self.nms_max_output_size` elements long. + padded_single_class = tf.pad(tensor=single_class_nms, + paddings=[[0, self.tf_nms_max_output_size - tf.shape(single_class_nms)[0]], [0, 0]], + mode='CONSTANT', + constant_values=0.0) + + return padded_single_class + + # Iterate `filter_single_class()` over all class indices. + filtered_single_classes = tf.map_fn(fn=lambda i: filter_single_class(i), + elems=tf.range(1,n_classes), + dtype=tf.float32, + parallel_iterations=128, + back_prop=False, + swap_memory=False, + infer_shape=True, + name='loop_over_classes') + + # Concatenate the filtered results for all individual classes to one tensor. + filtered_predictions = tf.reshape(tensor=filtered_single_classes, shape=(-1,6)) + + # Perform top-k filtering for this batch item or pad it in case there are + # fewer than `self.top_k` boxes left at this point. Either way, produce a + # tensor of length `self.top_k`. By the time we return the final results tensor + # for the whole batch, all batch items must have the same number of predicted + # boxes so that the tensor dimensions are homogenous. If fewer than `self.top_k` + # predictions are left after the filtering process above, we pad the missing + # predictions with zeros as dummy entries. + def top_k(): + return tf.gather(params=filtered_predictions, + indices=tf.nn.top_k(filtered_predictions[:, 1], k=self.tf_top_k, sorted=True).indices, + axis=0) + def pad_and_top_k(): + padded_predictions = tf.pad(tensor=filtered_predictions, + paddings=[[0, self.tf_top_k - tf.shape(filtered_predictions)[0]], [0, 0]], + mode='CONSTANT', + constant_values=0.0) + return tf.gather(params=padded_predictions, + indices=tf.nn.top_k(padded_predictions[:, 1], k=self.tf_top_k, sorted=True).indices, + axis=0) + + top_k_boxes = tf.cond(tf.greater_equal(tf.shape(filtered_predictions)[0], self.tf_top_k), top_k, pad_and_top_k) + + return top_k_boxes + + # Iterate `filter_predictions()` over all batch items. + output_tensor = tf.map_fn(fn=lambda x: filter_predictions(x), + elems=y_pred, + dtype=None, + parallel_iterations=128, + back_prop=False, + swap_memory=False, + infer_shape=True, + name='loop_over_batch') + + return output_tensor + + def compute_output_shape(self, input_shape): + batch_size, n_boxes, last_axis = input_shape + return (batch_size, self.tf_top_k, 6) # Last axis: (class_ID, confidence, 4 box coordinates) + + def get_config(self): + config = { + 'confidence_thresh': self.confidence_thresh, + 'iou_threshold': self.iou_threshold, + 'top_k': self.top_k, + 'nms_max_output_size': self.nms_max_output_size, + 'coords': self.coords, + 'normalize_coords': self.normalize_coords, + 'img_height': self.img_height, + 'img_width': self.img_width, + } + base_config = super(DecodeDetections, self).get_config() + return dict(list(base_config.items()) + list(config.items())) diff --git a/keras_ssd/keras_layers/keras_layer_DecodeDetectionsFast.py b/keras_ssd/keras_layers/keras_layer_DecodeDetectionsFast.py new file mode 100644 index 0000000..f8ab221 --- /dev/null +++ b/keras_ssd/keras_layers/keras_layer_DecodeDetectionsFast.py @@ -0,0 +1,266 @@ +''' +A custom Keras layer to decode the raw SSD prediction output. This is a modified +and more efficient version of the `DetectionOutput` layer type in the original Caffe +implementation of SSD. For a faithful replication of the original layer, please +refer to the `DecodeDetections` layer. + +Copyright (C) 2018 Pierluigi Ferrari + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +''' + +from __future__ import division +import numpy as np +import tensorflow as tf +import keras.backend as K +from keras.engine.topology import InputSpec +from keras.engine.topology import Layer + +class DecodeDetectionsFast(Layer): + ''' + A Keras layer to decode the raw SSD prediction output. + + Input shape: + 3D tensor of shape `(batch_size, n_boxes, n_classes + 12)`. + + Output shape: + 3D tensor of shape `(batch_size, top_k, 6)`. + ''' + + def __init__(self, + confidence_thresh=0.01, + iou_threshold=0.45, + top_k=200, + nms_max_output_size=400, + coords='centroids', + normalize_coords=True, + img_height=None, + img_width=None, + **kwargs): + ''' + All default argument values follow the Caffe implementation. + + Arguments: + confidence_thresh (float, optional): A float in [0,1), the minimum classification confidence in a specific + positive class in order to be considered for the non-maximum suppression stage for the respective class. + A lower value will result in a larger part of the selection process being done by the non-maximum suppression + stage, while a larger value will result in a larger part of the selection process happening in the confidence + thresholding stage. + iou_threshold (float, optional): A float in [0,1]. All boxes with a Jaccard similarity of greater than `iou_threshold` + with a locally maximal box will be removed from the set of predictions for a given class, where 'maximal' refers + to the box score. + top_k (int, optional): The number of highest scoring predictions to be kept for each batch item after the + non-maximum suppression stage. + nms_max_output_size (int, optional): The maximum number of predictions that will be left after performing non-maximum + suppression. + coords (str, optional): The box coordinate format that the model outputs. Must be 'centroids' + i.e. the format `(cx, cy, w, h)` (box center coordinates, width, and height). Other coordinate formats are + currently not supported. + normalize_coords (bool, optional): Set to `True` if the model outputs relative coordinates (i.e. coordinates in [0,1]) + and you wish to transform these relative coordinates back to absolute coordinates. If the model outputs + relative coordinates, but you do not want to convert them back to absolute coordinates, set this to `False`. + Do not set this to `True` if the model already outputs absolute coordinates, as that would result in incorrect + coordinates. Requires `img_height` and `img_width` if set to `True`. + img_height (int, optional): The height of the input images. Only needed if `normalize_coords` is `True`. + img_width (int, optional): The width of the input images. Only needed if `normalize_coords` is `True`. + ''' + if K.backend() != 'tensorflow': + raise TypeError("This layer only supports TensorFlow at the moment, but you are using the {} backend.".format(K.backend())) + + if normalize_coords and ((img_height is None) or (img_width is None)): + raise ValueError("If relative box coordinates are supposed to be converted to absolute coordinates, the decoder needs the image size in order to decode the predictions, but `img_height == {}` and `img_width == {}`".format(img_height, img_width)) + + if coords != 'centroids': + raise ValueError("The DetectionOutput layer currently only supports the 'centroids' coordinate format.") + + # We need these members for the config. + self.confidence_thresh = confidence_thresh + self.iou_threshold = iou_threshold + self.top_k = top_k + self.normalize_coords = normalize_coords + self.img_height = img_height + self.img_width = img_width + self.coords = coords + self.nms_max_output_size = nms_max_output_size + + # We need these members for TensorFlow. + self.tf_confidence_thresh = tf.constant(self.confidence_thresh, name='confidence_thresh') + self.tf_iou_threshold = tf.constant(self.iou_threshold, name='iou_threshold') + self.tf_top_k = tf.constant(self.top_k, name='top_k') + self.tf_normalize_coords = tf.constant(self.normalize_coords, name='normalize_coords') + self.tf_img_height = tf.constant(self.img_height, dtype=tf.float32, name='img_height') + self.tf_img_width = tf.constant(self.img_width, dtype=tf.float32, name='img_width') + self.tf_nms_max_output_size = tf.constant(self.nms_max_output_size, name='nms_max_output_size') + + super(DecodeDetectionsFast, self).__init__(**kwargs) + + def build(self, input_shape): + self.input_spec = [InputSpec(shape=input_shape)] + super(DecodeDetectionsFast, self).build(input_shape) + + def call(self, y_pred, mask=None): + ''' + Returns: + 3D tensor of shape `(batch_size, top_k, 6)`. The second axis is zero-padded + to always yield `top_k` predictions per batch item. The last axis contains + the coordinates for each predicted box in the format + `[class_id, confidence, xmin, ymin, xmax, ymax]`. + ''' + + ##################################################################################### + # 1. Convert the box coordinates from predicted anchor box offsets to predicted + # absolute coordinates + ##################################################################################### + + # Extract the predicted class IDs as the indices of the highest confidence values. + class_ids = tf.expand_dims(tf.to_float(tf.argmax(y_pred[...,:-12], axis=-1)), axis=-1) + # Extract the confidences of the maximal classes. + confidences = tf.reduce_max(y_pred[...,:-12], axis=-1, keep_dims=True) + + # Convert anchor box offsets to image offsets. + cx = y_pred[...,-12] * y_pred[...,-4] * y_pred[...,-6] + y_pred[...,-8] # cx = cx_pred * cx_variance * w_anchor + cx_anchor + cy = y_pred[...,-11] * y_pred[...,-3] * y_pred[...,-5] + y_pred[...,-7] # cy = cy_pred * cy_variance * h_anchor + cy_anchor + w = tf.exp(y_pred[...,-10] * y_pred[...,-2]) * y_pred[...,-6] # w = exp(w_pred * variance_w) * w_anchor + h = tf.exp(y_pred[...,-9] * y_pred[...,-1]) * y_pred[...,-5] # h = exp(h_pred * variance_h) * h_anchor + + # Convert 'centroids' to 'corners'. + xmin = cx - 0.5 * w + ymin = cy - 0.5 * h + xmax = cx + 0.5 * w + ymax = cy + 0.5 * h + + # If the model predicts box coordinates relative to the image dimensions and they are supposed + # to be converted back to absolute coordinates, do that. + def normalized_coords(): + xmin1 = tf.expand_dims(xmin * self.tf_img_width, axis=-1) + ymin1 = tf.expand_dims(ymin * self.tf_img_height, axis=-1) + xmax1 = tf.expand_dims(xmax * self.tf_img_width, axis=-1) + ymax1 = tf.expand_dims(ymax * self.tf_img_height, axis=-1) + return xmin1, ymin1, xmax1, ymax1 + def non_normalized_coords(): + return tf.expand_dims(xmin, axis=-1), tf.expand_dims(ymin, axis=-1), tf.expand_dims(xmax, axis=-1), tf.expand_dims(ymax, axis=-1) + + xmin, ymin, xmax, ymax = tf.cond(self.tf_normalize_coords, normalized_coords, non_normalized_coords) + + # Concatenate the one-hot class confidences and the converted box coordinates to form the decoded predictions tensor. + y_pred = tf.concat(values=[class_ids, confidences, xmin, ymin, xmax, ymax], axis=-1) + + ##################################################################################### + # 2. Perform confidence thresholding, non-maximum suppression, and top-k filtering. + ##################################################################################### + + batch_size = tf.shape(y_pred)[0] # Output dtype: tf.int32 + n_boxes = tf.shape(y_pred)[1] + n_classes = y_pred.shape[2] - 4 + class_indices = tf.range(1, n_classes) + + # Create a function that filters the predictions for the given batch item. Specifically, it performs: + # - confidence thresholding + # - non-maximum suppression (NMS) + # - top-k filtering + def filter_predictions(batch_item): + + # Keep only the non-background boxes. + positive_boxes = tf.not_equal(batch_item[...,0], 0.0) + predictions = tf.boolean_mask(tensor=batch_item, + mask=positive_boxes) + + def perform_confidence_thresholding(): + # Apply confidence thresholding. + threshold_met = predictions[:,1] > self.tf_confidence_thresh + return tf.boolean_mask(tensor=predictions, + mask=threshold_met) + def no_positive_boxes(): + return tf.constant(value=0.0, shape=(1,6)) + + # If there are any positive predictions, perform confidence thresholding. + predictions_conf_thresh = tf.cond(tf.equal(tf.size(predictions), 0), no_positive_boxes, perform_confidence_thresholding) + + def perform_nms(): + scores = predictions_conf_thresh[...,1] + + # `tf.image.non_max_suppression()` needs the box coordinates in the format `(ymin, xmin, ymax, xmax)`. + xmin = tf.expand_dims(predictions_conf_thresh[...,-4], axis=-1) + ymin = tf.expand_dims(predictions_conf_thresh[...,-3], axis=-1) + xmax = tf.expand_dims(predictions_conf_thresh[...,-2], axis=-1) + ymax = tf.expand_dims(predictions_conf_thresh[...,-1], axis=-1) + boxes = tf.concat(values=[ymin, xmin, ymax, xmax], axis=-1) + + maxima_indices = tf.image.non_max_suppression(boxes=boxes, + scores=scores, + max_output_size=self.tf_nms_max_output_size, + iou_threshold=self.iou_threshold, + name='non_maximum_suppresion') + maxima = tf.gather(params=predictions_conf_thresh, + indices=maxima_indices, + axis=0) + return maxima + def no_confident_predictions(): + return tf.constant(value=0.0, shape=(1,6)) + + # If any boxes made the threshold, perform NMS. + predictions_nms = tf.cond(tf.equal(tf.size(predictions_conf_thresh), 0), no_confident_predictions, perform_nms) + + # Perform top-k filtering for this batch item or pad it in case there are + # fewer than `self.top_k` boxes left at this point. Either way, produce a + # tensor of length `self.top_k`. By the time we return the final results tensor + # for the whole batch, all batch items must have the same number of predicted + # boxes so that the tensor dimensions are homogenous. If fewer than `self.top_k` + # predictions are left after the filtering process above, we pad the missing + # predictions with zeros as dummy entries. + def top_k(): + return tf.gather(params=predictions_nms, + indices=tf.nn.top_k(predictions_nms[:, 1], k=self.tf_top_k, sorted=True).indices, + axis=0) + def pad_and_top_k(): + padded_predictions = tf.pad(tensor=predictions_nms, + paddings=[[0, self.tf_top_k - tf.shape(predictions_nms)[0]], [0, 0]], + mode='CONSTANT', + constant_values=0.0) + return tf.gather(params=padded_predictions, + indices=tf.nn.top_k(padded_predictions[:, 1], k=self.tf_top_k, sorted=True).indices, + axis=0) + + top_k_boxes = tf.cond(tf.greater_equal(tf.shape(predictions_nms)[0], self.tf_top_k), top_k, pad_and_top_k) + + return top_k_boxes + + # Iterate `filter_predictions()` over all batch items. + output_tensor = tf.map_fn(fn=lambda x: filter_predictions(x), + elems=y_pred, + dtype=None, + parallel_iterations=128, + back_prop=False, + swap_memory=False, + infer_shape=True, + name='loop_over_batch') + + return output_tensor + + def compute_output_shape(self, input_shape): + batch_size, n_boxes, last_axis = input_shape + return (batch_size, self.tf_top_k, 6) # Last axis: (class_ID, confidence, 4 box coordinates) + + def get_config(self): + config = { + 'confidence_thresh': self.confidence_thresh, + 'iou_threshold': self.iou_threshold, + 'top_k': self.top_k, + 'nms_max_output_size': self.nms_max_output_size, + 'coords': self.coords, + 'normalize_coords': self.normalize_coords, + 'img_height': self.img_height, + 'img_width': self.img_width, + } + base_config = super(DecodeDetectionsFast, self).get_config() + return dict(list(base_config.items()) + list(config.items())) diff --git a/keras_ssd/keras_layers/keras_layer_L2Normalization.py b/keras_ssd/keras_layers/keras_layer_L2Normalization.py new file mode 100644 index 0000000..e2c71bf --- /dev/null +++ b/keras_ssd/keras_layers/keras_layer_L2Normalization.py @@ -0,0 +1,70 @@ +''' +A custom Keras layer to perform L2-normalization. + +Copyright (C) 2018 Pierluigi Ferrari + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +''' + +from __future__ import division +import numpy as np +import keras.backend as K +from keras.engine.topology import InputSpec +from keras.engine.topology import Layer + +class L2Normalization(Layer): + ''' + Performs L2 normalization on the input tensor with a learnable scaling parameter + as described in the paper "Parsenet: Looking Wider to See Better" (see references) + and as used in the original SSD model. + + Arguments: + gamma_init (int): The initial scaling parameter. Defaults to 20 following the + SSD paper. + + Input shape: + 4D tensor of shape `(batch, channels, height, width)` if `dim_ordering = 'th'` + or `(batch, height, width, channels)` if `dim_ordering = 'tf'`. + + Returns: + The scaled tensor. Same shape as the input tensor. + + References: + http://cs.unc.edu/~wliu/papers/parsenet.pdf + ''' + + def __init__(self, gamma_init=20, **kwargs): + if K.image_dim_ordering() == 'tf': + self.axis = 3 + else: + self.axis = 1 + self.gamma_init = gamma_init + super(L2Normalization, self).__init__(**kwargs) + + def build(self, input_shape): + self.input_spec = [InputSpec(shape=input_shape)] + gamma = self.gamma_init * np.ones((input_shape[self.axis],)) + self.gamma = K.variable(gamma, name='{}_gamma'.format(self.name)) + self.trainable_weights = [self.gamma] + super(L2Normalization, self).build(input_shape) + + def call(self, x, mask=None): + output = K.l2_normalize(x, self.axis) + return output * self.gamma + + def get_config(self): + config = { + 'gamma_init': self.gamma_init + } + base_config = super(L2Normalization, self).get_config() + return dict(list(base_config.items()) + list(config.items())) diff --git a/keras_ssd/keras_loss_function/__init__.py b/keras_ssd/keras_loss_function/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/keras_ssd/keras_loss_function/keras_ssd_loss.py b/keras_ssd/keras_loss_function/keras_ssd_loss.py new file mode 100644 index 0000000..83567f5 --- /dev/null +++ b/keras_ssd/keras_loss_function/keras_ssd_loss.py @@ -0,0 +1,211 @@ +''' +The Keras-compatible loss function for the SSD model. Currently supports TensorFlow only. + +Copyright (C) 2018 Pierluigi Ferrari + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +''' + +from __future__ import division +import tensorflow as tf + +class SSDLoss: + ''' + The SSD loss, see https://arxiv.org/abs/1512.02325. + ''' + + def __init__(self, + neg_pos_ratio=3, + n_neg_min=0, + alpha=1.0): + ''' + Arguments: + neg_pos_ratio (int, optional): The maximum ratio of negative (i.e. background) + to positive ground truth boxes to include in the loss computation. + There are no actual background ground truth boxes of course, but `y_true` + contains anchor boxes labeled with the background class. Since + the number of background boxes in `y_true` will usually exceed + the number of positive boxes by far, it is necessary to balance + their influence on the loss. Defaults to 3 following the paper. + n_neg_min (int, optional): The minimum number of negative ground truth boxes to + enter the loss computation *per batch*. This argument can be used to make + sure that the model learns from a minimum number of negatives in batches + in which there are very few, or even none at all, positive ground truth + boxes. It defaults to 0 and if used, it should be set to a value that + stands in reasonable proportion to the batch size used for training. + alpha (float, optional): A factor to weight the localization loss in the + computation of the total loss. Defaults to 1.0 following the paper. + ''' + self.neg_pos_ratio = neg_pos_ratio + self.n_neg_min = n_neg_min + self.alpha = alpha + + def smooth_L1_loss(self, y_true, y_pred): + ''' + Compute smooth L1 loss, see references. + + Arguments: + y_true (nD tensor): A TensorFlow tensor of any shape containing the ground truth data. + In this context, the expected tensor has shape `(batch_size, #boxes, 4)` and + contains the ground truth bounding box coordinates, where the last dimension + contains `(xmin, xmax, ymin, ymax)`. + y_pred (nD tensor): A TensorFlow tensor of identical structure to `y_true` containing + the predicted data, in this context the predicted bounding box coordinates. + + Returns: + The smooth L1 loss, a nD-1 Tensorflow tensor. In this context a 2D tensor + of shape (batch, n_boxes_total). + + References: + https://arxiv.org/abs/1504.08083 + ''' + absolute_loss = tf.abs(y_true - y_pred) + square_loss = 0.5 * (y_true - y_pred)**2 + l1_loss = tf.where(tf.less(absolute_loss, 1.0), square_loss, absolute_loss - 0.5) + return tf.reduce_sum(l1_loss, axis=-1) + + def log_loss(self, y_true, y_pred): + ''' + Compute the softmax log loss. + + Arguments: + y_true (nD tensor): A TensorFlow tensor of any shape containing the ground truth data. + In this context, the expected tensor has shape (batch_size, #boxes, #classes) + and contains the ground truth bounding box categories. + y_pred (nD tensor): A TensorFlow tensor of identical structure to `y_true` containing + the predicted data, in this context the predicted bounding box categories. + + Returns: + The softmax log loss, a nD-1 Tensorflow tensor. In this context a 2D tensor + of shape (batch, n_boxes_total). + ''' + # Make sure that `y_pred` doesn't contain any zeros (which would break the log function) + y_pred = tf.maximum(y_pred, 1e-15) + # Compute the log loss + log_loss = -tf.reduce_sum(y_true * tf.log(y_pred), axis=-1) + return log_loss + + def compute_loss(self, y_true, y_pred): + ''' + Compute the loss of the SSD model prediction against the ground truth. + + Arguments: + y_true (array): A Numpy array of shape `(batch_size, #boxes, #classes + 12)`, + where `#boxes` is the total number of boxes that the model predicts + per image. Be careful to make sure that the index of each given + box in `y_true` is the same as the index for the corresponding + box in `y_pred`. The last axis must have length `#classes + 12` and contain + `[classes one-hot encoded, 4 ground truth box coordinate offsets, 8 arbitrary entries]` + in this order, including the background class. The last eight entries of the + last axis are not used by this function and therefore their contents are + irrelevant, they only exist so that `y_true` has the same shape as `y_pred`, + where the last four entries of the last axis contain the anchor box + coordinates, which are needed during inference. Important: Boxes that + you want the cost function to ignore need to have a one-hot + class vector of all zeros. + y_pred (Keras tensor): The model prediction. The shape is identical + to that of `y_true`, i.e. `(batch_size, #boxes, #classes + 12)`. + The last axis must contain entries in the format + `[classes one-hot encoded, 4 predicted box coordinate offsets, 8 arbitrary entries]`. + + Returns: + A scalar, the total multitask loss for classification and localization. + ''' + self.neg_pos_ratio = tf.constant(self.neg_pos_ratio) + self.n_neg_min = tf.constant(self.n_neg_min) + self.alpha = tf.constant(self.alpha) + + batch_size = tf.shape(y_pred)[0] # Output dtype: tf.int32 + n_boxes = tf.shape(y_pred)[1] # Output dtype: tf.int32, note that `n_boxes` in this context denotes the total number of boxes per image, not the number of boxes per cell. + + # 1: Compute the losses for class and box predictions for every box. + + classification_loss = tf.to_float(self.log_loss(y_true[:,:,:-12], y_pred[:,:,:-12])) # Output shape: (batch_size, n_boxes) + localization_loss = tf.to_float(self.smooth_L1_loss(y_true[:,:,-12:-8], y_pred[:,:,-12:-8])) # Output shape: (batch_size, n_boxes) + + # 2: Compute the classification losses for the positive and negative targets. + + # Create masks for the positive and negative ground truth classes. + negatives = y_true[:,:,0] # Tensor of shape (batch_size, n_boxes) + positives = tf.to_float(tf.reduce_max(y_true[:,:,1:-12], axis=-1)) # Tensor of shape (batch_size, n_boxes) + + # Count the number of positive boxes (classes 1 to n) in y_true across the whole batch. + n_positive = tf.reduce_sum(positives) + + # Now mask all negative boxes and sum up the losses for the positive boxes PER batch item + # (Keras loss functions must output one scalar loss value PER batch item, rather than just + # one scalar for the entire batch, that's why we're not summing across all axes). + pos_class_loss = tf.reduce_sum(classification_loss * positives, axis=-1) # Tensor of shape (batch_size,) + + # Compute the classification loss for the negative default boxes (if there are any). + + # First, compute the classification loss for all negative boxes. + neg_class_loss_all = classification_loss * negatives # Tensor of shape (batch_size, n_boxes) + n_neg_losses = tf.count_nonzero(neg_class_loss_all, dtype=tf.int32) # The number of non-zero loss entries in `neg_class_loss_all` + # What's the point of `n_neg_losses`? For the next step, which will be to compute which negative boxes enter the classification + # loss, we don't just want to know how many negative ground truth boxes there are, but for how many of those there actually is + # a positive (i.e. non-zero) loss. This is necessary because `tf.nn.top-k()` in the function below will pick the top k boxes with + # the highest losses no matter what, even if it receives a vector where all losses are zero. In the unlikely event that all negative + # classification losses ARE actually zero though, this behavior might lead to `tf.nn.top-k()` returning the indices of positive + # boxes, leading to an incorrect negative classification loss computation, and hence an incorrect overall loss computation. + # We therefore need to make sure that `n_negative_keep`, which assumes the role of the `k` argument in `tf.nn.top-k()`, + # is at most the number of negative boxes for which there is a positive classification loss. + + # Compute the number of negative examples we want to account for in the loss. + # We'll keep at most `self.neg_pos_ratio` times the number of positives in `y_true`, but at least `self.n_neg_min` (unless `n_neg_loses` is smaller). + n_negative_keep = tf.minimum(tf.maximum(self.neg_pos_ratio * tf.to_int32(n_positive), self.n_neg_min), n_neg_losses) + + # In the unlikely case when either (1) there are no negative ground truth boxes at all + # or (2) the classification loss for all negative boxes is zero, return zero as the `neg_class_loss`. + def f1(): + return tf.zeros([batch_size]) + # Otherwise compute the negative loss. + def f2(): + # Now we'll identify the top-k (where k == `n_negative_keep`) boxes with the highest confidence loss that + # belong to the background class in the ground truth data. Note that this doesn't necessarily mean that the model + # predicted the wrong class for those boxes, it just means that the loss for those boxes is the highest. + + # To do this, we reshape `neg_class_loss_all` to 1D... + neg_class_loss_all_1D = tf.reshape(neg_class_loss_all, [-1]) # Tensor of shape (batch_size * n_boxes,) + # ...and then we get the indices for the `n_negative_keep` boxes with the highest loss out of those... + values, indices = tf.nn.top_k(neg_class_loss_all_1D, + k=n_negative_keep, + sorted=False) # We don't need them sorted. + # ...and with these indices we'll create a mask... + negatives_keep = tf.scatter_nd(indices=tf.expand_dims(indices, axis=1), + updates=tf.ones_like(indices, dtype=tf.int32), + shape=tf.shape(neg_class_loss_all_1D)) # Tensor of shape (batch_size * n_boxes,) + negatives_keep = tf.to_float(tf.reshape(negatives_keep, [batch_size, n_boxes])) # Tensor of shape (batch_size, n_boxes) + # ...and use it to keep only those boxes and mask all other classification losses + neg_class_loss = tf.reduce_sum(classification_loss * negatives_keep, axis=-1) # Tensor of shape (batch_size,) + return neg_class_loss + + neg_class_loss = tf.cond(tf.equal(n_neg_losses, tf.constant(0)), f1, f2) + + class_loss = pos_class_loss + neg_class_loss # Tensor of shape (batch_size,) + + # 3: Compute the localization loss for the positive targets. + # We don't compute a localization loss for negative predicted boxes (obviously: there are no ground truth boxes they would correspond to). + + loc_loss = tf.reduce_sum(localization_loss * positives, axis=-1) # Tensor of shape (batch_size,) + + # 4: Compute the total loss. + + total_loss = (class_loss + self.alpha * loc_loss) / tf.maximum(1.0, n_positive) # In case `n_positive == 0` + # Keras has the annoying habit of dividing the loss by the batch size, which sucks in our case + # because the relevant criterion to average our loss over is the number of positive boxes in the batch + # (by which we're dividing in the line above), not the batch size. So in order to revert Keras' averaging + # over the batch size, we'll have to multiply by it. + total_loss = total_loss * tf.to_float(batch_size) + + return total_loss diff --git a/keras_ssd/misc_utils/__init__.py b/keras_ssd/misc_utils/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/keras_ssd/misc_utils/tensor_sampling_utils.py b/keras_ssd/misc_utils/tensor_sampling_utils.py new file mode 100644 index 0000000..a27ce1d --- /dev/null +++ b/keras_ssd/misc_utils/tensor_sampling_utils.py @@ -0,0 +1,177 @@ +''' +Utilities that are useful to sub- or up-sample weights tensors. + +Copyright (C) 2018 Pierluigi Ferrari + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +''' + +import numpy as np + +def sample_tensors(weights_list, sampling_instructions, axes=None, init=None, mean=0.0, stddev=0.005): + ''' + Can sub-sample and/or up-sample individual dimensions of the tensors in the given list + of input tensors. + + It is possible to sub-sample some dimensions and up-sample other dimensions at the same time. + + The tensors in the list will be sampled consistently, i.e. for any given dimension that + corresponds among all tensors in the list, the same elements will be picked for every tensor + along that dimension. + + For dimensions that are being sub-sampled, you can either provide a list of the indices + that should be picked, or you can provide the number of elements to be sub-sampled, in which + case the elements will be chosen at random. + + For dimensions that are being up-sampled, "filler" elements will be insterted at random + positions along the respective dimension. These filler elements will be initialized either + with zero or from a normal distribution with selectable mean and standard deviation. + + Arguments: + weights_list (list): A list of Numpy arrays. Each array represents one of the tensors + to be sampled. The tensor with the greatest number of dimensions must be the first + element in the list. For example, in the case of the weights of a 2D convolutional + layer, the kernel must be the first element in the list and the bias the second, + not the other way around. For all tensors in the list after the first tensor, the + lengths of each of their axes must identical to the length of some axis of the + first tensor. + sampling_instructions (list): A list that contains the sampling instructions for each + dimension of the first tensor. If the first tensor has `n` dimensions, then this + must be a list of length `n`. That means, sampling instructions for every dimension + of the first tensor must still be given even if not all dimensions should be changed. + The elements of this list can be either lists of integers or integers. If the sampling + instruction for a given dimension is a list of integers, then these integers represent + the indices of the elements of that dimension that will be sub-sampled. If the sampling + instruction for a given dimension is an integer, then that number of elements will be + sampled along said dimension. If the integer is greater than the number of elements + of the input tensors in that dimension, that dimension will be up-sampled. If the integer + is smaller than the number of elements of the input tensors in that dimension, that + dimension will be sub-sampled. If the integer is equal to the number of elements + of the input tensors in that dimension, that dimension will remain the same. + axes (list, optional): Only relevant if `weights_list` contains more than one tensor. + This list contains a list for each additional tensor in `weights_list` beyond the first. + Each of these lists contains integers that determine to which axes of the first tensor + the axes of the respective tensor correspond. For example, let the first tensor be a + 4D tensor and the second tensor in the list be a 2D tensor. If the first element of + `axis` is the list `[2,3]`, then that means that the two axes of the second tensor + correspond to the last two axes of the first tensor, in the same order. The point of + this list is for the program to know, if a given dimension of the first tensor is to + be sub- or up-sampled, which dimensions of the other tensors in the list must be + sub- or up-sampled accordingly. + init (list, optional): Only relevant for up-sampling. Must be `None` or a list of strings + that determines for each tensor in `weights_list` how the newly inserted values should + be initialized. The possible values are 'gaussian' for initialization from a normal + distribution with the selected mean and standard deviation (see the following two arguments), + or 'zeros' for zero-initialization. If `None`, all initializations default to + 'gaussian'. + mean (float, optional): Only relevant for up-sampling. The mean of the values that will + be inserted into the tensors at random in the case of up-sampling. + stddev (float, optional): Only relevant for up-sampling. The standard deviation of the + values that will be inserted into the tensors at random in the case of up-sampling. + + Returns: + A list containing the sampled tensors in the same order in which they were given. + ''' + + first_tensor = weights_list[0] + + if (not isinstance(sampling_instructions, (list, tuple))) or (len(sampling_instructions) != first_tensor.ndim): + raise ValueError("The sampling instructions must be a list whose length is the number of dimensions of the first tensor in `weights_list`.") + + if (not init is None) and len(init) != len(weights_list): + raise ValueError("`init` must either be `None` or a list of strings that has the same length as `weights_list`.") + + up_sample = [] # Store the dimensions along which we need to up-sample. + out_shape = [] # Store the shape of the output tensor here. + # Store two stages of the new (sub-sampled and/or up-sampled) weights tensors in the following two lists. + subsampled_weights_list = [] # Tensors after sub-sampling, but before up-sampling (if any). + upsampled_weights_list = [] # Sub-sampled tensors after up-sampling (if any), i.e. final output tensors. + + # Create the slicing arrays from the sampling instructions. + sampling_slices = [] + for i, sampling_inst in enumerate(sampling_instructions): + if isinstance(sampling_inst, (list, tuple)): + amax = np.amax(np.array(sampling_inst)) + if amax >= first_tensor.shape[i]: + raise ValueError("The sample instructions for dimension {} contain index {}, which is greater than the length of that dimension.".format(i, amax)) + sampling_slices.append(np.array(sampling_inst)) + out_shape.append(len(sampling_inst)) + elif isinstance(sampling_inst, int): + out_shape.append(sampling_inst) + if sampling_inst == first_tensor.shape[i]: + # Nothing to sample here, we're keeping the original number of elements along this axis. + sampling_slice = np.arange(sampling_inst) + sampling_slices.append(sampling_slice) + elif sampling_inst < first_tensor.shape[i]: + # We want to SUB-sample this dimension. Randomly pick `sample_inst` many elements from it. + sampling_slice1 = np.array([0]) # We will always sample class 0, the background class. + # Sample the rest of the classes. + sampling_slice2 = np.sort(np.random.choice(np.arange(1, first_tensor.shape[i]), sampling_inst - 1, replace=False)) + sampling_slice = np.concatenate([sampling_slice1, sampling_slice2]) + sampling_slices.append(sampling_slice) + else: + # We want to UP-sample. Pick all elements from this dimension. + sampling_slice = np.arange(first_tensor.shape[i]) + sampling_slices.append(sampling_slice) + up_sample.append(i) + else: + raise ValueError("Each element of the sampling instructions must be either an integer or a list/tuple of integers, but received `{}`".format(type(sampling_inst))) + + # Process the first tensor. + subsampled_first_tensor = np.copy(first_tensor[np.ix_(*sampling_slices)]) + subsampled_weights_list.append(subsampled_first_tensor) + + # Process the other tensors. + if len(weights_list) > 1: + for j in range(1, len(weights_list)): + this_sampling_slices = [sampling_slices[i] for i in axes[j-1]] # Get the sampling slices for this tensor. + subsampled_weights_list.append(np.copy(weights_list[j][np.ix_(*this_sampling_slices)])) + + if up_sample: + # Take care of the dimensions that are to be up-sampled. + + out_shape = np.array(out_shape) + + # Process the first tensor. + if init is None or init[0] == 'gaussian': + upsampled_first_tensor = np.random.normal(loc=mean, scale=stddev, size=out_shape) + elif init[0] == 'zeros': + upsampled_first_tensor = np.zeros(out_shape) + else: + raise ValueError("Valid initializations are 'gaussian' and 'zeros', but received '{}'.".format(init[0])) + # Pick the indices of the elements in `upsampled_first_tensor` that should be occupied by `subsampled_first_tensor`. + up_sample_slices = [np.arange(k) for k in subsampled_first_tensor.shape] + for i in up_sample: + # Randomly select across which indices of this dimension to scatter the elements of `new_weights_tensor` in this dimension. + up_sample_slice1 = np.array([0]) + up_sample_slice2 = np.sort(np.random.choice(np.arange(1, upsampled_first_tensor.shape[i]), subsampled_first_tensor.shape[i] - 1, replace=False)) + up_sample_slices[i] = np.concatenate([up_sample_slice1, up_sample_slice2]) + upsampled_first_tensor[np.ix_(*up_sample_slices)] = subsampled_first_tensor + upsampled_weights_list.append(upsampled_first_tensor) + + # Process the other tensors + if len(weights_list) > 1: + for j in range(1, len(weights_list)): + if init is None or init[j] == 'gaussian': + upsampled_tensor = np.random.normal(loc=mean, scale=stddev, size=out_shape[axes[j-1]]) + elif init[j] == 'zeros': + upsampled_tensor = np.zeros(out_shape[axes[j-1]]) + else: + raise ValueError("Valid initializations are 'gaussian' and 'zeros', but received '{}'.".format(init[j])) + this_up_sample_slices = [up_sample_slices[i] for i in axes[j-1]] # Get the up-sampling slices for this tensor. + upsampled_tensor[np.ix_(*this_up_sample_slices)] = subsampled_weights_list[j] + upsampled_weights_list.append(upsampled_tensor) + + return upsampled_weights_list + else: + return subsampled_weights_list diff --git a/keras_ssd/models/__init__.py b/keras_ssd/models/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/keras_ssd/models/keras_ssd300.py b/keras_ssd/models/keras_ssd300.py new file mode 100644 index 0000000..6aed701 --- /dev/null +++ b/keras_ssd/models/keras_ssd300.py @@ -0,0 +1,457 @@ +''' +A Keras port of the original Caffe SSD300 network. + +Copyright (C) 2018 Pierluigi Ferrari + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +''' + +from __future__ import division +import numpy as np +from keras.models import Model +from keras.layers import Input, Lambda, Activation, Conv2D, MaxPooling2D, ZeroPadding2D, Reshape, Concatenate +from keras.regularizers import l2 +import keras.backend as K + +from keras_layers.keras_layer_AnchorBoxes import AnchorBoxes +from keras_layers.keras_layer_L2Normalization import L2Normalization +from keras_layers.keras_layer_DecodeDetections import DecodeDetections +from keras_layers.keras_layer_DecodeDetectionsFast import DecodeDetectionsFast + +def ssd_300(image_size, + n_classes, + mode='training', + l2_regularization=0.0005, + min_scale=None, + max_scale=None, + scales=None, + aspect_ratios_global=None, + aspect_ratios_per_layer=[[1.0, 2.0, 0.5], + [1.0, 2.0, 0.5, 3.0, 1.0/3.0], + [1.0, 2.0, 0.5, 3.0, 1.0/3.0], + [1.0, 2.0, 0.5, 3.0, 1.0/3.0], + [1.0, 2.0, 0.5], + [1.0, 2.0, 0.5]], + two_boxes_for_ar1=True, + steps=[8, 16, 32, 64, 100, 300], + offsets=None, + clip_boxes=False, + variances=[0.1, 0.1, 0.2, 0.2], + coords='centroids', + normalize_coords=True, + subtract_mean=[123, 117, 104], + divide_by_stddev=None, + swap_channels=[2, 1, 0], + confidence_thresh=0.01, + iou_threshold=0.45, + top_k=200, + nms_max_output_size=400, + return_predictor_sizes=False): + ''' + Build a Keras model with SSD300 architecture, see references. + + The base network is a reduced atrous VGG-16, extended by the SSD architecture, + as described in the paper. + + Most of the arguments that this function takes are only needed for the anchor + box layers. In case you're training the network, the parameters passed here must + be the same as the ones used to set up `SSDBoxEncoder`. In case you're loading + trained weights, the parameters passed here must be the same as the ones used + to produce the trained weights. + + Some of these arguments are explained in more detail in the documentation of the + `SSDBoxEncoder` class. + + Note: Requires Keras v2.0 or later. Currently works only with the + TensorFlow backend (v1.0 or later). + + Arguments: + image_size (tuple): The input image size in the format `(height, width, channels)`. + n_classes (int): The number of positive classes, e.g. 20 for Pascal VOC, 80 for MS COCO. + mode (str, optional): One of 'training', 'inference' and 'inference_fast'. In 'training' mode, + the model outputs the raw prediction tensor, while in 'inference' and 'inference_fast' modes, + the raw predictions are decoded into absolute coordinates and filtered via confidence thresholding, + non-maximum suppression, and top-k filtering. The difference between latter two modes is that + 'inference' follows the exact procedure of the original Caffe implementation, while + 'inference_fast' uses a faster prediction decoding procedure. + l2_regularization (float, optional): The L2-regularization rate. Applies to all convolutional layers. + Set to zero to deactivate L2-regularization. + min_scale (float, optional): The smallest scaling factor for the size of the anchor boxes as a fraction + of the shorter side of the input images. + max_scale (float, optional): The largest scaling factor for the size of the anchor boxes as a fraction + of the shorter side of the input images. All scaling factors between the smallest and the + largest will be linearly interpolated. Note that the second to last of the linearly interpolated + scaling factors will actually be the scaling factor for the last predictor layer, while the last + scaling factor is used for the second box for aspect ratio 1 in the last predictor layer + if `two_boxes_for_ar1` is `True`. + scales (list, optional): A list of floats containing scaling factors per convolutional predictor layer. + This list must be one element longer than the number of predictor layers. The first `k` elements are the + scaling factors for the `k` predictor layers, while the last element is used for the second box + for aspect ratio 1 in the last predictor layer if `two_boxes_for_ar1` is `True`. This additional + last scaling factor must be passed either way, even if it is not being used. If a list is passed, + this argument overrides `min_scale` and `max_scale`. All scaling factors must be greater than zero. + aspect_ratios_global (list, optional): The list of aspect ratios for which anchor boxes are to be + generated. This list is valid for all prediction layers. + aspect_ratios_per_layer (list, optional): A list containing one aspect ratio list for each prediction layer. + This allows you to set the aspect ratios for each predictor layer individually, which is the case for the + original SSD300 implementation. If a list is passed, it overrides `aspect_ratios_global`. + two_boxes_for_ar1 (bool, optional): Only relevant for aspect ratio lists that contain 1. Will be ignored otherwise. + If `True`, two anchor boxes will be generated for aspect ratio 1. The first will be generated + using the scaling factor for the respective layer, the second one will be generated using + geometric mean of said scaling factor and next bigger scaling factor. + steps (list, optional): `None` or a list with as many elements as there are predictor layers. The elements can be + either ints/floats or tuples of two ints/floats. These numbers represent for each predictor layer how many + pixels apart the anchor box center points should be vertically and horizontally along the spatial grid over + the image. If the list contains ints/floats, then that value will be used for both spatial dimensions. + If the list contains tuples of two ints/floats, then they represent `(step_height, step_width)`. + If no steps are provided, then they will be computed such that the anchor box center points will form an + equidistant grid within the image dimensions. + offsets (list, optional): `None` or a list with as many elements as there are predictor layers. The elements can be + either floats or tuples of two floats. These numbers represent for each predictor layer how many + pixels from the top and left boarders of the image the top-most and left-most anchor box center points should be + as a fraction of `steps`. The last bit is important: The offsets are not absolute pixel values, but fractions + of the step size specified in the `steps` argument. If the list contains floats, then that value will + be used for both spatial dimensions. If the list contains tuples of two floats, then they represent + `(vertical_offset, horizontal_offset)`. If no offsets are provided, then they will default to 0.5 of the step size. + clip_boxes (bool, optional): If `True`, clips the anchor box coordinates to stay within image boundaries. + variances (list, optional): A list of 4 floats >0. The anchor box offset for each coordinate will be divided by + its respective variance value. + coords (str, optional): The box coordinate format to be used internally by the model (i.e. this is not the input format + of the ground truth labels). Can be either 'centroids' for the format `(cx, cy, w, h)` (box center coordinates, width, + and height), 'minmax' for the format `(xmin, xmax, ymin, ymax)`, or 'corners' for the format `(xmin, ymin, xmax, ymax)`. + normalize_coords (bool, optional): Set to `True` if the model is supposed to use relative instead of absolute coordinates, + i.e. if the model predicts box coordinates within [0,1] instead of absolute coordinates. + subtract_mean (array-like, optional): `None` or an array-like object of integers or floating point values + of any shape that is broadcast-compatible with the image shape. The elements of this array will be + subtracted from the image pixel intensity values. For example, pass a list of three integers + to perform per-channel mean normalization for color images. + divide_by_stddev (array-like, optional): `None` or an array-like object of non-zero integers or + floating point values of any shape that is broadcast-compatible with the image shape. The image pixel + intensity values will be divided by the elements of this array. For example, pass a list + of three integers to perform per-channel standard deviation normalization for color images. + swap_channels (list, optional): Either `False` or a list of integers representing the desired order in which the input + image channels should be swapped. + confidence_thresh (float, optional): A float in [0,1), the minimum classification confidence in a specific + positive class in order to be considered for the non-maximum suppression stage for the respective class. + A lower value will result in a larger part of the selection process being done by the non-maximum suppression + stage, while a larger value will result in a larger part of the selection process happening in the confidence + thresholding stage. + iou_threshold (float, optional): A float in [0,1]. All boxes that have a Jaccard similarity of greater than `iou_threshold` + with a locally maximal box will be removed from the set of predictions for a given class, where 'maximal' refers + to the box's confidence score. + top_k (int, optional): The number of highest scoring predictions to be kept for each batch item after the + non-maximum suppression stage. + nms_max_output_size (int, optional): The maximal number of predictions that will be left over after the NMS stage. + return_predictor_sizes (bool, optional): If `True`, this function not only returns the model, but also + a list containing the spatial dimensions of the predictor layers. This isn't strictly necessary since + you can always get their sizes easily via the Keras API, but it's convenient and less error-prone + to get them this way. They are only relevant for training anyway (SSDBoxEncoder needs to know the + spatial dimensions of the predictor layers), for inference you don't need them. + + Returns: + model: The Keras SSD300 model. + predictor_sizes (optional): A Numpy array containing the `(height, width)` portion + of the output tensor shape for each convolutional predictor layer. During + training, the generator function needs this in order to transform + the ground truth labels into tensors of identical structure as the + output tensors of the model, which is in turn needed for the cost + function. + + References: + https://arxiv.org/abs/1512.02325v5 + ''' + + n_predictor_layers = 6 # The number of predictor conv layers in the network is 6 for the original SSD300. + n_classes += 1 # Account for the background class. + l2_reg = l2_regularization # Make the internal name shorter. + img_height, img_width, img_channels = image_size[0], image_size[1], image_size[2] + + ############################################################################ + # Get a few exceptions out of the way. + ############################################################################ + + if aspect_ratios_global is None and aspect_ratios_per_layer is None: + raise ValueError("`aspect_ratios_global` and `aspect_ratios_per_layer` cannot both be None. At least one needs to be specified.") + if aspect_ratios_per_layer: + if len(aspect_ratios_per_layer) != n_predictor_layers: + raise ValueError("It must be either aspect_ratios_per_layer is None or len(aspect_ratios_per_layer) == {}, but len(aspect_ratios_per_layer) == {}.".format(n_predictor_layers, len(aspect_ratios_per_layer))) + + if (min_scale is None or max_scale is None) and scales is None: + raise ValueError("Either `min_scale` and `max_scale` or `scales` need to be specified.") + if scales: + if len(scales) != n_predictor_layers+1: + raise ValueError("It must be either scales is None or len(scales) == {}, but len(scales) == {}.".format(n_predictor_layers+1, len(scales))) + else: # If no explicit list of scaling factors was passed, compute the list of scaling factors from `min_scale` and `max_scale` + scales = np.linspace(min_scale, max_scale, n_predictor_layers+1) + + if len(variances) != 4: + raise ValueError("4 variance values must be pased, but {} values were received.".format(len(variances))) + variances = np.array(variances) + if np.any(variances <= 0): + raise ValueError("All variances must be >0, but the variances given are {}".format(variances)) + + if (not (steps is None)) and (len(steps) != n_predictor_layers): + raise ValueError("You must provide at least one step value per predictor layer.") + + if (not (offsets is None)) and (len(offsets) != n_predictor_layers): + raise ValueError("You must provide at least one offset value per predictor layer.") + + ############################################################################ + # Compute the anchor box parameters. + ############################################################################ + + # Set the aspect ratios for each predictor layer. These are only needed for the anchor box layers. + if aspect_ratios_per_layer: + aspect_ratios = aspect_ratios_per_layer + else: + aspect_ratios = [aspect_ratios_global] * n_predictor_layers + + # Compute the number of boxes to be predicted per cell for each predictor layer. + # We need this so that we know how many channels the predictor layers need to have. + if aspect_ratios_per_layer: + n_boxes = [] + for ar in aspect_ratios_per_layer: + if (1 in ar) & two_boxes_for_ar1: + n_boxes.append(len(ar) + 1) # +1 for the second box for aspect ratio 1 + else: + n_boxes.append(len(ar)) + else: # If only a global aspect ratio list was passed, then the number of boxes is the same for each predictor layer + if (1 in aspect_ratios_global) & two_boxes_for_ar1: + n_boxes = len(aspect_ratios_global) + 1 + else: + n_boxes = len(aspect_ratios_global) + n_boxes = [n_boxes] * n_predictor_layers + + if steps is None: + steps = [None] * n_predictor_layers + if offsets is None: + offsets = [None] * n_predictor_layers + + ############################################################################ + # Define functions for the Lambda layers below. + ############################################################################ + + def identity_layer(tensor): + return tensor + + def input_mean_normalization(tensor): + return tensor - np.array(subtract_mean) + + def input_stddev_normalization(tensor): + return tensor / np.array(divide_by_stddev) + + def input_channel_swap(tensor): + if len(swap_channels) == 3: + return K.stack([tensor[...,swap_channels[0]], tensor[...,swap_channels[1]], tensor[...,swap_channels[2]]], axis=-1) + elif len(swap_channels) == 4: + return K.stack([tensor[...,swap_channels[0]], tensor[...,swap_channels[1]], tensor[...,swap_channels[2]], tensor[...,swap_channels[3]]], axis=-1) + + ############################################################################ + # Build the network. + ############################################################################ + + x = Input(shape=(img_height, img_width, img_channels)) + + # The following identity layer is only needed so that the subsequent lambda layers can be optional. + x1 = Lambda(identity_layer, output_shape=(img_height, img_width, img_channels), name='identity_layer')(x) + if not (subtract_mean is None): + x1 = Lambda(input_mean_normalization, output_shape=(img_height, img_width, img_channels), name='input_mean_normalization')(x1) + if not (divide_by_stddev is None): + x1 = Lambda(input_stddev_normalization, output_shape=(img_height, img_width, img_channels), name='input_stddev_normalization')(x1) + if swap_channels: + x1 = Lambda(input_channel_swap, output_shape=(img_height, img_width, img_channels), name='input_channel_swap')(x1) + + conv1_1 = Conv2D(64, (3, 3), activation='relu', padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv1_1')(x1) + conv1_2 = Conv2D(64, (3, 3), activation='relu', padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv1_2')(conv1_1) + pool1 = MaxPooling2D(pool_size=(2, 2), strides=(2, 2), padding='same', name='pool1')(conv1_2) + + conv2_1 = Conv2D(128, (3, 3), activation='relu', padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv2_1')(pool1) + conv2_2 = Conv2D(128, (3, 3), activation='relu', padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv2_2')(conv2_1) + pool2 = MaxPooling2D(pool_size=(2, 2), strides=(2, 2), padding='same', name='pool2')(conv2_2) + + conv3_1 = Conv2D(256, (3, 3), activation='relu', padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv3_1')(pool2) + conv3_2 = Conv2D(256, (3, 3), activation='relu', padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv3_2')(conv3_1) + conv3_3 = Conv2D(256, (3, 3), activation='relu', padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv3_3')(conv3_2) + pool3 = MaxPooling2D(pool_size=(2, 2), strides=(2, 2), padding='same', name='pool3')(conv3_3) + + conv4_1 = Conv2D(512, (3, 3), activation='relu', padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv4_1')(pool3) + conv4_2 = Conv2D(512, (3, 3), activation='relu', padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv4_2')(conv4_1) + conv4_3 = Conv2D(512, (3, 3), activation='relu', padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv4_3')(conv4_2) + pool4 = MaxPooling2D(pool_size=(2, 2), strides=(2, 2), padding='same', name='pool4')(conv4_3) + + conv5_1 = Conv2D(512, (3, 3), activation='relu', padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv5_1')(pool4) + conv5_2 = Conv2D(512, (3, 3), activation='relu', padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv5_2')(conv5_1) + conv5_3 = Conv2D(512, (3, 3), activation='relu', padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv5_3')(conv5_2) + pool5 = MaxPooling2D(pool_size=(3, 3), strides=(1, 1), padding='same', name='pool5')(conv5_3) + + fc6 = Conv2D(1024, (3, 3), dilation_rate=(6, 6), activation='relu', padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='fc6')(pool5) + + fc7 = Conv2D(1024, (1, 1), activation='relu', padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='fc7')(fc6) + + conv6_1 = Conv2D(256, (1, 1), activation='relu', padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv6_1')(fc7) + conv6_1 = ZeroPadding2D(padding=((1, 1), (1, 1)), name='conv6_padding')(conv6_1) + conv6_2 = Conv2D(512, (3, 3), strides=(2, 2), activation='relu', padding='valid', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv6_2')(conv6_1) + + conv7_1 = Conv2D(128, (1, 1), activation='relu', padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv7_1')(conv6_2) + conv7_1 = ZeroPadding2D(padding=((1, 1), (1, 1)), name='conv7_padding')(conv7_1) + conv7_2 = Conv2D(256, (3, 3), strides=(2, 2), activation='relu', padding='valid', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv7_2')(conv7_1) + + conv8_1 = Conv2D(128, (1, 1), activation='relu', padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv8_1')(conv7_2) + conv8_2 = Conv2D(256, (3, 3), strides=(1, 1), activation='relu', padding='valid', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv8_2')(conv8_1) + + conv9_1 = Conv2D(128, (1, 1), activation='relu', padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv9_1')(conv8_2) + conv9_2 = Conv2D(256, (3, 3), strides=(1, 1), activation='relu', padding='valid', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv9_2')(conv9_1) + + # Feed conv4_3 into the L2 normalization layer + conv4_3_norm = L2Normalization(gamma_init=20, name='conv4_3_norm')(conv4_3) + + ### Build the convolutional predictor layers on top of the base network + + # We precidt `n_classes` confidence values for each box, hence the confidence predictors have depth `n_boxes * n_classes` + # Output shape of the confidence layers: `(batch, height, width, n_boxes * n_classes)` + conv4_3_norm_mbox_conf = Conv2D(n_boxes[0] * n_classes, (3, 3), padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv4_3_norm_mbox_conf')(conv4_3_norm) + fc7_mbox_conf = Conv2D(n_boxes[1] * n_classes, (3, 3), padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='fc7_mbox_conf')(fc7) + conv6_2_mbox_conf = Conv2D(n_boxes[2] * n_classes, (3, 3), padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv6_2_mbox_conf')(conv6_2) + conv7_2_mbox_conf = Conv2D(n_boxes[3] * n_classes, (3, 3), padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv7_2_mbox_conf')(conv7_2) + conv8_2_mbox_conf = Conv2D(n_boxes[4] * n_classes, (3, 3), padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv8_2_mbox_conf')(conv8_2) + conv9_2_mbox_conf = Conv2D(n_boxes[5] * n_classes, (3, 3), padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv9_2_mbox_conf')(conv9_2) + # We predict 4 box coordinates for each box, hence the localization predictors have depth `n_boxes * 4` + # Output shape of the localization layers: `(batch, height, width, n_boxes * 4)` + conv4_3_norm_mbox_loc = Conv2D(n_boxes[0] * 4, (3, 3), padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv4_3_norm_mbox_loc')(conv4_3_norm) + fc7_mbox_loc = Conv2D(n_boxes[1] * 4, (3, 3), padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='fc7_mbox_loc')(fc7) + conv6_2_mbox_loc = Conv2D(n_boxes[2] * 4, (3, 3), padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv6_2_mbox_loc')(conv6_2) + conv7_2_mbox_loc = Conv2D(n_boxes[3] * 4, (3, 3), padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv7_2_mbox_loc')(conv7_2) + conv8_2_mbox_loc = Conv2D(n_boxes[4] * 4, (3, 3), padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv8_2_mbox_loc')(conv8_2) + conv9_2_mbox_loc = Conv2D(n_boxes[5] * 4, (3, 3), padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv9_2_mbox_loc')(conv9_2) + + ### Generate the anchor boxes (called "priors" in the original Caffe/C++ implementation, so I'll keep their layer names) + + # Output shape of anchors: `(batch, height, width, n_boxes, 8)` + conv4_3_norm_mbox_priorbox = AnchorBoxes(img_height, img_width, this_scale=scales[0], next_scale=scales[1], aspect_ratios=aspect_ratios[0], + two_boxes_for_ar1=two_boxes_for_ar1, this_steps=steps[0], this_offsets=offsets[0], clip_boxes=clip_boxes, + variances=variances, coords=coords, normalize_coords=normalize_coords, name='conv4_3_norm_mbox_priorbox')(conv4_3_norm_mbox_loc) + fc7_mbox_priorbox = AnchorBoxes(img_height, img_width, this_scale=scales[1], next_scale=scales[2], aspect_ratios=aspect_ratios[1], + two_boxes_for_ar1=two_boxes_for_ar1, this_steps=steps[1], this_offsets=offsets[1], clip_boxes=clip_boxes, + variances=variances, coords=coords, normalize_coords=normalize_coords, name='fc7_mbox_priorbox')(fc7_mbox_loc) + conv6_2_mbox_priorbox = AnchorBoxes(img_height, img_width, this_scale=scales[2], next_scale=scales[3], aspect_ratios=aspect_ratios[2], + two_boxes_for_ar1=two_boxes_for_ar1, this_steps=steps[2], this_offsets=offsets[2], clip_boxes=clip_boxes, + variances=variances, coords=coords, normalize_coords=normalize_coords, name='conv6_2_mbox_priorbox')(conv6_2_mbox_loc) + conv7_2_mbox_priorbox = AnchorBoxes(img_height, img_width, this_scale=scales[3], next_scale=scales[4], aspect_ratios=aspect_ratios[3], + two_boxes_for_ar1=two_boxes_for_ar1, this_steps=steps[3], this_offsets=offsets[3], clip_boxes=clip_boxes, + variances=variances, coords=coords, normalize_coords=normalize_coords, name='conv7_2_mbox_priorbox')(conv7_2_mbox_loc) + conv8_2_mbox_priorbox = AnchorBoxes(img_height, img_width, this_scale=scales[4], next_scale=scales[5], aspect_ratios=aspect_ratios[4], + two_boxes_for_ar1=two_boxes_for_ar1, this_steps=steps[4], this_offsets=offsets[4], clip_boxes=clip_boxes, + variances=variances, coords=coords, normalize_coords=normalize_coords, name='conv8_2_mbox_priorbox')(conv8_2_mbox_loc) + conv9_2_mbox_priorbox = AnchorBoxes(img_height, img_width, this_scale=scales[5], next_scale=scales[6], aspect_ratios=aspect_ratios[5], + two_boxes_for_ar1=two_boxes_for_ar1, this_steps=steps[5], this_offsets=offsets[5], clip_boxes=clip_boxes, + variances=variances, coords=coords, normalize_coords=normalize_coords, name='conv9_2_mbox_priorbox')(conv9_2_mbox_loc) + + ### Reshape + + # Reshape the class predictions, yielding 3D tensors of shape `(batch, height * width * n_boxes, n_classes)` + # We want the classes isolated in the last axis to perform softmax on them + conv4_3_norm_mbox_conf_reshape = Reshape((-1, n_classes), name='conv4_3_norm_mbox_conf_reshape')(conv4_3_norm_mbox_conf) + fc7_mbox_conf_reshape = Reshape((-1, n_classes), name='fc7_mbox_conf_reshape')(fc7_mbox_conf) + conv6_2_mbox_conf_reshape = Reshape((-1, n_classes), name='conv6_2_mbox_conf_reshape')(conv6_2_mbox_conf) + conv7_2_mbox_conf_reshape = Reshape((-1, n_classes), name='conv7_2_mbox_conf_reshape')(conv7_2_mbox_conf) + conv8_2_mbox_conf_reshape = Reshape((-1, n_classes), name='conv8_2_mbox_conf_reshape')(conv8_2_mbox_conf) + conv9_2_mbox_conf_reshape = Reshape((-1, n_classes), name='conv9_2_mbox_conf_reshape')(conv9_2_mbox_conf) + # Reshape the box predictions, yielding 3D tensors of shape `(batch, height * width * n_boxes, 4)` + # We want the four box coordinates isolated in the last axis to compute the smooth L1 loss + conv4_3_norm_mbox_loc_reshape = Reshape((-1, 4), name='conv4_3_norm_mbox_loc_reshape')(conv4_3_norm_mbox_loc) + fc7_mbox_loc_reshape = Reshape((-1, 4), name='fc7_mbox_loc_reshape')(fc7_mbox_loc) + conv6_2_mbox_loc_reshape = Reshape((-1, 4), name='conv6_2_mbox_loc_reshape')(conv6_2_mbox_loc) + conv7_2_mbox_loc_reshape = Reshape((-1, 4), name='conv7_2_mbox_loc_reshape')(conv7_2_mbox_loc) + conv8_2_mbox_loc_reshape = Reshape((-1, 4), name='conv8_2_mbox_loc_reshape')(conv8_2_mbox_loc) + conv9_2_mbox_loc_reshape = Reshape((-1, 4), name='conv9_2_mbox_loc_reshape')(conv9_2_mbox_loc) + # Reshape the anchor box tensors, yielding 3D tensors of shape `(batch, height * width * n_boxes, 8)` + conv4_3_norm_mbox_priorbox_reshape = Reshape((-1, 8), name='conv4_3_norm_mbox_priorbox_reshape')(conv4_3_norm_mbox_priorbox) + fc7_mbox_priorbox_reshape = Reshape((-1, 8), name='fc7_mbox_priorbox_reshape')(fc7_mbox_priorbox) + conv6_2_mbox_priorbox_reshape = Reshape((-1, 8), name='conv6_2_mbox_priorbox_reshape')(conv6_2_mbox_priorbox) + conv7_2_mbox_priorbox_reshape = Reshape((-1, 8), name='conv7_2_mbox_priorbox_reshape')(conv7_2_mbox_priorbox) + conv8_2_mbox_priorbox_reshape = Reshape((-1, 8), name='conv8_2_mbox_priorbox_reshape')(conv8_2_mbox_priorbox) + conv9_2_mbox_priorbox_reshape = Reshape((-1, 8), name='conv9_2_mbox_priorbox_reshape')(conv9_2_mbox_priorbox) + + ### Concatenate the predictions from the different layers + + # Axis 0 (batch) and axis 2 (n_classes or 4, respectively) are identical for all layer predictions, + # so we want to concatenate along axis 1, the number of boxes per layer + # Output shape of `mbox_conf`: (batch, n_boxes_total, n_classes) + mbox_conf = Concatenate(axis=1, name='mbox_conf')([conv4_3_norm_mbox_conf_reshape, + fc7_mbox_conf_reshape, + conv6_2_mbox_conf_reshape, + conv7_2_mbox_conf_reshape, + conv8_2_mbox_conf_reshape, + conv9_2_mbox_conf_reshape]) + + # Output shape of `mbox_loc`: (batch, n_boxes_total, 4) + mbox_loc = Concatenate(axis=1, name='mbox_loc')([conv4_3_norm_mbox_loc_reshape, + fc7_mbox_loc_reshape, + conv6_2_mbox_loc_reshape, + conv7_2_mbox_loc_reshape, + conv8_2_mbox_loc_reshape, + conv9_2_mbox_loc_reshape]) + + # Output shape of `mbox_priorbox`: (batch, n_boxes_total, 8) + mbox_priorbox = Concatenate(axis=1, name='mbox_priorbox')([conv4_3_norm_mbox_priorbox_reshape, + fc7_mbox_priorbox_reshape, + conv6_2_mbox_priorbox_reshape, + conv7_2_mbox_priorbox_reshape, + conv8_2_mbox_priorbox_reshape, + conv9_2_mbox_priorbox_reshape]) + + # The box coordinate predictions will go into the loss function just the way they are, + # but for the class predictions, we'll apply a softmax activation layer first + mbox_conf_softmax = Activation('softmax', name='mbox_conf_softmax')(mbox_conf) + + # Concatenate the class and box predictions and the anchors to one large predictions vector + # Output shape of `predictions`: (batch, n_boxes_total, n_classes + 4 + 8) + predictions = Concatenate(axis=2, name='predictions')([mbox_conf_softmax, mbox_loc, mbox_priorbox]) + + if mode == 'training': + model = Model(inputs=x, outputs=predictions) + elif mode == 'inference': + decoded_predictions = DecodeDetections(confidence_thresh=confidence_thresh, + iou_threshold=iou_threshold, + top_k=top_k, + nms_max_output_size=nms_max_output_size, + coords=coords, + normalize_coords=normalize_coords, + img_height=img_height, + img_width=img_width, + name='decoded_predictions')(predictions) + model = Model(inputs=x, outputs=decoded_predictions) + elif mode == 'inference_fast': + decoded_predictions = DecodeDetectionsFast(confidence_thresh=confidence_thresh, + iou_threshold=iou_threshold, + top_k=top_k, + nms_max_output_size=nms_max_output_size, + coords=coords, + normalize_coords=normalize_coords, + img_height=img_height, + img_width=img_width, + name='decoded_predictions')(predictions) + model = Model(inputs=x, outputs=decoded_predictions) + else: + raise ValueError("`mode` must be one of 'training', 'inference' or 'inference_fast', but received '{}'.".format(mode)) + + if return_predictor_sizes: + predictor_sizes = np.array([conv4_3_norm_mbox_conf._keras_shape[1:3], + fc7_mbox_conf._keras_shape[1:3], + conv6_2_mbox_conf._keras_shape[1:3], + conv7_2_mbox_conf._keras_shape[1:3], + conv8_2_mbox_conf._keras_shape[1:3], + conv9_2_mbox_conf._keras_shape[1:3]]) + return model, predictor_sizes + else: + return model diff --git a/keras_ssd/models/keras_ssd512.py b/keras_ssd/models/keras_ssd512.py new file mode 100644 index 0000000..3f69ac6 --- /dev/null +++ b/keras_ssd/models/keras_ssd512.py @@ -0,0 +1,477 @@ +''' +A Keras port of the original Caffe SSD512 network. + +Copyright (C) 2018 Pierluigi Ferrari + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +''' + +from __future__ import division +import numpy as np +from keras.models import Model +from keras.layers import Input, Lambda, Activation, Conv2D, MaxPooling2D, ZeroPadding2D, Reshape, Concatenate +from keras.regularizers import l2 +import keras.backend as K + +from keras_layers.keras_layer_AnchorBoxes import AnchorBoxes +from keras_layers.keras_layer_L2Normalization import L2Normalization +from keras_layers.keras_layer_DecodeDetections import DecodeDetections +from keras_layers.keras_layer_DecodeDetectionsFast import DecodeDetectionsFast + +def ssd_512(image_size, + n_classes, + mode='training', + l2_regularization=0.0005, + min_scale=None, + max_scale=None, + scales=None, + aspect_ratios_global=None, + aspect_ratios_per_layer=[[1.0, 2.0, 0.5], + [1.0, 2.0, 0.5, 3.0, 1.0/3.0], + [1.0, 2.0, 0.5, 3.0, 1.0/3.0], + [1.0, 2.0, 0.5, 3.0, 1.0/3.0], + [1.0, 2.0, 0.5, 3.0, 1.0/3.0], + [1.0, 2.0, 0.5], + [1.0, 2.0, 0.5]], + two_boxes_for_ar1=True, + steps=[8, 16, 32, 64, 128, 256, 512], + offsets=None, + clip_boxes=False, + variances=[0.1, 0.1, 0.2, 0.2], + coords='centroids', + normalize_coords=True, + subtract_mean=[123, 117, 104], + divide_by_stddev=None, + swap_channels=[2, 1, 0], + confidence_thresh=0.01, + iou_threshold=0.45, + top_k=200, + nms_max_output_size=400, + return_predictor_sizes=False): + ''' + Build a Keras model with SSD512 architecture, see references. + + The base network is a reduced atrous VGG-16, extended by the SSD architecture, + as described in the paper. + + Most of the arguments that this function takes are only needed for the anchor + box layers. In case you're training the network, the parameters passed here must + be the same as the ones used to set up `SSDBoxEncoder`. In case you're loading + trained weights, the parameters passed here must be the same as the ones used + to produce the trained weights. + + Some of these arguments are explained in more detail in the documentation of the + `SSDBoxEncoder` class. + + Note: Requires Keras v2.0 or later. Currently works only with the + TensorFlow backend (v1.0 or later). + + Arguments: + image_size (tuple): The input image size in the format `(height, width, channels)`. + n_classes (int): The number of positive classes, e.g. 20 for Pascal VOC, 80 for MS COCO. + mode (str, optional): One of 'training', 'inference' and 'inference_fast'. In 'training' mode, + the model outputs the raw prediction tensor, while in 'inference' and 'inference_fast' modes, + the raw predictions are decoded into absolute coordinates and filtered via confidence thresholding, + non-maximum suppression, and top-k filtering. The difference between latter two modes is that + 'inference' follows the exact procedure of the original Caffe implementation, while + 'inference_fast' uses a faster prediction decoding procedure. + l2_regularization (float, optional): The L2-regularization rate. Applies to all convolutional layers. + Set to zero to deactivate L2-regularization. + min_scale (float, optional): The smallest scaling factor for the size of the anchor boxes as a fraction + of the shorter side of the input images. + max_scale (float, optional): The largest scaling factor for the size of the anchor boxes as a fraction + of the shorter side of the input images. All scaling factors between the smallest and the + largest will be linearly interpolated. Note that the second to last of the linearly interpolated + scaling factors will actually be the scaling factor for the last predictor layer, while the last + scaling factor is used for the second box for aspect ratio 1 in the last predictor layer + if `two_boxes_for_ar1` is `True`. + scales (list, optional): A list of floats containing scaling factors per convolutional predictor layer. + This list must be one element longer than the number of predictor layers. The first `k` elements are the + scaling factors for the `k` predictor layers, while the last element is used for the second box + for aspect ratio 1 in the last predictor layer if `two_boxes_for_ar1` is `True`. This additional + last scaling factor must be passed either way, even if it is not being used. + If a list is passed, this argument overrides `min_scale` and `max_scale`. All scaling factors + must be greater than zero. + aspect_ratios_global (list, optional): The list of aspect ratios for which anchor boxes are to be + generated. This list is valid for all prediction layers. + aspect_ratios_per_layer (list, optional): A list containing one aspect ratio list for each prediction layer. + This allows you to set the aspect ratios for each predictor layer individually, which is the case for the + original SSD512 implementation. If a list is passed, it overrides `aspect_ratios_global`. + two_boxes_for_ar1 (bool, optional): Only relevant for aspect ratio lists that contain 1. Will be ignored otherwise. + If `True`, two anchor boxes will be generated for aspect ratio 1. The first will be generated + using the scaling factor for the respective layer, the second one will be generated using + geometric mean of said scaling factor and next bigger scaling factor. + steps (list, optional): `None` or a list with as many elements as there are predictor layers. The elements can be + either ints/floats or tuples of two ints/floats. These numbers represent for each predictor layer how many + pixels apart the anchor box center points should be vertically and horizontally along the spatial grid over + the image. If the list contains ints/floats, then that value will be used for both spatial dimensions. + If the list contains tuples of two ints/floats, then they represent `(step_height, step_width)`. + If no steps are provided, then they will be computed such that the anchor box center points will form an + equidistant grid within the image dimensions. + offsets (list, optional): `None` or a list with as many elements as there are predictor layers. The elements can be + either floats or tuples of two floats. These numbers represent for each predictor layer how many + pixels from the top and left boarders of the image the top-most and left-most anchor box center points should be + as a fraction of `steps`. The last bit is important: The offsets are not absolute pixel values, but fractions + of the step size specified in the `steps` argument. If the list contains floats, then that value will + be used for both spatial dimensions. If the list contains tuples of two floats, then they represent + `(vertical_offset, horizontal_offset)`. If no offsets are provided, then they will default to 0.5 of the step size. + clip_boxes (bool, optional): If `True`, clips the anchor box coordinates to stay within image boundaries. + variances (list, optional): A list of 4 floats >0. The anchor box offset for each coordinate will be divided by + its respective variance value. + coords (str, optional): The box coordinate format to be used internally by the model (i.e. this is not the input format + of the ground truth labels). Can be either 'centroids' for the format `(cx, cy, w, h)` (box center coordinates, width, + and height), 'minmax' for the format `(xmin, xmax, ymin, ymax)`, or 'corners' for the format `(xmin, ymin, xmax, ymax)`. + normalize_coords (bool, optional): Set to `True` if the model is supposed to use relative instead of absolute coordinates, + i.e. if the model predicts box coordinates within [0,1] instead of absolute coordinates. + subtract_mean (array-like, optional): `None` or an array-like object of integers or floating point values + of any shape that is broadcast-compatible with the image shape. The elements of this array will be + subtracted from the image pixel intensity values. For example, pass a list of three integers + to perform per-channel mean normalization for color images. + divide_by_stddev (array-like, optional): `None` or an array-like object of non-zero integers or + floating point values of any shape that is broadcast-compatible with the image shape. The image pixel + intensity values will be divided by the elements of this array. For example, pass a list + of three integers to perform per-channel standard deviation normalization for color images. + swap_channels (list, optional): Either `False` or a list of integers representing the desired order in which the input + image channels should be swapped. + confidence_thresh (float, optional): A float in [0,1), the minimum classification confidence in a specific + positive class in order to be considered for the non-maximum suppression stage for the respective class. + A lower value will result in a larger part of the selection process being done by the non-maximum suppression + stage, while a larger value will result in a larger part of the selection process happening in the confidence + thresholding stage. + iou_threshold (float, optional): A float in [0,1]. All boxes that have a Jaccard similarity of greater than `iou_threshold` + with a locally maximal box will be removed from the set of predictions for a given class, where 'maximal' refers + to the box's confidence score. + top_k (int, optional): The number of highest scoring predictions to be kept for each batch item after the + non-maximum suppression stage. + nms_max_output_size (int, optional): The maximal number of predictions that will be left over after the NMS stage. + return_predictor_sizes (bool, optional): If `True`, this function not only returns the model, but also + a list containing the spatial dimensions of the predictor layers. This isn't strictly necessary since + you can always get their sizes easily via the Keras API, but it's convenient and less error-prone + to get them this way. They are only relevant for training anyway (SSDBoxEncoder needs to know the + spatial dimensions of the predictor layers), for inference you don't need them. + + Returns: + model: The Keras SSD512 model. + predictor_sizes (optional): A Numpy array containing the `(height, width)` portion + of the output tensor shape for each convolutional predictor layer. During + training, the generator function needs this in order to transform + the ground truth labels into tensors of identical structure as the + output tensors of the model, which is in turn needed for the cost + function. + + References: + https://arxiv.org/abs/1512.02325v5 + ''' + + n_predictor_layers = 7 # The number of predictor conv layers in the network is 7 for the original SSD512 + n_classes += 1 # Account for the background class. + l2_reg = l2_regularization # Make the internal name shorter. + img_height, img_width, img_channels = image_size[0], image_size[1], image_size[2] + + ############################################################################ + # Get a few exceptions out of the way. + ############################################################################ + + if aspect_ratios_global is None and aspect_ratios_per_layer is None: + raise ValueError("`aspect_ratios_global` and `aspect_ratios_per_layer` cannot both be None. At least one needs to be specified.") + if aspect_ratios_per_layer: + if len(aspect_ratios_per_layer) != n_predictor_layers: + raise ValueError("It must be either aspect_ratios_per_layer is None or len(aspect_ratios_per_layer) == {}, but len(aspect_ratios_per_layer) == {}.".format(n_predictor_layers, len(aspect_ratios_per_layer))) + + if (min_scale is None or max_scale is None) and scales is None: + raise ValueError("Either `min_scale` and `max_scale` or `scales` need to be specified.") + if scales: + if len(scales) != n_predictor_layers+1: + raise ValueError("It must be either scales is None or len(scales) == {}, but len(scales) == {}.".format(n_predictor_layers+1, len(scales))) + else: # If no explicit list of scaling factors was passed, compute the list of scaling factors from `min_scale` and `max_scale` + scales = np.linspace(min_scale, max_scale, n_predictor_layers+1) + + if len(variances) != 4: + raise ValueError("4 variance values must be pased, but {} values were received.".format(len(variances))) + variances = np.array(variances) + if np.any(variances <= 0): + raise ValueError("All variances must be >0, but the variances given are {}".format(variances)) + + if (not (steps is None)) and (len(steps) != n_predictor_layers): + raise ValueError("You must provide at least one step value per predictor layer.") + + if (not (offsets is None)) and (len(offsets) != n_predictor_layers): + raise ValueError("You must provide at least one offset value per predictor layer.") + + ############################################################################ + # Compute the anchor box parameters. + ############################################################################ + + # Set the aspect ratios for each predictor layer. These are only needed for the anchor box layers. + if aspect_ratios_per_layer: + aspect_ratios = aspect_ratios_per_layer + else: + aspect_ratios = [aspect_ratios_global] * n_predictor_layers + + # Compute the number of boxes to be predicted per cell for each predictor layer. + # We need this so that we know how many channels the predictor layers need to have. + if aspect_ratios_per_layer: + n_boxes = [] + for ar in aspect_ratios_per_layer: + if (1 in ar) & two_boxes_for_ar1: + n_boxes.append(len(ar) + 1) # +1 for the second box for aspect ratio 1 + else: + n_boxes.append(len(ar)) + else: # If only a global aspect ratio list was passed, then the number of boxes is the same for each predictor layer + if (1 in aspect_ratios_global) & two_boxes_for_ar1: + n_boxes = len(aspect_ratios_global) + 1 + else: + n_boxes = len(aspect_ratios_global) + n_boxes = [n_boxes] * n_predictor_layers + + if steps is None: + steps = [None] * n_predictor_layers + if offsets is None: + offsets = [None] * n_predictor_layers + + ############################################################################ + # Define functions for the Lambda layers below. + ############################################################################ + + def identity_layer(tensor): + return tensor + + def input_mean_normalization(tensor): + return tensor - np.array(subtract_mean) + + def input_stddev_normalization(tensor): + return tensor / np.array(divide_by_stddev) + + def input_channel_swap(tensor): + if len(swap_channels) == 3: + return K.stack([tensor[...,swap_channels[0]], tensor[...,swap_channels[1]], tensor[...,swap_channels[2]]], axis=-1) + elif len(swap_channels) == 4: + return K.stack([tensor[...,swap_channels[0]], tensor[...,swap_channels[1]], tensor[...,swap_channels[2]], tensor[...,swap_channels[3]]], axis=-1) + + ############################################################################ + # Build the network. + ############################################################################ + + x = Input(shape=(img_height, img_width, img_channels)) + + # The following identity layer is only needed so that the subsequent lambda layers can be optional. + x1 = Lambda(identity_layer, output_shape=(img_height, img_width, img_channels), name='identity_layer')(x) + if not (subtract_mean is None): + x1 = Lambda(input_mean_normalization, output_shape=(img_height, img_width, img_channels), name='input_mean_normalization')(x1) + if not (divide_by_stddev is None): + x1 = Lambda(input_stddev_normalization, output_shape=(img_height, img_width, img_channels), name='input_stddev_normalization')(x1) + if swap_channels: + x1 = Lambda(input_channel_swap, output_shape=(img_height, img_width, img_channels), name='input_channel_swap')(x1) + + conv1_1 = Conv2D(64, (3, 3), activation='relu', padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv1_1')(x1) + conv1_2 = Conv2D(64, (3, 3), activation='relu', padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv1_2')(conv1_1) + pool1 = MaxPooling2D(pool_size=(2, 2), strides=(2, 2), padding='same', name='pool1')(conv1_2) + + conv2_1 = Conv2D(128, (3, 3), activation='relu', padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv2_1')(pool1) + conv2_2 = Conv2D(128, (3, 3), activation='relu', padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv2_2')(conv2_1) + pool2 = MaxPooling2D(pool_size=(2, 2), strides=(2, 2), padding='same', name='pool2')(conv2_2) + + conv3_1 = Conv2D(256, (3, 3), activation='relu', padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv3_1')(pool2) + conv3_2 = Conv2D(256, (3, 3), activation='relu', padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv3_2')(conv3_1) + conv3_3 = Conv2D(256, (3, 3), activation='relu', padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv3_3')(conv3_2) + pool3 = MaxPooling2D(pool_size=(2, 2), strides=(2, 2), padding='same', name='pool3')(conv3_3) + + conv4_1 = Conv2D(512, (3, 3), activation='relu', padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv4_1')(pool3) + conv4_2 = Conv2D(512, (3, 3), activation='relu', padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv4_2')(conv4_1) + conv4_3 = Conv2D(512, (3, 3), activation='relu', padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv4_3')(conv4_2) + pool4 = MaxPooling2D(pool_size=(2, 2), strides=(2, 2), padding='same', name='pool4')(conv4_3) + + conv5_1 = Conv2D(512, (3, 3), activation='relu', padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv5_1')(pool4) + conv5_2 = Conv2D(512, (3, 3), activation='relu', padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv5_2')(conv5_1) + conv5_3 = Conv2D(512, (3, 3), activation='relu', padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv5_3')(conv5_2) + pool5 = MaxPooling2D(pool_size=(3, 3), strides=(1, 1), padding='same', name='pool5')(conv5_3) + + fc6 = Conv2D(1024, (3, 3), dilation_rate=(6, 6), activation='relu', padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='fc6')(pool5) + + fc7 = Conv2D(1024, (1, 1), activation='relu', padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='fc7')(fc6) + + conv6_1 = Conv2D(256, (1, 1), activation='relu', padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv6_1')(fc7) + conv6_1 = ZeroPadding2D(padding=((1, 1), (1, 1)), name='conv6_padding')(conv6_1) + conv6_2 = Conv2D(512, (3, 3), strides=(2, 2), activation='relu', padding='valid', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv6_2')(conv6_1) + + conv7_1 = Conv2D(128, (1, 1), activation='relu', padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv7_1')(conv6_2) + conv7_1 = ZeroPadding2D(padding=((1, 1), (1, 1)), name='conv7_padding')(conv7_1) + conv7_2 = Conv2D(256, (3, 3), strides=(2, 2), activation='relu', padding='valid', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv7_2')(conv7_1) + + conv8_1 = Conv2D(128, (1, 1), activation='relu', padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv8_1')(conv7_2) + conv8_1 = ZeroPadding2D(padding=((1, 1), (1, 1)), name='conv8_padding')(conv8_1) + conv8_2 = Conv2D(256, (3, 3), strides=(2, 2), activation='relu', padding='valid', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv8_2')(conv8_1) + + conv9_1 = Conv2D(128, (1, 1), activation='relu', padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv9_1')(conv8_2) + conv9_1 = ZeroPadding2D(padding=((1, 1), (1, 1)), name='conv9_padding')(conv9_1) + conv9_2 = Conv2D(256, (3, 3), strides=(2, 2), activation='relu', padding='valid', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv9_2')(conv9_1) + + conv10_1 = Conv2D(128, (1, 1), activation='relu', padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv10_1')(conv9_2) + conv10_1 = ZeroPadding2D(padding=((1, 1), (1, 1)), name='conv10_padding')(conv10_1) + conv10_2 = Conv2D(256, (4, 4), strides=(1, 1), activation='relu', padding='valid', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv10_2')(conv10_1) + + # Feed conv4_3 into the L2 normalization layer + conv4_3_norm = L2Normalization(gamma_init=20, name='conv4_3_norm')(conv4_3) + + ### Build the convolutional predictor layers on top of the base network + + # We precidt `n_classes` confidence values for each box, hence the confidence predictors have depth `n_boxes * n_classes` + # Output shape of the confidence layers: `(batch, height, width, n_boxes * n_classes)` + conv4_3_norm_mbox_conf = Conv2D(n_boxes[0] * n_classes, (3, 3), padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv4_3_norm_mbox_conf')(conv4_3_norm) + fc7_mbox_conf = Conv2D(n_boxes[1] * n_classes, (3, 3), padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='fc7_mbox_conf')(fc7) + conv6_2_mbox_conf = Conv2D(n_boxes[2] * n_classes, (3, 3), padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv6_2_mbox_conf')(conv6_2) + conv7_2_mbox_conf = Conv2D(n_boxes[3] * n_classes, (3, 3), padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv7_2_mbox_conf')(conv7_2) + conv8_2_mbox_conf = Conv2D(n_boxes[4] * n_classes, (3, 3), padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv8_2_mbox_conf')(conv8_2) + conv9_2_mbox_conf = Conv2D(n_boxes[5] * n_classes, (3, 3), padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv9_2_mbox_conf')(conv9_2) + conv10_2_mbox_conf = Conv2D(n_boxes[6] * n_classes, (3, 3), padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv10_2_mbox_conf')(conv10_2) + # We predict 4 box coordinates for each box, hence the localization predictors have depth `n_boxes * 4` + # Output shape of the localization layers: `(batch, height, width, n_boxes * 4)` + conv4_3_norm_mbox_loc = Conv2D(n_boxes[0] * 4, (3, 3), padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv4_3_norm_mbox_loc')(conv4_3_norm) + fc7_mbox_loc = Conv2D(n_boxes[1] * 4, (3, 3), padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='fc7_mbox_loc')(fc7) + conv6_2_mbox_loc = Conv2D(n_boxes[2] * 4, (3, 3), padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv6_2_mbox_loc')(conv6_2) + conv7_2_mbox_loc = Conv2D(n_boxes[3] * 4, (3, 3), padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv7_2_mbox_loc')(conv7_2) + conv8_2_mbox_loc = Conv2D(n_boxes[4] * 4, (3, 3), padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv8_2_mbox_loc')(conv8_2) + conv9_2_mbox_loc = Conv2D(n_boxes[5] * 4, (3, 3), padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv9_2_mbox_loc')(conv9_2) + conv10_2_mbox_loc = Conv2D(n_boxes[6] * 4, (3, 3), padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv10_2_mbox_loc')(conv10_2) + + ### Generate the anchor boxes (called "priors" in the original Caffe/C++ implementation, so I'll keep their layer names) + + # Output shape of anchors: `(batch, height, width, n_boxes, 8)` + conv4_3_norm_mbox_priorbox = AnchorBoxes(img_height, img_width, this_scale=scales[0], next_scale=scales[1], aspect_ratios=aspect_ratios[0], + two_boxes_for_ar1=two_boxes_for_ar1, this_steps=steps[0], this_offsets=offsets[0], clip_boxes=clip_boxes, + variances=variances, coords=coords, normalize_coords=normalize_coords, name='conv4_3_norm_mbox_priorbox')(conv4_3_norm_mbox_loc) + fc7_mbox_priorbox = AnchorBoxes(img_height, img_width, this_scale=scales[1], next_scale=scales[2], aspect_ratios=aspect_ratios[1], + two_boxes_for_ar1=two_boxes_for_ar1, this_steps=steps[1], this_offsets=offsets[1], clip_boxes=clip_boxes, + variances=variances, coords=coords, normalize_coords=normalize_coords, name='fc7_mbox_priorbox')(fc7_mbox_loc) + conv6_2_mbox_priorbox = AnchorBoxes(img_height, img_width, this_scale=scales[2], next_scale=scales[3], aspect_ratios=aspect_ratios[2], + two_boxes_for_ar1=two_boxes_for_ar1, this_steps=steps[2], this_offsets=offsets[2], clip_boxes=clip_boxes, + variances=variances, coords=coords, normalize_coords=normalize_coords, name='conv6_2_mbox_priorbox')(conv6_2_mbox_loc) + conv7_2_mbox_priorbox = AnchorBoxes(img_height, img_width, this_scale=scales[3], next_scale=scales[4], aspect_ratios=aspect_ratios[3], + two_boxes_for_ar1=two_boxes_for_ar1, this_steps=steps[3], this_offsets=offsets[3], clip_boxes=clip_boxes, + variances=variances, coords=coords, normalize_coords=normalize_coords, name='conv7_2_mbox_priorbox')(conv7_2_mbox_loc) + conv8_2_mbox_priorbox = AnchorBoxes(img_height, img_width, this_scale=scales[4], next_scale=scales[5], aspect_ratios=aspect_ratios[4], + two_boxes_for_ar1=two_boxes_for_ar1, this_steps=steps[4], this_offsets=offsets[4], clip_boxes=clip_boxes, + variances=variances, coords=coords, normalize_coords=normalize_coords, name='conv8_2_mbox_priorbox')(conv8_2_mbox_loc) + conv9_2_mbox_priorbox = AnchorBoxes(img_height, img_width, this_scale=scales[5], next_scale=scales[6], aspect_ratios=aspect_ratios[5], + two_boxes_for_ar1=two_boxes_for_ar1, this_steps=steps[5], this_offsets=offsets[5], clip_boxes=clip_boxes, + variances=variances, coords=coords, normalize_coords=normalize_coords, name='conv9_2_mbox_priorbox')(conv9_2_mbox_loc) + conv10_2_mbox_priorbox = AnchorBoxes(img_height, img_width, this_scale=scales[6], next_scale=scales[7], aspect_ratios=aspect_ratios[6], + two_boxes_for_ar1=two_boxes_for_ar1, this_steps=steps[6], this_offsets=offsets[6], clip_boxes=clip_boxes, + variances=variances, coords=coords, normalize_coords=normalize_coords, name='conv10_2_mbox_priorbox')(conv10_2_mbox_loc) + + ### Reshape + + # Reshape the class predictions, yielding 3D tensors of shape `(batch, height * width * n_boxes, n_classes)` + # We want the classes isolated in the last axis to perform softmax on them + conv4_3_norm_mbox_conf_reshape = Reshape((-1, n_classes), name='conv4_3_norm_mbox_conf_reshape')(conv4_3_norm_mbox_conf) + fc7_mbox_conf_reshape = Reshape((-1, n_classes), name='fc7_mbox_conf_reshape')(fc7_mbox_conf) + conv6_2_mbox_conf_reshape = Reshape((-1, n_classes), name='conv6_2_mbox_conf_reshape')(conv6_2_mbox_conf) + conv7_2_mbox_conf_reshape = Reshape((-1, n_classes), name='conv7_2_mbox_conf_reshape')(conv7_2_mbox_conf) + conv8_2_mbox_conf_reshape = Reshape((-1, n_classes), name='conv8_2_mbox_conf_reshape')(conv8_2_mbox_conf) + conv9_2_mbox_conf_reshape = Reshape((-1, n_classes), name='conv9_2_mbox_conf_reshape')(conv9_2_mbox_conf) + conv10_2_mbox_conf_reshape = Reshape((-1, n_classes), name='conv10_2_mbox_conf_reshape')(conv10_2_mbox_conf) + # Reshape the box predictions, yielding 3D tensors of shape `(batch, height * width * n_boxes, 4)` + # We want the four box coordinates isolated in the last axis to compute the smooth L1 loss + conv4_3_norm_mbox_loc_reshape = Reshape((-1, 4), name='conv4_3_norm_mbox_loc_reshape')(conv4_3_norm_mbox_loc) + fc7_mbox_loc_reshape = Reshape((-1, 4), name='fc7_mbox_loc_reshape')(fc7_mbox_loc) + conv6_2_mbox_loc_reshape = Reshape((-1, 4), name='conv6_2_mbox_loc_reshape')(conv6_2_mbox_loc) + conv7_2_mbox_loc_reshape = Reshape((-1, 4), name='conv7_2_mbox_loc_reshape')(conv7_2_mbox_loc) + conv8_2_mbox_loc_reshape = Reshape((-1, 4), name='conv8_2_mbox_loc_reshape')(conv8_2_mbox_loc) + conv9_2_mbox_loc_reshape = Reshape((-1, 4), name='conv9_2_mbox_loc_reshape')(conv9_2_mbox_loc) + conv10_2_mbox_loc_reshape = Reshape((-1, 4), name='conv10_2_mbox_loc_reshape')(conv10_2_mbox_loc) + # Reshape the anchor box tensors, yielding 3D tensors of shape `(batch, height * width * n_boxes, 8)` + conv4_3_norm_mbox_priorbox_reshape = Reshape((-1, 8), name='conv4_3_norm_mbox_priorbox_reshape')(conv4_3_norm_mbox_priorbox) + fc7_mbox_priorbox_reshape = Reshape((-1, 8), name='fc7_mbox_priorbox_reshape')(fc7_mbox_priorbox) + conv6_2_mbox_priorbox_reshape = Reshape((-1, 8), name='conv6_2_mbox_priorbox_reshape')(conv6_2_mbox_priorbox) + conv7_2_mbox_priorbox_reshape = Reshape((-1, 8), name='conv7_2_mbox_priorbox_reshape')(conv7_2_mbox_priorbox) + conv8_2_mbox_priorbox_reshape = Reshape((-1, 8), name='conv8_2_mbox_priorbox_reshape')(conv8_2_mbox_priorbox) + conv9_2_mbox_priorbox_reshape = Reshape((-1, 8), name='conv9_2_mbox_priorbox_reshape')(conv9_2_mbox_priorbox) + conv10_2_mbox_priorbox_reshape = Reshape((-1, 8), name='conv10_2_mbox_priorbox_reshape')(conv10_2_mbox_priorbox) + + ### Concatenate the predictions from the different layers + + # Axis 0 (batch) and axis 2 (n_classes or 4, respectively) are identical for all layer predictions, + # so we want to concatenate along axis 1, the number of boxes per layer + # Output shape of `mbox_conf`: (batch, n_boxes_total, n_classes) + mbox_conf = Concatenate(axis=1, name='mbox_conf')([conv4_3_norm_mbox_conf_reshape, + fc7_mbox_conf_reshape, + conv6_2_mbox_conf_reshape, + conv7_2_mbox_conf_reshape, + conv8_2_mbox_conf_reshape, + conv9_2_mbox_conf_reshape, + conv10_2_mbox_conf_reshape]) + + # Output shape of `mbox_loc`: (batch, n_boxes_total, 4) + mbox_loc = Concatenate(axis=1, name='mbox_loc')([conv4_3_norm_mbox_loc_reshape, + fc7_mbox_loc_reshape, + conv6_2_mbox_loc_reshape, + conv7_2_mbox_loc_reshape, + conv8_2_mbox_loc_reshape, + conv9_2_mbox_loc_reshape, + conv10_2_mbox_loc_reshape]) + + # Output shape of `mbox_priorbox`: (batch, n_boxes_total, 8) + mbox_priorbox = Concatenate(axis=1, name='mbox_priorbox')([conv4_3_norm_mbox_priorbox_reshape, + fc7_mbox_priorbox_reshape, + conv6_2_mbox_priorbox_reshape, + conv7_2_mbox_priorbox_reshape, + conv8_2_mbox_priorbox_reshape, + conv9_2_mbox_priorbox_reshape, + conv10_2_mbox_priorbox_reshape]) + + # The box coordinate predictions will go into the loss function just the way they are, + # but for the class predictions, we'll apply a softmax activation layer first + mbox_conf_softmax = Activation('softmax', name='mbox_conf_softmax')(mbox_conf) + + # Concatenate the class and box predictions and the anchors to one large predictions vector + # Output shape of `predictions`: (batch, n_boxes_total, n_classes + 4 + 8) + predictions = Concatenate(axis=2, name='predictions')([mbox_conf_softmax, mbox_loc, mbox_priorbox]) + + if mode == 'training': + model = Model(inputs=x, outputs=predictions) + elif mode == 'inference': + decoded_predictions = DecodeDetections(confidence_thresh=confidence_thresh, + iou_threshold=iou_threshold, + top_k=top_k, + nms_max_output_size=nms_max_output_size, + coords=coords, + normalize_coords=normalize_coords, + img_height=img_height, + img_width=img_width, + name='decoded_predictions')(predictions) + model = Model(inputs=x, outputs=decoded_predictions) + elif mode == 'inference_fast': + decoded_predictions = DecodeDetectionsFast(confidence_thresh=confidence_thresh, + iou_threshold=iou_threshold, + top_k=top_k, + nms_max_output_size=nms_max_output_size, + coords=coords, + normalize_coords=normalize_coords, + img_height=img_height, + img_width=img_width, + name='decoded_predictions')(predictions) + model = Model(inputs=x, outputs=decoded_predictions) + else: + raise ValueError("`mode` must be one of 'training', 'inference' or 'inference_fast', but received '{}'.".format(mode)) + + if return_predictor_sizes: + predictor_sizes = np.array([conv4_3_norm_mbox_conf._keras_shape[1:3], + fc7_mbox_conf._keras_shape[1:3], + conv6_2_mbox_conf._keras_shape[1:3], + conv7_2_mbox_conf._keras_shape[1:3], + conv8_2_mbox_conf._keras_shape[1:3], + conv9_2_mbox_conf._keras_shape[1:3], + conv10_2_mbox_conf._keras_shape[1:3]]) + return model, predictor_sizes + else: + return model diff --git a/keras_ssd/models/keras_ssd7.py b/keras_ssd/models/keras_ssd7.py new file mode 100644 index 0000000..5409599 --- /dev/null +++ b/keras_ssd/models/keras_ssd7.py @@ -0,0 +1,430 @@ +''' +A small 7-layer Keras model with SSD architecture. Also serves as a template to build arbitrary network architectures. + +Copyright (C) 2018 Pierluigi Ferrari + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +''' + +from __future__ import division +import numpy as np +from keras.models import Model +from keras.layers import Input, Lambda, Conv2D, MaxPooling2D, BatchNormalization, ELU, Reshape, Concatenate, Activation +from keras.regularizers import l2 +import keras.backend as K + +from keras_layers.keras_layer_AnchorBoxes import AnchorBoxes +from keras_layers.keras_layer_DecodeDetections import DecodeDetections +from keras_layers.keras_layer_DecodeDetectionsFast import DecodeDetectionsFast + +def build_model(image_size, + n_classes, + mode='training', + l2_regularization=0.0, + min_scale=0.1, + max_scale=0.9, + scales=None, + aspect_ratios_global=[0.5, 1.0, 2.0], + aspect_ratios_per_layer=None, + two_boxes_for_ar1=True, + steps=None, + offsets=None, + clip_boxes=False, + variances=[1.0, 1.0, 1.0, 1.0], + coords='centroids', + normalize_coords=False, + subtract_mean=None, + divide_by_stddev=None, + swap_channels=False, + confidence_thresh=0.01, + iou_threshold=0.45, + top_k=200, + nms_max_output_size=400, + return_predictor_sizes=False): + ''' + Build a Keras model with SSD architecture, see references. + + The model consists of convolutional feature layers and a number of convolutional + predictor layers that take their input from different feature layers. + The model is fully convolutional. + + The implementation found here is a smaller version of the original architecture + used in the paper (where the base network consists of a modified VGG-16 extended + by a few convolutional feature layers), but of course it could easily be changed to + an arbitrarily large SSD architecture by following the general design pattern used here. + This implementation has 7 convolutional layers and 4 convolutional predictor + layers that take their input from layers 4, 5, 6, and 7, respectively. + + Most of the arguments that this function takes are only needed for the anchor + box layers. In case you're training the network, the parameters passed here must + be the same as the ones used to set up `SSDBoxEncoder`. In case you're loading + trained weights, the parameters passed here must be the same as the ones used + to produce the trained weights. + + Some of these arguments are explained in more detail in the documentation of the + `SSDBoxEncoder` class. + + Note: Requires Keras v2.0 or later. Training currently works only with the + TensorFlow backend (v1.0 or later). + + Arguments: + image_size (tuple): The input image size in the format `(height, width, channels)`. + n_classes (int): The number of positive classes, e.g. 20 for Pascal VOC, 80 for MS COCO. + mode (str, optional): One of 'training', 'inference' and 'inference_fast'. In 'training' mode, + the model outputs the raw prediction tensor, while in 'inference' and 'inference_fast' modes, + the raw predictions are decoded into absolute coordinates and filtered via confidence thresholding, + non-maximum suppression, and top-k filtering. The difference between latter two modes is that + 'inference' follows the exact procedure of the original Caffe implementation, while + 'inference_fast' uses a faster prediction decoding procedure. + l2_regularization (float, optional): The L2-regularization rate. Applies to all convolutional layers. + min_scale (float, optional): The smallest scaling factor for the size of the anchor boxes as a fraction + of the shorter side of the input images. + max_scale (float, optional): The largest scaling factor for the size of the anchor boxes as a fraction + of the shorter side of the input images. All scaling factors between the smallest and the + largest will be linearly interpolated. Note that the second to last of the linearly interpolated + scaling factors will actually be the scaling factor for the last predictor layer, while the last + scaling factor is used for the second box for aspect ratio 1 in the last predictor layer + if `two_boxes_for_ar1` is `True`. + scales (list, optional): A list of floats containing scaling factors per convolutional predictor layer. + This list must be one element longer than the number of predictor layers. The first `k` elements are the + scaling factors for the `k` predictor layers, while the last element is used for the second box + for aspect ratio 1 in the last predictor layer if `two_boxes_for_ar1` is `True`. This additional + last scaling factor must be passed either way, even if it is not being used. If a list is passed, + this argument overrides `min_scale` and `max_scale`. All scaling factors must be greater than zero. + aspect_ratios_global (list, optional): The list of aspect ratios for which anchor boxes are to be + generated. This list is valid for all predictor layers. The original implementation uses more aspect ratios + for some predictor layers and fewer for others. If you want to do that, too, then use the next argument instead. + aspect_ratios_per_layer (list, optional): A list containing one aspect ratio list for each predictor layer. + This allows you to set the aspect ratios for each predictor layer individually. If a list is passed, + it overrides `aspect_ratios_global`. + two_boxes_for_ar1 (bool, optional): Only relevant for aspect ratio lists that contain 1. Will be ignored otherwise. + If `True`, two anchor boxes will be generated for aspect ratio 1. The first will be generated + using the scaling factor for the respective layer, the second one will be generated using + geometric mean of said scaling factor and next bigger scaling factor. + steps (list, optional): `None` or a list with as many elements as there are predictor layers. The elements can be + either ints/floats or tuples of two ints/floats. These numbers represent for each predictor layer how many + pixels apart the anchor box center points should be vertically and horizontally along the spatial grid over + the image. If the list contains ints/floats, then that value will be used for both spatial dimensions. + If the list contains tuples of two ints/floats, then they represent `(step_height, step_width)`. + If no steps are provided, then they will be computed such that the anchor box center points will form an + equidistant grid within the image dimensions. + offsets (list, optional): `None` or a list with as many elements as there are predictor layers. The elements can be + either floats or tuples of two floats. These numbers represent for each predictor layer how many + pixels from the top and left boarders of the image the top-most and left-most anchor box center points should be + as a fraction of `steps`. The last bit is important: The offsets are not absolute pixel values, but fractions + of the step size specified in the `steps` argument. If the list contains floats, then that value will + be used for both spatial dimensions. If the list contains tuples of two floats, then they represent + `(vertical_offset, horizontal_offset)`. If no offsets are provided, then they will default to 0.5 of the step size, + which is also the recommended setting. + clip_boxes (bool, optional): If `True`, clips the anchor box coordinates to stay within image boundaries. + variances (list, optional): A list of 4 floats >0. The anchor box offset for each coordinate will be divided by + its respective variance value. + coords (str, optional): The box coordinate format to be used internally by the model (i.e. this is not the input format + of the ground truth labels). Can be either 'centroids' for the format `(cx, cy, w, h)` (box center coordinates, width, + and height), 'minmax' for the format `(xmin, xmax, ymin, ymax)`, or 'corners' for the format `(xmin, ymin, xmax, ymax)`. + normalize_coords (bool, optional): Set to `True` if the model is supposed to use relative instead of absolute coordinates, + i.e. if the model predicts box coordinates within [0,1] instead of absolute coordinates. + subtract_mean (array-like, optional): `None` or an array-like object of integers or floating point values + of any shape that is broadcast-compatible with the image shape. The elements of this array will be + subtracted from the image pixel intensity values. For example, pass a list of three integers + to perform per-channel mean normalization for color images. + divide_by_stddev (array-like, optional): `None` or an array-like object of non-zero integers or + floating point values of any shape that is broadcast-compatible with the image shape. The image pixel + intensity values will be divided by the elements of this array. For example, pass a list + of three integers to perform per-channel standard deviation normalization for color images. + swap_channels (list, optional): Either `False` or a list of integers representing the desired order in which the input + image channels should be swapped. + confidence_thresh (float, optional): A float in [0,1), the minimum classification confidence in a specific + positive class in order to be considered for the non-maximum suppression stage for the respective class. + A lower value will result in a larger part of the selection process being done by the non-maximum suppression + stage, while a larger value will result in a larger part of the selection process happening in the confidence + thresholding stage. + iou_threshold (float, optional): A float in [0,1]. All boxes that have a Jaccard similarity of greater than `iou_threshold` + with a locally maximal box will be removed from the set of predictions for a given class, where 'maximal' refers + to the box's confidence score. + top_k (int, optional): The number of highest scoring predictions to be kept for each batch item after the + non-maximum suppression stage. + nms_max_output_size (int, optional): The maximal number of predictions that will be left over after the NMS stage. + return_predictor_sizes (bool, optional): If `True`, this function not only returns the model, but also + a list containing the spatial dimensions of the predictor layers. This isn't strictly necessary since + you can always get their sizes easily via the Keras API, but it's convenient and less error-prone + to get them this way. They are only relevant for training anyway (SSDBoxEncoder needs to know the + spatial dimensions of the predictor layers), for inference you don't need them. + + Returns: + model: The Keras SSD model. + predictor_sizes (optional): A Numpy array containing the `(height, width)` portion + of the output tensor shape for each convolutional predictor layer. During + training, the generator function needs this in order to transform + the ground truth labels into tensors of identical structure as the + output tensors of the model, which is in turn needed for the cost + function. + + References: + https://arxiv.org/abs/1512.02325v5 + ''' + + n_predictor_layers = 4 # The number of predictor conv layers in the network + n_classes += 1 # Account for the background class. + l2_reg = l2_regularization # Make the internal name shorter. + img_height, img_width, img_channels = image_size[0], image_size[1], image_size[2] + + ############################################################################ + # Get a few exceptions out of the way. + ############################################################################ + + if aspect_ratios_global is None and aspect_ratios_per_layer is None: + raise ValueError("`aspect_ratios_global` and `aspect_ratios_per_layer` cannot both be None. At least one needs to be specified.") + if aspect_ratios_per_layer: + if len(aspect_ratios_per_layer) != n_predictor_layers: + raise ValueError("It must be either aspect_ratios_per_layer is None or len(aspect_ratios_per_layer) == {}, but len(aspect_ratios_per_layer) == {}.".format(n_predictor_layers, len(aspect_ratios_per_layer))) + + if (min_scale is None or max_scale is None) and scales is None: + raise ValueError("Either `min_scale` and `max_scale` or `scales` need to be specified.") + if scales: + if len(scales) != n_predictor_layers+1: + raise ValueError("It must be either scales is None or len(scales) == {}, but len(scales) == {}.".format(n_predictor_layers+1, len(scales))) + else: # If no explicit list of scaling factors was passed, compute the list of scaling factors from `min_scale` and `max_scale` + scales = np.linspace(min_scale, max_scale, n_predictor_layers+1) + + if len(variances) != 4: # We need one variance value for each of the four box coordinates + raise ValueError("4 variance values must be pased, but {} values were received.".format(len(variances))) + variances = np.array(variances) + if np.any(variances <= 0): + raise ValueError("All variances must be >0, but the variances given are {}".format(variances)) + + if (not (steps is None)) and (len(steps) != n_predictor_layers): + raise ValueError("You must provide at least one step value per predictor layer.") + + if (not (offsets is None)) and (len(offsets) != n_predictor_layers): + raise ValueError("You must provide at least one offset value per predictor layer.") + + ############################################################################ + # Compute the anchor box parameters. + ############################################################################ + + # Set the aspect ratios for each predictor layer. These are only needed for the anchor box layers. + if aspect_ratios_per_layer: + aspect_ratios = aspect_ratios_per_layer + else: + aspect_ratios = [aspect_ratios_global] * n_predictor_layers + + # Compute the number of boxes to be predicted per cell for each predictor layer. + # We need this so that we know how many channels the predictor layers need to have. + if aspect_ratios_per_layer: + n_boxes = [] + for ar in aspect_ratios_per_layer: + if (1 in ar) & two_boxes_for_ar1: + n_boxes.append(len(ar) + 1) # +1 for the second box for aspect ratio 1 + else: + n_boxes.append(len(ar)) + else: # If only a global aspect ratio list was passed, then the number of boxes is the same for each predictor layer + if (1 in aspect_ratios_global) & two_boxes_for_ar1: + n_boxes = len(aspect_ratios_global) + 1 + else: + n_boxes = len(aspect_ratios_global) + n_boxes = [n_boxes] * n_predictor_layers + + if steps is None: + steps = [None] * n_predictor_layers + if offsets is None: + offsets = [None] * n_predictor_layers + + ############################################################################ + # Define functions for the Lambda layers below. + ############################################################################ + + def identity_layer(tensor): + return tensor + + def input_mean_normalization(tensor): + return tensor - np.array(subtract_mean) + + def input_stddev_normalization(tensor): + return tensor / np.array(divide_by_stddev) + + def input_channel_swap(tensor): + if len(swap_channels) == 3: + return K.stack([tensor[...,swap_channels[0]], tensor[...,swap_channels[1]], tensor[...,swap_channels[2]]], axis=-1) + elif len(swap_channels) == 4: + return K.stack([tensor[...,swap_channels[0]], tensor[...,swap_channels[1]], tensor[...,swap_channels[2]], tensor[...,swap_channels[3]]], axis=-1) + + ############################################################################ + # Build the network. + ############################################################################ + + x = Input(shape=(img_height, img_width, img_channels)) + + # The following identity layer is only needed so that the subsequent lambda layers can be optional. + x1 = Lambda(identity_layer, output_shape=(img_height, img_width, img_channels), name='identity_layer')(x) + if not (subtract_mean is None): + x1 = Lambda(input_mean_normalization, output_shape=(img_height, img_width, img_channels), name='input_mean_normalization')(x1) + if not (divide_by_stddev is None): + x1 = Lambda(input_stddev_normalization, output_shape=(img_height, img_width, img_channels), name='input_stddev_normalization')(x1) + if swap_channels: + x1 = Lambda(input_channel_swap, output_shape=(img_height, img_width, img_channels), name='input_channel_swap')(x1) + + conv1 = Conv2D(32, (5, 5), strides=(1, 1), padding="same", kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv1')(x1) + conv1 = BatchNormalization(axis=3, momentum=0.99, name='bn1')(conv1) # Tensorflow uses filter format [filter_height, filter_width, in_channels, out_channels], hence axis = 3 + conv1 = ELU(name='elu1')(conv1) + pool1 = MaxPooling2D(pool_size=(2, 2), name='pool1')(conv1) + + conv2 = Conv2D(48, (3, 3), strides=(1, 1), padding="same", kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv2')(pool1) + conv2 = BatchNormalization(axis=3, momentum=0.99, name='bn2')(conv2) + conv2 = ELU(name='elu2')(conv2) + pool2 = MaxPooling2D(pool_size=(2, 2), name='pool2')(conv2) + + conv3 = Conv2D(64, (3, 3), strides=(1, 1), padding="same", kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv3')(pool2) + conv3 = BatchNormalization(axis=3, momentum=0.99, name='bn3')(conv3) + conv3 = ELU(name='elu3')(conv3) + pool3 = MaxPooling2D(pool_size=(2, 2), name='pool3')(conv3) + + conv4 = Conv2D(64, (3, 3), strides=(1, 1), padding="same", kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv4')(pool3) + conv4 = BatchNormalization(axis=3, momentum=0.99, name='bn4')(conv4) + conv4 = ELU(name='elu4')(conv4) + pool4 = MaxPooling2D(pool_size=(2, 2), name='pool4')(conv4) + + conv5 = Conv2D(48, (3, 3), strides=(1, 1), padding="same", kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv5')(pool4) + conv5 = BatchNormalization(axis=3, momentum=0.99, name='bn5')(conv5) + conv5 = ELU(name='elu5')(conv5) + pool5 = MaxPooling2D(pool_size=(2, 2), name='pool5')(conv5) + + conv6 = Conv2D(48, (3, 3), strides=(1, 1), padding="same", kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv6')(pool5) + conv6 = BatchNormalization(axis=3, momentum=0.99, name='bn6')(conv6) + conv6 = ELU(name='elu6')(conv6) + pool6 = MaxPooling2D(pool_size=(2, 2), name='pool6')(conv6) + + conv7 = Conv2D(32, (3, 3), strides=(1, 1), padding="same", kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv7')(pool6) + conv7 = BatchNormalization(axis=3, momentum=0.99, name='bn7')(conv7) + conv7 = ELU(name='elu7')(conv7) + + # The next part is to add the convolutional predictor layers on top of the base network + # that we defined above. Note that I use the term "base network" differently than the paper does. + # To me, the base network is everything that is not convolutional predictor layers or anchor + # box layers. In this case we'll have four predictor layers, but of course you could + # easily rewrite this into an arbitrarily deep base network and add an arbitrary number of + # predictor layers on top of the base network by simply following the pattern shown here. + + # Build the convolutional predictor layers on top of conv layers 4, 5, 6, and 7. + # We build two predictor layers on top of each of these layers: One for class prediction (classification), one for box coordinate prediction (localization) + # We precidt `n_classes` confidence values for each box, hence the `classes` predictors have depth `n_boxes * n_classes` + # We predict 4 box coordinates for each box, hence the `boxes` predictors have depth `n_boxes * 4` + # Output shape of `classes`: `(batch, height, width, n_boxes * n_classes)` + classes4 = Conv2D(n_boxes[0] * n_classes, (3, 3), strides=(1, 1), padding="same", kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='classes4')(conv4) + classes5 = Conv2D(n_boxes[1] * n_classes, (3, 3), strides=(1, 1), padding="same", kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='classes5')(conv5) + classes6 = Conv2D(n_boxes[2] * n_classes, (3, 3), strides=(1, 1), padding="same", kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='classes6')(conv6) + classes7 = Conv2D(n_boxes[3] * n_classes, (3, 3), strides=(1, 1), padding="same", kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='classes7')(conv7) + # Output shape of `boxes`: `(batch, height, width, n_boxes * 4)` + boxes4 = Conv2D(n_boxes[0] * 4, (3, 3), strides=(1, 1), padding="same", kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='boxes4')(conv4) + boxes5 = Conv2D(n_boxes[1] * 4, (3, 3), strides=(1, 1), padding="same", kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='boxes5')(conv5) + boxes6 = Conv2D(n_boxes[2] * 4, (3, 3), strides=(1, 1), padding="same", kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='boxes6')(conv6) + boxes7 = Conv2D(n_boxes[3] * 4, (3, 3), strides=(1, 1), padding="same", kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='boxes7')(conv7) + + # Generate the anchor boxes + # Output shape of `anchors`: `(batch, height, width, n_boxes, 8)` + anchors4 = AnchorBoxes(img_height, img_width, this_scale=scales[0], next_scale=scales[1], aspect_ratios=aspect_ratios[0], + two_boxes_for_ar1=two_boxes_for_ar1, this_steps=steps[0], this_offsets=offsets[0], + clip_boxes=clip_boxes, variances=variances, coords=coords, normalize_coords=normalize_coords, name='anchors4')(boxes4) + anchors5 = AnchorBoxes(img_height, img_width, this_scale=scales[1], next_scale=scales[2], aspect_ratios=aspect_ratios[1], + two_boxes_for_ar1=two_boxes_for_ar1, this_steps=steps[1], this_offsets=offsets[1], + clip_boxes=clip_boxes, variances=variances, coords=coords, normalize_coords=normalize_coords, name='anchors5')(boxes5) + anchors6 = AnchorBoxes(img_height, img_width, this_scale=scales[2], next_scale=scales[3], aspect_ratios=aspect_ratios[2], + two_boxes_for_ar1=two_boxes_for_ar1, this_steps=steps[2], this_offsets=offsets[2], + clip_boxes=clip_boxes, variances=variances, coords=coords, normalize_coords=normalize_coords, name='anchors6')(boxes6) + anchors7 = AnchorBoxes(img_height, img_width, this_scale=scales[3], next_scale=scales[4], aspect_ratios=aspect_ratios[3], + two_boxes_for_ar1=two_boxes_for_ar1, this_steps=steps[3], this_offsets=offsets[3], + clip_boxes=clip_boxes, variances=variances, coords=coords, normalize_coords=normalize_coords, name='anchors7')(boxes7) + + # Reshape the class predictions, yielding 3D tensors of shape `(batch, height * width * n_boxes, n_classes)` + # We want the classes isolated in the last axis to perform softmax on them + classes4_reshaped = Reshape((-1, n_classes), name='classes4_reshape')(classes4) + classes5_reshaped = Reshape((-1, n_classes), name='classes5_reshape')(classes5) + classes6_reshaped = Reshape((-1, n_classes), name='classes6_reshape')(classes6) + classes7_reshaped = Reshape((-1, n_classes), name='classes7_reshape')(classes7) + # Reshape the box coordinate predictions, yielding 3D tensors of shape `(batch, height * width * n_boxes, 4)` + # We want the four box coordinates isolated in the last axis to compute the smooth L1 loss + boxes4_reshaped = Reshape((-1, 4), name='boxes4_reshape')(boxes4) + boxes5_reshaped = Reshape((-1, 4), name='boxes5_reshape')(boxes5) + boxes6_reshaped = Reshape((-1, 4), name='boxes6_reshape')(boxes6) + boxes7_reshaped = Reshape((-1, 4), name='boxes7_reshape')(boxes7) + # Reshape the anchor box tensors, yielding 3D tensors of shape `(batch, height * width * n_boxes, 8)` + anchors4_reshaped = Reshape((-1, 8), name='anchors4_reshape')(anchors4) + anchors5_reshaped = Reshape((-1, 8), name='anchors5_reshape')(anchors5) + anchors6_reshaped = Reshape((-1, 8), name='anchors6_reshape')(anchors6) + anchors7_reshaped = Reshape((-1, 8), name='anchors7_reshape')(anchors7) + + # Concatenate the predictions from the different layers and the assosciated anchor box tensors + # Axis 0 (batch) and axis 2 (n_classes or 4, respectively) are identical for all layer predictions, + # so we want to concatenate along axis 1 + # Output shape of `classes_concat`: (batch, n_boxes_total, n_classes) + classes_concat = Concatenate(axis=1, name='classes_concat')([classes4_reshaped, + classes5_reshaped, + classes6_reshaped, + classes7_reshaped]) + + # Output shape of `boxes_concat`: (batch, n_boxes_total, 4) + boxes_concat = Concatenate(axis=1, name='boxes_concat')([boxes4_reshaped, + boxes5_reshaped, + boxes6_reshaped, + boxes7_reshaped]) + + # Output shape of `anchors_concat`: (batch, n_boxes_total, 8) + anchors_concat = Concatenate(axis=1, name='anchors_concat')([anchors4_reshaped, + anchors5_reshaped, + anchors6_reshaped, + anchors7_reshaped]) + + # The box coordinate predictions will go into the loss function just the way they are, + # but for the class predictions, we'll apply a softmax activation layer first + classes_softmax = Activation('softmax', name='classes_softmax')(classes_concat) + + # Concatenate the class and box coordinate predictions and the anchors to one large predictions tensor + # Output shape of `predictions`: (batch, n_boxes_total, n_classes + 4 + 8) + predictions = Concatenate(axis=2, name='predictions')([classes_softmax, boxes_concat, anchors_concat]) + + if mode == 'training': + model = Model(inputs=x, outputs=predictions) + elif mode == 'inference': + decoded_predictions = DecodeDetections(confidence_thresh=confidence_thresh, + iou_threshold=iou_threshold, + top_k=top_k, + nms_max_output_size=nms_max_output_size, + coords=coords, + normalize_coords=normalize_coords, + img_height=img_height, + img_width=img_width, + name='decoded_predictions')(predictions) + model = Model(inputs=x, outputs=decoded_predictions) + elif mode == 'inference_fast': + decoded_predictions = DecodeDetectionsFast(confidence_thresh=confidence_thresh, + iou_threshold=iou_threshold, + top_k=top_k, + nms_max_output_size=nms_max_output_size, + coords=coords, + normalize_coords=normalize_coords, + img_height=img_height, + img_width=img_width, + name='decoded_predictions')(predictions) + model = Model(inputs=x, outputs=decoded_predictions) + else: + raise ValueError("`mode` must be one of 'training', 'inference' or 'inference_fast', but received '{}'.".format(mode)) + + if return_predictor_sizes: + # The spatial dimensions are the same for the `classes` and `boxes` predictor layers. + predictor_sizes = np.array([classes4._keras_shape[1:3], + classes5._keras_shape[1:3], + classes6._keras_shape[1:3], + classes7._keras_shape[1:3]]) + return model, predictor_sizes + else: + return model diff --git a/keras_ssd/ssd_encoder_decoder/__init__.py b/keras_ssd/ssd_encoder_decoder/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/keras_ssd/ssd_encoder_decoder/matching_utils.py b/keras_ssd/ssd_encoder_decoder/matching_utils.py new file mode 100644 index 0000000..f1fcc90 --- /dev/null +++ b/keras_ssd/ssd_encoder_decoder/matching_utils.py @@ -0,0 +1,116 @@ +''' +Utilities to match ground truth boxes to anchor boxes. + +Copyright (C) 2018 Pierluigi Ferrari + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +''' + +from __future__ import division +import numpy as np + +def match_bipartite_greedy(weight_matrix): + ''' + Returns a bipartite matching according to the given weight matrix. + + The algorithm works as follows: + + Let the first axis of `weight_matrix` represent ground truth boxes + and the second axis anchor boxes. + The ground truth box that has the greatest similarity with any + anchor box will be matched first, then out of the remaining ground + truth boxes, the ground truth box that has the greatest similarity + with any of the remaining anchor boxes will be matched second, and + so on. That is, the ground truth boxes will be matched in descending + order by maximum similarity with any of the respectively remaining + anchor boxes. + The runtime complexity is O(m^2 * n), where `m` is the number of + ground truth boxes and `n` is the number of anchor boxes. + + Arguments: + weight_matrix (array): A 2D Numpy array that represents the weight matrix + for the matching process. If `(m,n)` is the shape of the weight matrix, + it must be `m <= n`. The weights can be integers or floating point + numbers. The matching process will maximize, i.e. larger weights are + preferred over smaller weights. + + Returns: + A 1D Numpy array of length `weight_matrix.shape[0]` that represents + the matched index along the second axis of `weight_matrix` for each index + along the first axis. + ''' + + weight_matrix = np.copy(weight_matrix) # We'll modify this array. + num_ground_truth_boxes = weight_matrix.shape[0] + all_gt_indices = list(range(num_ground_truth_boxes)) # Only relevant for fancy-indexing below. + + # This 1D array will contain for each ground truth box the index of + # the matched anchor box. + matches = np.zeros(num_ground_truth_boxes, dtype=np.int) + + # In each iteration of the loop below, exactly one ground truth box + # will be matched to one anchor box. + for _ in range(num_ground_truth_boxes): + + # Find the maximal anchor-ground truth pair in two steps: First, reduce + # over the anchor boxes and then reduce over the ground truth boxes. + anchor_indices = np.argmax(weight_matrix, axis=1) # Reduce along the anchor box axis. + overlaps = weight_matrix[all_gt_indices, anchor_indices] + ground_truth_index = np.argmax(overlaps) # Reduce along the ground truth box axis. + anchor_index = anchor_indices[ground_truth_index] + matches[ground_truth_index] = anchor_index # Set the match. + + # Set the row of the matched ground truth box and the column of the matched + # anchor box to all zeros. This ensures that those boxes will not be matched again, + # because they will never be the best matches for any other boxes. + weight_matrix[ground_truth_index] = 0 + weight_matrix[:,anchor_index] = 0 + + return matches + +def match_multi(weight_matrix, threshold): + ''' + Matches all elements along the second axis of `weight_matrix` to their best + matches along the first axis subject to the constraint that the weight of a match + must be greater than or equal to `threshold` in order to produce a match. + + If the weight matrix contains elements that should be ignored, the row or column + representing the respective elemet should be set to a value below `threshold`. + + Arguments: + weight_matrix (array): A 2D Numpy array that represents the weight matrix + for the matching process. If `(m,n)` is the shape of the weight matrix, + it must be `m <= n`. The weights can be integers or floating point + numbers. The matching process will maximize, i.e. larger weights are + preferred over smaller weights. + threshold (float): A float that represents the threshold (i.e. lower bound) + that must be met by a pair of elements to produce a match. + + Returns: + Two 1D Numpy arrays of equal length that represent the matched indices. The first + array contains the indices along the first axis of `weight_matrix`, the second array + contains the indices along the second axis. + ''' + + num_anchor_boxes = weight_matrix.shape[1] + all_anchor_indices = list(range(num_anchor_boxes)) # Only relevant for fancy-indexing below. + + # Find the best ground truth match for every anchor box. + ground_truth_indices = np.argmax(weight_matrix, axis=0) # Array of shape (weight_matrix.shape[1],) + overlaps = weight_matrix[ground_truth_indices, all_anchor_indices] # Array of shape (weight_matrix.shape[1],) + + # Filter out the matches with a weight below the threshold. + anchor_indices_thresh_met = np.nonzero(overlaps >= threshold)[0] + gt_indices_thresh_met = ground_truth_indices[anchor_indices_thresh_met] + + return gt_indices_thresh_met, anchor_indices_thresh_met diff --git a/keras_ssd/ssd_encoder_decoder/ssd_input_encoder.py b/keras_ssd/ssd_encoder_decoder/ssd_input_encoder.py new file mode 100644 index 0000000..15fbb53 --- /dev/null +++ b/keras_ssd/ssd_encoder_decoder/ssd_input_encoder.py @@ -0,0 +1,617 @@ +''' +An encoder that converts ground truth annotations to SSD-compatible training targets. + +Copyright (C) 2018 Pierluigi Ferrari + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +''' + +from __future__ import division +import numpy as np + +from bounding_box_utils.bounding_box_utils import iou, convert_coordinates +from ssd_encoder_decoder.matching_utils import match_bipartite_greedy, match_multi + +class SSDInputEncoder: + ''' + Transforms ground truth labels for object detection in images + (2D bounding box coordinates and class labels) to the format required for + training an SSD model. + + In the process of encoding the ground truth labels, a template of anchor boxes + is being built, which are subsequently matched to the ground truth boxes + via an intersection-over-union threshold criterion. + ''' + + def __init__(self, + img_height, + img_width, + n_classes, + predictor_sizes, + min_scale=0.1, + max_scale=0.9, + scales=None, + aspect_ratios_global=[0.5, 1.0, 2.0], + aspect_ratios_per_layer=None, + two_boxes_for_ar1=True, + steps=None, + offsets=None, + clip_boxes=False, + variances=[0.1, 0.1, 0.2, 0.2], + matching_type='multi', + pos_iou_threshold=0.5, + neg_iou_limit=0.3, + border_pixels='half', + coords='centroids', + normalize_coords=True, + background_id=0): + ''' + Arguments: + img_height (int): The height of the input images. + img_width (int): The width of the input images. + n_classes (int): The number of positive classes, e.g. 20 for Pascal VOC, 80 for MS COCO. + predictor_sizes (list): A list of int-tuples of the format `(height, width)` + containing the output heights and widths of the convolutional predictor layers. + min_scale (float, optional): The smallest scaling factor for the size of the anchor boxes as a fraction + of the shorter side of the input images. Note that you should set the scaling factors + such that the resulting anchor box sizes correspond to the sizes of the objects you are trying + to detect. Must be >0. + max_scale (float, optional): The largest scaling factor for the size of the anchor boxes as a fraction + of the shorter side of the input images. All scaling factors between the smallest and the + largest will be linearly interpolated. Note that the second to last of the linearly interpolated + scaling factors will actually be the scaling factor for the last predictor layer, while the last + scaling factor is used for the second box for aspect ratio 1 in the last predictor layer + if `two_boxes_for_ar1` is `True`. Note that you should set the scaling factors + such that the resulting anchor box sizes correspond to the sizes of the objects you are trying + to detect. Must be greater than or equal to `min_scale`. + scales (list, optional): A list of floats >0 containing scaling factors per convolutional predictor layer. + This list must be one element longer than the number of predictor layers. The first `k` elements are the + scaling factors for the `k` predictor layers, while the last element is used for the second box + for aspect ratio 1 in the last predictor layer if `two_boxes_for_ar1` is `True`. This additional + last scaling factor must be passed either way, even if it is not being used. If a list is passed, + this argument overrides `min_scale` and `max_scale`. All scaling factors must be greater than zero. + Note that you should set the scaling factors such that the resulting anchor box sizes correspond to + the sizes of the objects you are trying to detect. + aspect_ratios_global (list, optional): The list of aspect ratios for which anchor boxes are to be + generated. This list is valid for all prediction layers. Note that you should set the aspect ratios such + that the resulting anchor box shapes roughly correspond to the shapes of the objects you are trying to detect. + aspect_ratios_per_layer (list, optional): A list containing one aspect ratio list for each prediction layer. + If a list is passed, it overrides `aspect_ratios_global`. Note that you should set the aspect ratios such + that the resulting anchor box shapes very roughly correspond to the shapes of the objects you are trying to detect. + two_boxes_for_ar1 (bool, optional): Only relevant for aspect ratios lists that contain 1. Will be ignored otherwise. + If `True`, two anchor boxes will be generated for aspect ratio 1. The first will be generated + using the scaling factor for the respective layer, the second one will be generated using + geometric mean of said scaling factor and next bigger scaling factor. + steps (list, optional): `None` or a list with as many elements as there are predictor layers. The elements can be + either ints/floats or tuples of two ints/floats. These numbers represent for each predictor layer how many + pixels apart the anchor box center points should be vertically and horizontally along the spatial grid over + the image. If the list contains ints/floats, then that value will be used for both spatial dimensions. + If the list contains tuples of two ints/floats, then they represent `(step_height, step_width)`. + If no steps are provided, then they will be computed such that the anchor box center points will form an + equidistant grid within the image dimensions. + offsets (list, optional): `None` or a list with as many elements as there are predictor layers. The elements can be + either floats or tuples of two floats. These numbers represent for each predictor layer how many + pixels from the top and left boarders of the image the top-most and left-most anchor box center points should be + as a fraction of `steps`. The last bit is important: The offsets are not absolute pixel values, but fractions + of the step size specified in the `steps` argument. If the list contains floats, then that value will + be used for both spatial dimensions. If the list contains tuples of two floats, then they represent + `(vertical_offset, horizontal_offset)`. If no offsets are provided, then they will default to 0.5 of the step size. + clip_boxes (bool, optional): If `True`, limits the anchor box coordinates to stay within image boundaries. + variances (list, optional): A list of 4 floats >0. The anchor box offset for each coordinate will be divided by + its respective variance value. + matching_type (str, optional): Can be either 'multi' or 'bipartite'. In 'bipartite' mode, each ground truth box will + be matched only to the one anchor box with the highest IoU overlap. In 'multi' mode, in addition to the aforementioned + bipartite matching, all anchor boxes with an IoU overlap greater than or equal to the `pos_iou_threshold` will be + matched to a given ground truth box. + pos_iou_threshold (float, optional): The intersection-over-union similarity threshold that must be + met in order to match a given ground truth box to a given anchor box. + neg_iou_limit (float, optional): The maximum allowed intersection-over-union similarity of an + anchor box with any ground truth box to be labeled a negative (i.e. background) box. If an + anchor box is neither a positive, nor a negative box, it will be ignored during training. + border_pixels (str, optional): How to treat the border pixels of the bounding boxes. + Can be 'include', 'exclude', or 'half'. If 'include', the border pixels belong + to the boxes. If 'exclude', the border pixels do not belong to the boxes. + If 'half', then one of each of the two horizontal and vertical borders belong + to the boxex, but not the other. + coords (str, optional): The box coordinate format to be used internally by the model (i.e. this is not the input format + of the ground truth labels). Can be either 'centroids' for the format `(cx, cy, w, h)` (box center coordinates, width, + and height), 'minmax' for the format `(xmin, xmax, ymin, ymax)`, or 'corners' for the format `(xmin, ymin, xmax, ymax)`. + normalize_coords (bool, optional): If `True`, the encoder uses relative instead of absolute coordinates. + This means instead of using absolute tartget coordinates, the encoder will scale all coordinates to be within [0,1]. + This way learning becomes independent of the input image size. + background_id (int, optional): Determines which class ID is for the background class. + ''' + predictor_sizes = np.array(predictor_sizes) + if predictor_sizes.ndim == 1: + predictor_sizes = np.expand_dims(predictor_sizes, axis=0) + + ################################################################################## + # Handle exceptions. + ################################################################################## + + if (min_scale is None or max_scale is None) and scales is None: + raise ValueError("Either `min_scale` and `max_scale` or `scales` need to be specified.") + + if scales: + if (len(scales) != predictor_sizes.shape[0] + 1): # Must be two nested `if` statements since `list` and `bool` cannot be combined by `&` + raise ValueError("It must be either scales is None or len(scales) == len(predictor_sizes)+1, but len(scales) == {} and len(predictor_sizes)+1 == {}".format(len(scales), len(predictor_sizes)+1)) + scales = np.array(scales) + if np.any(scales <= 0): + raise ValueError("All values in `scales` must be greater than 0, but the passed list of scales is {}".format(scales)) + else: # If no list of scales was passed, we need to make sure that `min_scale` and `max_scale` are valid values. + if not 0 < min_scale <= max_scale: + raise ValueError("It must be 0 < min_scale <= max_scale, but it is min_scale = {} and max_scale = {}".format(min_scale, max_scale)) + + if not (aspect_ratios_per_layer is None): + if (len(aspect_ratios_per_layer) != predictor_sizes.shape[0]): # Must be two nested `if` statements since `list` and `bool` cannot be combined by `&` + raise ValueError("It must be either aspect_ratios_per_layer is None or len(aspect_ratios_per_layer) == len(predictor_sizes), but len(aspect_ratios_per_layer) == {} and len(predictor_sizes) == {}".format(len(aspect_ratios_per_layer), len(predictor_sizes))) + for aspect_ratios in aspect_ratios_per_layer: + if np.any(np.array(aspect_ratios) <= 0): + raise ValueError("All aspect ratios must be greater than zero.") + else: + if (aspect_ratios_global is None): + raise ValueError("At least one of `aspect_ratios_global` and `aspect_ratios_per_layer` must not be `None`.") + if np.any(np.array(aspect_ratios_global) <= 0): + raise ValueError("All aspect ratios must be greater than zero.") + + if len(variances) != 4: + raise ValueError("4 variance values must be pased, but {} values were received.".format(len(variances))) + variances = np.array(variances) + if np.any(variances <= 0): + raise ValueError("All variances must be >0, but the variances given are {}".format(variances)) + + if not (coords == 'minmax' or coords == 'centroids' or coords == 'corners'): + raise ValueError("Unexpected value for `coords`. Supported values are 'minmax', 'corners' and 'centroids'.") + + if (not (steps is None)) and (len(steps) != predictor_sizes.shape[0]): + raise ValueError("You must provide at least one step value per predictor layer.") + + if (not (offsets is None)) and (len(offsets) != predictor_sizes.shape[0]): + raise ValueError("You must provide at least one offset value per predictor layer.") + + ################################################################################## + # Set or compute members. + ################################################################################## + + self.img_height = img_height + self.img_width = img_width + self.n_classes = n_classes + 1 # + 1 for the background class + self.predictor_sizes = predictor_sizes + self.min_scale = min_scale + self.max_scale = max_scale + # If `scales` is None, compute the scaling factors by linearly interpolating between + # `min_scale` and `max_scale`. If an explicit list of `scales` is given, however, + # then it takes precedent over `min_scale` and `max_scale`. + if (scales is None): + self.scales = np.linspace(self.min_scale, self.max_scale, len(self.predictor_sizes)+1) + else: + # If a list of scales is given explicitly, we'll use that instead of computing it from `min_scale` and `max_scale`. + self.scales = scales + # If `aspect_ratios_per_layer` is None, then we use the same list of aspect ratios + # `aspect_ratios_global` for all predictor layers. If `aspect_ratios_per_layer` is given, + # however, then it takes precedent over `aspect_ratios_global`. + if (aspect_ratios_per_layer is None): + self.aspect_ratios = [aspect_ratios_global] * predictor_sizes.shape[0] + else: + # If aspect ratios are given per layer, we'll use those. + self.aspect_ratios = aspect_ratios_per_layer + self.two_boxes_for_ar1 = two_boxes_for_ar1 + if not (steps is None): + self.steps = steps + else: + self.steps = [None] * predictor_sizes.shape[0] + if not (offsets is None): + self.offsets = offsets + else: + self.offsets = [None] * predictor_sizes.shape[0] + self.clip_boxes = clip_boxes + self.variances = variances + self.matching_type = matching_type + self.pos_iou_threshold = pos_iou_threshold + self.neg_iou_limit = neg_iou_limit + self.border_pixels = border_pixels + self.coords = coords + self.normalize_coords = normalize_coords + self.background_id = background_id + + # Compute the number of boxes per spatial location for each predictor layer. + # For example, if a predictor layer has three different aspect ratios, [1.0, 0.5, 2.0], and is + # supposed to predict two boxes of slightly different size for aspect ratio 1.0, then that predictor + # layer predicts a total of four boxes at every spatial location across the feature map. + if not (aspect_ratios_per_layer is None): + self.n_boxes = [] + for aspect_ratios in aspect_ratios_per_layer: + if (1 in aspect_ratios) & two_boxes_for_ar1: + self.n_boxes.append(len(aspect_ratios) + 1) + else: + self.n_boxes.append(len(aspect_ratios)) + else: + if (1 in aspect_ratios_global) & two_boxes_for_ar1: + self.n_boxes = len(aspect_ratios_global) + 1 + else: + self.n_boxes = len(aspect_ratios_global) + + ################################################################################## + # Compute the anchor boxes for each predictor layer. + ################################################################################## + + # Compute the anchor boxes for each predictor layer. We only have to do this once + # since the anchor boxes depend only on the model configuration, not on the input data. + # For each predictor layer (i.e. for each scaling factor) the tensors for that layer's + # anchor boxes will have the shape `(feature_map_height, feature_map_width, n_boxes, 4)`. + + self.boxes_list = [] # This will store the anchor boxes for each predicotr layer. + + # The following lists just store diagnostic information. Sometimes it's handy to have the + # boxes' center points, heights, widths, etc. in a list. + self.wh_list_diag = [] # Box widths and heights for each predictor layer + self.steps_diag = [] # Horizontal and vertical distances between any two boxes for each predictor layer + self.offsets_diag = [] # Offsets for each predictor layer + self.centers_diag = [] # Anchor box center points as `(cy, cx)` for each predictor layer + + # Iterate over all predictor layers and compute the anchor boxes for each one. + for i in range(len(self.predictor_sizes)): + boxes, center, wh, step, offset = self.generate_anchor_boxes_for_layer(feature_map_size=self.predictor_sizes[i], + aspect_ratios=self.aspect_ratios[i], + this_scale=self.scales[i], + next_scale=self.scales[i+1], + this_steps=self.steps[i], + this_offsets=self.offsets[i], + diagnostics=True) + self.boxes_list.append(boxes) + self.wh_list_diag.append(wh) + self.steps_diag.append(step) + self.offsets_diag.append(offset) + self.centers_diag.append(center) + + def __call__(self, ground_truth_labels, diagnostics=False): + ''' + Converts ground truth bounding box data into a suitable format to train an SSD model. + + Arguments: + ground_truth_labels (list): A python list of length `batch_size` that contains one 2D Numpy array + for each batch image. Each such array has `k` rows for the `k` ground truth bounding boxes belonging + to the respective image, and the data for each ground truth bounding box has the format + `(class_id, xmin, ymin, xmax, ymax)` (i.e. the 'corners' coordinate format), and `class_id` must be + an integer greater than 0 for all boxes as class ID 0 is reserved for the background class. + diagnostics (bool, optional): If `True`, not only the encoded ground truth tensor will be returned, + but also a copy of it with anchor box coordinates in place of the ground truth coordinates. + This can be very useful if you want to visualize which anchor boxes got matched to which ground truth + boxes. + + Returns: + `y_encoded`, a 3D numpy array of shape `(batch_size, #boxes, #classes + 4 + 4 + 4)` that serves as the + ground truth label tensor for training, where `#boxes` is the total number of boxes predicted by the + model per image, and the classes are one-hot-encoded. The four elements after the class vecotrs in + the last axis are the box coordinates, the next four elements after that are just dummy elements, and + the last four elements are the variances. + ''' + + # Mapping to define which indices represent which coordinates in the ground truth. + class_id = 0 + xmin = 1 + ymin = 2 + xmax = 3 + ymax = 4 + + batch_size = len(ground_truth_labels) + + ################################################################################## + # Generate the template for y_encoded. + ################################################################################## + + y_encoded = self.generate_encoding_template(batch_size=batch_size, diagnostics=False) + + ################################################################################## + # Match ground truth boxes to anchor boxes. + ################################################################################## + + # Match the ground truth boxes to the anchor boxes. Every anchor box that does not have + # a ground truth match and for which the maximal IoU overlap with any ground truth box is less + # than or equal to `neg_iou_limit` will be a negative (background) box. + + y_encoded[:, :, self.background_id] = 1 # All boxes are background boxes by default. + n_boxes = y_encoded.shape[1] # The total number of boxes that the model predicts per batch item + class_vectors = np.eye(self.n_classes) # An identity matrix that we'll use as one-hot class vectors + + for i in range(batch_size): # For each batch item... + + if ground_truth_labels[i].size == 0: continue # If there is no ground truth for this batch item, there is nothing to match. + labels = ground_truth_labels[i].astype(np.float) # The labels for this batch item + + # Check for degenerate ground truth bounding boxes before attempting any computations. + if np.any(labels[:,[xmax]] - labels[:,[xmin]] <= 0) or np.any(labels[:,[ymax]] - labels[:,[ymin]] <= 0): + raise DegenerateBoxError("SSDInputEncoder detected degenerate ground truth bounding boxes for batch item {} with bounding boxes {}, ".format(i, labels) + + "i.e. bounding boxes where xmax <= xmin and/or ymax <= ymin. Degenerate ground truth " + + "bounding boxes will lead to NaN errors during the training.") + + # Maybe normalize the box coordinates. + if self.normalize_coords: + labels[:,[ymin,ymax]] /= self.img_height # Normalize ymin and ymax relative to the image height + labels[:,[xmin,xmax]] /= self.img_width # Normalize xmin and xmax relative to the image width + + # Maybe convert the box coordinate format. + if self.coords == 'centroids': + labels = convert_coordinates(labels, start_index=xmin, conversion='corners2centroids', border_pixels=self.border_pixels) + elif self.coords == 'minmax': + labels = convert_coordinates(labels, start_index=xmin, conversion='corners2minmax') + + classes_one_hot = class_vectors[labels[:, class_id].astype(np.int)] # The one-hot class IDs for the ground truth boxes of this batch item + labels_one_hot = np.concatenate([classes_one_hot, labels[:, [xmin,ymin,xmax,ymax]]], axis=-1) # The one-hot version of the labels for this batch item + + # Compute the IoU similarities between all anchor boxes and all ground truth boxes for this batch item. + # This is a matrix of shape `(num_ground_truth_boxes, num_anchor_boxes)`. + similarities = iou(labels[:,[xmin,ymin,xmax,ymax]], y_encoded[i,:,-12:-8], coords=self.coords, mode='outer_product', border_pixels=self.border_pixels) + + # First: Do bipartite matching, i.e. match each ground truth box to the one anchor box with the highest IoU. + # This ensures that each ground truth box will have at least one good match. + + # For each ground truth box, get the anchor box to match with it. + bipartite_matches = match_bipartite_greedy(weight_matrix=similarities) + + # Write the ground truth data to the matched anchor boxes. + y_encoded[i, bipartite_matches, :-8] = labels_one_hot + + # Set the columns of the matched anchor boxes to zero to indicate that they were matched. + similarities[:, bipartite_matches] = 0 + + # Second: Maybe do 'multi' matching, where each remaining anchor box will be matched to its most similar + # ground truth box with an IoU of at least `pos_iou_threshold`, or not matched if there is no + # such ground truth box. + + if self.matching_type == 'multi': + + # Get all matches that satisfy the IoU threshold. + matches = match_multi(weight_matrix=similarities, threshold=self.pos_iou_threshold) + + # Write the ground truth data to the matched anchor boxes. + y_encoded[i, matches[1], :-8] = labels_one_hot[matches[0]] + + # Set the columns of the matched anchor boxes to zero to indicate that they were matched. + similarities[:, matches[1]] = 0 + + # Third: Now after the matching is done, all negative (background) anchor boxes that have + # an IoU of `neg_iou_limit` or more with any ground truth box will be set to netral, + # i.e. they will no longer be background boxes. These anchors are "too close" to a + # ground truth box to be valid background boxes. + + max_background_similarities = np.amax(similarities, axis=0) + neutral_boxes = np.nonzero(max_background_similarities >= self.neg_iou_limit)[0] + y_encoded[i, neutral_boxes, self.background_id] = 0 + + ################################################################################## + # Convert box coordinates to anchor box offsets. + ################################################################################## + + if self.coords == 'centroids': + y_encoded[:,:,[-12,-11]] -= y_encoded[:,:,[-8,-7]] # cx(gt) - cx(anchor), cy(gt) - cy(anchor) + y_encoded[:,:,[-12,-11]] /= y_encoded[:,:,[-6,-5]] * y_encoded[:,:,[-4,-3]] # (cx(gt) - cx(anchor)) / w(anchor) / cx_variance, (cy(gt) - cy(anchor)) / h(anchor) / cy_variance + y_encoded[:,:,[-10,-9]] /= y_encoded[:,:,[-6,-5]] # w(gt) / w(anchor), h(gt) / h(anchor) + y_encoded[:,:,[-10,-9]] = np.log(y_encoded[:,:,[-10,-9]]) / y_encoded[:,:,[-2,-1]] # ln(w(gt) / w(anchor)) / w_variance, ln(h(gt) / h(anchor)) / h_variance (ln == natural logarithm) + elif self.coords == 'corners': + y_encoded[:,:,-12:-8] -= y_encoded[:,:,-8:-4] # (gt - anchor) for all four coordinates + y_encoded[:,:,[-12,-10]] /= np.expand_dims(y_encoded[:,:,-6] - y_encoded[:,:,-8], axis=-1) # (xmin(gt) - xmin(anchor)) / w(anchor), (xmax(gt) - xmax(anchor)) / w(anchor) + y_encoded[:,:,[-11,-9]] /= np.expand_dims(y_encoded[:,:,-5] - y_encoded[:,:,-7], axis=-1) # (ymin(gt) - ymin(anchor)) / h(anchor), (ymax(gt) - ymax(anchor)) / h(anchor) + y_encoded[:,:,-12:-8] /= y_encoded[:,:,-4:] # (gt - anchor) / size(anchor) / variance for all four coordinates, where 'size' refers to w and h respectively + elif self.coords == 'minmax': + y_encoded[:,:,-12:-8] -= y_encoded[:,:,-8:-4] # (gt - anchor) for all four coordinates + y_encoded[:,:,[-12,-11]] /= np.expand_dims(y_encoded[:,:,-7] - y_encoded[:,:,-8], axis=-1) # (xmin(gt) - xmin(anchor)) / w(anchor), (xmax(gt) - xmax(anchor)) / w(anchor) + y_encoded[:,:,[-10,-9]] /= np.expand_dims(y_encoded[:,:,-5] - y_encoded[:,:,-6], axis=-1) # (ymin(gt) - ymin(anchor)) / h(anchor), (ymax(gt) - ymax(anchor)) / h(anchor) + y_encoded[:,:,-12:-8] /= y_encoded[:,:,-4:] # (gt - anchor) / size(anchor) / variance for all four coordinates, where 'size' refers to w and h respectively + + if diagnostics: + # Here we'll save the matched anchor boxes (i.e. anchor boxes that were matched to a ground truth box, but keeping the anchor box coordinates). + y_matched_anchors = np.copy(y_encoded) + y_matched_anchors[:,:,-12:-8] = 0 # Keeping the anchor box coordinates means setting the offsets to zero. + return y_encoded, y_matched_anchors + else: + return y_encoded + + def generate_anchor_boxes_for_layer(self, + feature_map_size, + aspect_ratios, + this_scale, + next_scale, + this_steps=None, + this_offsets=None, + diagnostics=False): + ''' + Computes an array of the spatial positions and sizes of the anchor boxes for one predictor layer + of size `feature_map_size == [feature_map_height, feature_map_width]`. + + Arguments: + feature_map_size (tuple): A list or tuple `[feature_map_height, feature_map_width]` with the spatial + dimensions of the feature map for which to generate the anchor boxes. + aspect_ratios (list): A list of floats, the aspect ratios for which anchor boxes are to be generated. + All list elements must be unique. + this_scale (float): A float in [0, 1], the scaling factor for the size of the generate anchor boxes + as a fraction of the shorter side of the input image. + next_scale (float): A float in [0, 1], the next larger scaling factor. Only relevant if + `self.two_boxes_for_ar1 == True`. + diagnostics (bool, optional): If true, the following additional outputs will be returned: + 1) A list of the center point `x` and `y` coordinates for each spatial location. + 2) A list containing `(width, height)` for each box aspect ratio. + 3) A tuple containing `(step_height, step_width)` + 4) A tuple containing `(offset_height, offset_width)` + This information can be useful to understand in just a few numbers what the generated grid of + anchor boxes actually looks like, i.e. how large the different boxes are and how dense + their spatial distribution is, in order to determine whether the box grid covers the input images + appropriately and whether the box sizes are appropriate to fit the sizes of the objects + to be detected. + + Returns: + A 4D Numpy tensor of shape `(feature_map_height, feature_map_width, n_boxes_per_cell, 4)` where the + last dimension contains `(xmin, xmax, ymin, ymax)` for each anchor box in each cell of the feature map. + ''' + # Compute box width and height for each aspect ratio. + + # The shorter side of the image will be used to compute `w` and `h` using `scale` and `aspect_ratios`. + size = min(self.img_height, self.img_width) + # Compute the box widths and and heights for all aspect ratios + wh_list = [] + for ar in aspect_ratios: + if (ar == 1): + # Compute the regular anchor box for aspect ratio 1. + box_height = box_width = this_scale * size + wh_list.append((box_width, box_height)) + if self.two_boxes_for_ar1: + # Compute one slightly larger version using the geometric mean of this scale value and the next. + box_height = box_width = np.sqrt(this_scale * next_scale) * size + wh_list.append((box_width, box_height)) + else: + box_width = this_scale * size * np.sqrt(ar) + box_height = this_scale * size / np.sqrt(ar) + wh_list.append((box_width, box_height)) + wh_list = np.array(wh_list) + n_boxes = len(wh_list) + + # Compute the grid of box center points. They are identical for all aspect ratios. + + # Compute the step sizes, i.e. how far apart the anchor box center points will be vertically and horizontally. + if (this_steps is None): + step_height = self.img_height / feature_map_size[0] + step_width = self.img_width / feature_map_size[1] + else: + if isinstance(this_steps, (list, tuple)) and (len(this_steps) == 2): + step_height = this_steps[0] + step_width = this_steps[1] + elif isinstance(this_steps, (int, float)): + step_height = this_steps + step_width = this_steps + # Compute the offsets, i.e. at what pixel values the first anchor box center point will be from the top and from the left of the image. + if (this_offsets is None): + offset_height = 0.5 + offset_width = 0.5 + else: + if isinstance(this_offsets, (list, tuple)) and (len(this_offsets) == 2): + offset_height = this_offsets[0] + offset_width = this_offsets[1] + elif isinstance(this_offsets, (int, float)): + offset_height = this_offsets + offset_width = this_offsets + # Now that we have the offsets and step sizes, compute the grid of anchor box center points. + cy = np.linspace(offset_height * step_height, (offset_height + feature_map_size[0] - 1) * step_height, feature_map_size[0]) + cx = np.linspace(offset_width * step_width, (offset_width + feature_map_size[1] - 1) * step_width, feature_map_size[1]) + cx_grid, cy_grid = np.meshgrid(cx, cy) + cx_grid = np.expand_dims(cx_grid, -1) # This is necessary for np.tile() to do what we want further down + cy_grid = np.expand_dims(cy_grid, -1) # This is necessary for np.tile() to do what we want further down + + # Create a 4D tensor template of shape `(feature_map_height, feature_map_width, n_boxes, 4)` + # where the last dimension will contain `(cx, cy, w, h)` + boxes_tensor = np.zeros((feature_map_size[0], feature_map_size[1], n_boxes, 4)) + + boxes_tensor[:, :, :, 0] = np.tile(cx_grid, (1, 1, n_boxes)) # Set cx + boxes_tensor[:, :, :, 1] = np.tile(cy_grid, (1, 1, n_boxes)) # Set cy + boxes_tensor[:, :, :, 2] = wh_list[:, 0] # Set w + boxes_tensor[:, :, :, 3] = wh_list[:, 1] # Set h + + # Convert `(cx, cy, w, h)` to `(xmin, ymin, xmax, ymax)` + boxes_tensor = convert_coordinates(boxes_tensor, start_index=0, conversion='centroids2corners') + + # If `clip_boxes` is enabled, clip the coordinates to lie within the image boundaries + if self.clip_boxes: + x_coords = boxes_tensor[:,:,:,[0, 2]] + x_coords[x_coords >= self.img_width] = self.img_width - 1 + x_coords[x_coords < 0] = 0 + boxes_tensor[:,:,:,[0, 2]] = x_coords + y_coords = boxes_tensor[:,:,:,[1, 3]] + y_coords[y_coords >= self.img_height] = self.img_height - 1 + y_coords[y_coords < 0] = 0 + boxes_tensor[:,:,:,[1, 3]] = y_coords + + # `normalize_coords` is enabled, normalize the coordinates to be within [0,1] + if self.normalize_coords: + boxes_tensor[:, :, :, [0, 2]] /= self.img_width + boxes_tensor[:, :, :, [1, 3]] /= self.img_height + + # TODO: Implement box limiting directly for `(cx, cy, w, h)` so that we don't have to unnecessarily convert back and forth. + if self.coords == 'centroids': + # Convert `(xmin, ymin, xmax, ymax)` back to `(cx, cy, w, h)`. + boxes_tensor = convert_coordinates(boxes_tensor, start_index=0, conversion='corners2centroids', border_pixels='half') + elif self.coords == 'minmax': + # Convert `(xmin, ymin, xmax, ymax)` to `(xmin, xmax, ymin, ymax). + boxes_tensor = convert_coordinates(boxes_tensor, start_index=0, conversion='corners2minmax', border_pixels='half') + + if diagnostics: + return boxes_tensor, (cy, cx), wh_list, (step_height, step_width), (offset_height, offset_width) + else: + return boxes_tensor + + def generate_encoding_template(self, batch_size, diagnostics=False): + ''' + Produces an encoding template for the ground truth label tensor for a given batch. + + Note that all tensor creation, reshaping and concatenation operations performed in this function + and the sub-functions it calls are identical to those performed inside the SSD model. This, of course, + must be the case in order to preserve the spatial meaning of each box prediction, but it's useful to make + yourself aware of this fact and why it is necessary. + + In other words, the boxes in `y_encoded` must have a specific order in order correspond to the right spatial + positions and scales of the boxes predicted by the model. The sequence of operations here ensures that `y_encoded` + has this specific form. + + Arguments: + batch_size (int): The batch size. + diagnostics (bool, optional): See the documnentation for `generate_anchor_boxes()`. The diagnostic output + here is similar, just for all predictor conv layers. + + Returns: + A Numpy array of shape `(batch_size, #boxes, #classes + 12)`, the template into which to encode + the ground truth labels for training. The last axis has length `#classes + 12` because the model + output contains not only the 4 predicted box coordinate offsets, but also the 4 coordinates for + the anchor boxes and the 4 variance values. + ''' + # Tile the anchor boxes for each predictor layer across all batch items. + boxes_batch = [] + for boxes in self.boxes_list: + # Prepend one dimension to `self.boxes_list` to account for the batch size and tile it along. + # The result will be a 5D tensor of shape `(batch_size, feature_map_height, feature_map_width, n_boxes, 4)` + boxes = np.expand_dims(boxes, axis=0) + boxes = np.tile(boxes, (batch_size, 1, 1, 1, 1)) + + # Now reshape the 5D tensor above into a 3D tensor of shape + # `(batch, feature_map_height * feature_map_width * n_boxes, 4)`. The resulting + # order of the tensor content will be identical to the order obtained from the reshaping operation + # in our Keras model (we're using the Tensorflow backend, and tf.reshape() and np.reshape() + # use the same default index order, which is C-like index ordering) + boxes = np.reshape(boxes, (batch_size, -1, 4)) + boxes_batch.append(boxes) + + # Concatenate the anchor tensors from the individual layers to one. + boxes_tensor = np.concatenate(boxes_batch, axis=1) + + # 3: Create a template tensor to hold the one-hot class encodings of shape `(batch, #boxes, #classes)` + # It will contain all zeros for now, the classes will be set in the matching process that follows + classes_tensor = np.zeros((batch_size, boxes_tensor.shape[1], self.n_classes)) + + # 4: Create a tensor to contain the variances. This tensor has the same shape as `boxes_tensor` and simply + # contains the same 4 variance values for every position in the last axis. + variances_tensor = np.zeros_like(boxes_tensor) + variances_tensor += self.variances # Long live broadcasting + + # 4: Concatenate the classes, boxes and variances tensors to get our final template for y_encoded. We also need + # another tensor of the shape of `boxes_tensor` as a space filler so that `y_encoding_template` has the same + # shape as the SSD model output tensor. The content of this tensor is irrelevant, we'll just use + # `boxes_tensor` a second time. + y_encoding_template = np.concatenate((classes_tensor, boxes_tensor, boxes_tensor, variances_tensor), axis=2) + + if diagnostics: + return y_encoding_template, self.centers_diag, self.wh_list_diag, self.steps_diag, self.offsets_diag + else: + return y_encoding_template + +class DegenerateBoxError(Exception): + ''' + An exception class to be raised if degenerate boxes are being detected. + ''' + pass diff --git a/keras_ssd/ssd_encoder_decoder/ssd_output_decoder.py b/keras_ssd/ssd_encoder_decoder/ssd_output_decoder.py new file mode 100644 index 0000000..e6dce6a --- /dev/null +++ b/keras_ssd/ssd_encoder_decoder/ssd_output_decoder.py @@ -0,0 +1,530 @@ +''' +Includes: +* Functions to decode and filter raw SSD model output. These are only needed if the + SSD model does not have a `DecodeDetections` layer. +* Functions to perform greedy non-maximum suppression + +Copyright (C) 2018 Pierluigi Ferrari + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +''' + +from __future__ import division +import numpy as np + +from bounding_box_utils.bounding_box_utils import iou, convert_coordinates + +def greedy_nms(y_pred_decoded, iou_threshold=0.45, coords='corners', border_pixels='half'): + ''' + Perform greedy non-maximum suppression on the input boxes. + + Greedy NMS works by selecting the box with the highest score and + removing all boxes around it that are too close to it measured by IoU-similarity. + Out of the boxes that are left over, once again the one with the highest + score is selected and so on, until no boxes with too much overlap are left. + + Arguments: + y_pred_decoded (list): A batch of decoded predictions. For a given batch size `n` this + is a list of length `n` where each list element is a 2D Numpy array. + For a batch item with `k` predicted boxes this 2D Numpy array has + shape `(k, 6)`, where each row contains the coordinates of the respective + box in the format `[class_id, score, xmin, xmax, ymin, ymax]`. + Technically, the number of columns doesn't have to be 6, it can be + arbitrary as long as the first four elements of each row are + `xmin`, `xmax`, `ymin`, `ymax` (in this order) and the last element + is the score assigned to the prediction. Note that this function is + agnostic to the scale of the score or what it represents. + iou_threshold (float, optional): All boxes with a Jaccard similarity of + greater than `iou_threshold` with a locally maximal box will be removed + from the set of predictions, where 'maximal' refers to the box score. + coords (str, optional): The coordinate format of `y_pred_decoded`. + Can be one of the formats supported by `iou()`. + border_pixels (str, optional): How to treat the border pixels of the bounding boxes. + Can be 'include', 'exclude', or 'half'. If 'include', the border pixels belong + to the boxes. If 'exclude', the border pixels do not belong to the boxes. + If 'half', then one of each of the two horizontal and vertical borders belong + to the boxex, but not the other. + + Returns: + The predictions after removing non-maxima. The format is the same as the input format. + ''' + y_pred_decoded_nms = [] + for batch_item in y_pred_decoded: # For the labels of each batch item... + boxes_left = np.copy(batch_item) + maxima = [] # This is where we store the boxes that make it through the non-maximum suppression + while boxes_left.shape[0] > 0: # While there are still boxes left to compare... + maximum_index = np.argmax(boxes_left[:,1]) # ...get the index of the next box with the highest confidence... + maximum_box = np.copy(boxes_left[maximum_index]) # ...copy that box and... + maxima.append(maximum_box) # ...append it to `maxima` because we'll definitely keep it + boxes_left = np.delete(boxes_left, maximum_index, axis=0) # Now remove the maximum box from `boxes_left` + if boxes_left.shape[0] == 0: break # If there are no boxes left after this step, break. Otherwise... + similarities = iou(boxes_left[:,2:], maximum_box[2:], coords=coords, mode='element-wise', border_pixels=border_pixels) # ...compare (IoU) the other left over boxes to the maximum box... + boxes_left = boxes_left[similarities <= iou_threshold] # ...so that we can remove the ones that overlap too much with the maximum box + y_pred_decoded_nms.append(np.array(maxima)) + + return y_pred_decoded_nms + +def _greedy_nms(predictions, iou_threshold=0.45, coords='corners', border_pixels='half'): + ''' + The same greedy non-maximum suppression algorithm as above, but slightly modified for use as an internal + function for per-class NMS in `decode_detections()`. + ''' + boxes_left = np.copy(predictions) + maxima = [] # This is where we store the boxes that make it through the non-maximum suppression + while boxes_left.shape[0] > 0: # While there are still boxes left to compare... + maximum_index = np.argmax(boxes_left[:,0]) # ...get the index of the next box with the highest confidence... + maximum_box = np.copy(boxes_left[maximum_index]) # ...copy that box and... + maxima.append(maximum_box) # ...append it to `maxima` because we'll definitely keep it + boxes_left = np.delete(boxes_left, maximum_index, axis=0) # Now remove the maximum box from `boxes_left` + if boxes_left.shape[0] == 0: break # If there are no boxes left after this step, break. Otherwise... + similarities = iou(boxes_left[:,1:], maximum_box[1:], coords=coords, mode='element-wise', border_pixels=border_pixels) # ...compare (IoU) the other left over boxes to the maximum box... + boxes_left = boxes_left[similarities <= iou_threshold] # ...so that we can remove the ones that overlap too much with the maximum box + return np.array(maxima) + +def _greedy_nms2(predictions, iou_threshold=0.45, coords='corners', border_pixels='half'): + ''' + The same greedy non-maximum suppression algorithm as above, but slightly modified for use as an internal + function in `decode_detections_fast()`. + ''' + boxes_left = np.copy(predictions) + maxima = [] # This is where we store the boxes that make it through the non-maximum suppression + while boxes_left.shape[0] > 0: # While there are still boxes left to compare... + maximum_index = np.argmax(boxes_left[:,1]) # ...get the index of the next box with the highest confidence... + maximum_box = np.copy(boxes_left[maximum_index]) # ...copy that box and... + maxima.append(maximum_box) # ...append it to `maxima` because we'll definitely keep it + boxes_left = np.delete(boxes_left, maximum_index, axis=0) # Now remove the maximum box from `boxes_left` + if boxes_left.shape[0] == 0: break # If there are no boxes left after this step, break. Otherwise... + similarities = iou(boxes_left[:,2:], maximum_box[2:], coords=coords, mode='element-wise', border_pixels=border_pixels) # ...compare (IoU) the other left over boxes to the maximum box... + boxes_left = boxes_left[similarities <= iou_threshold] # ...so that we can remove the ones that overlap too much with the maximum box + return np.array(maxima) + +def decode_detections(y_pred, + confidence_thresh=0.01, + iou_threshold=0.45, + top_k=200, + input_coords='centroids', + normalize_coords=True, + img_height=None, + img_width=None, + border_pixels='half'): + ''' + Convert model prediction output back to a format that contains only the positive box predictions + (i.e. the same format that `SSDInputEncoder` takes as input). + + After the decoding, two stages of prediction filtering are performed for each class individually: + First confidence thresholding, then greedy non-maximum suppression. The filtering results for all + classes are concatenated and the `top_k` overall highest confidence results constitute the final + predictions for a given batch item. This procedure follows the original Caffe implementation. + For a slightly different and more efficient alternative to decode raw model output that performs + non-maximum suppresion globally instead of per class, see `decode_detections_fast()` below. + + Arguments: + y_pred (array): The prediction output of the SSD model, expected to be a Numpy array + of shape `(batch_size, #boxes, #classes + 4 + 4 + 4)`, where `#boxes` is the total number of + boxes predicted by the model per image and the last axis contains + `[one-hot vector for the classes, 4 predicted coordinate offsets, 4 anchor box coordinates, 4 variances]`. + confidence_thresh (float, optional): A float in [0,1), the minimum classification confidence in a specific + positive class in order to be considered for the non-maximum suppression stage for the respective class. + A lower value will result in a larger part of the selection process being done by the non-maximum suppression + stage, while a larger value will result in a larger part of the selection process happening in the confidence + thresholding stage. + iou_threshold (float, optional): A float in [0,1]. All boxes with a Jaccard similarity of greater than `iou_threshold` + with a locally maximal box will be removed from the set of predictions for a given class, where 'maximal' refers + to the box score. + top_k (int, optional): The number of highest scoring predictions to be kept for each batch item after the + non-maximum suppression stage. + input_coords (str, optional): The box coordinate format that the model outputs. Can be either 'centroids' + for the format `(cx, cy, w, h)` (box center coordinates, width, and height), 'minmax' for the format + `(xmin, xmax, ymin, ymax)`, or 'corners' for the format `(xmin, ymin, xmax, ymax)`. + normalize_coords (bool, optional): Set to `True` if the model outputs relative coordinates (i.e. coordinates in [0,1]) + and you wish to transform these relative coordinates back to absolute coordinates. If the model outputs + relative coordinates, but you do not want to convert them back to absolute coordinates, set this to `False`. + Do not set this to `True` if the model already outputs absolute coordinates, as that would result in incorrect + coordinates. Requires `img_height` and `img_width` if set to `True`. + img_height (int, optional): The height of the input images. Only needed if `normalize_coords` is `True`. + img_width (int, optional): The width of the input images. Only needed if `normalize_coords` is `True`. + border_pixels (str, optional): How to treat the border pixels of the bounding boxes. + Can be 'include', 'exclude', or 'half'. If 'include', the border pixels belong + to the boxes. If 'exclude', the border pixels do not belong to the boxes. + If 'half', then one of each of the two horizontal and vertical borders belong + to the boxex, but not the other. + + Returns: + A python list of length `batch_size` where each list element represents the predicted boxes + for one image and contains a Numpy array of shape `(boxes, 6)` where each row is a box prediction for + a non-background class for the respective image in the format `[class_id, confidence, xmin, ymin, xmax, ymax]`. + ''' + if normalize_coords and ((img_height is None) or (img_width is None)): + raise ValueError("If relative box coordinates are supposed to be converted to absolute coordinates, the decoder needs the image size in order to decode the predictions, but `img_height == {}` and `img_width == {}`".format(img_height, img_width)) + + # 1: Convert the box coordinates from the predicted anchor box offsets to predicted absolute coordinates + + y_pred_decoded_raw = np.copy(y_pred[:,:,:-8]) # Slice out the classes and the four offsets, throw away the anchor coordinates and variances, resulting in a tensor of shape `[batch, n_boxes, n_classes + 4 coordinates]` + + if input_coords == 'centroids': + y_pred_decoded_raw[:,:,[-2,-1]] = np.exp(y_pred_decoded_raw[:,:,[-2,-1]] * y_pred[:,:,[-2,-1]]) # exp(ln(w(pred)/w(anchor)) / w_variance * w_variance) == w(pred) / w(anchor), exp(ln(h(pred)/h(anchor)) / h_variance * h_variance) == h(pred) / h(anchor) + y_pred_decoded_raw[:,:,[-2,-1]] *= y_pred[:,:,[-6,-5]] # (w(pred) / w(anchor)) * w(anchor) == w(pred), (h(pred) / h(anchor)) * h(anchor) == h(pred) + y_pred_decoded_raw[:,:,[-4,-3]] *= y_pred[:,:,[-4,-3]] * y_pred[:,:,[-6,-5]] # (delta_cx(pred) / w(anchor) / cx_variance) * cx_variance * w(anchor) == delta_cx(pred), (delta_cy(pred) / h(anchor) / cy_variance) * cy_variance * h(anchor) == delta_cy(pred) + y_pred_decoded_raw[:,:,[-4,-3]] += y_pred[:,:,[-8,-7]] # delta_cx(pred) + cx(anchor) == cx(pred), delta_cy(pred) + cy(anchor) == cy(pred) + y_pred_decoded_raw = convert_coordinates(y_pred_decoded_raw, start_index=-4, conversion='centroids2corners') + elif input_coords == 'minmax': + y_pred_decoded_raw[:,:,-4:] *= y_pred[:,:,-4:] # delta(pred) / size(anchor) / variance * variance == delta(pred) / size(anchor) for all four coordinates, where 'size' refers to w or h, respectively + y_pred_decoded_raw[:,:,[-4,-3]] *= np.expand_dims(y_pred[:,:,-7] - y_pred[:,:,-8], axis=-1) # delta_xmin(pred) / w(anchor) * w(anchor) == delta_xmin(pred), delta_xmax(pred) / w(anchor) * w(anchor) == delta_xmax(pred) + y_pred_decoded_raw[:,:,[-2,-1]] *= np.expand_dims(y_pred[:,:,-5] - y_pred[:,:,-6], axis=-1) # delta_ymin(pred) / h(anchor) * h(anchor) == delta_ymin(pred), delta_ymax(pred) / h(anchor) * h(anchor) == delta_ymax(pred) + y_pred_decoded_raw[:,:,-4:] += y_pred[:,:,-8:-4] # delta(pred) + anchor == pred for all four coordinates + y_pred_decoded_raw = convert_coordinates(y_pred_decoded_raw, start_index=-4, conversion='minmax2corners') + elif input_coords == 'corners': + y_pred_decoded_raw[:,:,-4:] *= y_pred[:,:,-4:] # delta(pred) / size(anchor) / variance * variance == delta(pred) / size(anchor) for all four coordinates, where 'size' refers to w or h, respectively + y_pred_decoded_raw[:,:,[-4,-2]] *= np.expand_dims(y_pred[:,:,-6] - y_pred[:,:,-8], axis=-1) # delta_xmin(pred) / w(anchor) * w(anchor) == delta_xmin(pred), delta_xmax(pred) / w(anchor) * w(anchor) == delta_xmax(pred) + y_pred_decoded_raw[:,:,[-3,-1]] *= np.expand_dims(y_pred[:,:,-5] - y_pred[:,:,-7], axis=-1) # delta_ymin(pred) / h(anchor) * h(anchor) == delta_ymin(pred), delta_ymax(pred) / h(anchor) * h(anchor) == delta_ymax(pred) + y_pred_decoded_raw[:,:,-4:] += y_pred[:,:,-8:-4] # delta(pred) + anchor == pred for all four coordinates + else: + raise ValueError("Unexpected value for `input_coords`. Supported input coordinate formats are 'minmax', 'corners' and 'centroids'.") + + # 2: If the model predicts normalized box coordinates and they are supposed to be converted back to absolute coordinates, do that + + if normalize_coords: + y_pred_decoded_raw[:,:,[-4,-2]] *= img_width # Convert xmin, xmax back to absolute coordinates + y_pred_decoded_raw[:,:,[-3,-1]] *= img_height # Convert ymin, ymax back to absolute coordinates + + # 3: Apply confidence thresholding and non-maximum suppression per class + + n_classes = y_pred_decoded_raw.shape[-1] - 4 # The number of classes is the length of the last axis minus the four box coordinates + + y_pred_decoded = [] # Store the final predictions in this list + for batch_item in y_pred_decoded_raw: # `batch_item` has shape `[n_boxes, n_classes + 4 coords]` + pred = [] # Store the final predictions for this batch item here + for class_id in range(1, n_classes): # For each class except the background class (which has class ID 0)... + single_class = batch_item[:,[class_id, -4, -3, -2, -1]] # ...keep only the confidences for that class, making this an array of shape `[n_boxes, 5]` and... + threshold_met = single_class[single_class[:,0] > confidence_thresh] # ...keep only those boxes with a confidence above the set threshold. + if threshold_met.shape[0] > 0: # If any boxes made the threshold... + maxima = _greedy_nms(threshold_met, iou_threshold=iou_threshold, coords='corners', border_pixels=border_pixels) # ...perform NMS on them. + maxima_output = np.zeros((maxima.shape[0], maxima.shape[1] + 1)) # Expand the last dimension by one element to have room for the class ID. This is now an arrray of shape `[n_boxes, 6]` + maxima_output[:,0] = class_id # Write the class ID to the first column... + maxima_output[:,1:] = maxima # ...and write the maxima to the other columns... + pred.append(maxima_output) # ...and append the maxima for this class to the list of maxima for this batch item. + # Once we're through with all classes, keep only the `top_k` maxima with the highest scores + if pred: # If there are any predictions left after confidence-thresholding... + pred = np.concatenate(pred, axis=0) + if top_k != 'all' and pred.shape[0] > top_k: # If we have more than `top_k` results left at this point, otherwise there is nothing to filter,... + top_k_indices = np.argpartition(pred[:,1], kth=pred.shape[0]-top_k, axis=0)[pred.shape[0]-top_k:] # ...get the indices of the `top_k` highest-score maxima... + pred = pred[top_k_indices] # ...and keep only those entries of `pred`... + else: + pred = np.array(pred) # Even if empty, `pred` must become a Numpy array. + y_pred_decoded.append(pred) # ...and now that we're done, append the array of final predictions for this batch item to the output list + + return y_pred_decoded + +def decode_detections_fast(y_pred, + confidence_thresh=0.5, + iou_threshold=0.45, + top_k='all', + input_coords='centroids', + normalize_coords=True, + img_height=None, + img_width=None, + border_pixels='half'): + ''' + Convert model prediction output back to a format that contains only the positive box predictions + (i.e. the same format that `enconde_y()` takes as input). + + Optionally performs confidence thresholding and greedy non-maximum suppression after the decoding stage. + + Note that the decoding procedure used here is not the same as the procedure used in the original Caffe implementation. + For each box, the procedure used here assigns the box's highest confidence as its predicted class. Then it removes + all boxes for which the highest confidence is the background class. This results in less work for the subsequent + non-maximum suppression, because the vast majority of the predictions will be filtered out just by the fact that + their highest confidence is for the background class. It is much more efficient than the procedure of the original + implementation, but the results may also differ. + + Arguments: + y_pred (array): The prediction output of the SSD model, expected to be a Numpy array + of shape `(batch_size, #boxes, #classes + 4 + 4 + 4)`, where `#boxes` is the total number of + boxes predicted by the model per image and the last axis contains + `[one-hot vector for the classes, 4 predicted coordinate offsets, 4 anchor box coordinates, 4 variances]`. + confidence_thresh (float, optional): A float in [0,1), the minimum classification confidence in any positive + class required for a given box to be considered a positive prediction. A lower value will result + in better recall, while a higher value will result in better precision. Do not use this parameter with the + goal to combat the inevitably many duplicates that an SSD will produce, the subsequent non-maximum suppression + stage will take care of those. + iou_threshold (float, optional): `None` or a float in [0,1]. If `None`, no non-maximum suppression will be + performed. If not `None`, greedy NMS will be performed after the confidence thresholding stage, meaning + all boxes with a Jaccard similarity of greater than `iou_threshold` with a locally maximal box will be removed + from the set of predictions, where 'maximal' refers to the box score. + top_k (int, optional): 'all' or an integer with number of highest scoring predictions to be kept for each batch item + after the non-maximum suppression stage. If 'all', all predictions left after the NMS stage will be kept. + input_coords (str, optional): The box coordinate format that the model outputs. Can be either 'centroids' + for the format `(cx, cy, w, h)` (box center coordinates, width, and height), 'minmax' for the format + `(xmin, xmax, ymin, ymax)`, or 'corners' for the format `(xmin, ymin, xmax, ymax)`. + normalize_coords (bool, optional): Set to `True` if the model outputs relative coordinates (i.e. coordinates in [0,1]) + and you wish to transform these relative coordinates back to absolute coordinates. If the model outputs + relative coordinates, but you do not want to convert them back to absolute coordinates, set this to `False`. + Do not set this to `True` if the model already outputs absolute coordinates, as that would result in incorrect + coordinates. Requires `img_height` and `img_width` if set to `True`. + img_height (int, optional): The height of the input images. Only needed if `normalize_coords` is `True`. + img_width (int, optional): The width of the input images. Only needed if `normalize_coords` is `True`. + border_pixels (str, optional): How to treat the border pixels of the bounding boxes. + Can be 'include', 'exclude', or 'half'. If 'include', the border pixels belong + to the boxes. If 'exclude', the border pixels do not belong to the boxes. + If 'half', then one of each of the two horizontal and vertical borders belong + to the boxex, but not the other. + + Returns: + A python list of length `batch_size` where each list element represents the predicted boxes + for one image and contains a Numpy array of shape `(boxes, 6)` where each row is a box prediction for + a non-background class for the respective image in the format `[class_id, confidence, xmin, xmax, ymin, ymax]`. + ''' + if normalize_coords and ((img_height is None) or (img_width is None)): + raise ValueError("If relative box coordinates are supposed to be converted to absolute coordinates, the decoder needs the image size in order to decode the predictions, but `img_height == {}` and `img_width == {}`".format(img_height, img_width)) + + # 1: Convert the classes from one-hot encoding to their class ID + y_pred_converted = np.copy(y_pred[:,:,-14:-8]) # Slice out the four offset predictions plus two elements whereto we'll write the class IDs and confidences in the next step + y_pred_converted[:,:,0] = np.argmax(y_pred[:,:,:-12], axis=-1) # The indices of the highest confidence values in the one-hot class vectors are the class ID + y_pred_converted[:,:,1] = np.amax(y_pred[:,:,:-12], axis=-1) # Store the confidence values themselves, too + + # 2: Convert the box coordinates from the predicted anchor box offsets to predicted absolute coordinates + if input_coords == 'centroids': + y_pred_converted[:,:,[4,5]] = np.exp(y_pred_converted[:,:,[4,5]] * y_pred[:,:,[-2,-1]]) # exp(ln(w(pred)/w(anchor)) / w_variance * w_variance) == w(pred) / w(anchor), exp(ln(h(pred)/h(anchor)) / h_variance * h_variance) == h(pred) / h(anchor) + y_pred_converted[:,:,[4,5]] *= y_pred[:,:,[-6,-5]] # (w(pred) / w(anchor)) * w(anchor) == w(pred), (h(pred) / h(anchor)) * h(anchor) == h(pred) + y_pred_converted[:,:,[2,3]] *= y_pred[:,:,[-4,-3]] * y_pred[:,:,[-6,-5]] # (delta_cx(pred) / w(anchor) / cx_variance) * cx_variance * w(anchor) == delta_cx(pred), (delta_cy(pred) / h(anchor) / cy_variance) * cy_variance * h(anchor) == delta_cy(pred) + y_pred_converted[:,:,[2,3]] += y_pred[:,:,[-8,-7]] # delta_cx(pred) + cx(anchor) == cx(pred), delta_cy(pred) + cy(anchor) == cy(pred) + y_pred_converted = convert_coordinates(y_pred_converted, start_index=-4, conversion='centroids2corners') + elif input_coords == 'minmax': + y_pred_converted[:,:,2:] *= y_pred[:,:,-4:] # delta(pred) / size(anchor) / variance * variance == delta(pred) / size(anchor) for all four coordinates, where 'size' refers to w or h, respectively + y_pred_converted[:,:,[2,3]] *= np.expand_dims(y_pred[:,:,-7] - y_pred[:,:,-8], axis=-1) # delta_xmin(pred) / w(anchor) * w(anchor) == delta_xmin(pred), delta_xmax(pred) / w(anchor) * w(anchor) == delta_xmax(pred) + y_pred_converted[:,:,[4,5]] *= np.expand_dims(y_pred[:,:,-5] - y_pred[:,:,-6], axis=-1) # delta_ymin(pred) / h(anchor) * h(anchor) == delta_ymin(pred), delta_ymax(pred) / h(anchor) * h(anchor) == delta_ymax(pred) + y_pred_converted[:,:,2:] += y_pred[:,:,-8:-4] # delta(pred) + anchor == pred for all four coordinates + y_pred_converted = convert_coordinates(y_pred_converted, start_index=-4, conversion='minmax2corners') + elif input_coords == 'corners': + y_pred_converted[:,:,2:] *= y_pred[:,:,-4:] # delta(pred) / size(anchor) / variance * variance == delta(pred) / size(anchor) for all four coordinates, where 'size' refers to w or h, respectively + y_pred_converted[:,:,[2,4]] *= np.expand_dims(y_pred[:,:,-6] - y_pred[:,:,-8], axis=-1) # delta_xmin(pred) / w(anchor) * w(anchor) == delta_xmin(pred), delta_xmax(pred) / w(anchor) * w(anchor) == delta_xmax(pred) + y_pred_converted[:,:,[3,5]] *= np.expand_dims(y_pred[:,:,-5] - y_pred[:,:,-7], axis=-1) # delta_ymin(pred) / h(anchor) * h(anchor) == delta_ymin(pred), delta_ymax(pred) / h(anchor) * h(anchor) == delta_ymax(pred) + y_pred_converted[:,:,2:] += y_pred[:,:,-8:-4] # delta(pred) + anchor == pred for all four coordinates + else: + raise ValueError("Unexpected value for `coords`. Supported values are 'minmax', 'corners' and 'centroids'.") + + # 3: If the model predicts normalized box coordinates and they are supposed to be converted back to absolute coordinates, do that + if normalize_coords: + y_pred_converted[:,:,[2,4]] *= img_width # Convert xmin, xmax back to absolute coordinates + y_pred_converted[:,:,[3,5]] *= img_height # Convert ymin, ymax back to absolute coordinates + + # 4: Decode our huge `(batch, #boxes, 6)` tensor into a list of length `batch` where each list entry is an array containing only the positive predictions + y_pred_decoded = [] + for batch_item in y_pred_converted: # For each image in the batch... + boxes = batch_item[np.nonzero(batch_item[:,0])] # ...get all boxes that don't belong to the background class,... + boxes = boxes[boxes[:,1] >= confidence_thresh] # ...then filter out those positive boxes for which the prediction confidence is too low and after that... + if iou_threshold: # ...if an IoU threshold is set... + boxes = _greedy_nms2(boxes, iou_threshold=iou_threshold, coords='corners', border_pixels=border_pixels) # ...perform NMS on the remaining boxes. + if top_k != 'all' and boxes.shape[0] > top_k: # If we have more than `top_k` results left at this point... + top_k_indices = np.argpartition(boxes[:,1], kth=boxes.shape[0]-top_k, axis=0)[boxes.shape[0]-top_k:] # ...get the indices of the `top_k` highest-scoring boxes... + boxes = boxes[top_k_indices] # ...and keep only those boxes... + y_pred_decoded.append(boxes) # ...and now that we're done, append the array of final predictions for this batch item to the output list + + return y_pred_decoded + +################################################################################################ +# Debugging tools, not relevant for normal use +################################################################################################ + +# The functions below are for debugging, so you won't normally need them. That is, +# unless you need to debug your model, of course. + +def decode_detections_debug(y_pred, + confidence_thresh=0.01, + iou_threshold=0.45, + top_k=200, + input_coords='centroids', + normalize_coords=True, + img_height=None, + img_width=None, + variance_encoded_in_target=False, + border_pixels='half'): + ''' + This decoder performs the same processing as `decode_detections()`, but the output format for each left-over + predicted box is `[box_id, class_id, confidence, xmin, ymin, xmax, ymax]`. + + That is, in addition to the usual data, each predicted box has the internal index of that box within + the model (`box_id`) prepended to it. This allows you to know exactly which part of the model made a given + box prediction; in particular, it allows you to know which predictor layer made a given prediction. + This can be useful for debugging. + + Arguments: + y_pred (array): The prediction output of the SSD model, expected to be a Numpy array + of shape `(batch_size, #boxes, #classes + 4 + 4 + 4)`, where `#boxes` is the total number of + boxes predicted by the model per image and the last axis contains + `[one-hot vector for the classes, 4 predicted coordinate offsets, 4 anchor box coordinates, 4 variances]`. + confidence_thresh (float, optional): A float in [0,1), the minimum classification confidence in a specific + positive class in order to be considered for the non-maximum suppression stage for the respective class. + A lower value will result in a larger part of the selection process being done by the non-maximum suppression + stage, while a larger value will result in a larger part of the selection process happening in the confidence + thresholding stage. + iou_threshold (float, optional): A float in [0,1]. All boxes with a Jaccard similarity of greater than `iou_threshold` + with a locally maximal box will be removed from the set of predictions for a given class, where 'maximal' refers + to the box score. + top_k (int, optional): The number of highest scoring predictions to be kept for each batch item after the + non-maximum suppression stage. + input_coords (str, optional): The box coordinate format that the model outputs. Can be either 'centroids' + for the format `(cx, cy, w, h)` (box center coordinates, width, and height), 'minmax' for the format + `(xmin, xmax, ymin, ymax)`, or 'corners' for the format `(xmin, ymin, xmax, ymax)`. + normalize_coords (bool, optional): Set to `True` if the model outputs relative coordinates (i.e. coordinates in [0,1]) + and you wish to transform these relative coordinates back to absolute coordinates. If the model outputs + relative coordinates, but you do not want to convert them back to absolute coordinates, set this to `False`. + Do not set this to `True` if the model already outputs absolute coordinates, as that would result in incorrect + coordinates. Requires `img_height` and `img_width` if set to `True`. + img_height (int, optional): The height of the input images. Only needed if `normalize_coords` is `True`. + img_width (int, optional): The width of the input images. Only needed if `normalize_coords` is `True`. + border_pixels (str, optional): How to treat the border pixels of the bounding boxes. + Can be 'include', 'exclude', or 'half'. If 'include', the border pixels belong + to the boxes. If 'exclude', the border pixels do not belong to the boxes. + If 'half', then one of each of the two horizontal and vertical borders belong + to the boxex, but not the other. + + Returns: + A python list of length `batch_size` where each list element represents the predicted boxes + for one image and contains a Numpy array of shape `(boxes, 7)` where each row is a box prediction for + a non-background class for the respective image in the format `[box_id, class_id, confidence, xmin, ymin, xmax, ymax]`. + ''' + if normalize_coords and ((img_height is None) or (img_width is None)): + raise ValueError("If relative box coordinates are supposed to be converted to absolute coordinates, the decoder needs the image size in order to decode the predictions, but `img_height == {}` and `img_width == {}`".format(img_height, img_width)) + + # 1: Convert the box coordinates from the predicted anchor box offsets to predicted absolute coordinates + + y_pred_decoded_raw = np.copy(y_pred[:,:,:-8]) # Slice out the classes and the four offsets, throw away the anchor coordinates and variances, resulting in a tensor of shape `[batch, n_boxes, n_classes + 4 coordinates]` + + if input_coords == 'centroids': + if variance_encoded_in_target: + # Decode the predicted box center x and y coordinates. + y_pred_decoded_raw[:,:,[-4,-3]] = y_pred_decoded_raw[:,:,[-4,-3]] * y_pred[:,:,[-6,-5]] + y_pred[:,:,[-8,-7]] + # Decode the predicted box width and heigt. + y_pred_decoded_raw[:,:,[-2,-1]] = np.exp(y_pred_decoded_raw[:,:,[-2,-1]]) * y_pred[:,:,[-6,-5]] + else: + # Decode the predicted box center x and y coordinates. + y_pred_decoded_raw[:,:,[-4,-3]] = y_pred_decoded_raw[:,:,[-4,-3]] * y_pred[:,:,[-6,-5]] * y_pred[:,:,[-4,-3]] + y_pred[:,:,[-8,-7]] + # Decode the predicted box width and heigt. + y_pred_decoded_raw[:,:,[-2,-1]] = np.exp(y_pred_decoded_raw[:,:,[-2,-1]] * y_pred[:,:,[-2,-1]]) * y_pred[:,:,[-6,-5]] + y_pred_decoded_raw = convert_coordinates(y_pred_decoded_raw, start_index=-4, conversion='centroids2corners') + elif input_coords == 'minmax': + y_pred_decoded_raw[:,:,-4:] *= y_pred[:,:,-4:] # delta(pred) / size(anchor) / variance * variance == delta(pred) / size(anchor) for all four coordinates, where 'size' refers to w or h, respectively + y_pred_decoded_raw[:,:,[-4,-3]] *= np.expand_dims(y_pred[:,:,-7] - y_pred[:,:,-8], axis=-1) # delta_xmin(pred) / w(anchor) * w(anchor) == delta_xmin(pred), delta_xmax(pred) / w(anchor) * w(anchor) == delta_xmax(pred) + y_pred_decoded_raw[:,:,[-2,-1]] *= np.expand_dims(y_pred[:,:,-5] - y_pred[:,:,-6], axis=-1) # delta_ymin(pred) / h(anchor) * h(anchor) == delta_ymin(pred), delta_ymax(pred) / h(anchor) * h(anchor) == delta_ymax(pred) + y_pred_decoded_raw[:,:,-4:] += y_pred[:,:,-8:-4] # delta(pred) + anchor == pred for all four coordinates + y_pred_decoded_raw = convert_coordinates(y_pred_decoded_raw, start_index=-4, conversion='minmax2corners') + elif input_coords == 'corners': + y_pred_decoded_raw[:,:,-4:] *= y_pred[:,:,-4:] # delta(pred) / size(anchor) / variance * variance == delta(pred) / size(anchor) for all four coordinates, where 'size' refers to w or h, respectively + y_pred_decoded_raw[:,:,[-4,-2]] *= np.expand_dims(y_pred[:,:,-6] - y_pred[:,:,-8], axis=-1) # delta_xmin(pred) / w(anchor) * w(anchor) == delta_xmin(pred), delta_xmax(pred) / w(anchor) * w(anchor) == delta_xmax(pred) + y_pred_decoded_raw[:,:,[-3,-1]] *= np.expand_dims(y_pred[:,:,-5] - y_pred[:,:,-7], axis=-1) # delta_ymin(pred) / h(anchor) * h(anchor) == delta_ymin(pred), delta_ymax(pred) / h(anchor) * h(anchor) == delta_ymax(pred) + y_pred_decoded_raw[:,:,-4:] += y_pred[:,:,-8:-4] # delta(pred) + anchor == pred for all four coordinates + else: + raise ValueError("Unexpected value for `input_coords`. Supported input coordinate formats are 'minmax', 'corners' and 'centroids'.") + + # 2: If the model predicts normalized box coordinates and they are supposed to be converted back to absolute coordinates, do that + + if normalize_coords: + y_pred_decoded_raw[:,:,[-4,-2]] *= img_width # Convert xmin, xmax back to absolute coordinates + y_pred_decoded_raw[:,:,[-3,-1]] *= img_height # Convert ymin, ymax back to absolute coordinates + + # 3: For each batch item, prepend each box's internal index to its coordinates. + + y_pred_decoded_raw2 = np.zeros((y_pred_decoded_raw.shape[0], y_pred_decoded_raw.shape[1], y_pred_decoded_raw.shape[2] + 1)) # Expand the last axis by one. + y_pred_decoded_raw2[:,:,1:] = y_pred_decoded_raw + y_pred_decoded_raw2[:,:,0] = np.arange(y_pred_decoded_raw.shape[1]) # Put the box indices as the first element for each box via broadcasting. + y_pred_decoded_raw = y_pred_decoded_raw2 + + # 4: Apply confidence thresholding and non-maximum suppression per class + + n_classes = y_pred_decoded_raw.shape[-1] - 5 # The number of classes is the length of the last axis minus the four box coordinates and minus the index + + y_pred_decoded = [] # Store the final predictions in this list + for batch_item in y_pred_decoded_raw: # `batch_item` has shape `[n_boxes, n_classes + 4 coords]` + pred = [] # Store the final predictions for this batch item here + for class_id in range(1, n_classes): # For each class except the background class (which has class ID 0)... + single_class = batch_item[:,[0, class_id + 1, -4, -3, -2, -1]] # ...keep only the confidences for that class, making this an array of shape `[n_boxes, 6]` and... + threshold_met = single_class[single_class[:,1] > confidence_thresh] # ...keep only those boxes with a confidence above the set threshold. + if threshold_met.shape[0] > 0: # If any boxes made the threshold... + maxima = _greedy_nms_debug(threshold_met, iou_threshold=iou_threshold, coords='corners', border_pixels=border_pixels) # ...perform NMS on them. + maxima_output = np.zeros((maxima.shape[0], maxima.shape[1] + 1)) # Expand the last dimension by one element to have room for the class ID. This is now an arrray of shape `[n_boxes, 6]` + maxima_output[:,0] = maxima[:,0] # Write the box index to the first column... + maxima_output[:,1] = class_id # ...and write the class ID to the second column... + maxima_output[:,2:] = maxima[:,1:] # ...and write the rest of the maxima data to the other columns... + pred.append(maxima_output) # ...and append the maxima for this class to the list of maxima for this batch item. + # Once we're through with all classes, keep only the `top_k` maxima with the highest scores + pred = np.concatenate(pred, axis=0) + if pred.shape[0] > top_k: # If we have more than `top_k` results left at this point, otherwise there is nothing to filter,... + top_k_indices = np.argpartition(pred[:,2], kth=pred.shape[0]-top_k, axis=0)[pred.shape[0]-top_k:] # ...get the indices of the `top_k` highest-score maxima... + pred = pred[top_k_indices] # ...and keep only those entries of `pred`... + y_pred_decoded.append(pred) # ...and now that we're done, append the array of final predictions for this batch item to the output list + + return y_pred_decoded + +def _greedy_nms_debug(predictions, iou_threshold=0.45, coords='corners', border_pixels='half'): + ''' + The same greedy non-maximum suppression algorithm as above, but slightly modified for use as an internal + function for per-class NMS in `decode_detections_debug()`. The difference is that it keeps the indices of all + left-over boxes for each batch item, which allows you to know which predictor layer predicted a given output + box and is thus useful for debugging. + ''' + boxes_left = np.copy(predictions) + maxima = [] # This is where we store the boxes that make it through the non-maximum suppression + while boxes_left.shape[0] > 0: # While there are still boxes left to compare... + maximum_index = np.argmax(boxes_left[:,1]) # ...get the index of the next box with the highest confidence... + maximum_box = np.copy(boxes_left[maximum_index]) # ...copy that box and... + maxima.append(maximum_box) # ...append it to `maxima` because we'll definitely keep it + boxes_left = np.delete(boxes_left, maximum_index, axis=0) # Now remove the maximum box from `boxes_left` + if boxes_left.shape[0] == 0: break # If there are no boxes left after this step, break. Otherwise... + similarities = iou(boxes_left[:,2:], maximum_box[2:], coords=coords, mode='element-wise', border_pixels=border_pixels) # ...compare (IoU) the other left over boxes to the maximum box... + boxes_left = boxes_left[similarities <= iou_threshold] # ...so that we can remove the ones that overlap too much with the maximum box + return np.array(maxima) + +def get_num_boxes_per_pred_layer(predictor_sizes, aspect_ratios, two_boxes_for_ar1): + ''' + Returns a list of the number of boxes that each predictor layer predicts. + + `aspect_ratios` must be a nested list, containing a list of aspect ratios + for each predictor layer. + ''' + num_boxes_per_pred_layer = [] + for i in range(len(predictor_sizes)): + if two_boxes_for_ar1: + num_boxes_per_pred_layer.append(predictor_sizes[i][0] * predictor_sizes[i][1] * (len(aspect_ratios[i]) + 1)) + else: + num_boxes_per_pred_layer.append(predictor_sizes[i][0] * predictor_sizes[i][1] * len(aspect_ratios[i])) + return num_boxes_per_pred_layer + +def get_pred_layers(y_pred_decoded, num_boxes_per_pred_layer): + ''' + For a given prediction tensor decoded with `decode_detections_debug()`, returns a list + with the indices of the predictor layers that made each predictions. + + That is, this function lets you know which predictor layer is responsible + for a given prediction. + + Arguments: + y_pred_decoded (array): The decoded model output tensor. Must have been + decoded with `decode_detections_debug()` so that it contains the internal box index + for each predicted box. + num_boxes_per_pred_layer (list): A list that contains the total number + of boxes that each predictor layer predicts. + ''' + pred_layers_all = [] + cum_boxes_per_pred_layer = np.cumsum(num_boxes_per_pred_layer) + for batch_item in y_pred_decoded: + pred_layers = [] + for prediction in batch_item: + if (prediction[0] < 0) or (prediction[0] >= cum_boxes_per_pred_layer[-1]): + raise ValueError("Box index is out of bounds of the possible indices as given by the values in `num_boxes_per_pred_layer`.") + for i in range(len(cum_boxes_per_pred_layer)): + if prediction[0] < cum_boxes_per_pred_layer[i]: + pred_layers.append(i) + break + pred_layers_all.append(pred_layers) + return pred_layers_all