Skip to content

Commit ac8fcfb

Browse files
authored
Merge pull request #741 from mwcraig/refactor-average-final
Improve image combination performance
2 parents bb9c667 + 50abe7c commit ac8fcfb

11 files changed

+406
-75
lines changed

.github/workflows/ci_tests.yml

+5
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ jobs:
2121
name: [
2222
'ubuntu-py37-oldestdeps',
2323
'ubuntu-py38',
24+
'ubuntu-py38-bottleneck',
2425
# 'ubuntu-py39', # Skip until astroscrappy is working
2526
'macos-py38',
2627
'windows-py38',
@@ -44,6 +45,10 @@ jobs:
4445
os: ubuntu-latest
4546
python: '3.8'
4647
tox_env: 'py38-test-alldeps-numpy118-cov'
48+
- name: 'ubuntu-py38-bottleneck'
49+
os: ubuntu-latest
50+
python: '3.8'
51+
tox_env: 'py38-test-alldeps-numpy118-cov-bottleneck'
4752
# - name: 'ubuntu-py39'
4853
# os: ubuntu-latest
4954
# python: '3.9'

CHANGES.rst

+8
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,11 @@
44
New Features
55
^^^^^^^^^^^^
66

7+
- Image combination is faster for average and sum combine, and improves
8+
for all operations if the ``bottleneck`` package is installed. [#741]
9+
10+
- Pixel-wise weighting can be done for sum and average combine. [#741]
11+
712
Other Changes and Additions
813
^^^^^^^^^^^^^^^^^^^^^^^^^^^
914

@@ -14,6 +19,9 @@ Bug Fixes
1419
explicitly using a regex search pattern (``regex_match=True``), escape all
1520
special characters in the keyword value for a successful search. [#770]
1621

22+
- Return mask and uncertainty from ``combine`` even if input images have no
23+
mask or uncertainty. [#775]
24+
1725
2.1.1 (2021-03-15)
1826
------------------
1927

ccdproc/combiner.py

+178-32
Original file line numberDiff line numberDiff line change
@@ -4,14 +4,51 @@
44

55
import numpy as np
66
from numpy import ma
7+
8+
try:
9+
import bottleneck as bn
10+
except ImportError:
11+
HAS_BOTTLENECK = False
12+
else:
13+
HAS_BOTTLENECK = True
14+
715
from .core import sigma_func
816

917
from astropy.nddata import CCDData, StdDevUncertainty
18+
from astropy.stats import sigma_clip
1019
from astropy import log
1120

1221
__all__ = ['Combiner', 'combine']
1322

1423

24+
def _default_median(): # pragma: no cover
25+
if HAS_BOTTLENECK:
26+
return bn.nanmedian
27+
else:
28+
return np.nanmedian
29+
30+
31+
def _default_average(): # pragma: no cover
32+
if HAS_BOTTLENECK:
33+
return bn.nanmean
34+
else:
35+
return np.nanmean
36+
37+
38+
def _default_sum(): # pragma: no cover
39+
if HAS_BOTTLENECK:
40+
return bn.nansum
41+
else:
42+
return np.nansum
43+
44+
45+
def _default_std(): # pragma: no cover
46+
if HAS_BOTTLENECK:
47+
return bn.nanstd
48+
else:
49+
return np.nanstd
50+
51+
1552
class Combiner:
1653
"""
1754
A class for combining CCDData objects.
@@ -258,7 +295,8 @@ def minmax_clipping(self, min_clip=None, max_clip=None):
258295

259296
# set up sigma clipping algorithms
260297
def sigma_clipping(self, low_thresh=3, high_thresh=3,
261-
func=ma.mean, dev_func=ma.std):
298+
func=ma.mean, dev_func=ma.std, use_astropy=False,
299+
**kwd):
262300
"""
263301
Pixels will be rejected if they have deviations greater than those
264302
set by the threshold values. The algorithm will first calculated
@@ -283,15 +321,38 @@ def sigma_clipping(self, low_thresh=3, high_thresh=3,
283321
func : function, optional
284322
Function for calculating the baseline values (i.e. `numpy.ma.mean`
285323
or `numpy.ma.median`). This should be a function that can handle
286-
`numpy.ma.MaskedArray` objects.
324+
`numpy.ma.MaskedArray` objects. **Set to ``'median'`` and
325+
set ``use_astropy=True`` for best performance if using a
326+
median.**
287327
Default is `numpy.ma.mean`.
288328
289329
dev_func : function, optional
290330
Function for calculating the deviation from the baseline value
291331
(i.e. `numpy.ma.std`). This should be a function that can handle
292332
`numpy.ma.MaskedArray` objects.
293333
Default is `numpy.ma.std`.
334+
335+
use_astropy : bool, optional
336+
If ``True``, use astropy's `~astropy.stats.sigma_clip`, which is faster
337+
and more flexible. The high/low sigma clip parameters are set
338+
from ``low_thresh`` and ``high_thresh``. Any remaining keywords are passed
339+
in to astropy's `~astropy.stats.sigma_clip`. By default, the
340+
number of iterations and other settings will be made to reproduce
341+
the behavior of ccdproc's ``sigma_clipping``.
294342
"""
343+
if use_astropy:
344+
copy = kwd.get('copy', False)
345+
axis = kwd.get('axis', 0)
346+
maxiters = kwd.get('maxiters', 1)
347+
self.data_arr.mask = \
348+
sigma_clip(self.data_arr.data, sigma_lower=low_thresh,
349+
sigma_upper=high_thresh, axis=axis, copy=copy,
350+
maxiters=maxiters,
351+
cenfunc=func, stdfunc=dev_func,
352+
masked=True,
353+
**kwd).mask
354+
return
355+
295356
# setup baseline values
296357
baseline = func(self.data_arr, axis=0)
297358
dev = dev_func(self.data_arr, axis=0)
@@ -313,8 +374,38 @@ def _get_scaled_data(self, scale_arg):
313374
return self.data_arr * self.scaling
314375
return self.data_arr
315376

377+
def _get_nan_substituted_data(self, data):
378+
# Get the data as an unmasked array with masked values filled as NaN
379+
if self.data_arr.mask.any():
380+
data = np.ma.filled(data, fill_value=np.nan)
381+
else:
382+
data = data.data
383+
return data
384+
385+
def _combination_setup(self,
386+
user_func,
387+
default_func,
388+
scale_to):
389+
"""
390+
Handle the common pieces of image combination data/mask setup.
391+
"""
392+
data = self._get_scaled_data(scale_to)
393+
394+
# Play it safe for now and only do the nan thing if the user is using
395+
# the default combination function.
396+
if user_func is None:
397+
combo_func = default_func
398+
# Subtitute NaN for masked entries
399+
data = self._get_nan_substituted_data(data)
400+
masked_values = np.isnan(data).sum(axis=0)
401+
else:
402+
masked_values = self.data_arr.mask.sum(axis=0)
403+
combo_func = user_func
404+
405+
return data, masked_values, combo_func
406+
316407
# set up the combining algorithms
317-
def median_combine(self, median_func=ma.median, scale_to=None,
408+
def median_combine(self, median_func=None, scale_to=None,
318409
uncertainty_func=sigma_func):
319410
"""
320411
Median combine a set of arrays.
@@ -351,15 +442,28 @@ def median_combine(self, median_func=ma.median, scale_to=None,
351442
The uncertainty currently calculated using the median absolute
352443
deviation does not account for rejected pixels.
353444
"""
354-
# set the data
355-
data = median_func(self._get_scaled_data(scale_to), axis=0)
445+
446+
data, masked_values, median_func = \
447+
self._combination_setup(median_func,
448+
_default_median(),
449+
scale_to)
450+
451+
medianed = median_func(data, axis=0)
356452

357453
# set the mask
358-
masked_values = self.data_arr.mask.sum(axis=0)
359454
mask = (masked_values == len(self.data_arr))
360455

361456
# set the uncertainty
362-
uncertainty = uncertainty_func(self.data_arr, axis=0)
457+
458+
# This still uses numpy for the median because the astropy
459+
# code requires that the median function take the argument
460+
# overwrite_input and bottleneck doesn't allow that argument.
461+
# This is ugly, but setting ignore_nan to True should make sure
462+
# that either nans or masks are handled properly.
463+
if uncertainty_func is sigma_func:
464+
uncertainty = uncertainty_func(data, axis=0, ignore_nan=True)
465+
else:
466+
uncertainty = uncertainty_func(data, axis=0)
363467
# Divide uncertainty by the number of pixel (#309)
364468
uncertainty /= np.sqrt(len(self.data_arr) - masked_values)
365469
# Convert uncertainty to plain numpy array (#351)
@@ -370,7 +474,7 @@ def median_combine(self, median_func=ma.median, scale_to=None,
370474
uncertainty = np.asarray(uncertainty)
371475

372476
# create the combined image with a dtype matching the combiner
373-
combined_image = CCDData(np.asarray(data.data, dtype=self.dtype),
477+
combined_image = CCDData(np.asarray(medianed, dtype=self.dtype),
374478
mask=mask, unit=self.unit,
375479
uncertainty=StdDevUncertainty(uncertainty))
376480

@@ -380,8 +484,26 @@ def median_combine(self, median_func=ma.median, scale_to=None,
380484
# return the combined image
381485
return combined_image
382486

383-
def average_combine(self, scale_func=ma.average, scale_to=None,
384-
uncertainty_func=ma.std):
487+
def _weighted_sum(self, data, sum_func):
488+
"""
489+
Perform weighted sum, used by both ``sum_combine`` and in some cases
490+
by ``average_combine``.
491+
"""
492+
if self.weights.shape != data.shape:
493+
# Add extra axes to the weights for broadcasting
494+
weights = np.reshape(self.weights, [len(self.weights), 1, 1])
495+
else:
496+
weights = self.weights
497+
498+
# Turns out bn.nansum has an implementation that is not
499+
# precise enough for float32 sums. Doing this should
500+
# ensure the sums are carried out as float64
501+
weights = weights.astype('float64')
502+
weighted_sum = sum_func(data * weights, axis=0)
503+
return weighted_sum, weights
504+
505+
def average_combine(self, scale_func=None, scale_to=None,
506+
uncertainty_func=_default_std(), sum_func=_default_sum()):
385507
"""
386508
Average combine together a set of arrays.
387509
@@ -397,7 +519,7 @@ def average_combine(self, scale_func=ma.average, scale_to=None,
397519
----------
398520
scale_func : function, optional
399521
Function to calculate the average. Defaults to
400-
`numpy.ma.average`.
522+
`numpy.nanmean`.
401523
402524
scale_to : float or None, optional
403525
Scaling factor used in the average combined image. If given,
@@ -406,40 +528,57 @@ def average_combine(self, scale_func=ma.average, scale_to=None,
406528
uncertainty_func : function, optional
407529
Function to calculate uncertainty. Defaults to `numpy.ma.std`.
408530
531+
sum_func : function, optional
532+
Function used to calculate sums, including the one done to
533+
find the weighted average. Defaults to `numpy.nansum`.
534+
409535
Returns
410536
-------
411537
combined_image: `~astropy.nddata.CCDData`
412538
CCDData object based on the combined input of CCDData objects.
413539
"""
414-
# set up the data
415-
data, wei = scale_func(self._get_scaled_data(scale_to),
416-
axis=0, weights=self.weights,
417-
returned=True)
540+
data, masked_values, scale_func = \
541+
self._combination_setup(scale_func,
542+
_default_average(),
543+
scale_to)
544+
# # set up the data
545+
# data = self._get_scaled_data(scale_to)
546+
547+
# # Subtitute NaN for masked entries
548+
# data = self._get_nan_substituted_data(data)
549+
550+
# Do NOT modify data after this -- we need it to be intact when we
551+
# we get to the uncertainty calculation.
552+
if self.weights is not None:
553+
weighted_sum, weights = self._weighted_sum(data, sum_func)
554+
mean = weighted_sum / sum_func(weights, axis=0)
555+
else:
556+
mean = scale_func(data, axis=0)
557+
558+
# calculate the mask
418559

419-
# set up the mask
420-
masked_values = self.data_arr.mask.sum(axis=0)
421560
mask = (masked_values == len(self.data_arr))
422561

423562
# set up the deviation
424-
uncertainty = uncertainty_func(self.data_arr, axis=0)
563+
uncertainty = uncertainty_func(data, axis=0)
425564
# Divide uncertainty by the number of pixel (#309)
426-
uncertainty /= np.sqrt(len(self.data_arr) - masked_values)
565+
uncertainty /= np.sqrt(len(data) - masked_values)
427566
# Convert uncertainty to plain numpy array (#351)
428567
uncertainty = np.asarray(uncertainty)
429568

430569
# create the combined image with a dtype that matches the combiner
431-
combined_image = CCDData(np.asarray(data.data, dtype=self.dtype),
570+
combined_image = CCDData(np.asarray(mean, dtype=self.dtype),
432571
mask=mask, unit=self.unit,
433572
uncertainty=StdDevUncertainty(uncertainty))
434573

435574
# update the meta data
436-
combined_image.meta['NCOMBINE'] = len(self.data_arr)
575+
combined_image.meta['NCOMBINE'] = len(data)
437576

438577
# return the combined image
439578
return combined_image
440579

441-
def sum_combine(self, sum_func=ma.sum, scale_to=None,
442-
uncertainty_func=ma.std):
580+
def sum_combine(self, sum_func=None, scale_to=None,
581+
uncertainty_func=_default_std()):
443582
"""
444583
Sum combine together a set of arrays.
445584
@@ -458,7 +597,7 @@ def sum_combine(self, sum_func=ma.sum, scale_to=None,
458597
----------
459598
sum_func : function, optional
460599
Function to calculate the sum. Defaults to
461-
`numpy.ma.sum`.
600+
`numpy.nansum` or `bottleneck.nansum`.
462601
463602
scale_to : float or None, optional
464603
Scaling factor used in the sum combined image. If given,
@@ -472,24 +611,31 @@ def sum_combine(self, sum_func=ma.sum, scale_to=None,
472611
combined_image: `~astropy.nddata.CCDData`
473612
CCDData object based on the combined input of CCDData objects.
474613
"""
475-
# set up the data
476-
data = sum_func(self._get_scaled_data(scale_to), axis=0)
614+
615+
data, masked_values, sum_func = \
616+
self._combination_setup(sum_func,
617+
_default_sum(),
618+
scale_to)
619+
620+
if self.weights is not None:
621+
summed, weights = self._weighted_sum(data, sum_func)
622+
else:
623+
summed = sum_func(data, axis=0)
477624

478625
# set up the mask
479-
masked_values = self.data_arr.mask.sum(axis=0)
480626
mask = (masked_values == len(self.data_arr))
481627

482628
# set up the deviation
483-
uncertainty = uncertainty_func(self.data_arr, axis=0)
629+
uncertainty = uncertainty_func(data, axis=0)
484630
# Divide uncertainty by the number of pixel (#309)
485-
uncertainty /= np.sqrt(len(self.data_arr) - masked_values)
631+
uncertainty /= np.sqrt(len(data) - masked_values)
486632
# Convert uncertainty to plain numpy array (#351)
487633
uncertainty = np.asarray(uncertainty)
488634
# Multiply uncertainty by square root of the number of images
489-
uncertainty *= len(self.data_arr) - masked_values
635+
uncertainty *= len(data) - masked_values
490636

491637
# create the combined image with a dtype that matches the combiner
492-
combined_image = CCDData(np.asarray(data.data, dtype=self.dtype),
638+
combined_image = CCDData(np.asarray(summed, dtype=self.dtype),
493639
mask=mask, unit=self.unit,
494640
uncertainty=StdDevUncertainty(uncertainty))
495641

@@ -730,7 +876,7 @@ def combine(img_list, output_file=None,
730876
else:
731877
memory_factor = 2
732878

733-
memory_factor *= 1.5
879+
memory_factor *= 1.3
734880

735881
# determine the number of chunks to split the images into
736882
no_chunks = int((memory_factor * size_of_an_img * no_of_img) / mem_limit) + 1

0 commit comments

Comments
 (0)