@@ -289,13 +289,10 @@ class CBlitImageFilter : public CImageFilter<CBlitImageFilter<Normalize,Clamp,Sw
289289
290290 // filtering and alpha handling happens separately for every layer, so save on scratch memory size
291291 const auto inImageType = inParams.type ;
292- const auto window_last = [&kernelX,&kernelY,&kernelZ]() -> core::vectorSIMDi32
293- {
294- return core::vectorSIMDi32 (kernelX.getWindowSize ().x -1 ,kernelY.getWindowSize ().y -1 ,kernelZ.getWindowSize ().z -1 ,0 );
295- }();
292+ const auto window_end = getWindowEnd (inImageType,kernelX,kernelY,kernelZ);
296293 const core::vectorSIMDi32 intermediateExtent[3 ] = {
297- core::vectorSIMDi32 (outExtent.width ,inExtent.height +window_last [1 ],inExtent.depth +window_last [2 ]),
298- core::vectorSIMDi32 (outExtent.width ,outExtent.height ,inExtent.depth +window_last [2 ]),
294+ core::vectorSIMDi32 (outExtent.width ,inExtent.height +window_end [1 ],inExtent.depth +window_end [2 ]),
295+ core::vectorSIMDi32 (outExtent.width ,outExtent.height ,inExtent.depth +window_end [2 ]),
299296 core::vectorSIMDi32 (outExtent.width ,outExtent.height ,outExtent.depth )
300297 };
301298 const core::vectorSIMDi32 intermediateLastCoord[3 ] = {
@@ -465,7 +462,7 @@ class CBlitImageFilter : public CImageFilter<CBlitImageFilter<Normalize,Clamp,Sw
465462 lineBuffer = intermediateStorage[axis-1 ]+core::dot (static_cast <const core::vectorSIMDi32&>(intermediateStrides[axis-1 ]),localTexCoord)[0 ];
466463 else
467464 {
468- const auto windowEnd = inExtent.width +window_last .x ;
465+ const auto windowEnd = inExtent.width +window_end .x ;
469466 decode_offset = alloc_decode_scratch ();
470467 lineBuffer = intermediateStorage[1 ]+decode_offset*MaxChannels*windowEnd;
471468 for (auto & i=localTexCoord.x ; i<windowEnd; i++)
@@ -566,6 +563,21 @@ class CBlitImageFilter : public CImageFilter<CBlitImageFilter<Normalize,Clamp,Sw
566563
567564 private:
568565 static inline constexpr uint32_t VectorizationBoundSTL = /* AVX2*/ 16u ;
566+ //
567+ static inline core::vectorSIMDi32 getWindowEnd (const IImage::E_TYPE inImageType,
568+ const CScaledImageFilterKernel<KernelX>& kernelX,
569+ const CScaledImageFilterKernel<KernelY>& kernelY,
570+ const CScaledImageFilterKernel<KernelZ>& kernelZ
571+ )
572+ {
573+ // TODO: investigate properly if its supposed be `size` or `size-1` (polyphase kinda shows need for `size`)
574+ core::vectorSIMDi32 last (kernelX.getWindowSize ().x ,0 ,0 ,0 );
575+ if (inImageType>=IImage::ET_2D)
576+ last.y = kernelY.getWindowSize ().x ;
577+ if (inImageType>=IImage::ET_3D)
578+ last.z = kernelZ.getWindowSize ().x ;
579+ return last;
580+ }
569581 // the blit filter will filter one axis at a time, hence necessitating "ping ponging" between two scratch buffers
570582 static inline uint32_t getScratchOffset (const state_type* state, bool secondPong)
571583 {
@@ -574,17 +586,14 @@ class CBlitImageFilter : public CImageFilter<CBlitImageFilter<Normalize,Clamp,Sw
574586 const auto kernelY = state->contructScaledKernel (state->kernelY );
575587 const auto kernelZ = state->contructScaledKernel (state->kernelZ );
576588
577- const auto window_last = [&kernelX,&kernelY,&kernelZ]() -> core::vectorSIMDi32
578- {
579- return core::vectorSIMDi32 (kernelX.getWindowSize ().x -1 ,kernelY.getWindowSize ().y -1 ,kernelZ.getWindowSize ().z -1 ,0 );
580- }();
589+ const auto window_end = getWindowEnd (state->inImage ->getCreationParameters ().type ,kernelX,kernelY,kernelZ);
581590 // TODO: account for the size needed for coverage adjustment
582591 // the first pass will be along X, so new temporary image will have the width of the output extent, but the height and depth will need to be padded
583592 // but the last pass will be along Z and the new temporary image will have the exact dimensions of `outExtent` which is why there is a `core::max`
584- auto texelCount = state->outExtent .width *core::max<uint32_t >((state->inExtent .height +window_last [1 ])*(state->inExtent .depth +window_last [2 ]),state->outExtent .height *state->outExtent .depth );
593+ auto texelCount = state->outExtent .width *core::max<uint32_t >((state->inExtent .height +window_end [1 ])*(state->inExtent .depth +window_end [2 ]),state->outExtent .height *state->outExtent .depth );
585594 // the second pass will result in an image that has the width and height equal to `outExtent`
586595 if (secondPong)
587- texelCount += core::max<uint32_t >(state->outExtent .width *state->outExtent .height *(state->inExtent .depth +window_last [2 ]),(state->inExtent .width +window_last [0 ])*std::thread::hardware_concurrency ()*VectorizationBoundSTL);
596+ texelCount += core::max<uint32_t >(state->outExtent .width *state->outExtent .height *(state->inExtent .depth +window_end [2 ]),(state->inExtent .width +window_end [0 ])*std::thread::hardware_concurrency ()*VectorizationBoundSTL);
588597 // obviously we have multiple channels and each channel has a certain type for arithmetic
589598 return texelCount*MaxChannels*sizeof (value_type);
590599 }
0 commit comments