-
Notifications
You must be signed in to change notification settings - Fork 5.8k
Description
System information (version)
- OpenCV => 4.0
- Operating System / Platform => Ubuntu Mate 64 bits
- Compiler => GCC
Detailed description
Hello I recently has to work with both Vgg and BoostDesc image descriptors.
I had to make descriptors for a hugh number of keypoints (several hundreds of thousands) for each image of PASCAL 2012, after one week my algorithm has crash and I observed that only a one hundred images has been processed in seven days.
So I start looking if I could improve the code here are some observations:
In both file the keypoints computer make a copy of the keypoints.
If the keypoints were stored in a former cv::Vector data structe if would have been performance less but not with a std::vector data structure.
is it possible to change the arguments of the class ComputeVGGInvoker
from:
Mat image;
Mat *descriptors;
vector<KeyPoint> keypoints;
Mat Proj;
Mat PRFilters;
int anglebins;
float scale_factor;
bool img_normalize;
bool use_scale_orientation;
to:
const Mat& image;
const vector<KeyPoint>& keypoints;
const Mat& Proj;
const Mat& PRFilters;
const int& anglebins;
const float& scale_factor;
const bool& img_normalize;
const bool& use_scale_orientation;
Mat &descriptors;
The modifly constructor should be:
ComputeVGGInvoker( const Mat& _image,
Mat& _descriptors,
const vector<KeyPoint>& _keypoints,
const Mat& _PRFilters, const Mat& _Proj,
const int& _anglebins, const bool& _img_normalize,
const bool& _use_scale_orientation, const float& _scale_factor ):
image(_image),
keypoints(_keypoints),
Proj(_Proj),
PRFilters(_PRFilters),
anglebins(_anglebins),
scale_factor(_scale_factor),
img_normalize(_img_normalize),
use_scale_orientation(_use_scale_orientation),
descriptors(_descriptors)
{}
In the operator() override of the class ComputeVGGInvoker
the one before last operation consists to reshape the descriptor as a matrix with 4096 rows and 1 columns:
Desc = Desc.reshape( 1, (int)Desc.total() );
The operation is immediately followed by a matrix multiplication the transpose of the results with the transpose of the member variable Proj.
descriptors->row( k ) = Desc.t() * Proj.t();
The transposition of Desc in the last line can be prevented by reshaping the descriptor as a matrix of one row and 4096 columns.
Another observation is that the variable Proj is always transpose and never used as is.
So the two last line should become:
// reshape
Desc = Desc.reshape( 1, 1);
// project
descriptors.row( k ) = Desc * Proj;
I also observed that in the function get_desc
there is a lot of transposition that can be prevented by simply saving the data in the patch at the transposed place (x,y rather than y,x).
Also the function get_patch evaluate 4096 time a condition that can be determine before the loop even start... it is a lot useless.
I propose to rewrite get_patch function as:
// sample 64x64 patch from image given keypoint
static inline void get_patch( const KeyPoint kp, Mat& Patch, const Mat& image,
const bool use_scale_orientation, const float scale_factor )
{
// scale & radians
float scale = kp.size / 64.f * scale_factor;
const float angle = (kp.angle == -1)
? 0 : ( (kp.angle)*(float)CV_PI ) / 180.f;
// transforms
const float tsin = sin(angle) * scale;
const float tcos = cos(angle) * scale;
const float half_cols = (float)Patch.cols / 2.f;
const float half_rows = (float)Patch.rows / 2.f;
// sample form original image
if(use_scale_orientation)
{
for ( int x = 0; x < Patch.cols; x++ )
for ( int y = 0; y < Patch.rows; y++ )
{
const float xoff = x - half_cols;
const float yoff = y - half_rows;
// the rotation shifts & scale
int img_x = int( (kp.pt.x + 0.5f) + xoff*tcos - yoff*tsin );
int img_y = int( (kp.pt.y + 0.5f) + xoff*tsin + yoff*tcos );
// sample only within image
// to prevent to have to transpose later and due to the fact the patch is square it is easier to save x,y rather than y,x.
Patch.at<float>( x, y) = ( img_x < image.cols ) && ( img_x >= 0 )
&& ( img_y < image.rows ) && ( img_y >= 0 ) ? image.at<float>( img_y, img_x ) : 0.f;
}
}
else
{
for ( int x = 0; x < Patch.cols; x++ )
for ( int y = 0; y < Patch.rows; y++ )
{
const float xoff = x - half_cols;
const float yoff = y - half_rows;
// the samples from image
int img_x = int( kp.pt.x + 0.5f + xoff );
int img_y = int( kp.pt.y + 0.5f + yoff );
// sample only within image
Patch.at<float>( x, y) = ( img_x < image.cols ) && ( img_x >= 0 )
&& ( img_y < image.rows ) && ( img_y >= 0 ) ? image.at<float>( img_y, img_x ) : 0.f;
}
}
}
In the function get_desc the processing of the variable GAngleRatio
can be done in two steps.
The first step consists simply by processing the atan2
for every pixelwisely, the second steps consist to process GMag
, GAngleRatio
, w1
, w2
, Bin1
, and Bin2
in one look.
The advantage to do that like this is that, in the first step some some unrolling can be done in order to help the compiler to call the autovectorizer for the architecture that have the trigonometric vectorized instruction, it will help the compiler optimize the caches otherwise.
The second interest is that the second step is vectorizable manually and the everything is aligned.
The last loops order should be inversed.
make the anglebins as first loop and the length of the patch in the second loop mean that for every element of w1
, w2
, GMagT
, Bin1T
and Bin2T
will be evaluate anglebins
(8) times.
By inversing the two loops each element of each variable will only be evaluate one time only.
So I propose to modify the function get_desc
to the following:
static void get_desc( const Mat Patch, Mat& PatchTrans, int anglebins, bool img_normalize )
{
Mat Ix, Iy;
// % compute gradient
Mat1f Kernel = { -1.f, 0.f, 1.f };
filter2D( Patch, Ix, CV_32F, Kernel, Point( -1, -1 ), 0, BORDER_REPLICATE );
filter2D( Patch, Iy, CV_32F, Kernel.t(), Point( -1, -1 ), 0, BORDER_REPLICATE );
Mat GMag, GAngleRatio, w1, w2, Bin1T, Bin2T;
// % gradient magnitude
// % GMag = sqrt(Ix .^ 2 + Iy .^ 2);
// magnitude( Ix, Iy, GMag );
// hal::magnitude(Ix.ptr<float>(), Iy.ptr<float>(), GMag.ptr<float>(), GMag.total());
// % gradient orientation: [0; 2 * pi]
// % GAngle = atan2(Iy, Ix) + pi;
//phase( Ix, Iy, GAngle, false ); //<- opencv is buggy
float AngleStep = 2.0f * (float) CV_PI / (float) anglebins;
// GAngle = Mat( GMag.rows, GMag.cols, CV_32F );
GMag.create(Ix.size(), CV_32F);
GAngleRatio.create(GMag.size(), GMag.type());
w1.create(GMag.cols, GMag.rows, GMag.type());
w2.create(w1.size(), w1.type());
Bin1T.create(w1.size(), w1.type());
Bin2T.create(w1.size(), w1.type());
float pif = static_cast<float>(CV_PI);
const float* dx_ptr = Ix.ptr<float>();
const float* dy_ptr = Iy.ptr<float>();
float* angle_ptr = GAngleRatio.ptr<float>();
int i=0;
#if CV_ENABLE_UNROLLED
for(;i<static_cast<int>(GAngleRatio.total()-4); i+=4)
{
float vx0 = dx_ptr[i];
float vx1 = dx_ptr[i+1];
float vy0 = dy_ptr[i];
float vy1 = dy_ptr[i+1];
angle_ptr[i] = atan2(vy0, vx0);
angle_ptr[i+1] = atan2(vy1, vx1);
vx0 = dx_ptr[i+2];
vx1 = dx_ptr[i+3];
vy0 = dy_ptr[i+2];
vy1 = dy_ptr[i+3];
angle_ptr[i+2] = atan2(vy0, vx0);
angle_ptr[i+3] = atan2(vy1, vx1);
}
dx_ptr+=i;
dy_ptr+=i;
angle_ptr+=i;
#endif
for ( ; i < (int)GAngleRatio.total(); i++, dx_ptr++, dy_ptr++, angle_ptr++ )
*angle_ptr = atan2(*dy_ptr, *dx_ptr);
dx_ptr = Ix.ptr<float>();
dy_ptr = Iy.ptr<float>();
angle_ptr = GAngleRatio.ptr<float>();
float* mag_ptr = GMag.ptr<float>();
i=0;
v_float32 v_pif = vx_setall_f32(pif);
v_float32 v_half = vx_setall_f32(-0.5f);
v_float32 v_step = vx_setall_f32(1.f/AngleStep);
for(;i<(int)GAngleRatio.total(); i+=v_float32::nlanes, dx_ptr+=v_float32::nlanes, dy_ptr+=v_float32::nlanes, angle_ptr+=v_float32::nlanes, mag_ptr+=v_float32::nlanes)
{
v_float32 v_dx = vx_load(dx_ptr);
v_float32 v_dy = vx_load(dy_ptr);
v_float32 v_mag = v_magnitude(v_dx, v_dy);
vx_store_aligned(mag_ptr, v_mag);
v_float32 v_angle = vx_load(angle_ptr) + v_pif;
v_angle = v_fma(v_angle, v_step, v_half);
vx_store_aligned(angle_ptr, v_angle);
}
for(;i<(int)GAngleRatio.total(); i++, dx_ptr++, dy_ptr++, angle_ptr++, mag_ptr++)
{
*mag_ptr = hypotf(*dx_ptr, *dy_ptr);
float v = *angle_ptr + pif;
v = v / AngleStep - 0.5f;
*angle_ptr = v;
}
// GAngleRatio = GAngleRatio.t();
v_float32 v_ones = vx_setall_f32(1.f);
v_float32 v_anglebins = vx_setall_f32(anglebins - 1.f);
v_float32 v_minus_ones = vx_setall_f32(-1.f);
for(int r=0;r<GAngleRatio.rows;r++)
{
int c=0;
const float* angle_ptr = GAngleRatio.ptr<float>(r);
float* w1_ptr = w1.ptr<float>(r);
float* w2_ptr = w2.ptr<float>(r);
float* bin1_ptr = Bin1T.ptr<float>(r);
float* bin2_ptr = Bin2T.ptr<float>(r);
for(;c<GAngleRatio.cols; c+=v_float32::nlanes, angle_ptr+=v_float32::nlanes, w1_ptr+=v_float32::nlanes, w2_ptr+=v_float32::nlanes, bin1_ptr+=v_float32::nlanes, bin2_ptr+=v_float32::nlanes)
{
v_float32 va = v_load_aligned(angle_ptr);
v_float32 vw2 = va - v_cvt_f32(v_floor(va));
v_float32 vw1 = v_ones - vw2;
v_float32 vb1 = v_cvt_f32(v_ceil(va - v_ones));
v_float32 v_mask = vb1 == v_minus_ones;
vb1 = (v_mask & v_anglebins) | ((~v_mask) & vb1);
v_float32 vb2 = vb1 + v_ones;
v_mask = vb2 <= v_anglebins;
vb2 = v_mask & vb2;
vx_store_aligned(w1_ptr, vw1);
vx_store_aligned(w2_ptr, vw2);
vx_store_aligned(bin1_ptr, vb1);
vx_store_aligned(bin2_ptr, vb2);
}
for(;c<GAngleRatio.cols; c++, angle_ptr++, w1_ptr++, w2_ptr++, bin1_ptr++, bin2_ptr++)
{
float a = *angle_ptr;
a = a - static_cast<float>(cvFloor(a));
*w2_ptr = a;
*w1_ptr = 1.f - a;
float b = static_cast<float>(cvCeil(a-1.f));
float vb1 = b == -1.f ? static_cast<float>(anglebins - 1.f) : b;
b = vb1 + 1.f;
float vb2 = b <= static_cast<float>(anglebins - 1.f) ? b : 0.f;
*bin1_ptr = vb1;
*bin2_ptr = vb2;
}
}
// normalize
if ( img_normalize )
{
// % Quantile = 0.8;
float q = 0.8f;
// % T = quantile(GMag(:), Quantile);
Mat GMagSorted;
sort( GMag.reshape( 0, 1 ), GMagSorted, SORT_ASCENDING );
int n = GMagSorted.cols;
// scipy/stats/mstats_basic.py#L1718 mquantiles()
// m = alphap + p*(1.-alphap-betap)
// alphap = 0.5 betap = 0.5 => (m = 0.5)
// aleph = (n*p + m)
float aleph = ( n * q + 0.5f );
int k = max(min(cvFloor(aleph), n-1), 1);
// int k = cvFloor( aleph );
// if ( k >= n - 1 ) k = n - 1;
// if ( k <= 1 ) k = 1;
float gamma = max(min(aleph - k, 1.f),0.f);
// float gamma = aleph - k;
// if ( gamma >= 1.0f ) gamma = 1.0f;
// if ( gamma <= 0.0f ) gamma = 0.0f;
// quantile out from distribution
float T = ( 1.f - gamma ) * GMagSorted.at<float>( k - 1 )
+ gamma * GMagSorted.at<float>( k );
// avoid NaN
if ( T != 0.0f )
{
const float den = T / anglebins;
v_float32 v_den = vx_setall_f32(den);
float* mag_ptr = GMag.ptr<float>();
int i=0;
for(; i<GMag.total(); i+=v_float32::nlanes, mag_ptr+=v_float32::nlanes)
vx_store_aligned(mag_ptr, vx_load_aligned(mag_ptr) / v_den);
for(;i<GMag.total();i++, mag_ptr++)
*mag_ptr /= den;
}
// GMag /= ( T / anglebins );
}
Mat GMagT = GMag;
// % feature channels
PatchTrans = Mat( (int)Patch.total(), anglebins, CV_32F, Scalar::all(0) );
const float* w1_ptr = w1.ptr<float>();
const float* w2_ptr = w2.ptr<float>();
const float* bin1_ptr = Bin1T.ptr<float>();
const float* bin2_ptr = Bin2T.ptr<float>();
mag_ptr = GMagT.ptr<float>();
int p=0;
for ( ; p < PatchTrans.rows; p++, bin1_ptr++, bin2_ptr++, w1_ptr++, w2_ptr++, mag_ptr++ )
{
float vw1 = *w1_ptr;
float vw2 = *w2_ptr;
float vb1 = *bin1_ptr;
float vb2 = *bin2_ptr;
float vmag = *mag_ptr;
float* it_dst = PatchTrans.ptr<float>(p);
int i=0;
for ( ; i < anglebins; i++, it_dst++ )
*it_dst = vb1 == i ? vw1 * vmag : vb2 == i ? vw2 * vmag : 0.f;
}
}
[vgg2.cpp.txt](https://github.com/opencv/opencv_contrib/files/2974102/vgg2.cpp.txt)
Steps to reproduce
During the tests I conducted I without modification of vgg call from the library locally compiled it tooks 30 s to process the keypoints, from the code executed in debug mode before any modifications it tooks around 80 s to process the keypoints, after modifications it tooks between 39-40 seconds in debug and around 27 in release.
I did not provided ximgproc_improved.h due to some currently works on it. For the following example it does only contains the declaration of the class "fastVGG" that is identically the same as the one of VGG.
The image is from Pascal Voc the mask is an adaptation from the data from Pascal Part.
#include <iostream>
#include "ximproc_improved.h"
#include <opencv2/xfeatures2d.hpp>
#include <opencv2/highgui.hpp>
int main()
{
cv::Mat image = cv::imread("2008_003915.jpg");
cv::Mat masks = cv::imread("label_map.tiff", cv::IMREAD_GRAYSCALE);
std::vector<cv::Point> pts;
cv::findNonZero(masks, pts);
cv::Ptr<cv::xfeatures2d::VGG> vgg = cv::xfeatures2d::VGG::create();
cv::Ptr<cv::xfeatures2d::fastVGG> vgg_fast = cv::xfeatures2d::fastVGG::create();
std::vector<cv::KeyPoint> kps(pts.size());
cv::Mat descriptors_ref, descriptors_tgt;
cv::cuda::GpuMat tmp;
cv::parallel_for_(cv::Range(0, static_cast<int>(pts.size()) ), [&kps, &pts](const cv::Range& range)->void
{
for(int r=range.start; r<range.end; r++)
kps.at(r) = cv::KeyPoint(pts.at(r),1.f);
});
kps.insert(kps.begin(), cv::KeyPoint(0.f,0.f,0.f, -1.f));
cv::TickMeter tm;
std::cout<<"CHECK REF"<<std::endl;
tm.start();
vgg->compute(image, kps, descriptors_ref);
tm.stop();
std::cout<<"TIME OF EXECUTION: "<<tm.getTimeMilli()<<std::endl;
tm.reset();
std::cout<<"CHECK TGT"<<std::endl;
tm.start();
vgg_fast->compute(image, kps, descriptors_tgt);
tm.stop();
std::cout<<"TIME OF EXECUTION: "<<tm.getTimeMilli()<<std::endl;
tm.reset();
std::cout<<"GLOBAL TIME OF EXECUTION: "<<tm.getTimeMilli()<<std::endl;
descriptors_ref.convertTo(descriptors_ref, CV_32F);
descriptors_tgt.convertTo(descriptors_tgt, CV_32F);
cv::Mat delta = descriptors_ref-descriptors_tgt;
delta = delta.mul(delta);
double mn(0.), mx(0.);
cv::minMaxIdx(delta, &mn, &mx);
std::cout<<"MSE: "<<mn<<" "<<mx<<std::endl;
std::cout<<descriptors_ref.size()<<" "<<descriptors_tgt.size()<<std::endl;
std::cout<<std::endl<<descriptors_ref.rowRange(0,2)<<std::endl<<std::endl<<descriptors_tgt.rowRange(0,2)<<std::endl;
int cnt(0);
std::vector<float> dist(descriptors_ref.rows);
for(int i=0;i<descriptors_tgt.rows;i++)
{
dist.at(i) = cv::normL2Sqr(descriptors_ref.ptr<float>(i), descriptors_tgt.ptr<float>(i), descriptors_ref.cols);
if(cv::normL2Sqr(descriptors_ref.ptr<float>(i), descriptors_tgt.ptr<float>(i), descriptors_ref.cols) < 1.f)
cnt++;
}
cv::minMaxIdx(dist, &mn, &mx);
std::cout<<"THERE IS: "<<cnt<<"/"<<descriptors_ref.rows<<" MATCHES"<<std::endl;
std::cout << "Hello World!" << std::endl;
return 0;
}