Cuda Implementation

Import libraries

#include <cuda_runtime.h>
#include <opencv2/opencv.hpp>

Declare Cuda function by sending first 3 arguments are the vectors with the original image, created in the device as

d_R
d_G
d_B

The next 3 arguments are the ones where we are going to write the blurred image, passed as:

d_RED
d_GREEN
d_BLUE
The size argument to iterate through rows and cols
The kernel radius to apply the filter
The Number of threads

__global__ void blurEffect(int *R, int *G, int *B, int *RED, int *GREEN, int *BLUE, int size, int rows, int cols, int kernel, int NTHREADS)
{
  int id = blockDim.x * blockIdx.x + threadIdx.x;

  //using just for "for loop"
  //int start = id*size/NTHREADS;
  //int end = (id+1)*size/NTHREADS;

  int tmp_red = 0;
  int tmp_green = 0;
  int tmp_blue = 0;

  if (id < size)
  {
    tmp_red = 0;
    tmp_green = 0;
    tmp_blue = 0;

    if ( (id-cols*kernel) > 0 ) {
      tmp_red += R[id-cols*kernel];
      tmp_green += G[id-cols*kernel];
      tmp_blue += B[id-cols*kernel];
    }

    if ( (id+cols*kernel) < size ) {
      tmp_red += R[id+cols*kernel];
      tmp_green += G[id+cols*kernel];
      tmp_blue += B[id+cols*kernel];
    }

    if ( id%cols == 0 ) {
      tmp_red += R[id+kernel];
      tmp_green += G[id+kernel];
      tmp_blue += B[id+kernel];
    }
    else if ( id%cols == cols-1 ) {
      tmp_red += R[id-kernel];
      tmp_green += G[id-kernel];
      tmp_blue += B[id-kernel];
    }
    else {
      tmp_red += R[id+kernel] + R[id-kernel];
      tmp_green += G[id+kernel] + G[id-kernel];
      tmp_blue += B[id+kernel]+ B[id-kernel];
    }

    RED[id] = tmp_red/4;
    GREEN[id] = tmp_green/4;
    BLUE[id] = tmp_blue/4;
  }

}

Function to populate the Vector

void randomFill(int *V, int row, int col){
  for(int i=0 ; i<row ; i++){
    for(int j=0 ; j<col ; j++){
      V[i*col+j] = 0;
    }
  }
}

Function to check if error is present and print it

void checkError(string s,cudaError_t err){
	if(err != cudaSuccess){
		std::cout<<s<<" "<<cudaGetErrorString(err)<<std::endl;
		exit(EXIT_FAILURE);
	}
}

Read the Image and create one for the blur effect

char* imageName = argv[1];
Mat image;
image = imread( imageName, IMREAD_COLOR );
if( !image.data )
{
  std::cout<<" No image data \n ";
  return -1;
}

Mat blur = image.clone();

Declare the structure of the Vector

int radius;
size_t the_size = image.rows * image.cols * sizeof(int);
int total_size = image.rows * image.cols;

int NTHREADS = 192;
int threadsPerBlock = 256;

Get the Kernel radius and number of threads from arguments, if present

if(argv[2]==NULL || argc<3) {
  radius = 3;
  NTHREADS = 192;
  threadsPerBlock = 256;
  std::cout<<"Radius and CUDA NUM_THREADS are null"<<std::endl;
}
else {
  radius = atoi(argv[2]);
}

if (argv[3]==NULL || argc<4) {
	NTHREADS = 192;
  threadsPerBlock = 256;
	std::cout<<"CUDA NUM_THREADS is null"<<std::endl;
}
else {
	NTHREADS = atoi(argv[3]);
  threadsPerBlock = atoi(argv[3]);
}

Declare variables in host memory

cudaError_t err = cudaSuccess;

int *h_R, *h_G, *h_B, *h_RED, *h_GREEN, *h_BLUE;
int *d_R, *d_G, *d_B, *d_RED, *d_GREEN, *d_BLUE;

//Memory in Host
h_R = new int[the_size];
h_G = new int[the_size];
h_B = new int[the_size];
h_RED = new int[the_size];
h_GREEN = new int[the_size];
h_BLUE = new int[the_size];

Fill Arrays for each channel RED GREEN BLUE with zeros

randomFill(h_RED, image.rows, image.cols);
randomFill(h_GREEN, image.rows, image.cols);
randomFill(h_BLUE, image.rows, image.cols);

//Fill arrays RGB with data image
for (int i = 0; i < image.rows; ++i)
{
  for (int j = 0; j < image.cols; ++j)
  {
    h_R[i*image.cols+j] = image.at<Vec3b>(i,j)[0];
    h_G[i*image.cols+j] = image.at<Vec3b>(i,j)[1];
    h_B[i*image.cols+j] = image.at<Vec3b>(i,j)[2];
  }
}

Allocate memory in GPU

err = cudaMalloc((void**)&d_R, the_size);
checkError("Error al reservar memoria para R",err);

err = cudaMalloc((void**)&d_G, the_size);
checkError("Error al reservar memoria para G",err);

err = cudaMalloc((void**)&d_B, the_size);
checkError("Error al reservar memoria para B",err);

err = cudaMalloc((void**)&d_RED, the_size);
checkError("Error al reservar memoria para RED",err);

err = cudaMalloc((void**)&d_GREEN, the_size);
checkError("Error al reservar memoria para GREEN",err);

err = cudaMalloc((void**)&d_BLUE, the_size);
checkError("Error al reservar memoria para BLUE",err);

Using cudaMemcpy(), we copy the input data to the device with the parameter cudaMemcpyHostToDevice and copy the result data back to the host with cudaMemcpyDeviceToHost

err = cudaMemcpy(d_R, h_R, the_size, cudaMemcpyHostToDevice);
checkError("Error al pasar los datos de RED al device", err);

err = cudaMemcpy(d_G, h_G, the_size, cudaMemcpyHostToDevice);
checkError("Error al pasar los datos de GREEN al device", err);

err = cudaMemcpy(d_B, h_B, the_size, cudaMemcpyHostToDevice);
checkError("Error al pasar los datos de BLUE al device", err);

Call the Cuda Function with given vectors

blurEffect<<<blocksPerGrid, threadsPerBlock>>>(d_R,d_G,d_B,d_RED,d_GREEN,d_BLUE,total_size,image.rows,image.cols, radius, NTHREADS);

Get errors and copy from memory from device to host

err = cudaGetLastError();
checkError("Error al ejecutar el kernel",err);

err = cudaMemcpy(h_RED, d_RED, the_size, cudaMemcpyDeviceToHost);
checkError("Error al pasar los datos de R al device", err);

err = cudaMemcpy(h_GREEN, d_GREEN, the_size, cudaMemcpyDeviceToHost);
checkError("Error al pasar los datos de G al device", err);

err = cudaMemcpy(h_BLUE, d_BLUE, the_size, cudaMemcpyDeviceToHost);
checkError("Error al pasar los datos de B al device", err);

Move memory to New vector

for (int i = 0; i < image.rows; ++i)
{
	for (int j = 0; j < image.cols; ++j)
	{
		blur.at<Vec3b>(i,j)[0] = h_RED[i*image.cols+j];
		blur.at<Vec3b>(i,j)[1] = h_GREEN[i*image.cols+j];
		blur.at<Vec3b>(i,j)[2] = h_BLUE[i*image.cols+j];
	}
}

Take the time and write the output for future analysis

//take the time
clock_gettime(CLOCK_MONOTONIC, &finish);
elapsed = (finish.tv_sec - start.tv_sec);
elapsed += (finish.tv_nsec - start.tv_nsec) / 1000000000.0;
std::cout<<"Time: "<< elapsed << std::endl;

//write in file
std::ofstream output;
output.open("maincudaResult.txt", std::ofstream::app | std::ofstream::out );
output<< imageName <<", "<< radius << ", " << threadsPerBlock <<", "<< elapsed << std::endl;
output.close();

Because we are environmentally conscientious coders, we clean up after ourselves with cudaFree()

//free Memory in device
err=cudaFree(d_R);
checkError("Error al liberar la memoria del device R",err);
err=cudaFree(d_G);
checkError("Error al liberar la memoria del device G",err);
err=cudaFree(d_B);
checkError("Error al liberar la memoria del device B",err);
err=cudaFree(d_RED);
checkError("Error al liberar la memoria del device RED",err);
err=cudaFree(d_GREEN);
checkError("Error al liberar la memoria del device GREEN",err);
err=cudaFree(d_BLUE);
checkError("Error al liberar la memoria del device BLUE",err);

err = cudaDeviceReset();
checkError("Error al resetear el device",err);

//free memory in host
delete h_R;
delete h_G;
delete h_B;
delete h_RED;
delete h_GREEN;
delete h_BLUE;

Blur

POSIX Threads (pthreads)

Threads (openMP)

GPU (cuda)

How to run

Script

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Cuda Implementation

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Clone this wiki locally