Skip to content

Cuda Implementation

Jonathan edited this page Nov 28, 2017 · 11 revisions

Import libraries

#include <cuda_runtime.h>
#include <opencv2/opencv.hpp>

Declare Cuda function by sending first 3 arguments are the vectors with the original image, created in the device as

  • d_R
  • d_G
  • d_B

The next 3 arguments are the ones where we are going to write the blurred image, passed as:

  • d_RED

  • d_GREEN

  • d_BLUE

  • The size argument to iterate through rows and cols

  • The kernel radius to apply the filter

  • The Number of threads

__global__ void blurEffect(int *R, int *G, int *B, int *RED, int *GREEN, int *BLUE, int size, int rows, int cols, int kernel, int NTHREADS)
{
  int id = blockDim.x * blockIdx.x + threadIdx.x;

  //using just for "for loop"
  //int start = id*size/NTHREADS;
  //int end = (id+1)*size/NTHREADS;

  int tmp_red = 0;
  int tmp_green = 0;
  int tmp_blue = 0;

  if (id < size)
  {
    tmp_red = 0;
    tmp_green = 0;
    tmp_blue = 0;

    if ( (id-cols*kernel) > 0 ) {
      tmp_red += R[id-cols*kernel];
      tmp_green += G[id-cols*kernel];
      tmp_blue += B[id-cols*kernel];
    }

    if ( (id+cols*kernel) < size ) {
      tmp_red += R[id+cols*kernel];
      tmp_green += G[id+cols*kernel];
      tmp_blue += B[id+cols*kernel];
    }

    if ( id%cols == 0 ) {
      tmp_red += R[id+kernel];
      tmp_green += G[id+kernel];
      tmp_blue += B[id+kernel];
    }
    else if ( id%cols == cols-1 ) {
      tmp_red += R[id-kernel];
      tmp_green += G[id-kernel];
      tmp_blue += B[id-kernel];
    }
    else {
      tmp_red += R[id+kernel] + R[id-kernel];
      tmp_green += G[id+kernel] + G[id-kernel];
      tmp_blue += B[id+kernel]+ B[id-kernel];
    }

    RED[id] = tmp_red/4;
    GREEN[id] = tmp_green/4;
    BLUE[id] = tmp_blue/4;
  }

}

Function to populate the Vector

void randomFill(int *V, int row, int col){
  for(int i=0 ; i<row ; i++){
    for(int j=0 ; j<col ; j++){
      V[i*col+j] = 0;
    }
  }
}

Function to check if error is present and print it

void checkError(string s,cudaError_t err){
	if(err != cudaSuccess){
		std::cout<<s<<" "<<cudaGetErrorString(err)<<std::endl;
		exit(EXIT_FAILURE);
	}
}

Read the Image and create one for the blur effect

char* imageName = argv[1];
Mat image;
image = imread( imageName, IMREAD_COLOR );
if( !image.data )
{
  std::cout<<" No image data \n ";
  return -1;
}

Mat blur = image.clone();

Declare the structure of the Vector

int radius;
size_t the_size = image.rows * image.cols * sizeof(int);
int total_size = image.rows * image.cols;

int NTHREADS = 192;
int threadsPerBlock = 256;

Get the Kernel radius and number of threads from arguments, if present

if(argv[2]==NULL || argc<3) {
  radius = 3;
  NTHREADS = 192;
  threadsPerBlock = 256;
  std::cout<<"Radius and CUDA NUM_THREADS are null"<<std::endl;
}
else {
  radius = atoi(argv[2]);
}

if (argv[3]==NULL || argc<4) {
	NTHREADS = 192;
  threadsPerBlock = 256;
	std::cout<<"CUDA NUM_THREADS is null"<<std::endl;
}
else {
	NTHREADS = atoi(argv[3]);
  threadsPerBlock = atoi(argv[3]);
}

Declare variables in host memory

cudaError_t err = cudaSuccess;

int *h_R, *h_G, *h_B, *h_RED, *h_GREEN, *h_BLUE;
int *d_R, *d_G, *d_B, *d_RED, *d_GREEN, *d_BLUE;

//Memory in Host
h_R = new int[the_size];
h_G = new int[the_size];
h_B = new int[the_size];
h_RED = new int[the_size];
h_GREEN = new int[the_size];
h_BLUE = new int[the_size];

Fill Arrays for each channel RED GREEN BLUE with zeros

randomFill(h_RED, image.rows, image.cols);
randomFill(h_GREEN, image.rows, image.cols);
randomFill(h_BLUE, image.rows, image.cols);

//Fill arrays RGB with data image
for (int i = 0; i < image.rows; ++i)
{
  for (int j = 0; j < image.cols; ++j)
  {
    h_R[i*image.cols+j] = image.at<Vec3b>(i,j)[0];
    h_G[i*image.cols+j] = image.at<Vec3b>(i,j)[1];
    h_B[i*image.cols+j] = image.at<Vec3b>(i,j)[2];
  }
}

Allocate memory in GPU

err = cudaMalloc((void**)&d_R, the_size);
checkError("Error al reservar memoria para R",err);

err = cudaMalloc((void**)&d_G, the_size);
checkError("Error al reservar memoria para G",err);

err = cudaMalloc((void**)&d_B, the_size);
checkError("Error al reservar memoria para B",err);

err = cudaMalloc((void**)&d_RED, the_size);
checkError("Error al reservar memoria para RED",err);

err = cudaMalloc((void**)&d_GREEN, the_size);
checkError("Error al reservar memoria para GREEN",err);

err = cudaMalloc((void**)&d_BLUE, the_size);
checkError("Error al reservar memoria para BLUE",err);

Using cudaMemcpy(), we copy the input data to the device with the parameter cudaMemcpyHostToDevice and copy the result data back to the host with cudaMemcpyDeviceToHost

err = cudaMemcpy(d_R, h_R, the_size, cudaMemcpyHostToDevice);
checkError("Error al pasar los datos de RED al device", err);

err = cudaMemcpy(d_G, h_G, the_size, cudaMemcpyHostToDevice);
checkError("Error al pasar los datos de GREEN al device", err);

err = cudaMemcpy(d_B, h_B, the_size, cudaMemcpyHostToDevice);
checkError("Error al pasar los datos de BLUE al device", err);

Call the Cuda Function with given vectors

blurEffect<<<blocksPerGrid, threadsPerBlock>>>(d_R,d_G,d_B,d_RED,d_GREEN,d_BLUE,total_size,image.rows,image.cols, radius, NTHREADS);

Get errors and copy from memory from device to host

err = cudaGetLastError();
checkError("Error al ejecutar el kernel",err);

err = cudaMemcpy(h_RED, d_RED, the_size, cudaMemcpyDeviceToHost);
checkError("Error al pasar los datos de R al device", err);

err = cudaMemcpy(h_GREEN, d_GREEN, the_size, cudaMemcpyDeviceToHost);
checkError("Error al pasar los datos de G al device", err);

err = cudaMemcpy(h_BLUE, d_BLUE, the_size, cudaMemcpyDeviceToHost);
checkError("Error al pasar los datos de B al device", err);

Move memory to New vector

for (int i = 0; i < image.rows; ++i)
{
	for (int j = 0; j < image.cols; ++j)
	{
		blur.at<Vec3b>(i,j)[0] = h_RED[i*image.cols+j];
		blur.at<Vec3b>(i,j)[1] = h_GREEN[i*image.cols+j];
		blur.at<Vec3b>(i,j)[2] = h_BLUE[i*image.cols+j];
	}
}

Take the time and write the output for future analysis

//take the time
clock_gettime(CLOCK_MONOTONIC, &finish);
elapsed = (finish.tv_sec - start.tv_sec);
elapsed += (finish.tv_nsec - start.tv_nsec) / 1000000000.0;
std::cout<<"Time: "<< elapsed << std::endl;

//write in file
std::ofstream output;
output.open("maincudaResult.txt", std::ofstream::app | std::ofstream::out );
output<< imageName <<", "<< radius << ", " << threadsPerBlock <<", "<< elapsed << std::endl;
output.close();

Because we are environmentally conscientious coders, we clean up after ourselves with cudaFree()

//free Memory in device
err=cudaFree(d_R);
checkError("Error al liberar la memoria del device R",err);
err=cudaFree(d_G);
checkError("Error al liberar la memoria del device G",err);
err=cudaFree(d_B);
checkError("Error al liberar la memoria del device B",err);
err=cudaFree(d_RED);
checkError("Error al liberar la memoria del device RED",err);
err=cudaFree(d_GREEN);
checkError("Error al liberar la memoria del device GREEN",err);
err=cudaFree(d_BLUE);
checkError("Error al liberar la memoria del device BLUE",err);

err = cudaDeviceReset();
checkError("Error al resetear el device",err);

//free memory in host
delete h_R;
delete h_G;
delete h_B;
delete h_RED;
delete h_GREEN;
delete h_BLUE;

Blur

POSIX Threads (pthreads)

Threads (openMP)

GPU (cuda)

How to run

Clone this wiki locally