Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Project 2: Nick Moon #19

Open
wants to merge 11 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
392 changes: 386 additions & 6 deletions README.md

Large diffs are not rendered by default.

Binary file added images/figures/graph_blocksize.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added images/figures/graph_compact.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added images/figures/graph_pow2.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added images/figures/graph_scan.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added images/figures/scan_downsweep.PNG
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added images/figures/scan_upsweep.PNG
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added images/results/compact_15.PNG
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added images/results/compact_20.PNG
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added images/results/compact_25.PNG
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added images/results/scan_15.PNG
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added images/results/scan_20.PNG
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added images/results/scan_25.PNG
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added images/results/thrust_nsight_timeline_memcpy.PNG
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
56 changes: 44 additions & 12 deletions src/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,11 @@
#include <stream_compaction/thrust.h>
#include "testing_helpers.hpp"

const int SIZE = 1 << 8; // feel free to change the size of array
const int SIZE = 1 << 20; // feel free to change the size of array
const int NPOT = SIZE - 3; // Non-Power-Of-Two
int *a = new int[SIZE];
int *b = new int[SIZE];
int *c = new int[SIZE];
int* a = new int[SIZE];
int* b = new int[SIZE];
int* c = new int[SIZE];

int main(int argc, char* argv[]) {
// Scan tests
Expand Down Expand Up @@ -51,7 +51,7 @@ int main(int argc, char* argv[]) {
printDesc("naive scan, power-of-two");
StreamCompaction::Naive::scan(SIZE, c, a);
printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
//printArray(SIZE, c, true);
printArray(SIZE, c, true);
printCmpResult(SIZE, b, c);

/* For bug-finding only: Array of 1s to help find bugs in stream compaction or scan
Expand All @@ -64,35 +64,35 @@ int main(int argc, char* argv[]) {
printDesc("naive scan, non-power-of-two");
StreamCompaction::Naive::scan(NPOT, c, a);
printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
//printArray(SIZE, c, true);
printArray(NPOT, c, true);
printCmpResult(NPOT, b, c);

zeroArray(SIZE, c);
printDesc("work-efficient scan, power-of-two");
StreamCompaction::Efficient::scan(SIZE, c, a);
printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
//printArray(SIZE, c, true);
printArray(SIZE, c, true);
printCmpResult(SIZE, b, c);

zeroArray(SIZE, c);
printDesc("work-efficient scan, non-power-of-two");
StreamCompaction::Efficient::scan(NPOT, c, a);
printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
//printArray(NPOT, c, true);
printArray(NPOT, c, true);
printCmpResult(NPOT, b, c);

zeroArray(SIZE, c);
printDesc("thrust scan, power-of-two");
StreamCompaction::Thrust::scan(SIZE, c, a);
printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
//printArray(SIZE, c, true);
printArray(SIZE, c, true);
printCmpResult(SIZE, b, c);

zeroArray(SIZE, c);
printDesc("thrust scan, non-power-of-two");
StreamCompaction::Thrust::scan(NPOT, c, a);
printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
//printArray(NPOT, c, true);
printArray(NPOT, c, true);
printCmpResult(NPOT, b, c);

printf("\n");
Expand Down Expand Up @@ -137,16 +137,48 @@ int main(int argc, char* argv[]) {
printDesc("work-efficient compact, power-of-two");
count = StreamCompaction::Efficient::compact(SIZE, c, a);
printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
//printArray(count, c, true);
printArray(count, c, true);
printCmpLenResult(count, expectedCount, b, c);

zeroArray(SIZE, c);
printDesc("work-efficient compact, non-power-of-two");
count = StreamCompaction::Efficient::compact(NPOT, c, a);
printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
//printArray(count, c, true);
printArray(count, c, true);
printCmpLenResult(count, expectedNPOT, b, c);


// RADIX SORT ON GPU NOT IMPLEMENTED
/*
printf("\n");
printf("*****************************\n");
printf("** RADIX SORT TESTS **\n");
printf("*****************************\n");

genArray(SIZE - 1, a, 100); // Leave a 0 at the end to test that edge case
a[SIZE - 1] = 0;
printArray(SIZE, a, true);

zeroArray(SIZE, b);
printDesc("cpu std::stable_sort, power-of-two");
StreamCompaction::CPU::stdSort(SIZE, b, a);
printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");
printArray(SIZE, b, true);

zeroArray(SIZE, c);
printDesc("cpu radix sort, power-of-two");
StreamCompaction::CPU::radixSort(SIZE, c, a);
printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");
//printArray(SIZE, c, true);
printCmpResult(SIZE, b, c);

zeroArray(SIZE, c);
printDesc("gpu radix sort, power-of-two");
StreamCompaction::Efficient::radixSort(SIZE, c, a);
printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
//printArray(SIZE, c, true);
printCmpResult(SIZE, b, c);*/

system("pause"); // stop Win32 console from closing on exit
delete[] a;
delete[] b;
Expand Down
2 changes: 1 addition & 1 deletion src/testing_helpers.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ void printArray(int n, int *a, bool abridged = false) {
printf(" [ ");
for (int i = 0; i < n; i++) {
if (abridged && i + 2 == 15 && n > 16) {
i = n - 2;
i = n - 3;
printf("... ");
}
printf("%3d ", a[i]);
Expand Down
51 changes: 49 additions & 2 deletions stream_compaction/common.cu
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,18 @@ namespace StreamCompaction {
* which map to 0 will be removed, and elements which map to 1 will be kept.
*/
__global__ void kernMapToBoolean(int n, int *bools, const int *idata) {
// TODO

int thread_num = threadIdx.x + (blockIdx.x * blockDim.x);
if (thread_num >= n) {
return;
}

if (idata[thread_num] == 0) {
bools[thread_num] = 0;
}
else {
bools[thread_num] = 1;
}
}

/**
Expand All @@ -32,8 +43,44 @@ namespace StreamCompaction {
*/
__global__ void kernScatter(int n, int *odata,
const int *idata, const int *bools, const int *indices) {
// TODO

int thread_num = threadIdx.x + (blockIdx.x * blockDim.x);
if (thread_num >= n) {
return;
}

if (bools[thread_num] == 1) {
odata[indices[thread_num]] = idata[thread_num];
}
}


/**
* Maps an array to an array of 0s and 1s for radix sort. Elements
* whose value in bit b is 1 are mapped to 1, otherwise 0
*/
__global__ void kernMapToBooleanBitwiseCheck(int n, int c, int* bools, const int* idata) {
int thread_num = threadIdx.x + (blockIdx.x * blockDim.x);
if (thread_num >= n) {
return;
}

if (idata[thread_num] & c) {
bools[thread_num] = 0;
}
else {
bools[thread_num] = 1;
}
}


__global__ void kernReverseArray(int n, int* odata_reversed, const int* odata) {
int thread_num = threadIdx.x + (blockIdx.x * blockDim.x);
if (thread_num >= n) {
return;
}

odata_reversed[thread_num] = odata[n - thread_num - 1];
}
}
}
10 changes: 10 additions & 0 deletions stream_compaction/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,16 @@
#include <algorithm>
#include <chrono>
#include <stdexcept>
#include <iostream>
#include <vector>

#define FILENAME (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__)
#define checkCUDAError(msg) checkCUDAErrorFn(msg, FILENAME, __LINE__)

#define blockSize 128

//#define inclusiveToExclusive

/**
* Check for CUDA errors; print and exit if there was a problem.
*/
Expand All @@ -37,6 +43,10 @@ namespace StreamCompaction {
__global__ void kernScatter(int n, int *odata,
const int *idata, const int *bools, const int *indices);

__global__ void kernMapToBooleanBitwiseCheck(int n, int c, int* bools, const int* idata);

__global__ void kernReverseArray(int n, int* odata_reversed, const int* odata);

/**
* This class is used for timing the performance
* Uncopyable and unmovable
Expand Down
148 changes: 143 additions & 5 deletions stream_compaction/cpu.cu
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,10 @@ namespace StreamCompaction {
*/
void scan(int n, int *odata, const int *idata) {
timer().startCpuTimer();
// TODO
odata[0] = 0;
for (int i = 1; i < n; ++i) {
odata[i] = odata[i - 1] + idata[i - 1];
}
timer().endCpuTimer();
}

Expand All @@ -30,9 +33,15 @@ namespace StreamCompaction {
*/
int compactWithoutScan(int n, int *odata, const int *idata) {
timer().startCpuTimer();
// TODO
int num_elements = 0;
for (int i = 0; i < n; ++i) {
if (idata[i] != 0) {
odata[num_elements] = idata[i];
++num_elements;
}
}
timer().endCpuTimer();
return -1;
return num_elements;
}

/**
Expand All @@ -41,10 +50,139 @@ namespace StreamCompaction {
* @returns the number of elements remaining after compaction.
*/
int compactWithScan(int n, int *odata, const int *idata) {
int* temp_array = new int[n];

timer().startCpuTimer();
// TODO

int num_elements = 0;
// GENERATE CONDITION VALUES
for (int i = 0; i < n; ++i) {
if (idata[i] == 0) {
odata[i] = 0;
}
else {
odata[i] = 1;
}
}

// SCAN
temp_array[0] = 0;
for (int i = 1; i < n; ++i) {
temp_array[i] = temp_array[i - 1] + odata[i - 1];
}

// SCATTER
for (int i = 0; i < n; ++i) {
if (odata[i] == 1) {
odata[temp_array[i]] = idata[i];
++num_elements;
}
}

timer().endCpuTimer();
return -1;

delete[] temp_array;

return num_elements;
}

/**
* CPU radix sort implementation
*
* @param n The number of elements in idata.
* @param odata The array into which to store elements.
* @param idata The array of elements to sort.
*/
void radixSort(int n, int* odata, const int* idata) {
// not the greatest cpu implementation of radix sort (quite memory intensive), but functional

int* temp_array_0 = new int[n * 2];
int* temp_array_1 = new int[n * 2];

for (int i = 0; i < n; ++i) {
temp_array_0[i] = idata[i];
}

int index_left_partition = 0;
int index_right_partition = 0;
int num_left_partition = n;
int num_right_partition = 0;

timer().startCpuTimer();

for (int b = 0; b < sizeof(int) * 8; ++b) {
// handle left partition
for (int i = 0; i < num_left_partition; ++i) {
if (!(temp_array_0[i] & (1 << b))) {
temp_array_1[index_left_partition] = temp_array_0[i];
index_left_partition++;
}
else {
temp_array_1[index_right_partition + n] = temp_array_0[i];
index_right_partition++;
}
}

//handle right partition
for (int i = n; i < n + num_right_partition; ++i) {
if (!(temp_array_0[i] & (1 << b))) {
temp_array_1[index_left_partition] = temp_array_0[i];
index_left_partition++;
}
else {
temp_array_1[index_right_partition + n] = temp_array_0[i];
index_right_partition++;
}
}

int* temp = temp_array_0;
temp_array_0 = temp_array_1;
temp_array_1 = temp;

num_left_partition = index_left_partition;
num_right_partition = index_right_partition;
index_left_partition = 0;
index_right_partition = 0;
}



timer().endCpuTimer();

// handle left partition
int odata_index = 0;
for (int i = 0; i < num_left_partition; ++i) {
odata[odata_index] = temp_array_0[i];
odata_index++;
}

//handle right partition
for (int i = n; i < n + num_right_partition; ++i) {
odata[odata_index] = temp_array_0[i];
odata_index++;
}

delete[] temp_array_0;
delete[] temp_array_1;
}

/**
* CPU radix sort implementation using std::stable_sort
*
* @param n The number of elements in idata.
* @param odata The array into which to store elements.
* @param idata The array of elements to sort.
*/
void stdSort(int n, int* odata, const int* idata) {
std::vector<int> vect_idata(idata, idata + n);

timer().startCpuTimer();
std::stable_sort(vect_idata.begin(), vect_idata.end());
timer().endCpuTimer();

for (int i = 0; i < n; ++i) {
odata[i] = vect_idata[i];
}
}
}
}
Loading