-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsimpleMPI.cu
91 lines (70 loc) · 2.5 KB
/
simpleMPI.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
//
// Created by kwoodle on 5/10/20.
//
/*
* Copyright 1993-2015 NVIDIA Corporation. All rights reserved.
*
* Please refer to the NVIDIA end user license agreement (EULA) associated
* with this source code for terms and conditions that govern your use of
* this software. Any use, reproduction, disclosure, or distribution of
* this software and related documentation outside the terms of the EULA
* is strictly prohibited.
*
*/
/* Simple example demonstrating how to use MPI with CUDA
*
* Generate some random numbers on one node.
* Dispatch them to all nodes.
* Compute their square root on each node's GPU.
* Compute the average of the results using MPI.
*
* simpleMPI.cu: GPU part, compiled with nvcc
*/
#include <iostream>
using std::cerr;
using std::endl;
#include "simpleMPI.h"
// Error handling macro
#define CUDA_CHECK(call) \
if((call) != cudaSuccess) { \
cudaError_t err = cudaGetLastError(); \
cerr << "CUDA error calling \""#call"\", code is " << err << endl; \
my_abort(err); }
// Device code
// Very simple GPU Kernel that computes square roots of input numbers
__global__ void simpleMPIKernel(float *input, float *output) {
int tid = blockIdx.x * blockDim.x + threadIdx.x;
output[tid] = sqrt(input[tid]);
}
// Initialize an array with random data (between 0 and 1)
void initData(float *data, int dataSize) {
for (int i = 0; i < dataSize; i++) {
data[i] = (float) rand() / RAND_MAX;
}
}
// CUDA computation on each node
// No MPI here, only CUDA
void computeGPU(float *hostData, int blockSize, int gridSize) {
int dataSize = blockSize * gridSize;
// Allocate data on GPU memory
float *deviceInputData = NULL;
CUDA_CHECK(cudaMalloc((void **) &deviceInputData, dataSize * sizeof(float)));
float *deviceOutputData = NULL;
CUDA_CHECK(cudaMalloc((void **) &deviceOutputData, dataSize * sizeof(float)));
// Copy to GPU memory
CUDA_CHECK(cudaMemcpy(deviceInputData, hostData, dataSize * sizeof(float), cudaMemcpyHostToDevice));
// Run kernel
simpleMPIKernel<<<gridSize, blockSize>>>(deviceInputData, deviceOutputData);
// Copy data back to CPU memory
CUDA_CHECK(cudaMemcpy(hostData, deviceOutputData, dataSize * sizeof(float), cudaMemcpyDeviceToHost));
// Free GPU memory
CUDA_CHECK(cudaFree(deviceInputData));
CUDA_CHECK(cudaFree(deviceOutputData));
}
float sum(float *data, int size) {
float accum = 0.f;
for (int i = 0; i < size; i++) {
accum += data[i];
}
return accum;
}