Skip to content

Commit 719b1fa

Browse files
authored
Initial commit
0 parents  commit 719b1fa

15 files changed

+289
-0
lines changed

.figures/mnist.png

26.1 KB
Loading

.github/setup.sh

+8
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
#!/bin/bash
2+
sudo apt install cmake
3+
pip install torch --index-url https://download.pytorch.org/whl/cpu
4+
pip install numpy
5+
cd build_cpu
6+
cmake -DCMAKE_PREFIX_PATH=`python -c 'import torch;print(torch.utils.cmake_prefix_path)'` .
7+
cmake --build . --config Release
8+
./test-net
+34
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
name: build-and-run-tests
2+
3+
on: [push]
4+
5+
jobs:
6+
build-and-run-tests:
7+
runs-on: ubuntu-latest
8+
9+
steps:
10+
- name: get_packages
11+
run: sudo apt update && sudo apt install cmake
12+
13+
- name: install_pytorch
14+
run: pip3 install torch --index-url https://download.pytorch.org/whl/cpu
15+
16+
- name: install_numpy
17+
run: pip install numpy
18+
19+
- name: get_repo
20+
uses: actions/checkout@v3
21+
with:
22+
path: main
23+
24+
- name: configure_and_build
25+
shell: bash
26+
working-directory: ${{github.workspace}}/main/build_cpu
27+
run: |
28+
cmake -DCMAKE_PREFIX_PATH=`python -c 'import torch;print(torch.utils.cmake_prefix_path)'` .
29+
cmake --build . --config Release
30+
31+
- name: runtest
32+
shell: bash
33+
working-directory: ${{github.workspace}}/main/build_cpu
34+
run: ./test-net

.gitignore

+3
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
.vscode/
2+
dependencies/
3+
build/

CMakeLists.txt

+18
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
cmake_minimum_required(VERSION 3.18 FATAL_ERROR)
2+
3+
project(train-net LANGUAGES CXX CUDA)
4+
cmake_policy(SET CMP0004 OLD)
5+
find_package(Torch REQUIRED)
6+
7+
# Enable CUDA language support
8+
find_package(CUDAToolkit REQUIRED)
9+
set(CUDA_SEPARABLE_COMPILATION ON)
10+
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${TORCH_CXX_FLAGS} ${CUDAToolkit_CXX_FLAGS} -pthread")
11+
12+
add_executable(train-net source/train_net.cpp)
13+
target_link_libraries(train-net "${CUDAToolkit_libraries} ${TORCH_LIBRARIES}")
14+
set_property(TARGET train-net PROPERTY CXX_STANDARD 17)
15+
16+
add_executable(test-net tests/test_net.cpp)
17+
target_link_libraries(test-net "${CUDAToolkit_libraries} ${TORCH_LIBRARIES}")
18+
set_property(TARGET test-net PROPERTY CXX_STANDARD 17)

README.md

+50
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
## HPC exercise training a neural network on the MNIST data-set.
2+
- The exercise explores training a neural network using [the torch c++ api](https://pytorch.org/cppdocs/).
3+
4+
![where_is_waldo](.figures/mnist.png)
5+
6+
You will learn how to train a network to recognize handwritten digits. To do so we will use the mnist data-set.
7+
The image above shows example images. The exercise assumes you are working on the systems at the Juelich Supercomputing Centre.
8+
To solve this exercise look through the files in the `source` folder. `TODO`s mark parts of the code that require your attention.
9+
Come back to this readme for additional hints.
10+
11+
- To get started on the JUWELS Booster load the modules
12+
``` bash
13+
Stages/2023 GCC/11.3.0 OpenMPI/4.1.4 CUDA/11.7 CMake PyTorch
14+
```
15+
16+
- Use `mkdir build` to create your build directory. Change directory into your build folder and compile by running:
17+
```bash
18+
cmake -DCUDA_CUDA_LIB=/usr/lib64/libcuda.so -DCMAKE_PREFIX_PATH=`python -c 'import torch;print(torch.utils.cmake_prefix_path)'` ..
19+
cmake --build . --config Release
20+
```
21+
22+
- Navigate to `source/net.h` implement the constructor for the `Net` struct.
23+
The `Net` should implement a fully connected network
24+
25+
$$
26+
y = \ln(\sigma (W_3f_r(W_2 f_r(W_1 x + b_1) + b_2) + b_3))
27+
$$
28+
29+
with $W_1 \in \mathbb{R}^{h_1, n}, W_2 \in \mathbb{R}^{h_2, h_1}, W_3 \in \mathbb{R}^{m, h_2}$
30+
and $b_1 \in \mathbb{R}^{h_1}, b_2 \in \mathbb{R}^{h_2}, b_3 \in \mathbb{R}^{m}$, where
31+
$n$ denotes the input dimension $h_1$ the number of hidden neurons in the first layer $h_2$ the number of neurons in the second layer, and $m$ the number of output neurons.
32+
Finally $\sigma$ denotes the [softmax function](https://en.wikipedia.org/wiki/Softmax_function) and $\ln$ the natural logarithm.
33+
Use `register_module` to add `Linear` layers to the network. Linear layers that implement $Wx +b$ are provided by `torch::nn:Linear`.
34+
Move on to implement the forward pass. Follow the equation above, use `torch::relu` and
35+
`torch::log_softmax`. What happens if you choose `torch::sigmoid` instead of the ReLU?
36+
37+
- Before training your network network implement the `acc` function in `source/train_net.cpp`. It should find the ratio of
38+
correctly identified digits, by comparing the `argmax` of the network output and the annotations.
39+
40+
- Torch devices are defined i.e. by `torch::Device device = torch::kCPU;` move to GPUs by choosing `torch::kCUDA;` if cuda-GPUs are available.
41+
42+
- Finally iterate over the test data set and compute the test accuracy.
43+
44+
- Train and test your network by executing:
45+
```bash
46+
./train_net
47+
```
48+
49+
- When your network has converged, you should measure more than 90% accuracy.
50+

build_cpu/CMakeLists.txt

+13
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
cmake_minimum_required(VERSION 3.18 FATAL_ERROR)
2+
3+
project(train-net LANGUAGES CXX)
4+
cmake_policy(SET CMP0004 OLD)
5+
find_package(Torch REQUIRED)
6+
7+
add_executable(train-net ../source/train_net.cpp)
8+
target_link_libraries(train-net "${TORCH_LIBRARIES}")
9+
set_property(TARGET train-net PROPERTY CXX_STANDARD 17)
10+
11+
add_executable(test-net ../tests/test_net.cpp)
12+
target_link_libraries(test-net "${TORCH_LIBRARIES}")
13+
set_property(TARGET test-net PROPERTY CXX_STANDARD 17)

data/t10k-images-idx3-ubyte

7.48 MB
Binary file not shown.

data/t10k-labels-idx1-ubyte

9.77 KB
Binary file not shown.

data/train-images-idx3-ubyte

44.9 MB
Binary file not shown.

data/train-labels-idx1-ubyte

58.6 KB
Binary file not shown.

source/net.h

+31
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
#include <torch/torch.h>
2+
3+
#ifndef NET_H
4+
#define NET_H
5+
6+
// Define a new Module.
7+
struct Net : torch::nn::Module {
8+
Net() {
9+
// Construct and register three Linear submodules.
10+
// TODO!
11+
// call them i.e. fc1, fc2 or fc3.
12+
// Your last Linear layer should have ten output neurons.
13+
}
14+
15+
// Implement the Net's algorithm.
16+
torch::Tensor forward(torch::Tensor x) {
17+
// Use one of many tensor manipulation functions.
18+
torch::Tensor y = torch::zeros_like(x);
19+
20+
// run the forward pass by accessing your modules
21+
// and adding activations functions.
22+
// the last function should be a log_softmax.
23+
// TODO!
24+
return y;
25+
}
26+
27+
// Use one of many "standard library" modules.
28+
torch::nn::Linear fc1{nullptr}, fc2{nullptr}, fc3{nullptr};
29+
};
30+
31+
#endif

source/train_net.cpp

+91
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,91 @@
1+
#include <torch/torch.h>
2+
#include <iostream>
3+
#include "net.h"
4+
5+
double acc(torch::Tensor& preds, torch::Tensor& labels){
6+
// compute the batch accuracy given the network classification and the labels.
7+
// TODO
8+
return 0.;
9+
}
10+
11+
int main() {
12+
torch::manual_seed(1);
13+
// Create a new Net.
14+
auto net = std::make_shared<Net>();
15+
16+
std::cout << "CUDA is available: " << torch::cuda::is_available() << std::endl;
17+
// TODO: Move your data to the GPU.
18+
19+
// Create a multi-threaded data loader for the MNIST dataset.
20+
auto data_set = torch::data::datasets::MNIST("../data/")
21+
.map(torch::data::transforms::Normalize<>(0.5, 0.5))
22+
.map(torch::data::transforms::Stack<>());
23+
24+
auto data_loader = torch::data::make_data_loader<torch::data::samplers::SequentialSampler>(
25+
std::move(data_set),
26+
torch::data::DataLoaderOptions().batch_size(64).workers(2));
27+
28+
//TODO: move your network to the GPU
29+
30+
// Instantiate an SGD optimization algorithm to update our Net's parameters.
31+
torch::optim::SGD optimizer(net->parameters(), /*lr=*/0.01);
32+
33+
for (size_t epoch = 1; epoch <= 20; ++epoch) {
34+
size_t batch_index = 0;
35+
// Iterate the data loader to yield batches from the dataset.
36+
for (auto& batch : *data_loader) {
37+
batch.target = batch.target.to(device, false);
38+
batch.data = batch.data.to(device);
39+
40+
// Reset gradients.
41+
optimizer.zero_grad();
42+
// Execute the model on the input data.
43+
44+
torch::Tensor prediction = net -> forward(batch.data);
45+
// Compute a loss value to judge the prediction of our model.
46+
torch::Tensor loss = torch::nll_loss(prediction, batch.target);
47+
// Compute gradients of the loss w.r.t. the parameters of our model.
48+
loss.backward();
49+
// Update the parameters based on the calculated gradients.
50+
optimizer.step();
51+
// Output the loss and checkpoint every 100 batches.
52+
53+
torch::Tensor net_choice = prediction.argmax(-1);
54+
double accuracy = acc(net_choice, batch.target);
55+
56+
if (++batch_index % 100 == 0) {
57+
std::cout << "Epoch: " << epoch << " | Batch: " << batch_index
58+
<< " | Loss: " << loss.item<float>()
59+
<< " | Accuracy:" << accuracy << std::endl;
60+
61+
}
62+
}
63+
}
64+
65+
(*net).to(torch::kCPU, false);
66+
// Serialize your model periodically as a checkpoint.
67+
torch::save(net, "../trained_net/net.pt");
68+
std::cout << "model saved.";
69+
std::cout << "started tests.";
70+
71+
auto test_set = torch::data::datasets::MNIST("../data/",
72+
torch::data::datasets::MNIST::Mode::kTest)
73+
.map(torch::data::transforms::Normalize<>(0.5, 0.5))
74+
.map(torch::data::transforms::Stack<>());
75+
// test our network
76+
auto test_loader = torch::data::make_data_loader(
77+
test_set,
78+
torch::data::DataLoaderOptions().batch_size(64).workers(2));
79+
80+
double correct = 0;
81+
double total = 0;
82+
for (auto& test_batch: *test_loader) {
83+
// TODO: Loop over the test data and find the test accuracy
84+
// The test accuracy is the ratio of currectly identified
85+
// digits over the total number of digits.
86+
}
87+
double test_acc = 0.;
88+
std::cout << "Total: " << total << " correct: " << correct << std::endl;
89+
std::cout << "Test accuracy:" << test_acc << std::endl;
90+
91+
}

tests/test_net.cpp

+40
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
//#include <gtest/gtest.h>
2+
#include <torch/torch.h>
3+
#include <iostream>
4+
#include "../source/net.h"
5+
6+
int main() {
7+
auto net = std::make_shared<Net>();
8+
torch::load(net, "../trained_net/net.pt");
9+
std::cout << "model saved.";
10+
std::cout << "started tests.";
11+
12+
auto test_set = torch::data::datasets::MNIST("../data/",
13+
torch::data::datasets::MNIST::Mode::kTest)
14+
.map(torch::data::transforms::Normalize<>(0.5, 0.5))
15+
.map(torch::data::transforms::Stack<>());
16+
// test our network
17+
auto test_loader = torch::data::make_data_loader(
18+
test_set,
19+
torch::data::DataLoaderOptions().batch_size(64).workers(2));
20+
21+
double correct = 0;
22+
double total = 0;
23+
for (auto& test_batch: *test_loader) {
24+
torch::Tensor test_batch_data = test_batch.data;
25+
torch::Tensor test_batch_labels = test_batch.target;
26+
torch::Tensor test_out = net->forward(test_batch_data);
27+
torch::Tensor test_out_max = test_out.argmax(-1);
28+
for (int i = 0; i < test_out_max.size(0); i++){
29+
total += 1;
30+
if (test_out_max[i].item<int>() == test_batch_labels[i].item<int>()){
31+
correct += 1;
32+
}
33+
}
34+
}
35+
double test_acc = correct / total;
36+
std::cout << "Total: " << total << " correct: " << correct << std::endl;
37+
std::cout << "Test accuracy:" << test_acc << std::endl;
38+
assert(test_acc > 0.9);
39+
return 0;
40+
}

trained_net/README.md

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Your trained network should appear here.

0 commit comments

Comments
 (0)