Skip to content

Commit 382d067

Browse files
committed
[NeoML] DnnDistributed -- remove code copy-paste
Signed-off-by: Kirill Golikov <[email protected]>
1 parent b07e513 commit 382d067

File tree

5 files changed

+202
-212
lines changed

5 files changed

+202
-212
lines changed

NeoML/Python/src/PyDnnDistributed.cpp

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
/* Copyright © 2017-2023 ABBYY
2+
23
Licensed under the Apache License, Version 2.0 (the "License");
34
you may not use this file except in compliance with the License.
45
You may obtain a copy of the License at
@@ -28,7 +29,7 @@ int CPyDistributedDataset::SetInputBatch( CDnn& dnn, int thread )
2829
const int batchSize = py::int_( input_data[0] );
2930
py::dict inputs = py::dict( input_data[1] );
3031

31-
for ( std::pair<py::handle, py::handle> item : inputs ){
32+
for( std::pair<py::handle, py::handle> item : inputs ) {
3233
auto layerName = item.first.cast<std::string>();
3334
auto input = item.second.attr( "_internal" ).cast<CPyBlob>();
3435
CPtr<CSourceLayer> layer = dynamic_cast<CSourceLayer*>( dnn.GetLayer( layerName.c_str() ).Ptr() );
@@ -81,7 +82,7 @@ py::list CPyDistributedTraining::GetOutput( const std::string& layer )
8182
py::list output( blobs.Size() );
8283

8384
CPtr<CPyMathEngineOwner> owner = new CPyMathEngineOwner( &GetDefaultCpuMathEngine(), false );
84-
for( int i = 0; i < blobs.Size(); i++ ){
85+
for( int i = 0; i < blobs.Size(); i++ ) {
8586
CPtr<CDnnBlob> blob = CDnnBlob::CreateBlob( GetDefaultCpuMathEngine(), CT_Float, blobs[i]->GetDesc() );
8687
blob->CopyFrom( blobs[i] );
8788
output[i] = CPyBlob( *owner, blob );
@@ -123,35 +124,35 @@ void InitializeDistributedTraining(py::module& m)
123124
CArchiveFile file( path.c_str(), CArchive::load );
124125
CArchive archive( &file, CArchive::load );
125126
return new CPyDistributedTraining( archive, count, getInitializer( initializerName ), seed );
126-
})
127+
} )
127128
)
128129

129130
.def( py::init(
130131
[]( CPyDnn& dnn, int count, const std::string& initializerName, int seed ) {
131132
return new CPyDistributedTraining( dnn.Dnn(), count, getInitializer( initializerName ), seed );
132-
})
133+
} )
133134
)
134135

135136
.def( py::init(
136137
[]( const std::string& path, py::list cudaDevs, const std::string& initializerName, int seed ) {
137138
CArchiveFile file( path.c_str(), CArchive::load );
138139
CArchive archive( &file, CArchive::load );
139140
CArray<int> devs;
140-
for( int i = 0; i < cudaDevs.size(); i++ ){
141+
for( int i = 0; i < cudaDevs.size(); i++ ) {
141142
devs.Add( cudaDevs[i].cast<int>() );
142143
}
143144
return new CPyDistributedTraining( archive, devs, getInitializer( initializerName ), seed );
144-
})
145+
} )
145146
)
146147

147148
.def( py::init(
148149
[]( CPyDnn& dnn, py::list cudaDevs, const std::string& initializerName, int seed ) {
149150
CArray<int> devs;
150-
for( int i = 0; i < cudaDevs.size(); i++ ){
151+
for( int i = 0; i < cudaDevs.size(); i++ ) {
151152
devs.Add( cudaDevs[i].cast<int>() );
152153
}
153154
return new CPyDistributedTraining( dnn.Dnn(), devs, getInitializer( initializerName ), seed );
154-
})
155+
} )
155156
)
156157

157158
.def( "_run", &CPyDistributedTraining::Run )

NeoML/Python/src/PyDnnDistributed.h

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
/* Copyright © 2017-2023 ABBYY
2+
23
Licensed under the Apache License, Version 2.0 (the "License");
34
you may not use this file except in compliance with the License.
45
You may obtain a copy of the License at
@@ -20,7 +21,7 @@ limitations under the License.
2021

2122
class CPyDistributedDataset : public IDistributedDataset {
2223
public:
23-
CPyDistributedDataset( const py::object& data ) : getData( data ) {};
24+
CPyDistributedDataset( const py::object& data ) : getData( data ) {}
2425
int SetInputBatch( CDnn& dnn, int thread ) override;
2526
private:
2627
py::object getData;
@@ -29,13 +30,14 @@ class CPyDistributedDataset : public IDistributedDataset {
2930
class CPyDistributedTraining : public CDistributedTraining {
3031
public:
3132
CPyDistributedTraining( CDnn& dnn, int count, TDistributedInitializer initializer, int seed )
32-
: CDistributedTraining( dnn, count, initializer, seed ) {};
33+
: CDistributedTraining( dnn, count, initializer, seed ) {}
3334
CPyDistributedTraining( CArchive& archive, int count, TDistributedInitializer initializer, int seed )
34-
: CDistributedTraining( archive, count, initializer, seed ) {};
35+
: CDistributedTraining( archive, count, initializer, seed ) {}
3536
CPyDistributedTraining( CDnn& dnn, const CArray<int>& cudaDevs, TDistributedInitializer initializer, int seed )
36-
: CDistributedTraining( dnn, cudaDevs, initializer, seed ) {};
37+
: CDistributedTraining( dnn, cudaDevs, initializer, seed ) {}
3738
CPyDistributedTraining( CArchive& archive, const CArray<int>& cudaDevs, TDistributedInitializer initializer, int seed )
38-
: CDistributedTraining( archive, cudaDevs, initializer, seed ) {};
39+
: CDistributedTraining( archive, cudaDevs, initializer, seed ) {}
40+
3941
void Run( const py::object& data );
4042
void RunAndBackward( const py::object& data );
4143
void Learn( const py::object& data );
@@ -46,4 +48,4 @@ class CPyDistributedTraining : public CDistributedTraining {
4648
void Save( const std::string& path );
4749
};
4850

49-
void InitializeDistributedTraining(py::module& m);
51+
void InitializeDistributedTraining( py::module& m );

NeoML/include/NeoML/Dnn/DnnDistributed.h

Lines changed: 25 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
/* Copyright © 2017-2023 ABBYY
2+
23
Licensed under the Apache License, Version 2.0 (the "License");
34
you may not use this file except in compliance with the License.
45
You may obtain a copy of the License at
@@ -23,6 +24,7 @@ namespace NeoML {
2324
// Interface for setting input to a neural network
2425
class IDistributedDataset {
2526
public:
27+
virtual ~IDistributedDataset() {}
2628
// This method must set batches for all of the source layers in CDnn
2729
// Returns the current batch size (or 0, if there is no data for this thread on this run)
2830
// This batch size affects weights balance between different threads
@@ -54,47 +56,51 @@ class NEOML_API CDistributedTraining {
5456
CDistributedTraining( CArchive& archive, const CArray<int>& cudaDevs,
5557
TDistributedInitializer initializer = TDistributedInitializer::Xavier, int seed = 42 );
5658

57-
~CDistributedTraining();
59+
virtual ~CDistributedTraining();
5860

5961
// Gets the number of models in disitrbuted traning
6062
int GetModelCount() const { return cnns.Size(); }
6163
// Sets the solver for all of the models
62-
void SetSolver( CArchive& archive );
64+
void SetSolver( CArchive& );
6365
// Sets the learning rate for all of the models
6466
void SetLearningRate( float rate );
6567
// Returns the current learning rate
6668
float GetLearningRate() const;
69+
6770
// Runs the networks without backward and training
68-
void RunOnce( IDistributedDataset& data );
71+
void RunOnce( IDistributedDataset& );
6972
// Runs the networks and performs a backward pass
70-
void RunAndBackwardOnce( IDistributedDataset& data );
73+
void RunAndBackwardOnce( IDistributedDataset& );
7174
// Runs the networks, performs a backward pass and updates the trainable weights of all models
72-
void RunAndLearnOnce( IDistributedDataset& data );
75+
void RunAndLearnOnce( IDistributedDataset& );
7376
// Updates the trainable weights of all models (after RunAndBackwardOnce)
7477
void Train();
78+
7579
// Returns last loss of `layerName` for all models
7680
// `layerName` should correspond to CLossLayer, CCtcLossLayer or CCrfLossLayer
77-
void GetLastLoss( const CString& layerName, CArray<float>& losses );
81+
void GetLastLoss( const CString& layerName, CArray<float>& losses ) const;
7882
// Returns last blobs of `layerName` for all models
7983
// `layerName` should correspond to CSinkLayer
80-
void GetLastBlob( const CString& layerName, CObjectArray<CDnnBlob>& blobs );
84+
void GetLastBlob( const CString& layerName, CObjectArray<CDnnBlob>& blobs ) const;
85+
8186
// Save trained net
82-
void Serialize( CArchive& archive );
87+
void Serialize( CArchive& );
8388
// Save the trained net with the given `index` with its solver state (optional)
8489
// An archive with solver state can later be passed to CDnn::SerializeCheckpoint to resume training
85-
void StoreDnn( CArchive& archive, int index, bool storeSolver );
90+
void StoreDnn( CArchive&, int index, bool storeSolver );
8691

8792
private:
88-
const bool isCpu;
89-
IThreadPool* threadPool;
90-
CArray<IMathEngine*> mathEngines;
91-
CArray<CRandom*> rands;
92-
CArray<CDnn*> cnns;
93-
CArray<int> batchSize;
94-
bool isFirstRun = true;
95-
CString errorMessage;
96-
97-
void initialize( CArchive& archive, int count, TDistributedInitializer initializer, int seed );
93+
enum class TRunType { Invalid, RunOnce, RunBackwardOnce, Train };
94+
class CParams;
95+
CParams* const params = nullptr;
96+
IThreadPool* const threadPool = nullptr;
97+
CArray<IMathEngine*> mathEngines{};
98+
CArray<CRandom*> rands{};
99+
CArray<CDnn*> cnns{};
100+
CArray<int> batchSize{};
101+
102+
void runOnce( IDistributedDataset*, TRunType );
103+
void initialize( CArchive&, int count, TDistributedInitializer initializer, int seed );
98104
};
99105

100106
} // namespace NeoML

0 commit comments

Comments
 (0)