[NeoML] DnnDistributed -- remove code copy-paste

favorart · favorart · commit f3a6357f362e · 2024-01-18T23:32:18.000+01:00
Signed-off-by: Kirill Golikov &lt;kirill.golikov@abbyy.com&gt;
diff --git a/NeoML/Python/src/PyDnnDistributed.cpp b/NeoML/Python/src/PyDnnDistributed.cpp
@@ -1,4 +1,5 @@
-/* Copyright © 2017-2023 ABBYY
+/* Copyright © 2017-2024 ABBYY
+
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
diff --git a/NeoML/Python/src/PyDnnDistributed.h b/NeoML/Python/src/PyDnnDistributed.h
@@ -1,4 +1,5 @@
-/* Copyright © 2017-2023 ABBYY
+/* Copyright © 2017-2024 ABBYY
+
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
@@ -20,7 +21,7 @@ limitations under the License.
 
 class CPyDistributedDataset : public IDistributedDataset {
 public:
-	CPyDistributedDataset( const py::object& data ) : getData( data ) {};
+	CPyDistributedDataset( const py::object& data ) : getData( data ) {}
 	int SetInputBatch( CDnn& dnn, int thread ) override;
 private:
 	py::object getData;
@@ -29,13 +30,14 @@ class CPyDistributedDataset : public IDistributedDataset {
 class CPyDistributedTraining : public CDistributedTraining {
 public:
 	CPyDistributedTraining( CDnn& dnn, int count, TDistributedInitializer initializer, int seed )
-		: CDistributedTraining( dnn, count, initializer, seed ) {};
+		: CDistributedTraining( dnn, count, initializer, seed ) {}
 	CPyDistributedTraining( CArchive& archive, int count, TDistributedInitializer initializer, int seed )
-		: CDistributedTraining( archive, count, initializer, seed ) {};
+		: CDistributedTraining( archive, count, initializer, seed ) {}
 	CPyDistributedTraining( CDnn& dnn, const CArray<int>& cudaDevs, TDistributedInitializer initializer, int seed )
-		: CDistributedTraining( dnn, cudaDevs, initializer, seed ) {};
+		: CDistributedTraining( dnn, cudaDevs, initializer, seed ) {}
 	CPyDistributedTraining( CArchive& archive, const CArray<int>& cudaDevs, TDistributedInitializer initializer, int seed )
-		: CDistributedTraining( archive, cudaDevs, initializer, seed ) {};
+		: CDistributedTraining( archive, cudaDevs, initializer, seed ) {}
+
 	void Run( const py::object& data );
 	void RunAndBackward( const py::object& data );
 	void Learn( const py::object& data );
@@ -46,4 +48,4 @@ class CPyDistributedTraining : public CDistributedTraining {
 	void Save( const std::string& path );
 };
 
-void InitializeDistributedTraining(py::module& m);
+void InitializeDistributedTraining( py::module& m );
diff --git a/NeoML/include/NeoML/Dnn/DnnDistributed.h b/NeoML/include/NeoML/Dnn/DnnDistributed.h
@@ -1,4 +1,5 @@
-/* Copyright © 2017-2023 ABBYY
+/* Copyright © 2017-2024 ABBYY
+
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
@@ -26,6 +27,7 @@ class CLoraSerializer;
 // Interface for setting input to a neural network
 class IDistributedDataset {
 public:
+	virtual ~IDistributedDataset() {}
 	// This method must set batches for all of the source layers in CDnn
 	// Returns the current batch size (or 0, if there is no data for this thread on this run)
 	// This batch size affects weights balance between different threads
@@ -57,7 +59,7 @@ class NEOML_API CDistributedTraining {
 	CDistributedTraining( CArchive& archive, const CArray<int>& cudaDevs,
 		TDistributedInitializer initializer = TDistributedInitializer::Xavier, int seed = 42 );
 
-	~CDistributedTraining();
+	virtual ~CDistributedTraining();
 
 	// Gets the number of models in disitrbuted traning
 	int GetModelCount() const { return cnns.Size(); }
@@ -67,6 +69,7 @@ class NEOML_API CDistributedTraining {
 	void SetLearningRate( float rate );
 	// Returns the current learning rate
 	float GetLearningRate() const;
+
 	// Runs the networks without backward and training
 	void RunOnce( IDistributedDataset& data );
 	// Runs the networks and performs a backward pass
@@ -75,28 +78,31 @@ class NEOML_API CDistributedTraining {
 	void RunAndLearnOnce( IDistributedDataset& data );
 	// Updates the trainable weights of all models (after RunAndBackwardOnce)
 	void Train();
+
 	// Returns last loss of `layerName` for all models
 	// `layerName` should correspond to CLossLayer, CCtcLossLayer or CCrfLossLayer
-	void GetLastLoss( const CString& layerName, CArray<float>& losses );
+	void GetLastLoss( const CString& layerName, CArray<float>& losses ) const;
 	// Returns last blobs of `layerName` for all models
 	// `layerName` should correspond to CSinkLayer
-	void GetLastBlob( const CString& layerName, CObjectArray<CDnnBlob>& blobs );
+	void GetLastBlob( const CString& layerName, CObjectArray<CDnnBlob>& blobs ) const;
+
 	// Save trained net
 	void Serialize( CArchive& archive );
 	// Save the trained net with the given `index` with its solver state (optional)
 	// An archive with solver state can later be passed to CDnn::SerializeCheckpoint to resume training
 	void StoreDnn( CArchive& archive, int index, bool storeSolver );
 
 private:
-	const bool isCpu;
-	IThreadPool* threadPool;
+	enum class TRunType { Invalid, RunOnce, RunBackwardOnce, Train };
+	class CParams;
+	CParams* const params = nullptr;
+	IThreadPool* const threadPool = nullptr;
 	CArray<IMathEngine*> mathEngines;
 	CArray<CRandom*> rands;
 	CArray<CDnn*> cnns;
 	CArray<int> batchSize;
-	bool isFirstRun = true;
-	CString errorMessage;
 
+	void runOnce( IDistributedDataset*, TRunType );
 	void initialize( CArchive& archive, int count, TDistributedInitializer initializer, int seed );
 
 	friend class CLoraSerializer;
diff --git a/NeoML/src/Dnn/DnnDistributed.cpp b/NeoML/src/Dnn/DnnDistributed.cpp