Add build option to split torch_cuda library into torch_cuda_cu and torch_cuda_cpp (pytorch#49050)

janeyx99 · facebook-github-bot · commit 88af2149e1cf · 2021-02-01T18:42:35.000-08:00
Summary: Because of the size of our `libtorch_cuda.so`, linking with other hefty binaries presents a problem where 32bit relocation markers are too small and end up overflowing. This PR attempts to break up `torch_cuda` into `torch_cuda_cu` and `torch_cuda_cpp`. `torch_cuda_cu`: all the files previously in `Caffe2_GPU_SRCS` that are * pure `.cu` files in `aten`match * all the BLAS files * all the THC files, except for THCAllocator.cpp, THCCachingHostAllocator.cpp and THCGeneral.cpp * all files in`detail` * LegacyDefinitions.cpp and LegacyTHFunctionsCUDA.cpp * Register*CUDA.cpp * CUDAHooks.cpp * CUDASolver.cpp * TensorShapeCUDA.cpp `torch_cuda_cpp`: all other files in `Caffe2_GPU_SRCS` Accordingly, TORCH_CUDA_API and TORCH_CUDA_BUILD_MAIN_LIB usages are getting split as well to TORCH_CUDA_CU_API and TORCH_CUDA_CPP_API. To test this locally, you can run `export BUILD_SPLIT_CUDA=ON && python setup.py develop`. In your `build/lib` folder, you should find binaries for both `torch_cuda_cpp` and `torch_cuda_cu`. To see that the SPLIT_CUDA option was toggled, you can grep the Summary of running cmake and make sure `Split CUDA` is ON. This build option is tested on CI for CUDA 11.1 builds (linux for now, but windows soon). Pull Request resolved: pytorch#49050 Reviewed By: walterddr Differential Revision: D26114310 Pulled By: janeyx99 fbshipit-source-id: 0180f2519abb5a9cdde16a6fb7dd3171cff687a6
diff --git a/.jenkins/pytorch/build.sh b/.jenkins/pytorch/build.sh
@@ -53,6 +53,11 @@ if [[ "$BUILD_ENVIRONMENT" == *coverage* ]]; then
   export USE_CPP_CODE_COVERAGE=ON
 fi
 
+if [[ "$BUILD_ENVIRONMENT" == *cuda11* ]]; then
+  # enable split torch_cuda build option in CMake
+  export BUILD_SPLIT_CUDA=ON
+fi
+
 # TODO: Don't run this...
 pip_install -r requirements.txt || true
 
diff --git a/.jenkins/pytorch/win-test-helpers/build_pytorch.bat b/.jenkins/pytorch/win-test-helpers/build_pytorch.bat
@@ -110,6 +110,10 @@ if "%REBUILD%" == "" (
     aws s3 cp "s3://ossci-windows/Restore PyTorch Environment.lnk" "C:\Users\circleci\Desktop\Restore PyTorch Environment.lnk"
   )
 )
+:: tests if BUILD_ENVIRONMENT contains cuda11 as a substring
+if not x%BUILD_ENVIRONMENT:cuda11=%==x%BUILD_ENVIRONMENT% (
+   set BUILD_SPLIT_CUDA=ON
+)
 
 python setup.py install --cmake && sccache --show-stats && (
   if "%BUILD_ENVIRONMENT%"=="" (
@@ -118,4 +122,3 @@ python setup.py install --cmake && sccache --show-stats && (
     7z a %TMP_DIR_WIN%\%IMAGE_COMMIT_TAG%.7z %CONDA_PARENT_DIR%\Miniconda3\Lib\site-packages\torch %CONDA_PARENT_DIR%\Miniconda3\Lib\site-packages\caffe2 && copy /Y "%TMP_DIR_WIN%\%IMAGE_COMMIT_TAG%.7z" "%PYTORCH_FINAL_PACKAGE_DIR%\"
   )
 )
-
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -161,6 +161,13 @@ option(COLORIZE_OUTPUT "Colorize output during compilation" ON)
 option(USE_ASAN "Use Address Sanitizer" OFF)
 option(USE_TSAN "Use Thread Sanitizer" OFF)
 option(USE_CUDA "Use CUDA" ON)
+# BUILD_SPLIT_CUDA must also be exported as an environment variable before building, with
+# `export BUILD_SPLIT_CUDA=1` because cpp_extension.py can only work properly if this variable
+# also exists in the environment.
+# This option is incompatible with CUDA_SEPARABLE_COMPILATION.
+cmake_dependent_option(
+    BUILD_SPLIT_CUDA "Split torch_cuda library into torch_cuda_cu and torch_cuda_cpp" OFF
+    "USE_CUDA AND NOT CUDA_SEPARABLE_COMPILATION" OFF)
 option(USE_FAST_NVCC "Use parallel NVCC build" OFF)
 option(USE_ROCM "Use ROCm" ON)
 option(CAFFE2_STATIC_LINK_CUDA "Statically link CUDA libraries" OFF)
diff --git a/aten/src/ATen/cuda/CUDAContext.cpp b/aten/src/ATen/cuda/CUDAContext.cpp
@@ -29,9 +29,9 @@ void initDeviceProperty(DeviceIndex device_index) {
 
 } // anonymous namespace
 
-// We need this function to force the linking against torch_cuda on Windows.
-// If you need to modify this function, please specify a new function and apply the changes
-// according to https://github.com/pytorch/pytorch/pull/34288.
+// We need this function to force the linking against torch_cuda(_cpp) on Windows.
+// If you need to modify this function, please specify a new function and apply
+// the changes according to https://github.com/pytorch/pytorch/pull/34288.
 // Related issue: https://github.com/pytorch/pytorch/issues/31611.
 /* Device info */
 int warp_size() {
diff --git a/aten/src/ATen/cudnn/Descriptors.h b/aten/src/ATen/cudnn/Descriptors.h
@@ -80,7 +80,7 @@ struct DescriptorDeleter {
 // initialized the first time you call set() or any other initializing
 // function.
 template <typename T, cudnnStatus_t (*ctor)(T**), cudnnStatus_t (*dtor)(T*)>
-class TORCH_CUDA_CU_API Descriptor {
+class TORCH_CUDA_CPP_API Descriptor {
  public:
   // TODO: Figure out why const-correctness doesn't work here
 
@@ -108,7 +108,7 @@ class TORCH_CUDA_CU_API Descriptor {
   std::unique_ptr<T, DescriptorDeleter<T, dtor>> desc_;
 };
 
-class TORCH_CUDA_CU_API TensorDescriptor : public Descriptor<
+class TORCH_CUDA_CPP_API TensorDescriptor : public Descriptor<
                                                cudnnTensorStruct,
                                                &cudnnCreateTensorDescriptor,
                                                &cudnnDestroyTensorDescriptor> {
@@ -147,7 +147,7 @@ class TORCH_CUDA_CU_API TensorDescriptor : public Descriptor<
 
 std::ostream& operator<<(std::ostream & out, const TensorDescriptor& d);
 
-class TORCH_CUDA_CU_API FilterDescriptor : public Descriptor<
+class TORCH_CUDA_CPP_API FilterDescriptor : public Descriptor<
                                                cudnnFilterStruct,
                                                &cudnnCreateFilterDescriptor,
                                                &cudnnDestroyFilterDescriptor> {
@@ -163,7 +163,7 @@ class TORCH_CUDA_CU_API FilterDescriptor : public Descriptor<
 
 std::ostream& operator<<(std::ostream & out, const FilterDescriptor& d);
 
-struct TORCH_CUDA_CU_API ConvolutionDescriptor
+struct TORCH_CUDA_CPP_API ConvolutionDescriptor
     : public Descriptor<
           cudnnConvolutionStruct,
           &cudnnCreateConvolutionDescriptor,
@@ -186,7 +186,7 @@ struct TORCH_CUDA_CU_API ConvolutionDescriptor
   }
 };
 
-struct TORCH_CUDA_CU_API SpatialTransformerDescriptor
+struct TORCH_CUDA_CPP_API SpatialTransformerDescriptor
     : public Descriptor<
           cudnnSpatialTransformerStruct,
           &cudnnCreateSpatialTransformerDescriptor,
@@ -196,7 +196,7 @@ struct TORCH_CUDA_CU_API SpatialTransformerDescriptor
   }
 };
 
-struct TORCH_CUDA_CU_API DropoutDescriptor
+struct TORCH_CUDA_CPP_API DropoutDescriptor
     : public Descriptor<
           cudnnDropoutStruct,
           &cudnnCreateDropoutDescriptor,
@@ -235,7 +235,7 @@ struct TORCH_CUDA_CU_API DropoutDescriptor
   }
 };
 
-struct TORCH_CUDA_CU_API RNNDescriptor : public Descriptor<
+struct TORCH_CUDA_CPP_API RNNDescriptor : public Descriptor<
                                              cudnnRNNStruct,
                                              &cudnnCreateRNNDescriptor,
                                              &cudnnDestroyRNNDescriptor> {
@@ -282,7 +282,7 @@ struct TORCH_CUDA_CU_API RNNDescriptor : public Descriptor<
   }
 };
 
-struct TORCH_CUDA_CU_API CTCLossDescriptor
+struct TORCH_CUDA_CPP_API CTCLossDescriptor
     : public Descriptor<
           cudnnCTCLossStruct,
           &cudnnCreateCTCLossDescriptor,
diff --git a/aten/src/ATen/cudnn/Handle.h b/aten/src/ATen/cudnn/Handle.h
@@ -5,5 +5,5 @@
 
 namespace at { namespace native {
 
-TORCH_CUDA_CU_API cudnnHandle_t getCudnnHandle();
+TORCH_CUDA_CPP_API cudnnHandle_t getCudnnHandle();
 }} // namespace at::native
diff --git a/aten/src/ATen/cudnn/Types.h b/aten/src/ATen/cudnn/Types.h
@@ -5,7 +5,7 @@
 
 namespace at { namespace native {
 
-TORCH_CUDA_CU_API cudnnDataType_t
+TORCH_CUDA_CPP_API cudnnDataType_t
 getCudnnDataTypeFromScalarType(const at::ScalarType dtype);
 cudnnDataType_t getCudnnDataType(const at::Tensor& tensor);
 
diff --git a/aten/src/ATen/native/cuda/Bucketization.cu b/aten/src/ATen/native/cuda/Bucketization.cu
@@ -126,6 +126,7 @@ Tensor& searchsorted_out_cuda(Tensor& result, const Tensor& sorted_sequence, con
   return result;
 }
 
+// We need this function to force the linking against torch_cuda_cu on Windows.
 Tensor searchsorted_cuda(const Tensor& sorted_sequence, const Tensor& self, bool out_int32, bool right) {
   ScalarType scalar_type = out_int32 ? ScalarType::Int : ScalarType::Long;
   c10::TensorOptions options = TensorOptions().device(self.options().device()).dtype(scalar_type);
diff --git a/aten/src/ATen/native/cudnn/RNN.cpp b/aten/src/ATen/native/cudnn/RNN.cpp
@@ -774,7 +774,7 @@ namespace {
 // Utilities exposed in RNNUtils.h
 namespace cudnn_rnn {
 
-TORCH_CUDA_CU_API std::tuple<Tensor, std::vector<Tensor>>
+TORCH_CUDA_CPP_API std::tuple<Tensor, std::vector<Tensor>>
 copy_weights_to_flat_buf_views(
     TensorList weight_arr,
     int64_t weight_stride0,
diff --git a/aten/src/ATen/native/cudnn/RNNUtils.h b/aten/src/ATen/native/cudnn/RNNUtils.h
@@ -8,7 +8,7 @@ namespace at {
 namespace native {
 namespace cudnn_rnn {
 
-TORCH_CUDA_CU_API std::tuple<Tensor, std::vector<Tensor>>
+TORCH_CUDA_CPP_API std::tuple<Tensor, std::vector<Tensor>>
 copy_weights_to_flat_buf_views(
     TensorList weight_arr,
     int64_t weight_stride0,
diff --git a/aten/src/ATen/test/cuda_distributions_test.cu b/aten/src/ATen/test/cuda_distributions_test.cu
@@ -79,7 +79,7 @@ TEST(DistributionsTest, TestPhiloxIncrementSmallUniformTensor) {
 
   // get 4 randoms from uniform_(), philox offset is now incremented to 4 by this call
   at::empty({4}, at::TensorOptions(at::kCUDA)).uniform_();
-  
+
   // expected uniforms will start from counter offset of 4
   assert_with_expected_uniforms(4);
 }
@@ -97,12 +97,13 @@ TEST(DistributionsTest, TestPhiloxIncrementBigUniformTensor) {
   //      greater the number of threads launched), it hits the unroll loop in
   //      the uniform_ kernel.
   //    - Hence, we set the size of the tensor in this test to be 8 times the
-  //      maximum number of threads we can launch. This means that, each thread will
-  //      be yielding 8 elements, and as a result, curand_uniform4 will be called twice
-  //      and all the 8 elements in a thread will consume all the float4 from the
-  //      two calls of curand_unfiorm4 as a result of the unroll loop. Therefore,
-  //      after this call to the unform_, counter_offset for the next call to uniform_
-  //      will start from 8. This is what we test next.
+  //      maximum number of threads we can launch. This means that, each thread
+  //      will be yielding 8 elements, and as a result, curand_uniform4 will be
+  //      called twice and all the 8 elements in a thread will consume all the
+  //      float4 from the two calls of curand_uniform4 as a result of the unroll
+  //      loop. Therefore, after this call to the uniform_, counter_offset for
+  //      the next call to uniform_ will start from 8. This is what we test
+  //      next.
   //    - assert that call to uniform_ will start from counter_offset of 8
 
   // if cuda not available, return
@@ -121,7 +122,7 @@ TEST(DistributionsTest, TestPhiloxIncrementBigUniformTensor) {
 
   // get numel randoms from uniform_(), philox offset is now incremented to 8 by this call
   at::empty({numel}, at::TensorOptions(at::kCUDA)).uniform_();
-  
+
   // expected uniforms will start from counter offset of 8
   assert_with_expected_uniforms(8);
 }
diff --git a/aten/src/THC/THCAllocator.h b/aten/src/THC/THCAllocator.h
@@ -5,7 +5,7 @@
 
 // IPC doesn't support (re)allocation
 
-class TORCH_CUDA_CU_API THCIpcDeleter {
+class TORCH_CUDA_CPP_API THCIpcDeleter {
  public:
   THCIpcDeleter(std::shared_ptr<void> basePtr);
   ~THCIpcDeleter();
diff --git a/aten/src/THC/THCCachingHostAllocator.h b/aten/src/THC/THCCachingHostAllocator.h
@@ -21,14 +21,14 @@
 // Note that this allocator does not split larger allocations into smaller
 // blocks, unlike the caching device allocator.
 //
-TORCH_CUDA_CU_API c10::Allocator* getTHCCachingHostAllocator(void);
+TORCH_CUDA_CPP_API c10::Allocator* getTHCCachingHostAllocator(void);
 
 // Records an event in the specified stream. The allocation 'ptr' will not be
 // re-used until the event has occurred.
-TORCH_CUDA_CU_API cudaError_t
+TORCH_CUDA_CPP_API cudaError_t
 THCCachingHostAllocator_recordEvent(void* ptr, at::cuda::CUDAStream stream);
 
 // Releases cached pinned memory allocations via cudaHostFree
-TORCH_CUDA_CU_API void THCCachingHostAllocator_emptyCache(void);
+TORCH_CUDA_CPP_API void THCCachingHostAllocator_emptyCache(void);
 
 #endif
diff --git a/aten/src/THC/THCGeneral.h.in b/aten/src/THC/THCGeneral.h.in
@@ -31,39 +31,39 @@ typedef struct _THCCudaResourcesPerDevice {
   size_t scratchSpacePerStream;
 } THCCudaResourcesPerDevice;
 
-TORCH_CUDA_CU_API THCState* THCState_alloc(void);
-TORCH_CUDA_CU_API void THCState_free(THCState* state);
+TORCH_CUDA_CPP_API THCState* THCState_alloc(void);
+TORCH_CUDA_CPP_API void THCState_free(THCState* state);
 
-TORCH_CUDA_CU_API void THCudaInit(THCState* state);
-TORCH_CUDA_CU_API void THCudaShutdown(THCState* state);
+TORCH_CUDA_CPP_API void THCudaInit(THCState* state);
+TORCH_CUDA_CPP_API void THCudaShutdown(THCState* state);
 
 /* If device `dev` can access allocations on device `devToAccess`, this will return */
 /* 1; otherwise, 0. */
-TORCH_CUDA_CU_API int THCState_getPeerToPeerAccess(THCState* state, int dev, int devToAccess);
+TORCH_CUDA_CPP_API int THCState_getPeerToPeerAccess(THCState* state, int dev, int devToAccess);
 
-TORCH_CUDA_CU_API c10::Allocator* THCState_getCudaHostAllocator(THCState* state);
+TORCH_CUDA_CPP_API c10::Allocator* THCState_getCudaHostAllocator(THCState* state);
 
-TORCH_CUDA_CU_API void THCMagma_init(THCState *state);
+TORCH_CUDA_CPP_API void THCMagma_init(THCState *state);
 
 /* For the current device and stream, returns the allocated scratch space */
-TORCH_CUDA_CU_API size_t THCState_getCurrentDeviceScratchSpaceSize(THCState* state);
+TORCH_CUDA_CPP_API size_t THCState_getCurrentDeviceScratchSpaceSize(THCState* state);
 
 #define THCAssertSameGPU(expr) if (!expr) THError("arguments are located on different GPUs")
 #define THCudaCheck(err)  __THCudaCheck(err, __FILE__, __LINE__)
 #define THCudaCheckWarn(err)  __THCudaCheckWarn(err, __FILE__, __LINE__)
 #define THCublasCheck(err)  __THCublasCheck(err,  __FILE__, __LINE__)
 #define THCusparseCheck(err)  __THCusparseCheck(err,  __FILE__, __LINE__)
 
-TORCH_CUDA_CU_API void __THCudaCheck(cudaError_t err, const char *file, const int line);
-TORCH_CUDA_CU_API void __THCudaCheckWarn(cudaError_t err, const char *file, const int line);
-TORCH_CUDA_CU_API void __THCublasCheck(cublasStatus_t status, const char *file, const int line);
-TORCH_CUDA_CU_API void __THCusparseCheck(cusparseStatus_t status, const char *file, const int line);
+TORCH_CUDA_CPP_API void __THCudaCheck(cudaError_t err, const char *file, const int line);
+TORCH_CUDA_CPP_API void __THCudaCheckWarn(cudaError_t err, const char *file, const int line);
+TORCH_CUDA_CPP_API void __THCublasCheck(cublasStatus_t status, const char *file, const int line);
+TORCH_CUDA_CPP_API void __THCusparseCheck(cusparseStatus_t status, const char *file, const int line);
 
-TORCH_CUDA_CU_API void* THCudaMalloc(THCState *state, size_t size);
-TORCH_CUDA_CU_API void THCudaFree(THCState *state, void* ptr);
+TORCH_CUDA_CPP_API void* THCudaMalloc(THCState *state, size_t size);
+TORCH_CUDA_CPP_API void THCudaFree(THCState *state, void* ptr);
 
 at::DataPtr THCudaHostAlloc(THCState *state, size_t size);
 
-TORCH_CUDA_CU_API void THCudaHostRecord(THCState *state, void *ptr);
+TORCH_CUDA_CPP_API void THCudaHostRecord(THCState *state, void *ptr);
 
 #endif
diff --git a/c10/macros/Export.h b/c10/macros/Export.h
@@ -100,15 +100,30 @@
 
 // NB: For now, HIP is overloaded to use the same macro, but ideally
 // HIPify should translate TORCH_CUDA_API to TORCH_HIP_API
-#if defined(TORCH_CUDA_BUILD_MAIN_LIB) || defined(TORCH_HIP_BUILD_MAIN_LIB)
-#define TORCH_CUDA_API C10_EXPORT
-#else
-#define TORCH_CUDA_API C10_IMPORT
+// JX: I removed the || defined(TORCH_HIP_BUILD_MAIN_LIB) check for TORCH_CUDA_*_API
+// since TORCH_HIP_API seems properly initialized below
+// libtorch_cuda_cu.so
+#ifdef TORCH_CUDA_CU_BUILD_MAIN_LIB
+#define TORCH_CUDA_CU_API C10_EXPORT
+#elif defined(BUILD_SPLIT_CUDA)
+#define TORCH_CUDA_CU_API C10_IMPORT
+#endif
+
+// libtorch_cuda_cpp.so
+#ifdef TORCH_CUDA_CPP_BUILD_MAIN_LIB
+#define TORCH_CUDA_CPP_API C10_EXPORT
+#elif defined(BUILD_SPLIT_CUDA)
+#define TORCH_CUDA_CPP_API C10_IMPORT
 #endif
 
-// This is in preparation for the imminent torch_cuda split
-#define TORCH_CUDA_CU_API TORCH_CUDA_API
-#define TORCH_CUDA_CPP_API TORCH_CUDA_API
+// libtorch_cuda.so (where torch_cuda_cu and torch_cuda_cpp are a part of the same api)
+#ifdef TORCH_CUDA_BUILD_MAIN_LIB
+#define TORCH_CUDA_CPP_API C10_EXPORT
+#define TORCH_CUDA_CU_API C10_EXPORT
+#elif !defined(BUILD_SPLIT_CUDA)
+#define TORCH_CUDA_CPP_API C10_IMPORT
+#define TORCH_CUDA_CU_API C10_IMPORT
+#endif
 
 #if defined(TORCH_HIP_BUILD_MAIN_LIB)
 #define TORCH_HIP_API C10_EXPORT
diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt
diff --git a/cmake/Summary.cmake b/cmake/Summary.cmake
diff --git a/torch/csrc/CudaIPCTypes.h b/torch/csrc/CudaIPCTypes.h
diff --git a/torch/utils/cpp_extension.py b/torch/utils/cpp_extension.py

Original file line number	Diff line number	Diff line change
`@@ -110,6 +110,10 @@ if "%REBUILD%" == "" (`
`110`	`110`	`aws s3 cp "s3://ossci-windows/Restore PyTorch Environment.lnk" "C:\Users\circleci\Desktop\Restore PyTorch Environment.lnk"`
`111`	`111`	`)`
`112`	`112`	`)`
	`113`	`+:: tests if BUILD_ENVIRONMENT contains cuda11 as a substring`
	`114`	`+if not x%BUILD_ENVIRONMENT:cuda11=%==x%BUILD_ENVIRONMENT% (`
	`115`	`+ set BUILD_SPLIT_CUDA=ON`
	`116`	`+)`
`113`	`117`
`114`	`118`	`python setup.py install --cmake && sccache --show-stats && (`
`115`	`119`	`if "%BUILD_ENVIRONMENT%"=="" (`
`@@ -118,4 +122,3 @@ python setup.py install --cmake && sccache --show-stats && (`
`118`	`122`	`7z a %TMP_DIR_WIN%\%IMAGE_COMMIT_TAG%.7z %CONDA_PARENT_DIR%\Miniconda3\Lib\site-packages\torch %CONDA_PARENT_DIR%\Miniconda3\Lib\site-packages\caffe2 && copy /Y "%TMP_DIR_WIN%\%IMAGE_COMMIT_TAG%.7z" "%PYTORCH_FINAL_PACKAGE_DIR%\"`
`119`	`123`	`)`
`120`	`124`	`)`
`121`		`-`
Original file line number	Diff line number	Diff line change
`@@ -126,6 +126,7 @@ Tensor& searchsorted_out_cuda(Tensor& result, const Tensor& sorted_sequence, con`
`126`	`126`	`return result;`
`127`	`127`	`}`
`128`	`128`
	`129`	`+// We need this function to force the linking against torch_cuda_cu on Windows.`
`129`	`130`	`Tensor searchsorted_cuda(const Tensor& sorted_sequence, const Tensor& self, bool out_int32, bool right) {`
`130`	`131`	`ScalarType scalar_type = out_int32 ? ScalarType::Int : ScalarType::Long;`
`131`	`132`	`c10::TensorOptions options = TensorOptions().device(self.options().device()).dtype(scalar_type);`