docs/repo.toml

########################################################################################################################
# Repo tool base settings
########################################################################################################################

[repo_docs]
enabled = true
name = "CUDA C++ Core Libraries"
project = "cccl"
logo = "img/logo.png"

repo_url         = "https://github.com/NVIDIA/cccl"
social_media_set = ""
social_media     = [
    [ "github", "https://github.com/NVIDIA/cccl" ],
    [ "discord", "https://discord.com/channels/1019361803752456192/1161051667945508884" ],
]

sphinx_version = "4.5.0.2-py3.10-${platform}"

enhanced_search_enabled = true
api_output_directory = "api"
use_fast_doxygen_conversion = true
sphinx_generate_doxygen_groups = true
sphinx_generate_doxygen_pages = true
sphinx_exclude_patterns = [
    "tools",
    "VERSION.md",
]

project_build_order = [ "libcudacxx", "cudax", "cub", "thrust", "cccl", "cuda_cooperative", "cuda_parallel" ]

# deps can be used to link to other projects' documentation
deps = [
    [ "libcudacxx", "_build/docs/libcudacxx/latest" ],
    [ "cudax", "_build/docs/cudax/latest" ],
    [ "cub", "_build/docs/cub/latest" ],
    [ "thrust", "_build/docs/thrust/latest" ],
    [ "cuda_cooperative", "_build/docs/cuda_cooperative/latest" ],
    [ "cuda_parallel", "_build/docs/cuda_parallel/latest" ],
]

[repo_docs.projects.libcudacxx]
name = "libcu++"
docs_root = "libcudacxx"
logo = "../img/logo.png"

repo_url         = "https://github.com/NVIDIA/cccl/libcudacxx"
social_media_set = ""
social_media     = [
    [ "github", "https://github.com/NVIDIA/cccl" ],
]

enhanced_search_enabled = true
api_output_directory = "api"
use_fast_doxygen_conversion = true
sphinx_generate_doxygen_groups = true
sphinx_generate_doxygen_pages = true
sphinx_exclude_patterns = []

[repo_docs.projects.cub]
name = "CUB"
docs_root = "cub"
logo = "../img/logo.png"

repo_url         = "https://github.com/NVIDIA/cccl/cub"
social_media_set = ""
social_media     = [
    [ "github", "https://github.com/NVIDIA/cccl" ],
]

enhanced_search_enabled = true
api_output_directory = "api"
use_fast_doxygen_conversion = true
sphinx_generate_doxygen_groups = true
sphinx_generate_doxygen_pages = true
sphinx_exclude_patterns = []

# deps can be used to link to other projects' documentation
deps = [
    [ "libcudacxx", "_build/docs/libcudacxx/latest" ],
]

# list of files from which to extract documentation.  if a directory is specified,
# it will be recursively searched.
#
# paths are relative to ${docs_root}.
#
# defaults to an empty list.
doxygen_input = [
    "../../cub/cub/*.cuh",
    "../../cub/cub/thread/*.cuh",
    "../../cub/cub/warp/*.cuh",
    "../../cub/cub/block/*.cuh",
    "../../cub/cub/device/*.cuh",
    "../../cub/cub/grid/*.cuh",
    "../../cub/cub/iterator/*.cuh"
]

# Using wildcards is also supported in `doxygen_input`. Assuming there are no other `.h` files
# in the `include/carb` directory, the above may also be specified as:
# doxygen_input = [
#     "include/carb/*.h",
#     "source/examples/example.doxygen/ExampleDoxygen.h",
# ]

# doxygen allows the creation of custom commands to ease in the documentation process.
# for example, this adds a @carb_framework_overview command which creates a link
# to a reStructuredText document. defaults to an empty list.
#
# more information on the format can be found at:
#     https://www.doxygen.nl/manual/config.html#cfg_aliases
doxygen_aliases = [
  "smemwarpreuse=A subsequent ``__syncwarp()`` warp-wide barrier should be invoked after calling this method if the collective's temporary storage (e.g., ``temp_storage``) is to be reused or repurposed.",
  "smemreuse=A subsequent ``__syncthreads()`` threadblock barrier should be invoked after calling this method if the collective's temporary storage (e.g., ``temp_storage``) is to be reused or repurposed.",
  "smemreuse{1}=After any operation, a subsequent ``__syncthreads()`` barrier is required if the collective's \\1 is to be reused or repurposed",
  "smemstorage{1}=The operations exposed by \\1 require a temporary memory allocation of this nested type for thread communication. This opaque storage can be allocated directly using the ``__shared__`` keyword. Alternatively, it can be aliased to externally allocated memory (shared or global) or ``union``'d with other storage allocation types to facilitate memory reuse.",
  "granularity=Efficiency is increased with increased granularity ``ITEMS_PER_THREAD``. Performance is also typically increased until the additional register pressure or shared memory allocation size causes SM occupancy to fall too low. Consider variants of ``cub::BlockLoad`` for efficiently gathering a :ref:`blocked arrangement <flexible-data-arrangement>` of elements across threads.",
  "blocksize=The number of threads in the block is a multiple of the architecture's warp size",
  "ptxversion=The PTX compute capability for which to to specialize this collective, formatted as per the ``__CUDA_ARCH__`` macro (e.g., 750 for sm_75). Useful for determining the collective's storage requirements for a given device from the host. (Default: the value of ``__CUDA_ARCH__`` during the current compiler pass)",
  "blockcollective{1}=Every thread in the block uses the \\1 class by first specializing the \\1 type, then instantiating an instance with parameters for communication, and finally invoking one or more collective member functions.",
  "warpcollective{1}=Every thread in the warp uses the \\1 class by first specializing the \\1 type, then instantiating an instance with parameters for communication, and finally invoking or more collective member functions.",
  "devicestorage=When ``d_temp_storage`` is ``nullptr``, no work is done and the required allocation size is returned in ``temp_storage_bytes``.",
  "devicestorageP=This operation requires a relatively small allocation of temporary device storage that is ``O(P)``, where ``P`` is the number of streaming multiprocessors on the device (and is typically a small constant relative to the input size ``N``).",
  "devicestorageNP=This operation requires an allocation of temporary device storage that is ``O(N+P)``, where ``N`` is the length of the input and ``P`` is the number of streaming multiprocessors on the device.",
  "devicestorageNCP=This operation requires a relatively small allocation of temporary device storage that is ``O(N/C + P)``, where ``N`` is the length of the input, ``C`` is the number of concurrent threads that can be actively scheduled on each streaming multiprocessor (typically several thousand), and ``P`` is the number of streaming multiprocessors on the device.",
  "cdp_class{1}= - Dynamic parallelism. \\1 methods can be called within kernel code on devices in which CUDA dynamic parallelism is supported.",
  "iterator=(may be a simple pointer type)",
  "offset_size1=(Consider using 32-bit values as offsets/lengths/etc. For example, ``int`` will typically yield better performance than ``size_t`` in 64-bit memory mode.)",
  "offset_size2=Careful consideration should be given to the size of integer types used for offsets and lengths. Many (if not most) scenarios will only require 32-bit offsets (e.g., ``int``). 64-bit offset types (e.g., ``size_t`` on 64-bit memory mode) can consume a significant amount of thread storage resources, adversely affecting processor occupancy and performance.",
  "rowmajor=For multi-dimensional blocks, threads are linearly ranked in row-major order.",
  "blocked=Assumes a :ref:`blocked arrangement <flexible-data-arrangement>` of (*block-threads* * *items-per-thread*) items across the thread block, where *thread*\\ :sub:`i` owns the *i*\\ :sup:`th` range of *items-per-thread* contiguous items. For multi-dimensional thread blocks, a row-major thread ordering is assumed.",
  "striped=Assumes a :ref:`striped arrangement <flexible-data-arrangement>` of (*block-threads* * *items-per-thread*) items across the thread block, where *thread*\\ :sub:`i` owns items (*i*), (*i* + *block-threads*), ..., (*i* + (*block-threads* * (*items-per-thread* - 1))).  For multi-dimensional thread blocks, a row-major thread ordering is assumed.",
  "warpstriped=Assumes a *warp-striped arrangement* of elements across threads, where warp\\ :sub:`i` owns the *i*\\ :sup:`th` range of (*warp-threads* * *items-per-thread*) contiguous items, and each thread owns items (*i*), (*i* + *warp-threads*), ..., (*i* + (*warp-threads* * (*items-per-thread* - 1))).",
  "linear_performance{1}=The work-complexity of \\1 as a function of input size is linear, resulting in performance throughput that plateaus with problem sizes large enough to saturate the GPU."  ,
  "plots_below=Performance plots for other scenarios can be found in the detailed method descriptions below.",
  "identityzero=This operation assumes the value of obtained by the ``T``'s default constructor (or by zero-initialization if no user-defined default constructor exists) is suitable as the identity value \"zero\" for addition.",
  "lookback=`decoupled look-back <https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back>`_"
]

# doxygen sometimes gets confused by macros. the array below allows the user to
# tell doxygen how to expand a macro. defaults to an empty list.
#
# more information on the format can be found at:
#     https://www.doxygen.nl/manual/config.html#cfg_predefined
doxygen_predefined = [
    "_CCCL_AND=&&",
    "_CCCL_ALIAS_ATTRIBUTE(x)=",
    "_CCCL_HOST",
    "_CCCL_DEVICE",
    "_CCCL_EAT_REST(x)=",
    "_CCCL_HOST_DEVICE",
    "_CCCL_FORCEINLINE",
    "_CCCL_STD_VER",
    "_CCCL_NODISCARD",
    "_CCCL_VISIBILITY_HIDDEN",
    "_CCCL_SUPPRESS_DEPRECATED_PUSH",
    "_CCCL_SUPPRESS_DEPRECATED_POP",
    "_CCCL_DIAG_PUSH=",
    "_CCCL_DIAG_POP=",
    "_CCCL_DIAG_SUPPRESS_CLANG(x)=",
    "_CCCL_DIAG_SUPPRESS_GCC(x)=",
    "_CCCL_DIAG_SUPPRESS_MSVC(x)=",
    "_CCCL_DIAG_SUPPRESS_NVHPC(x)=",
    "_CCCL_DOXYGEN_INVOKED",
    "_CCCL_REQUIRES(x)= ::cuda::std::enable_if_t<x, int> = 0>",
    "_CCCL_TEMPLATE(x)=template<x, ",
    "_CCCL_TRAILING_REQUIRES(x)=-> x _CCCL_EAT_REST",
    "__device__",
    "__host__",
    "__forceinline__",
    "__declspec(x)=",
    "__align__(x)=",
    "__cccl_lib_mdspan",
    "CCCL_IGNORE_DEPRECATED_CPP_DIALECT",
    "CUB_DISABLE_NAMESPACE_MAGIC",
    "CUB_IGNORE_NAMESPACE_MAGIC_ERROR",
    "CUB_NAMESPACE_BEGIN=namespace cub {",
    "CUB_NAMESPACE_END=}",
    "CCCL_DEPRECATED_BECAUSE(x)=",
    "CCCL_DEPRECATED=",
    "CUB_STATIC_ASSERT(cond,msg)=",
    "CUB_RUNTIME_FUNCTION",
    "CUDASTF_HOST",
    "CUDASTF_DEVICE",
    "CUDASTF_HOST_DEVICE"
]

# make sure to use ./fetch_imgs.sh
doxygen_conf_extra = """
  IMAGE_PATH             = ${config_root}/img
  DOXYFILE_ENCODING      = UTF-8
  INPUT_ENCODING         = UTF-8
  EXTENSION_MAPPING      = cuh=c++ cu=c++
  EXAMPLE_PATH           = ../../cub/examples/device
  EXAMPLE_RECURSIVE      = NO
  EXAMPLE_PATTERNS       = *.cu
  EXCLUDE_SYMBOLS        = "*detail*" "CUB_DETAIL*"
  AUTOLINK_SUPPORT       = YES
  FULL_PATH_NAMES        = YES
  STRIP_FROM_PATH        = ../../cub
"""

[repo_docs.projects.thrust]
name = "Thrust: The C++ Parallel Algorithms Library"
docs_root = "thrust"
logo = "../img/logo.png"

repo_url         = "https://github.com/NVIDIA/cccl/thrust"
social_media_set = ""
social_media     = [
    [ "github", "https://github.com/NVIDIA/cccl" ],
]

enhanced_search_enabled = true
api_output_directory = "api"
use_fast_doxygen_conversion = true
sphinx_generate_doxygen_groups = true
sphinx_generate_doxygen_pages = true
sphinx_exclude_patterns = []

# deps can be used to link to other projects' documentation
deps = [
    [ "libcudacxx", "_build/docs/libcudacxx/latest" ],
    [ "cub", "_build/docs/cub/latest" ],
]

# list of files from which to extract documentation.  if a directory is specified,
# it will be recursively searched.
#
# paths are relative to ${docs_root}.
#
# defaults to an empty list.
doxygen_input = [
    "../../thrust/thrust/*.h",
    "../../thrust/thrust/iterator/*.h",
    "../../thrust/thrust/mr/*.h",
    "../../thrust/thrust/random/*.h",
    "../../thrust/thrust/system/*.h",
    "../../thrust/thrust/system/cpp/pointer.h",
    "../../thrust/thrust/system/omp/pointer.h",
    "../../thrust/thrust/system/tbb/pointer.h",
    "../../thrust/thrust/type_traits/*.h",
]

# doxygen allows the creation of custom commands to ease in the documentation process.
# for example, this adds a @carb_framework_overview command which creates a link
# to a reStructuredText document. defaults to an empty list.
#
# more information on the format can be found at:
#     https://www.doxygen.nl/manual/config.html#cfg_aliases
doxygen_aliases = []

# doxygen sometimes gets confused by macros. the array below allows the user to
# tell doxygen how to expand a macro. defaults to an empty list.
#
# more information on the format can be found at:
#     https://www.doxygen.nl/manual/config.html#cfg_predefined
doxygen_predefined = [
  "_CCCL_ALIAS_ATTRIBUTE(x)=",
  "_CCCL_DEVICE=",
  "_CCCL_EXEC_CHECK_DISABLE=",
  "_CCCL_FORCEINLINE=",
  "_CCCL_HOST=",
  "_CCCL_HOST_DEVICE=",
  "_CCCL_NODISCARD=[[nodiscard]]",
  "_CCCL_STD_VER",
  "_CCCL_SUPPRESS_DEPRECATED_PUSH",
  "_CCCL_SUPPRESS_DEPRECATED_POP",
  "_CCCL_DIAG_PUSH=",
  "_CCCL_DIAG_POP=",
  "_CCCL_DIAG_SUPPRESS_CLANG(x)=",
  "_CCCL_DIAG_SUPPRESS_GCC(x)=",
  "_CCCL_DIAG_SUPPRESS_MSVC(x)=",
  "_CCCL_DIAG_SUPPRESS_NVHPC(x)=",
  "_CCCL_GLOBAL_CONSTANT=constexpr",
  "CUDASTF_HOST=",
  "CUDASTF_DEVICE=",
  "CUDASTF_HOST_DEVICE=",
  "_CCCL_DOXYGEN_INVOKED",
  "_LIBCUDACXX_DEPRECATED_IN_CXX11",
  "THRUST_NAMESPACE_BEGIN=namespace thrust {",
  "THRUST_NAMESPACE_END=}",
  "THRUST_PREVENT_MACRO_SUBSTITUTION",
  "THRUST_FWD(x)=x",
  "CCCL_DEPRECATED_BECAUSE(x)=",
  "CCCL_DEPRECATED="
]

# make sure to use ./fetch_imgs.sh
doxygen_conf_extra = """
  IMAGE_PATH             = ${config_root}/img
  DISTRIBUTE_GROUP_DOC   = Yes
  DOXYFILE_ENCODING      = UTF-8
  INPUT_ENCODING         = UTF-8
  EXTENSION_MAPPING      = cuh=c++ cu=c++
  EXAMPLE_PATH           = ../../thrust/examples
  EXAMPLE_RECURSIVE      = NO
  EXAMPLE_PATTERNS       = *.cu
  EXCLUDE_SYMBOLS        = "*detail*" "__*" "THRUST_NS_QUALIFIER" "optional"
  AUTOLINK_SUPPORT       = YES
  FULL_PATH_NAMES        = YES
  STRIP_FROM_PATH        = ../../thrust
"""

[repo_docs.projects.cuda_cooperative]
name = "cuda.cooperative"
docs_root = "cuda_cooperative"
logo = "../img/logo.png"

repo_url         = "https://github.com/NVIDIA/cccl/python/cuda"
social_media_set = ""
social_media     = [
    [ "github", "https://github.com/NVIDIA/cccl" ],
]

autodoc.mock_imports = [
    "numba",
    "pynvjitlink",
    "cuda.bindings",
    "llvmlite",
    "numpy",
    "cupy",
]

enhanced_search_enabled = true
python_paths = [
    "${root}/../python/cuda_cooperative"
]

[repo_docs.projects.cuda_parallel]
name = "cuda.parallel"
docs_root = "cuda_parallel"
logo = "../img/logo.png"

repo_url         = "https://github.com/NVIDIA/cccl/python/cuda"
social_media_set = ""
social_media     = [
    [ "github", "https://github.com/NVIDIA/cccl" ],
]

autodoc.mock_imports = [
    "numba",
    "pynvjitlink",
    "cuda.bindings",
    "cuda.cccl",
    "llvmlite",
    "numpy",
    "cupy",
]

enhanced_search_enabled = true
python_paths = [
    "${root}/../python/cuda_parallel"
]

[repo_docs.projects.cudax]
name = "Cudax: Experimental new features"
docs_root = "cudax"
logo = "../img/logo.png"

repo_url         = "https://github.com/NVIDIA/cccl/cudax"
social_media_set = ""
social_media     = [
    [ "github", "https://github.com/NVIDIA/cccl" ],
]

enhanced_search_enabled = true
api_output_directory = "api"
use_fast_doxygen_conversion = true
sphinx_generate_doxygen_groups = true
sphinx_generate_doxygen_pages = true
sphinx_exclude_patterns = []

# deps can be used to link to other projects' documentation
deps = [
    [ "libcudacxx", "_build/docs/libcudacxx/latest" ],
]

doxygen_input = [
    "../../cudax/include/cuda/experimental/__container/*.cuh",
    "../../cudax/include/cuda/experimental/__device/*.cuh",
    "../../cudax/include/cuda/experimental/__event/*.cuh",
    "../../cudax/include/cuda/experimental/__hierarchy/*.cuh",
    "../../cudax/include/cuda/experimental/__launch/*.cuh",
    "../../cudax/include/cuda/experimental/__memory_resource/*.cuh",
    "../../cudax/include/cuda/experimental/__stream/*.cuh",
    "../../cudax/include/cuda/experimental/stf.cuh",
    "../../cudax/include/cuda/experimental/__stf/*.cuh",
    "../../cudax/include/cuda/experimental/__stf/internal/*.cuh",
    "../../cudax/include/cuda/experimental/__stf/utility/*.cuh",
    "../../cudax/include/cuda/experimental/__stf/localization/*.cuh",
    "../../cudax/include/cuda/experimental/__stf/allocators/*.cuh",
    "../../cudax/include/cuda/experimental/__stf/graph/*.cuh",
    "../../cudax/include/cuda/experimental/__stf/graph/internal/*.cuh",
    "../../cudax/include/cuda/experimental/__stf/graph/interfaces/*.cuh",
    "../../cudax/include/cuda/experimental/__stf/places/*.cuh",
    "../../cudax/include/cuda/experimental/__stf/places/exec/*.cuh",
    "../../cudax/include/cuda/experimental/__stf/places/exec/host/*.cuh",
    "../../cudax/include/cuda/experimental/__stf/stream/*.cuh",
    "../../cudax/include/cuda/experimental/__stf/stream/internal/*.cuh",
    "../../cudax/include/cuda/experimental/__stf/stream/interfaces/*.cuh",
]

doxygen_predefined = [
  "__device__",
  "__host__",
  "__global__",
  "_CCCL_AND=&&",
  "_CCCL_CUDACC_AT_LEAST(x, y)=1",
  "_CCCL_CUDACC_BELOW(x, y)=0",
  "_CCCL_DEVICE=",
  "_CCCL_DOXYGEN_INVOKED",
  "_CCCL_EAT_REST(x)=",
  "_CCCL_EXEC_CHECK_DISABLE=",
  "_CCCL_FORCEINLINE=",
  "_CCCL_GLOBAL_CONSTANT=constexpr",
  "_CCCL_HIDE_FROM_ABI=",
  "_CCCL_HOST=",
  "_CCCL_HOST_DEVICE=",
  "_CCCL_NODISCARD=[[nodiscard]]",
  "_CCCL_NODISCARD_FRIEND=",
  "_CCCL_STD_VER=2020",
  "_CCCL_TRAIT(x, y)=x<y>::value",
  "_CUDA_VMR=cuda::mr",
  "_CUDA_VRANGES=cuda::std::ranges",
  "_CUDA_VSTD=cuda::std",
  "_CCCL_SUPPRESS_DEPRECATED_PUSH",
  "_CCCL_SUPPRESS_DEPRECATED_POP",
  "_CCCL_DIAG_PUSH=",
  "_CCCL_DIAG_POP=",
  "_CCCL_DIAG_SUPPRESS_CLANG(x)=",
  "_CCCL_DIAG_SUPPRESS_GCC(x)=",
  "_CCCL_DIAG_SUPPRESS_MSVC(x)=",
  "_CCCL_DIAG_SUPPRESS_NVHPC(x)=",
  "_CCCL_REQUIRES(x)= ::cuda::std::enable_if_t<x, int> = 0>",
  "_CCCL_TEMPLATE(x)=template<x, ",
  "_CCCL_TRAILING_REQUIRES(x)=-> x _CCCL_EAT_REST",
  "_CUDAX_API",
  "_CUDAX_HOST_API",
  "_CUDAX_DEVICE_API",
  "_CUDAX_TRIVIAL_API",
  "_CUDAX_TRIVIAL_HOST_API",
  "_CUDAX_TRIVIAL_DEVICE_API",
  "_CUDAX_PUBLIC_API",
  "_LIBCUDACXX_HAS_SPACESHIP_OPERATOR()",
  "_LIBCUDACXX_HIDE_FROM_ABI=inline",
  "LIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE=",
]

# make sure to use ./fetch_imgs.sh
doxygen_conf_extra = """
  IMAGE_PATH             = ${config_root}/img
  DISTRIBUTE_GROUP_DOC   = Yes
  DOXYFILE_ENCODING      = UTF-8
  INPUT_ENCODING         = UTF-8
  EXTENSION_MAPPING      = cuh=c++ cu=c++
  EXAMPLE_RECURSIVE      = NO
  EXAMPLE_PATTERNS       = *.cu
  EXCLUDE_SYMBOLS        = "*detail*" "*RESERVED*" "*reserved*" "*__*" "_A*" "_B*" "_C*" "_D*" "_E*" "_F*" "_G*" "_H*" "_I*" "_J*" "_K*" "_L*" "_M*" "_N*" "_O*" "_P*" "_Q*" "_R*" "_S*" "_T*" "_U*" "_V*" "_W*" "_X*" "_Y*" "_Z*" "UNITTEST"
  AUTOLINK_SUPPORT       = YES
  FULL_PATH_NAMES        = YES
  STRIP_FROM_PATH        = ../../cudax
"""