Avoid default tokenization in Dask (dmlc#10398)

rjzamora · trivialfis · web-flow · commit dc14f98f4090 · 2024-06-14T19:44:54.000+08:00
---------

Co-authored-by: Jiaming Yuan &lt;jm.yuan@outlook.com&gt;
diff --git a/python-package/xgboost/core.py b/python-package/xgboost/core.py
@@ -7,6 +7,7 @@
 import os
 import re
 import sys
+import uuid
 import warnings
 import weakref
 from abc import ABC, abstractmethod
@@ -3143,3 +3144,9 @@ def get_split_value_histogram(
                 UserWarning,
             )
         return nph_stacked
+
+    def __dask_tokenize__(self) -> uuid.UUID:
+        # TODO: Implement proper tokenization to avoid unnecessary re-computation in
+        # Dask. However, default tokenzation causes problems after
+        # https://github.com/dask/dask/pull/10883
+        return uuid.uuid4()
diff --git a/tests/ci_build/Dockerfile.gpu b/tests/ci_build/Dockerfile.gpu
@@ -25,7 +25,7 @@ RUN \
     mamba create -y -n gpu_test -c rapidsai -c conda-forge -c nvidia \
         python=3.10 cudf=$RAPIDS_VERSION_ARG* rmm=$RAPIDS_VERSION_ARG* cudatoolkit=$CUDA_VERSION_ARG \
         "nccl>=${NCCL_SHORT_VER}" \
-        dask=2024.1.1 \
+        dask \
         dask-cuda=$RAPIDS_VERSION_ARG* dask-cudf=$RAPIDS_VERSION_ARG* cupy \
         numpy pytest pytest-timeout scipy scikit-learn pandas matplotlib wheel python-kubernetes urllib3 graphviz hypothesis \
         "pyspark>=3.4.0" cloudpickle cuda-python && \
diff --git a/tests/ci_build/Dockerfile.gpu_dev_ver b/tests/ci_build/Dockerfile.gpu_dev_ver
@@ -28,7 +28,7 @@ RUN \
     mamba create -y -n gpu_test -c rapidsai-nightly -c conda-forge -c nvidia \
         python=3.10 "cudf=$RAPIDS_VERSION_ARG.*" "rmm=$RAPIDS_VERSION_ARG.*" cudatoolkit=$CUDA_VERSION_ARG \
         "nccl>=${NCCL_SHORT_VER}" \
-        dask=2024.1.1 \
+        dask \
         "dask-cuda=$RAPIDS_VERSION_ARG.*" "dask-cudf=$RAPIDS_VERSION_ARG.*" cupy \
         numpy pytest pytest-timeout scipy scikit-learn pandas matplotlib wheel python-kubernetes urllib3 graphviz hypothesis \
         "pyspark>=3.4.0" cloudpickle cuda-python && \
diff --git a/tests/ci_build/conda_env/linux_cpu_test.yml b/tests/ci_build/conda_env/linux_cpu_test.yml
@@ -17,8 +17,8 @@ dependencies:
 - scikit-learn
 - pandas
 - matplotlib
-- dask>=2022.6
-- distributed>=2022.6
+- dask
+- distributed
 - python-graphviz
 - hypothesis>=6.46
 - astroid
diff --git a/tests/test_distributed/test_gpu_with_dask/test_gpu_with_dask.py b/tests/test_distributed/test_gpu_with_dask/test_gpu_with_dask.py
@@ -248,10 +248,10 @@ def test_categorical(self, local_cuda_client: Client) -> None:
         import dask_cudf
 
         X, y = make_categorical(local_cuda_client, 10000, 30, 13)
-        X = dask_cudf.from_dask_dataframe(X)
+        X = X.to_backend("cudf")
 
         X_onehot, _ = make_categorical(local_cuda_client, 10000, 30, 13, True)
-        X_onehot = dask_cudf.from_dask_dataframe(X_onehot)
+        X_onehot = X_onehot.to_backend("cudf")
         run_categorical(local_cuda_client, "hist", "cuda", X, X_onehot, y)
 
     @given(
@@ -383,9 +383,9 @@ def test_dask_classifier(self, model: str, local_cuda_client: Client) -> None:
 
         X_, y_, w_ = generate_array(with_weights=True)
         y_ = (y_ * 10).astype(np.int32)
-        X = dask_cudf.from_dask_dataframe(dd.from_dask_array(X_))
-        y = dask_cudf.from_dask_dataframe(dd.from_dask_array(y_))
-        w = dask_cudf.from_dask_dataframe(dd.from_dask_array(w_))
+        X = dd.from_dask_array(X_).to_backend("cudf")
+        y = dd.from_dask_array(y_).to_backend("cudf")
+        w = dd.from_dask_array(w_).to_backend("cudf")
         run_dask_classifier(X, y, w, model, "hist", "cuda", local_cuda_client, 10)
 
     def test_empty_dmatrix(self, local_cuda_client: Client) -> None: