Skip to content

Commit dc14f98

Browse files
rjzamoratrivialfis
andauthored
Avoid default tokenization in Dask (dmlc#10398)
--------- Co-authored-by: Jiaming Yuan <[email protected]>
1 parent 01ff2b2 commit dc14f98

File tree

5 files changed

+16
-9
lines changed

5 files changed

+16
-9
lines changed

python-package/xgboost/core.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
import os
88
import re
99
import sys
10+
import uuid
1011
import warnings
1112
import weakref
1213
from abc import ABC, abstractmethod
@@ -3143,3 +3144,9 @@ def get_split_value_histogram(
31433144
UserWarning,
31443145
)
31453146
return nph_stacked
3147+
3148+
def __dask_tokenize__(self) -> uuid.UUID:
3149+
# TODO: Implement proper tokenization to avoid unnecessary re-computation in
3150+
# Dask. However, default tokenzation causes problems after
3151+
# https://github.com/dask/dask/pull/10883
3152+
return uuid.uuid4()

tests/ci_build/Dockerfile.gpu

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ RUN \
2525
mamba create -y -n gpu_test -c rapidsai -c conda-forge -c nvidia \
2626
python=3.10 cudf=$RAPIDS_VERSION_ARG* rmm=$RAPIDS_VERSION_ARG* cudatoolkit=$CUDA_VERSION_ARG \
2727
"nccl>=${NCCL_SHORT_VER}" \
28-
dask=2024.1.1 \
28+
dask \
2929
dask-cuda=$RAPIDS_VERSION_ARG* dask-cudf=$RAPIDS_VERSION_ARG* cupy \
3030
numpy pytest pytest-timeout scipy scikit-learn pandas matplotlib wheel python-kubernetes urllib3 graphviz hypothesis \
3131
"pyspark>=3.4.0" cloudpickle cuda-python && \

tests/ci_build/Dockerfile.gpu_dev_ver

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ RUN \
2828
mamba create -y -n gpu_test -c rapidsai-nightly -c conda-forge -c nvidia \
2929
python=3.10 "cudf=$RAPIDS_VERSION_ARG.*" "rmm=$RAPIDS_VERSION_ARG.*" cudatoolkit=$CUDA_VERSION_ARG \
3030
"nccl>=${NCCL_SHORT_VER}" \
31-
dask=2024.1.1 \
31+
dask \
3232
"dask-cuda=$RAPIDS_VERSION_ARG.*" "dask-cudf=$RAPIDS_VERSION_ARG.*" cupy \
3333
numpy pytest pytest-timeout scipy scikit-learn pandas matplotlib wheel python-kubernetes urllib3 graphviz hypothesis \
3434
"pyspark>=3.4.0" cloudpickle cuda-python && \

tests/ci_build/conda_env/linux_cpu_test.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,8 @@ dependencies:
1717
- scikit-learn
1818
- pandas
1919
- matplotlib
20-
- dask>=2022.6
21-
- distributed>=2022.6
20+
- dask
21+
- distributed
2222
- python-graphviz
2323
- hypothesis>=6.46
2424
- astroid

tests/test_distributed/test_gpu_with_dask/test_gpu_with_dask.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -248,10 +248,10 @@ def test_categorical(self, local_cuda_client: Client) -> None:
248248
import dask_cudf
249249

250250
X, y = make_categorical(local_cuda_client, 10000, 30, 13)
251-
X = dask_cudf.from_dask_dataframe(X)
251+
X = X.to_backend("cudf")
252252

253253
X_onehot, _ = make_categorical(local_cuda_client, 10000, 30, 13, True)
254-
X_onehot = dask_cudf.from_dask_dataframe(X_onehot)
254+
X_onehot = X_onehot.to_backend("cudf")
255255
run_categorical(local_cuda_client, "hist", "cuda", X, X_onehot, y)
256256

257257
@given(
@@ -383,9 +383,9 @@ def test_dask_classifier(self, model: str, local_cuda_client: Client) -> None:
383383

384384
X_, y_, w_ = generate_array(with_weights=True)
385385
y_ = (y_ * 10).astype(np.int32)
386-
X = dask_cudf.from_dask_dataframe(dd.from_dask_array(X_))
387-
y = dask_cudf.from_dask_dataframe(dd.from_dask_array(y_))
388-
w = dask_cudf.from_dask_dataframe(dd.from_dask_array(w_))
386+
X = dd.from_dask_array(X_).to_backend("cudf")
387+
y = dd.from_dask_array(y_).to_backend("cudf")
388+
w = dd.from_dask_array(w_).to_backend("cudf")
389389
run_dask_classifier(X, y, w, model, "hist", "cuda", local_cuda_client, 10)
390390

391391
def test_empty_dmatrix(self, local_cuda_client: Client) -> None:

0 commit comments

Comments
 (0)