[backport] Fix using categorical data with the ranker. (#9753) (#9778)

dmlc · Nov 8, 2023 · 0ffc52e · 0ffc52e
1 parent a408254
commit 0ffc52e
Show file tree

Hide file tree

Showing 4 changed files with 48 additions and 3 deletions.
diff --git a/python-package/xgboost/sklearn.py b/python-package/xgboost/sklearn.py
@@ -2093,7 +2093,17 @@ def score(self, X: ArrayLike, y: ArrayLike) -> float:
 
         """
         X, qid = _get_qid(X, None)
-        Xyq = DMatrix(X, y, qid=qid)
+        # fixme(jiamingy): base margin and group weight is not yet supported. We might
+        # need to make extra special fields in the dataframe.
+        Xyq = DMatrix(
+            X,
+            y,
+            qid=qid,
+            missing=self.missing,
+            enable_categorical=self.enable_categorical,
+            nthread=self.n_jobs,
+            feature_types=self.feature_types,
+        )
         if callable(self.eval_metric):
             metric = ltr_metric_decorator(self.eval_metric, self.n_jobs)
             result_str = self.get_booster().eval_set([(Xyq, "eval")], feval=metric)

diff --git a/python-package/xgboost/testing/ranking.py b/python-package/xgboost/testing/ranking.py
@@ -75,3 +75,28 @@ def neg_mse(*args: Any, **kwargs: Any) -> float:
 
     with pytest.raises(ValueError, match="Either `group` or `qid`."):
         ranker.fit(df, y, eval_set=[(X, y)])
+
+
+def run_ranking_categorical(device: str) -> None:
+    """Test LTR with categorical features."""
+    from sklearn.model_selection import cross_val_score
+
+    X, y = tm.make_categorical(
+        n_samples=512, n_features=10, n_categories=3, onehot=False
+    )
+    rng = np.random.default_rng(1994)
+    qid = rng.choice(3, size=y.shape[0])
+    qid = np.sort(qid)
+    X["qid"] = qid
+
+    ltr = xgb.XGBRanker(enable_categorical=True, device=device)
+    ltr.fit(X, y)
+    score = ltr.score(X, y)
+    assert score > 0.9
+
+    ltr = xgb.XGBRanker(enable_categorical=True, device=device)
+
+    # test using the score function inside sklearn.
+    scores = cross_val_score(ltr, X, y)
+    for s in scores:
+        assert s > 0.7
diff --git a/tests/python-gpu/test_gpu_with_sklearn.py b/tests/python-gpu/test_gpu_with_sklearn.py
@@ -9,7 +9,7 @@
 
 import xgboost as xgb
 from xgboost import testing as tm
-from xgboost.testing.ranking import run_ranking_qid_df
+from xgboost.testing.ranking import run_ranking_categorical, run_ranking_qid_df
 
 sys.path.append("tests/python")
 import test_with_sklearn as twskl  # noqa
@@ -165,6 +165,11 @@ def test_ranking_qid_df():
     run_ranking_qid_df(cudf, "gpu_hist")
 
 
+@pytest.mark.skipif(**tm.no_pandas())
+def test_ranking_categorical() -> None:
+    run_ranking_categorical(device="cuda")
+
+
 @pytest.mark.skipif(**tm.no_cupy())
 @pytest.mark.mgpu
 def test_device_ordinal() -> None:

diff --git a/tests/python/test_with_sklearn.py b/tests/python/test_with_sklearn.py
@@ -12,7 +12,7 @@
 
 import xgboost as xgb
 from xgboost import testing as tm
-from xgboost.testing.ranking import run_ranking_qid_df
+from xgboost.testing.ranking import run_ranking_categorical, run_ranking_qid_df
 from xgboost.testing.shared import get_feature_weights, validate_data_initialization
 from xgboost.testing.updater import get_basescore
 
@@ -173,6 +173,11 @@ def test_ranking():
     np.testing.assert_almost_equal(pred, pred_orig)
 
 
+@pytest.mark.skipif(**tm.no_pandas())
+def test_ranking_categorical() -> None:
+    run_ranking_categorical(device="cpu")
+
+
 def test_ranking_metric() -> None:
     from sklearn.metrics import roc_auc_score