dmlc · RAMitchell · Feb 5, 2026 · Jan 29, 2026 · Jan 30, 2026 · Jan 30, 2026
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -67,7 +67,7 @@ repos:
         language: python
         types: [file]
         files: \.py$
-        exclude: (dmlc-core|gputreeshap|^ops/)
+        exclude: (dmlc-core|gputreeshap|^ops/|^tests/)
         args:
           - --rcfile=python-package/pyproject.toml
         additional_dependencies:

diff --git a/python-package/xgboost/_data_utils.py b/python-package/xgboost/_data_utils.py
@@ -442,13 +442,23 @@ def is_prim() -> bool:
         jarr_codes = array_interface_dict(code_values)
         return jarr_values, jarr_codes, (name_values_num, code_values)
 
-    def npstr_to_arrow_strarr(strarr: np.ndarray) -> Tuple[np.ndarray, str]:
-        """Convert a numpy string array to an arrow string array."""
+    def npstr_to_arrow_strarr(strarr: Any) -> Tuple[np.ndarray, str]:
+        """Convert a string-like array to an arrow string array."""
+        if not isinstance(strarr, np.ndarray):
+            if hasattr(strarr, "to_numpy"):
+                strarr = strarr.to_numpy(dtype=object)
+            else:
+                strarr = np.asarray(strarr, dtype=object)
+
         lenarr = np.vectorize(len)
         offsets = np.cumsum(
             np.concatenate([np.array([0], dtype=np.int64), lenarr(strarr)])
         )
-        values = strarr.sum()
+        if strarr.dtype.kind == "S":
+            str_list = [s.decode("utf-8") for s in strarr.tolist()]
+        else:
+            str_list = [str(s) for s in strarr.tolist()]
+        values = "".join(str_list)
         assert "\0" not in values  # arrow string array doesn't need null terminal
         return offsets.astype(np.int32), values
 

diff --git a/python-package/xgboost/testing/updater.py b/python-package/xgboost/testing/updater.py
@@ -577,7 +577,7 @@ def run(max_cat_to_onehot: int) -> None:
         booster = train(
             parameters,
             Xy,
-            num_boost_round=16,
+            num_boost_round=8,
             evals=[(Xy, "Train")],
             evals_result=evals_result,
         )

diff --git a/tests/python/test_ranking.py b/tests/python/test_ranking.py
@@ -92,62 +92,80 @@ def fit(ltr: xgboost.XGBRanker):
 
 
 def test_ranking_with_unweighted_data():
+    # fmt: off
     Xrow = np.array([1, 2, 6, 8, 11, 14, 16, 17])
-    Xcol = np.array([0, 0, 1, 1,  2,  2,  3,  3])
+    Xcol = np.array([0, 0, 1, 1, 2, 2, 3, 3])
+    y = np.array([
+        0.0, 1.0, 1.0, 0.0, 0.0,
+        0.0, 1.0, 0.0, 1.0, 0.0,
+        0.0, 1.0, 0.0, 0.0, 1.0,
+        0.0, 1.0, 1.0, 0.0, 0.0
+    ])
+    # fmt: on
     X = csr_matrix((np.ones(shape=8), (Xrow, Xcol)), shape=(20, 4))
-    y = np.array([0.0, 1.0, 1.0, 0.0, 0.0,
-                  0.0, 1.0, 0.0, 1.0, 0.0,
-                  0.0, 1.0, 0.0, 0.0, 1.0,
-                  0.0, 1.0, 1.0, 0.0, 0.0])
 
     group = np.array([5, 5, 5, 5], dtype=np.uint)
     dtrain = xgboost.DMatrix(X, label=y)
     dtrain.set_group(group)
 
-    params = {'eta': 1, 'tree_method': 'exact',
-              'objective': 'rank:pairwise', 'eval_metric': ['auc', 'aucpr'],
-              'max_depth': 1}
+    params = {
+        "eta": 1,
+        "tree_method": "exact",
+        "objective": "rank:pairwise",
+        "eval_metric": ["auc", "aucpr"],
+        "max_depth": 1,
+    }
     evals_result = {}
-    bst = xgboost.train(params, dtrain, 10, evals=[(dtrain, 'train')],
-                        evals_result=evals_result)
-    auc_rec = evals_result['train']['auc']
+    bst = xgboost.train(
+        params, dtrain, 10, evals=[(dtrain, "train")], evals_result=evals_result
+    )
+    auc_rec = evals_result["train"]["auc"]
     assert all(p <= q for p, q in zip(auc_rec, auc_rec[1:]))
-    auc_rec = evals_result['train']['aucpr']
+    auc_rec = evals_result["train"]["aucpr"]
     assert all(p <= q for p, q in zip(auc_rec, auc_rec[1:]))
 
 
 def test_ranking_with_weighted_data():
+    # fmt: off
     Xrow = np.array([1, 2, 6, 8, 11, 14, 16, 17])
-    Xcol = np.array([0, 0, 1, 1,  2,  2,  3,  3])
+    Xcol = np.array([0, 0, 1, 1, 2, 2, 3, 3])
+    y = np.array([
+        0.0, 1.0, 1.0, 0.0, 0.0,
+        0.0, 1.0, 0.0, 1.0, 0.0,
+        0.0, 1.0, 0.0, 0.0, 1.0,
+        0.0, 1.0, 1.0, 0.0, 0.0
+    ])
+    # fmt: on
     X = csr_matrix((np.ones(shape=8), (Xrow, Xcol)), shape=(20, 4))
-    y = np.array([0.0, 1.0, 1.0, 0.0, 0.0,
-                  0.0, 1.0, 0.0, 1.0, 0.0,
-                  0.0, 1.0, 0.0, 0.0, 1.0,
-                  0.0, 1.0, 1.0, 0.0, 0.0])
     weights = np.array([1.0, 2.0, 3.0, 4.0])
 
     group = np.array([5, 5, 5, 5], dtype=np.uint)
     dtrain = xgboost.DMatrix(X, label=y, weight=weights)
     dtrain.set_group(group)
 
-    params = {'eta': 1, 'tree_method': 'exact',
-              'objective': 'rank:pairwise', 'eval_metric': ['auc', 'aucpr'],
-              'max_depth': 1}
+    params = {
+        "eta": 1,
+        "tree_method": "exact",
+        "objective": "rank:pairwise",
+        "eval_metric": ["auc", "aucpr"],
+        "max_depth": 1,
+    }
     evals_result = {}
-    bst = xgboost.train(params, dtrain, 10, evals=[(dtrain, 'train')],
-                        evals_result=evals_result)
-    auc_rec = evals_result['train']['auc']
+    bst = xgboost.train(
+        params, dtrain, 10, evals=[(dtrain, "train")], evals_result=evals_result
+    )
+    auc_rec = evals_result["train"]["auc"]
     assert all(p <= q for p, q in zip(auc_rec, auc_rec[1:]))
-    auc_rec = evals_result['train']['aucpr']
+    auc_rec = evals_result["train"]["aucpr"]
     assert all(p <= q for p, q in zip(auc_rec, auc_rec[1:]))
 
     for i in range(1, 11):
         pred = bst.predict(dtrain, iteration_range=(0, i))
         # is_sorted[i]: is i-th group correctly sorted by the ranking predictor?
         is_sorted = []
         for k in range(0, 20, 5):
-            ind = np.argsort(-pred[k:k+5])
-            z = y[ind+k]
+            ind = np.argsort(-pred[k : k + 5])
+            z = y[ind + k]
             is_sorted.append(all(i >= j for i, j in zip(z, z[1:])))
         # Since we give weights 1, 2, 3, 4 to the four query groups,
         # the ranking predictor will first try to correctly sort the last query group
@@ -163,7 +181,7 @@ def test_error_msg() -> None:
 
 
 @given(lambdarank_parameter_strategy)
-@settings(deadline=None, print_blob=True)
+@settings(deadline=None, print_blob=True, max_examples=10)
 def test_lambdarank_parameters(params):
     if params["objective"] == "rank:map":
         rel = 1
@@ -191,9 +209,7 @@ def test_unbiased() -> None:
     data = RelDataCV((X, y, q), (Xe, ye, qe), max_rel=4)
 
     train, _ = simulate_clicks(data)
-    x, c, y, q = sort_ltr_samples(
-        train.X, train.y, train.qid, train.click, train.pos
-    )
+    x, c, y, q = sort_ltr_samples(train.X, train.y, train.qid, train.click, train.pos)
     df: Optional[pd.DataFrame] = None
 
     class Position(xgboost.callback.TrainingCallback):
@@ -247,30 +263,43 @@ def setup_class(cls):
         Download and setup the test fixtures
         """
         cls.dpath = "demo/"
-        (x_train, y_train, qid_train, x_test, y_test, qid_test,
-         x_valid, y_valid, qid_valid) = tm.data.get_mq2008(cls.dpath)
+        (
+            x_train,
+            y_train,
+            qid_train,
+            x_test,
+            y_test,
+            qid_test,
+            x_valid,
+            y_valid,
+            qid_valid,
+        ) = tm.data.get_mq2008(cls.dpath)
 
         # instantiate the matrices
         cls.dtrain = xgboost.DMatrix(x_train, y_train)
         cls.dvalid = xgboost.DMatrix(x_valid, y_valid)
         cls.dtest = xgboost.DMatrix(x_test, y_test)
         # set the group counts from the query IDs
-        cls.dtrain.set_group([len(list(items))
-                              for _key, items in itertools.groupby(qid_train)])
-        cls.dtest.set_group([len(list(items))
-                             for _key, items in itertools.groupby(qid_test)])
-        cls.dvalid.set_group([len(list(items))
-                              for _key, items in itertools.groupby(qid_valid)])
+        cls.dtrain.set_group(
+            [len(list(items)) for _key, items in itertools.groupby(qid_train)]
+        )
+        cls.dtest.set_group(
+            [len(list(items)) for _key, items in itertools.groupby(qid_test)]
+        )
+        cls.dvalid.set_group(
+            [len(list(items)) for _key, items in itertools.groupby(qid_valid)]
+        )
         # save the query IDs for testing
         cls.qid_train = qid_train
         cls.qid_test = qid_test
         cls.qid_valid = qid_valid
 
         # model training parameters
-        cls.params = {'objective': 'rank:pairwise',
-                      'booster': 'gbtree',
-                      'eval_metric': ['ndcg']
-                      }
+        cls.params = {
+            "objective": "rank:pairwise",
+            "booster": "gbtree",
+            "eval_metric": ["ndcg"],
+        }
 
     @classmethod
     def teardown_class(cls):
@@ -290,29 +319,49 @@ def test_training(self):
         Train an XGBoost ranking model
         """
         # specify validations set to watch performance
-        watchlist = [(self.dtest, 'eval'), (self.dtrain, 'train')]
-        bst = xgboost.train(self.params, self.dtrain, num_boost_round=2500,
-                            early_stopping_rounds=10, evals=watchlist)
+        watchlist = [(self.dtest, "eval"), (self.dtrain, "train")]
+        bst = xgboost.train(
+            self.params,
+            self.dtrain,
+            num_boost_round=2500,
+            early_stopping_rounds=10,
+            evals=watchlist,
+        )
         assert bst.best_score > 0.98
 
     def test_cv(self):
         """
         Test cross-validation with a group specified
         """
-        cv = xgboost.cv(self.params, self.dtrain, num_boost_round=2500,
-                        early_stopping_rounds=10, nfold=10, as_pandas=False)
+        cv = xgboost.cv(
+            self.params,
+            self.dtrain,
+            num_boost_round=2500,
+            early_stopping_rounds=10,
+            nfold=10,
+            as_pandas=False,
+        )
         assert isinstance(cv, dict)
         assert set(cv.keys()) == {
-            'test-ndcg-mean', 'train-ndcg-mean', 'test-ndcg-std', 'train-ndcg-std'
+            "test-ndcg-mean",
+            "train-ndcg-mean",
+            "test-ndcg-std",
+            "train-ndcg-std",
         }, "CV results dict key mismatch."
 
     def test_cv_no_shuffle(self):
         """
         Test cross-validation with a group specified
         """
-        cv = xgboost.cv(self.params, self.dtrain, num_boost_round=2500,
-                        early_stopping_rounds=10, shuffle=False, nfold=10,
-                        as_pandas=False)
+        cv = xgboost.cv(
+            self.params,
+            self.dtrain,
+            num_boost_round=2500,
+            early_stopping_rounds=10,
+            shuffle=False,
+            nfold=10,
+            as_pandas=False,
+        )
         assert isinstance(cv, dict)
         assert len(cv) == 4
 
@@ -321,19 +370,21 @@ def test_get_group(self):
         Retrieve the group number from the dmatrix
         """
         # test the new getter
-        self.dtrain.get_uint_info('group_ptr')
+        self.dtrain.get_uint_info("group_ptr")
 
-        for d, qid in [(self.dtrain, self.qid_train),
-                       (self.dvalid, self.qid_valid),
-                       (self.dtest, self.qid_test)]:
+        for d, qid in [
+            (self.dtrain, self.qid_train),
+            (self.dvalid, self.qid_valid),
+            (self.dtest, self.qid_test),
+        ]:
             # size of each group
-            group_sizes = np.array([len(list(items))
-                                    for _key, items in itertools.groupby(qid)])
+            group_sizes = np.array(
+                [len(list(items)) for _key, items in itertools.groupby(qid)]
+            )
             # indexes of group boundaries
-            group_limits = d.get_uint_info('group_ptr')
-            assert len(group_limits) == len(group_sizes)+1
+            group_limits = d.get_uint_info("group_ptr")
+            assert len(group_limits) == len(group_sizes) + 1
             assert np.array_equal(np.diff(group_limits), group_sizes)
-            assert np.array_equal(
-                group_sizes, np.diff(d.get_uint_info('group_ptr')))
-            assert np.array_equal(group_sizes, np.diff(d.get_uint_info('group_ptr')))
-            assert np.array_equal(group_limits, d.get_uint_info('group_ptr'))
+            assert np.array_equal(group_sizes, np.diff(d.get_uint_info("group_ptr")))
+            assert np.array_equal(group_sizes, np.diff(d.get_uint_info("group_ptr")))
+            assert np.array_equal(group_limits, d.get_uint_info("group_ptr"))
diff --git a/tests/python/test_updaters.py b/tests/python/test_updaters.py
@@ -233,7 +233,7 @@ def test_max_cat(self, tree_method: str) -> None:
         strategies.integers(1, 2),
         strategies.integers(4, 7),
     )
-    @settings(deadline=None, print_blob=True)
+    @settings(deadline=None, print_blob=True, max_examples=10)
     @pytest.mark.skipif(**tm.no_pandas())
     def test_categorical_ohe(
         self, rows: int, cols: int, rounds: int, cats: int
@@ -288,7 +288,7 @@ def test_categorical(
         cat_parameter_strategy,
         strategies.sampled_from(["hist", "approx"]),
     )
-    @settings(deadline=None, print_blob=True)
+    @settings(deadline=None, print_blob=True, max_examples=10)
     def test_categorical_ames_housing(
         self,
         hist_parameters: Dict[str, Any],
@@ -308,7 +308,7 @@ def test_categorical_ames_housing(
         strategies.integers(3, 8),
         strategies.integers(4, 7),
     )
-    @settings(deadline=None, print_blob=True)
+    @settings(deadline=None, print_blob=True, max_examples=10)
     @pytest.mark.skipif(**tm.no_pandas())
     def test_categorical_missing(self, rows: int, cols: int, cats: int) -> None:
         check_categorical_missing(