Skip to content
Merged
Show file tree
Hide file tree
Changes from 13 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ repos:
language: python
types: [file]
files: \.py$
exclude: (dmlc-core|gputreeshap|^ops/)
exclude: (dmlc-core|gputreeshap|^ops/|^tests/)
args:
- --rcfile=python-package/pyproject.toml
additional_dependencies:
Expand Down
16 changes: 13 additions & 3 deletions python-package/xgboost/_data_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -442,13 +442,23 @@ def is_prim() -> bool:
jarr_codes = array_interface_dict(code_values)
return jarr_values, jarr_codes, (name_values_num, code_values)

def npstr_to_arrow_strarr(strarr: np.ndarray) -> Tuple[np.ndarray, str]:
"""Convert a numpy string array to an arrow string array."""
def npstr_to_arrow_strarr(strarr: Any) -> Tuple[np.ndarray, str]:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What's being changed here?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

With my dependencies strarr was not an np.ndarray but rather an arrow data structure.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Okay, you are using pandas 3.0 and have pyarrow in your environment.

"""Convert a string-like array to an arrow string array."""
if not isinstance(strarr, np.ndarray):
if hasattr(strarr, "to_numpy"):
strarr = strarr.to_numpy(dtype=object)
else:
strarr = np.asarray(strarr, dtype=object)

lenarr = np.vectorize(len)
offsets = np.cumsum(
np.concatenate([np.array([0], dtype=np.int64), lenarr(strarr)])
)
values = strarr.sum()
if strarr.dtype.kind == "S":
str_list = [s.decode("utf-8") for s in strarr.tolist()]
else:
str_list = [str(s) for s in strarr.tolist()]
values = "".join(str_list)
assert "\0" not in values # arrow string array doesn't need null terminal
return offsets.astype(np.int32), values

Expand Down
2 changes: 1 addition & 1 deletion python-package/xgboost/testing/updater.py
Original file line number Diff line number Diff line change
Expand Up @@ -577,7 +577,7 @@ def run(max_cat_to_onehot: int) -> None:
booster = train(
parameters,
Xy,
num_boost_round=16,
num_boost_round=8,
evals=[(Xy, "Train")],
evals_result=evals_result,
)
Expand Down
177 changes: 114 additions & 63 deletions tests/python/test_ranking.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,62 +92,80 @@ def fit(ltr: xgboost.XGBRanker):


def test_ranking_with_unweighted_data():
# fmt: off
Xrow = np.array([1, 2, 6, 8, 11, 14, 16, 17])
Xcol = np.array([0, 0, 1, 1, 2, 2, 3, 3])
Xcol = np.array([0, 0, 1, 1, 2, 2, 3, 3])
y = np.array([
0.0, 1.0, 1.0, 0.0, 0.0,
0.0, 1.0, 0.0, 1.0, 0.0,
0.0, 1.0, 0.0, 0.0, 1.0,
0.0, 1.0, 1.0, 0.0, 0.0
])
# fmt: on
X = csr_matrix((np.ones(shape=8), (Xrow, Xcol)), shape=(20, 4))
y = np.array([0.0, 1.0, 1.0, 0.0, 0.0,
0.0, 1.0, 0.0, 1.0, 0.0,
0.0, 1.0, 0.0, 0.0, 1.0,
0.0, 1.0, 1.0, 0.0, 0.0])

group = np.array([5, 5, 5, 5], dtype=np.uint)
dtrain = xgboost.DMatrix(X, label=y)
dtrain.set_group(group)

params = {'eta': 1, 'tree_method': 'exact',
'objective': 'rank:pairwise', 'eval_metric': ['auc', 'aucpr'],
'max_depth': 1}
params = {
"eta": 1,
"tree_method": "exact",
"objective": "rank:pairwise",
"eval_metric": ["auc", "aucpr"],
"max_depth": 1,
}
evals_result = {}
bst = xgboost.train(params, dtrain, 10, evals=[(dtrain, 'train')],
evals_result=evals_result)
auc_rec = evals_result['train']['auc']
bst = xgboost.train(
params, dtrain, 10, evals=[(dtrain, "train")], evals_result=evals_result
)
auc_rec = evals_result["train"]["auc"]
assert all(p <= q for p, q in zip(auc_rec, auc_rec[1:]))
auc_rec = evals_result['train']['aucpr']
auc_rec = evals_result["train"]["aucpr"]
assert all(p <= q for p, q in zip(auc_rec, auc_rec[1:]))


def test_ranking_with_weighted_data():
# fmt: off
Xrow = np.array([1, 2, 6, 8, 11, 14, 16, 17])
Xcol = np.array([0, 0, 1, 1, 2, 2, 3, 3])
Xcol = np.array([0, 0, 1, 1, 2, 2, 3, 3])
y = np.array([
0.0, 1.0, 1.0, 0.0, 0.0,
0.0, 1.0, 0.0, 1.0, 0.0,
0.0, 1.0, 0.0, 0.0, 1.0,
0.0, 1.0, 1.0, 0.0, 0.0
])
# fmt: on
X = csr_matrix((np.ones(shape=8), (Xrow, Xcol)), shape=(20, 4))
y = np.array([0.0, 1.0, 1.0, 0.0, 0.0,
0.0, 1.0, 0.0, 1.0, 0.0,
0.0, 1.0, 0.0, 0.0, 1.0,
0.0, 1.0, 1.0, 0.0, 0.0])
weights = np.array([1.0, 2.0, 3.0, 4.0])

group = np.array([5, 5, 5, 5], dtype=np.uint)
dtrain = xgboost.DMatrix(X, label=y, weight=weights)
dtrain.set_group(group)

params = {'eta': 1, 'tree_method': 'exact',
'objective': 'rank:pairwise', 'eval_metric': ['auc', 'aucpr'],
'max_depth': 1}
params = {
"eta": 1,
"tree_method": "exact",
"objective": "rank:pairwise",
"eval_metric": ["auc", "aucpr"],
"max_depth": 1,
}
evals_result = {}
bst = xgboost.train(params, dtrain, 10, evals=[(dtrain, 'train')],
evals_result=evals_result)
auc_rec = evals_result['train']['auc']
bst = xgboost.train(
params, dtrain, 10, evals=[(dtrain, "train")], evals_result=evals_result
)
auc_rec = evals_result["train"]["auc"]
assert all(p <= q for p, q in zip(auc_rec, auc_rec[1:]))
auc_rec = evals_result['train']['aucpr']
auc_rec = evals_result["train"]["aucpr"]
assert all(p <= q for p, q in zip(auc_rec, auc_rec[1:]))

for i in range(1, 11):
pred = bst.predict(dtrain, iteration_range=(0, i))
# is_sorted[i]: is i-th group correctly sorted by the ranking predictor?
is_sorted = []
for k in range(0, 20, 5):
ind = np.argsort(-pred[k:k+5])
z = y[ind+k]
ind = np.argsort(-pred[k : k + 5])
z = y[ind + k]
is_sorted.append(all(i >= j for i, j in zip(z, z[1:])))
# Since we give weights 1, 2, 3, 4 to the four query groups,
# the ranking predictor will first try to correctly sort the last query group
Expand All @@ -163,7 +181,7 @@ def test_error_msg() -> None:


@given(lambdarank_parameter_strategy)
@settings(deadline=None, print_blob=True)
@settings(deadline=None, print_blob=True, max_examples=10)
def test_lambdarank_parameters(params):
if params["objective"] == "rank:map":
rel = 1
Expand Down Expand Up @@ -191,9 +209,7 @@ def test_unbiased() -> None:
data = RelDataCV((X, y, q), (Xe, ye, qe), max_rel=4)

train, _ = simulate_clicks(data)
x, c, y, q = sort_ltr_samples(
train.X, train.y, train.qid, train.click, train.pos
)
x, c, y, q = sort_ltr_samples(train.X, train.y, train.qid, train.click, train.pos)
df: Optional[pd.DataFrame] = None

class Position(xgboost.callback.TrainingCallback):
Expand Down Expand Up @@ -247,30 +263,43 @@ def setup_class(cls):
Download and setup the test fixtures
"""
cls.dpath = "demo/"
(x_train, y_train, qid_train, x_test, y_test, qid_test,
x_valid, y_valid, qid_valid) = tm.data.get_mq2008(cls.dpath)
(
x_train,
y_train,
qid_train,
x_test,
y_test,
qid_test,
x_valid,
y_valid,
qid_valid,
) = tm.data.get_mq2008(cls.dpath)

# instantiate the matrices
cls.dtrain = xgboost.DMatrix(x_train, y_train)
cls.dvalid = xgboost.DMatrix(x_valid, y_valid)
cls.dtest = xgboost.DMatrix(x_test, y_test)
# set the group counts from the query IDs
cls.dtrain.set_group([len(list(items))
for _key, items in itertools.groupby(qid_train)])
cls.dtest.set_group([len(list(items))
for _key, items in itertools.groupby(qid_test)])
cls.dvalid.set_group([len(list(items))
for _key, items in itertools.groupby(qid_valid)])
cls.dtrain.set_group(
[len(list(items)) for _key, items in itertools.groupby(qid_train)]
)
cls.dtest.set_group(
[len(list(items)) for _key, items in itertools.groupby(qid_test)]
)
cls.dvalid.set_group(
[len(list(items)) for _key, items in itertools.groupby(qid_valid)]
)
# save the query IDs for testing
cls.qid_train = qid_train
cls.qid_test = qid_test
cls.qid_valid = qid_valid

# model training parameters
cls.params = {'objective': 'rank:pairwise',
'booster': 'gbtree',
'eval_metric': ['ndcg']
}
cls.params = {
"objective": "rank:pairwise",
"booster": "gbtree",
"eval_metric": ["ndcg"],
}

@classmethod
def teardown_class(cls):
Expand All @@ -290,29 +319,49 @@ def test_training(self):
Train an XGBoost ranking model
"""
# specify validations set to watch performance
watchlist = [(self.dtest, 'eval'), (self.dtrain, 'train')]
bst = xgboost.train(self.params, self.dtrain, num_boost_round=2500,
early_stopping_rounds=10, evals=watchlist)
watchlist = [(self.dtest, "eval"), (self.dtrain, "train")]
bst = xgboost.train(
self.params,
self.dtrain,
num_boost_round=2500,
early_stopping_rounds=10,
evals=watchlist,
)
assert bst.best_score > 0.98

def test_cv(self):
"""
Test cross-validation with a group specified
"""
cv = xgboost.cv(self.params, self.dtrain, num_boost_round=2500,
early_stopping_rounds=10, nfold=10, as_pandas=False)
cv = xgboost.cv(
self.params,
self.dtrain,
num_boost_round=2500,
early_stopping_rounds=10,
nfold=10,
as_pandas=False,
)
assert isinstance(cv, dict)
assert set(cv.keys()) == {
'test-ndcg-mean', 'train-ndcg-mean', 'test-ndcg-std', 'train-ndcg-std'
"test-ndcg-mean",
"train-ndcg-mean",
"test-ndcg-std",
"train-ndcg-std",
}, "CV results dict key mismatch."

def test_cv_no_shuffle(self):
"""
Test cross-validation with a group specified
"""
cv = xgboost.cv(self.params, self.dtrain, num_boost_round=2500,
early_stopping_rounds=10, shuffle=False, nfold=10,
as_pandas=False)
cv = xgboost.cv(
self.params,
self.dtrain,
num_boost_round=2500,
early_stopping_rounds=10,
shuffle=False,
nfold=10,
as_pandas=False,
)
assert isinstance(cv, dict)
assert len(cv) == 4

Expand All @@ -321,19 +370,21 @@ def test_get_group(self):
Retrieve the group number from the dmatrix
"""
# test the new getter
self.dtrain.get_uint_info('group_ptr')
self.dtrain.get_uint_info("group_ptr")

for d, qid in [(self.dtrain, self.qid_train),
(self.dvalid, self.qid_valid),
(self.dtest, self.qid_test)]:
for d, qid in [
(self.dtrain, self.qid_train),
(self.dvalid, self.qid_valid),
(self.dtest, self.qid_test),
]:
# size of each group
group_sizes = np.array([len(list(items))
for _key, items in itertools.groupby(qid)])
group_sizes = np.array(
[len(list(items)) for _key, items in itertools.groupby(qid)]
)
# indexes of group boundaries
group_limits = d.get_uint_info('group_ptr')
assert len(group_limits) == len(group_sizes)+1
group_limits = d.get_uint_info("group_ptr")
assert len(group_limits) == len(group_sizes) + 1
assert np.array_equal(np.diff(group_limits), group_sizes)
assert np.array_equal(
group_sizes, np.diff(d.get_uint_info('group_ptr')))
assert np.array_equal(group_sizes, np.diff(d.get_uint_info('group_ptr')))
assert np.array_equal(group_limits, d.get_uint_info('group_ptr'))
assert np.array_equal(group_sizes, np.diff(d.get_uint_info("group_ptr")))
assert np.array_equal(group_sizes, np.diff(d.get_uint_info("group_ptr")))
assert np.array_equal(group_limits, d.get_uint_info("group_ptr"))
6 changes: 3 additions & 3 deletions tests/python/test_updaters.py
Original file line number Diff line number Diff line change
Expand Up @@ -233,7 +233,7 @@ def test_max_cat(self, tree_method: str) -> None:
strategies.integers(1, 2),
strategies.integers(4, 7),
)
@settings(deadline=None, print_blob=True)
@settings(deadline=None, print_blob=True, max_examples=10)
@pytest.mark.skipif(**tm.no_pandas())
def test_categorical_ohe(
self, rows: int, cols: int, rounds: int, cats: int
Expand Down Expand Up @@ -288,7 +288,7 @@ def test_categorical(
cat_parameter_strategy,
strategies.sampled_from(["hist", "approx"]),
)
@settings(deadline=None, print_blob=True)
@settings(deadline=None, print_blob=True, max_examples=10)
def test_categorical_ames_housing(
self,
hist_parameters: Dict[str, Any],
Expand All @@ -308,7 +308,7 @@ def test_categorical_ames_housing(
strategies.integers(3, 8),
strategies.integers(4, 7),
)
@settings(deadline=None, print_blob=True)
@settings(deadline=None, print_blob=True, max_examples=10)
@pytest.mark.skipif(**tm.no_pandas())
def test_categorical_missing(self, rows: int, cols: int, cats: int) -> None:
check_categorical_missing(
Expand Down
Loading
Loading