备注
前往结尾 以下载完整示例代码。
对外部内存的实验性支持
这与 quantile_data_iterator.py 中的类似,但针对的是外部内存而非 Quantile DMatrix。该功能尚未准备好用于生产环境。
Added in version 1.5.0.
更多详情请参阅 教程。
import os
import tempfile
from typing import Callable, List, Tuple
import numpy as np
from sklearn.datasets import make_regression
import xgboost
def make_batches(
n_samples_per_batch: int,
n_features: int,
n_batches: int,
tmpdir: str,
) -> List[Tuple[str, str]]:
files: List[Tuple[str, str]] = []
rng = np.random.RandomState(1994)
for i in range(n_batches):
X, y = make_regression(n_samples_per_batch, n_features, random_state=rng)
X_path = os.path.join(tmpdir, "X-" + str(i) + ".npy")
y_path = os.path.join(tmpdir, "y-" + str(i) + ".npy")
np.save(X_path, X)
np.save(y_path, y)
files.append((X_path, y_path))
return files
class Iterator(xgboost.DataIter):
"""A custom iterator for loading files in batches."""
def __init__(self, file_paths: List[Tuple[str, str]]) -> None:
self._file_paths = file_paths
self._it = 0
# XGBoost will generate some cache files under current directory with the prefix
# "cache"
super().__init__(cache_prefix=os.path.join(".", "cache"))
def load_file(self) -> Tuple[np.ndarray, np.ndarray]:
X_path, y_path = self._file_paths[self._it]
X = np.load(X_path)
y = np.load(y_path)
assert X.shape[0] == y.shape[0]
return X, y
def next(self, input_data: Callable) -> int:
"""Advance the iterator by 1 step and pass the data to XGBoost. This function is
called by XGBoost during the construction of ``DMatrix``
"""
if self._it == len(self._file_paths):
# return 0 to let XGBoost know this is the end of iteration
return 0
# input_data is a function passed in by XGBoost who has the similar signature to
# the ``DMatrix`` constructor.
X, y = self.load_file()
input_data(data=X, label=y)
self._it += 1
return 1
def reset(self) -> None:
"""Reset the iterator to its beginning"""
self._it = 0
def main(tmpdir: str) -> xgboost.Booster:
# generate some random data for demo
files = make_batches(1024, 17, 31, tmpdir)
it = Iterator(files)
# For non-data arguments, specify it here once instead of passing them by the `next`
# method.
missing = np.nan
Xy = xgboost.DMatrix(it, missing=missing, enable_categorical=False)
# ``approx`` is also supported, but less efficient due to sketching. GPU behaves
# differently than CPU tree methods as it uses a hybrid approach. See tutorial in
# doc for details.
booster = xgboost.train(
{"tree_method": "hist", "max_depth": 4},
Xy,
evals=[(Xy, "Train")],
num_boost_round=10,
)
return booster
if __name__ == "__main__":
with tempfile.TemporaryDirectory() as tmpdir:
main(tmpdir)