.. DO NOT EDIT. .. THIS FILE WAS AUTOMATICALLY GENERATED BY SPHINX-GALLERY. .. TO MAKE CHANGES, EDIT THE SOURCE PYTHON FILE: .. "python/examples/spark_estimator_examples.py" .. LINE NUMBERS ARE GIVEN BELOW. .. only:: html .. note:: :class: sphx-glr-download-link-note :ref:`Go to the end ` to download the full example code. .. rst-class:: sphx-glr-example-title .. _sphx_glr_python_examples_spark_estimator_examples.py: Collection of examples for using xgboost.spark estimator interface ================================================================== @author: Weichen Xu .. GENERATED FROM PYTHON SOURCE LINES 7-99 .. code-block:: Python import sklearn.datasets from pyspark.ml.evaluation import MulticlassClassificationEvaluator, RegressionEvaluator from pyspark.ml.linalg import Vectors from pyspark.sql import SparkSession from pyspark.sql.functions import rand from sklearn.model_selection import train_test_split from xgboost.spark import SparkXGBClassifier, SparkXGBRegressor spark = SparkSession.builder.master("local[*]").getOrCreate() def create_spark_df(X, y): return spark.createDataFrame( spark.sparkContext.parallelize( [(Vectors.dense(features), float(label)) for features, label in zip(X, y)] ), ["features", "label"], ) # load diabetes dataset (regression dataset) diabetes_X, diabetes_y = sklearn.datasets.load_diabetes(return_X_y=True) diabetes_X_train, diabetes_X_test, diabetes_y_train, diabetes_y_test = train_test_split( diabetes_X, diabetes_y, test_size=0.3, shuffle=True ) diabetes_train_spark_df = create_spark_df(diabetes_X_train, diabetes_y_train) diabetes_test_spark_df = create_spark_df(diabetes_X_test, diabetes_y_test) # train xgboost regressor model xgb_regressor = SparkXGBRegressor(max_depth=5) xgb_regressor_model = xgb_regressor.fit(diabetes_train_spark_df) transformed_diabetes_test_spark_df = xgb_regressor_model.transform( diabetes_test_spark_df ) regressor_evaluator = RegressionEvaluator(metricName="rmse") print( f"regressor rmse={regressor_evaluator.evaluate(transformed_diabetes_test_spark_df)}" ) diabetes_train_spark_df2 = diabetes_train_spark_df.withColumn( "validationIndicatorCol", rand(1) > 0.7 ) # train xgboost regressor model with validation dataset xgb_regressor2 = SparkXGBRegressor( max_depth=5, validation_indicator_col="validationIndicatorCol" ) xgb_regressor_model2 = xgb_regressor2.fit(diabetes_train_spark_df2) transformed_diabetes_test_spark_df2 = xgb_regressor_model2.transform( diabetes_test_spark_df ) print( f"regressor2 rmse={regressor_evaluator.evaluate(transformed_diabetes_test_spark_df2)}" ) # load iris dataset (classification dataset) iris_X, iris_y = sklearn.datasets.load_iris(return_X_y=True) iris_X_train, iris_X_test, iris_y_train, iris_y_test = train_test_split( iris_X, iris_y, test_size=0.3, shuffle=True ) iris_train_spark_df = create_spark_df(iris_X_train, iris_y_train) iris_test_spark_df = create_spark_df(iris_X_test, iris_y_test) # train xgboost classifier model xgb_classifier = SparkXGBClassifier(max_depth=5) xgb_classifier_model = xgb_classifier.fit(iris_train_spark_df) transformed_iris_test_spark_df = xgb_classifier_model.transform(iris_test_spark_df) classifier_evaluator = MulticlassClassificationEvaluator(metricName="f1") print(f"classifier f1={classifier_evaluator.evaluate(transformed_iris_test_spark_df)}") iris_train_spark_df2 = iris_train_spark_df.withColumn( "validationIndicatorCol", rand(1) > 0.7 ) # train xgboost classifier model with validation dataset xgb_classifier2 = SparkXGBClassifier( max_depth=5, validation_indicator_col="validationIndicatorCol" ) xgb_classifier_model2 = xgb_classifier2.fit(iris_train_spark_df2) transformed_iris_test_spark_df2 = xgb_classifier_model2.transform(iris_test_spark_df) print( f"classifier2 f1={classifier_evaluator.evaluate(transformed_iris_test_spark_df2)}" ) spark.stop() .. _sphx_glr_download_python_examples_spark_estimator_examples.py: .. only:: html .. container:: sphx-glr-footer sphx-glr-footer-example .. container:: sphx-glr-download sphx-glr-download-jupyter :download:`Download Jupyter notebook: spark_estimator_examples.ipynb ` .. container:: sphx-glr-download sphx-glr-download-python :download:`Download Python source code: spark_estimator_examples.py ` .. container:: sphx-glr-download sphx-glr-download-zip :download:`Download zipped: spark_estimator_examples.zip ` .. only:: html .. rst-class:: sphx-glr-signature `Gallery generated by Sphinx-Gallery `_