符号回归与分类¶
符号回归和分类旨在优化一个可解释的代数方程。TPOT允许您将这种方法与经典的机器学习操作结合起来。
我们可以使用TreePipeline或GraphSearchPipeline来构建符号方程的搜索空间,因为它们都没有固定的管道结构,而是优化自己的序列和结构。
策略是将叶子节点设置为选择单个特征(使用FSSNode),所有内部节点为算术运算符,根节点为分类器或回归器。
注意:这仍然是实验性的。优化过程有很多优化的机会。未来,符号回归/分类可能会有自己专用的搜索空间类。
In [8]:
Copied!
import tpot2
from tpot2.search_spaces.pipelines import GraphSearchPipeline
from tpot2.search_spaces.nodes import FSSNode
from tpot2.config import get_search_space
import sklearn.datasets
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import numpy as np
import tpot2
from tpot2.search_spaces.pipelines import GraphSearchPipeline
from tpot2.search_spaces.nodes import FSSNode
from tpot2.config import get_search_space
import sklearn.datasets
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import numpy as np
符号分类¶
In [9]:
Copied!
X, y = sklearn.datasets.make_classification(n_samples=1000, n_features=100, n_informative=6, n_redundant=0, n_repeated=0, n_classes=2, n_clusters_per_class=2, weights=None, flip_y=0.01, class_sep=1.0, hypercube=True, shift=0.0, scale=1.0, shuffle=True, random_state=None)
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, train_size=0.75, test_size=0.25)
X, y = sklearn.datasets.make_classification(n_samples=1000, n_features=100, n_informative=6, n_redundant=0, n_repeated=0, n_classes=2, n_clusters_per_class=2, weights=None, flip_y=0.01, class_sep=1.0, hypercube=True, shift=0.0, scale=1.0, shuffle=True, random_state=None)
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, train_size=0.75, test_size=0.25)
In [36]:
Copied!
symbolic_classification_search_space = GraphSearchPipeline(
root_search_space= get_search_space("LogisticRegression"),
leaf_search_space = FSSNode(subsets=X_train.shape[1]),
inner_search_space = get_search_space(["arithmatic"]),
max_size = 20,
)
#example pipelines randomly sampled
ind = symbolic_classification_search_space.generate(rng=5)
for i in range(3):
ind.mutate(rng=1)
est_example = ind.export_pipeline()
est_example.plot()
symbolic_classification_search_space = GraphSearchPipeline(
root_search_space= get_search_space("LogisticRegression"),
leaf_search_space = FSSNode(subsets=X_train.shape[1]),
inner_search_space = get_search_space(["arithmatic"]),
max_size = 20,
)
#example pipelines randomly sampled
ind = symbolic_classification_search_space.generate(rng=5)
for i in range(3):
ind.mutate(rng=1)
est_example = ind.export_pipeline()
est_example.plot()
In [37]:
Copied!
est = tpot2.TPOTEstimator( generations=20,
max_time_mins=None,
scorers=['roc_auc_ovr'],
scorers_weights=[1],
other_objective_functions=[tpot2.objectives.number_of_nodes_objective],
other_objective_functions_weights=[-1],
n_jobs=32,
classification=True,
search_space = symbolic_classification_search_space,
verbose=1,
)
scorer = sklearn.metrics.get_scorer('roc_auc_ovo')
est.fit(X_train, y_train)
print(scorer(est, X_test, y_test))
est.fitted_pipeline_.plot()
est = tpot2.TPOTEstimator( generations=20,
max_time_mins=None,
scorers=['roc_auc_ovr'],
scorers_weights=[1],
other_objective_functions=[tpot2.objectives.number_of_nodes_objective],
other_objective_functions_weights=[-1],
n_jobs=32,
classification=True,
search_space = symbolic_classification_search_space,
verbose=1,
)
scorer = sklearn.metrics.get_scorer('roc_auc_ovo')
est.fit(X_train, y_train)
print(scorer(est, X_test, y_test))
est.fitted_pipeline_.plot()
Generation: 100%|██████████| 20/20 [00:40<00:00, 2.00s/it] /home/perib/miniconda3/envs/myenv/lib/python3.10/site-packages/sklearn/linear_model/_sag.py:349: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge warnings.warn(
0.7341062801932366
In [43]:
Copied!
import seaborn as sns
import matplotlib.pyplot as plt
df = est.evaluated_individuals
#replace nans in pareto front with 0
fig, ax = plt.subplots(figsize=(5,5))
sns.scatterplot(df[df['Pareto_Front']!=1], y='roc_auc_score', x='number_of_nodes_objective', label='other', ax=ax)
sns.scatterplot(df[df['Pareto_Front']==1], y='roc_auc_score', x='number_of_nodes_objective', label='Pareto Front', ax=ax)
ax.title.set_text('Performance of all pipelines')
#log scale y
plt.show()
#replace nans in pareto front with 0
fig, ax = plt.subplots(figsize=(10,5))
sns.scatterplot(df[df['Pareto_Front']==1], y='roc_auc_score', x='number_of_nodes_objective', label='Pareto Front', ax=ax)
ax.title.set_text('Performance of only the Pareto Front')
#log scale y
# ax.set_yscale('log')
plt.show()
import seaborn as sns
import matplotlib.pyplot as plt
df = est.evaluated_individuals
#replace nans in pareto front with 0
fig, ax = plt.subplots(figsize=(5,5))
sns.scatterplot(df[df['Pareto_Front']!=1], y='roc_auc_score', x='number_of_nodes_objective', label='other', ax=ax)
sns.scatterplot(df[df['Pareto_Front']==1], y='roc_auc_score', x='number_of_nodes_objective', label='Pareto Front', ax=ax)
ax.title.set_text('Performance of all pipelines')
#log scale y
plt.show()
#replace nans in pareto front with 0
fig, ax = plt.subplots(figsize=(10,5))
sns.scatterplot(df[df['Pareto_Front']==1], y='roc_auc_score', x='number_of_nodes_objective', label='Pareto Front', ax=ax)
ax.title.set_text('Performance of only the Pareto Front')
#log scale y
# ax.set_yscale('log')
plt.show()
符号回归
In [44]:
Copied!
import tpot2
import sklearn.datasets
scorer = sklearn.metrics.get_scorer('neg_mean_squared_error')
X, y = sklearn.datasets.load_diabetes(return_X_y=True)
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, train_size=0.75, test_size=0.25)
graph_search_space = tpot2.search_spaces.pipelines.GraphSearchPipeline(
root_search_space= tpot2.config.get_search_space("SGDRegressor"),
leaf_search_space = tpot2.search_spaces.nodes.FSSNode(subsets=X_train.shape[1]),
inner_search_space = tpot2.config.get_search_space(["arithmatic"]),
max_size = 10,
)
est = tpot2.TPOTEstimator( generations=20,
max_time_mins=None,
scorers=['neg_mean_squared_error'],
scorers_weights=[1],
other_objective_functions=[tpot2.objectives.number_of_nodes_objective],
other_objective_functions_weights=[-1],
n_jobs=32,
classification=False,
search_space = graph_search_space ,
verbose=2,
)
est.fit(X_train, y_train)
print(scorer(est, X_test, y_test))
est.fitted_pipeline_.plot()
import tpot2
import sklearn.datasets
scorer = sklearn.metrics.get_scorer('neg_mean_squared_error')
X, y = sklearn.datasets.load_diabetes(return_X_y=True)
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, train_size=0.75, test_size=0.25)
graph_search_space = tpot2.search_spaces.pipelines.GraphSearchPipeline(
root_search_space= tpot2.config.get_search_space("SGDRegressor"),
leaf_search_space = tpot2.search_spaces.nodes.FSSNode(subsets=X_train.shape[1]),
inner_search_space = tpot2.config.get_search_space(["arithmatic"]),
max_size = 10,
)
est = tpot2.TPOTEstimator( generations=20,
max_time_mins=None,
scorers=['neg_mean_squared_error'],
scorers_weights=[1],
other_objective_functions=[tpot2.objectives.number_of_nodes_objective],
other_objective_functions_weights=[-1],
n_jobs=32,
classification=False,
search_space = graph_search_space ,
verbose=2,
)
est.fit(X_train, y_train)
print(scorer(est, X_test, y_test))
est.fitted_pipeline_.plot()
Generation: 100%|██████████| 20/20 [00:32<00:00, 1.63s/it]
-3452.5150085210244
In [48]:
Copied!
import seaborn as sns
import matplotlib.pyplot as plt
df = est.evaluated_individuals
df['mean_squared_error'] = -df['mean_squared_error']
#replace nans in pareto front with 0
fig, ax = plt.subplots(figsize=(5,5))
sns.scatterplot(df[df['Pareto_Front']!=1], y='mean_squared_error', x='number_of_nodes_objective', label='other', ax=ax)
sns.scatterplot(df[df['Pareto_Front']==1], y='mean_squared_error', x='number_of_nodes_objective', label='Pareto Front', ax=ax)
ax.title.set_text('Performance of all pipelines')
#log scale y
ax.set_yscale('log')
plt.show()
#replace nans in pareto front with 0
fig, ax = plt.subplots(figsize=(10,5))
sns.scatterplot(df[df['Pareto_Front']==1], y='mean_squared_error', x='number_of_nodes_objective', label='Pareto Front', ax=ax)
ax.title.set_text('Performance of only the Pareto Front')
#log scale y
# ax.set_yscale('log')
plt.show()
import seaborn as sns
import matplotlib.pyplot as plt
df = est.evaluated_individuals
df['mean_squared_error'] = -df['mean_squared_error']
#replace nans in pareto front with 0
fig, ax = plt.subplots(figsize=(5,5))
sns.scatterplot(df[df['Pareto_Front']!=1], y='mean_squared_error', x='number_of_nodes_objective', label='other', ax=ax)
sns.scatterplot(df[df['Pareto_Front']==1], y='mean_squared_error', x='number_of_nodes_objective', label='Pareto Front', ax=ax)
ax.title.set_text('Performance of all pipelines')
#log scale y
ax.set_yscale('log')
plt.show()
#replace nans in pareto front with 0
fig, ax = plt.subplots(figsize=(10,5))
sns.scatterplot(df[df['Pareto_Front']==1], y='mean_squared_error', x='number_of_nodes_objective', label='Pareto Front', ax=ax)
ax.title.set_text('Performance of only the Pareto Front')
#log scale y
# ax.set_yscale('log')
plt.show()