GLM: 分层回归模型上的小批量ADVI#

与高斯混合模型不同,(分层)回归模型具有自变量。这些变量影响似然函数,但不是随机变量。在使用小批量时,我们应该注意这一点。

%env PYTENSOR_FLAGS=device=cpu, floatX=float32, warn_float64=ignore

import os

import arviz as az
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pymc as pm
import pytensor
import pytensor.tensor as pt
import seaborn as sns

from scipy import stats

print(f"Running on PyMC v{pm.__version__}")
env: PYTENSOR_FLAGS=device=cpu, floatX=float32, warn_float64=ignore
Running on PyMC v5.0.1
%config InlineBackend.figure_format = 'retina'
RANDOM_SEED = 8927
rng = np.random.default_rng(RANDOM_SEED)
az.style.use("arviz-darkgrid")
try:
    data = pd.read_csv(os.path.join("..", "data", "radon.csv"))
except FileNotFoundError:
    data = pd.read_csv(pm.get_data("radon.csv"))

data
Unnamed: 0 idnum state state2 stfips zip region typebldg floor room ... pcterr adjwt dupflag zipflag cntyfips county fips Uppm county_code log_radon
0 0 5081.0 MN MN 27.0 55735 5.0 1.0 1.0 3.0 ... 9.7 1146.499190 1.0 0.0 1.0 AITKIN 27001.0 0.502054 0 0.832909
1 1 5082.0 MN MN 27.0 55748 5.0 1.0 0.0 4.0 ... 14.5 471.366223 0.0 0.0 1.0 AITKIN 27001.0 0.502054 0 0.832909
2 2 5083.0 MN MN 27.0 55748 5.0 1.0 0.0 4.0 ... 9.6 433.316718 0.0 0.0 1.0 AITKIN 27001.0 0.502054 0 1.098612
3 3 5084.0 MN MN 27.0 56469 5.0 1.0 0.0 4.0 ... 24.3 461.623670 0.0 0.0 1.0 AITKIN 27001.0 0.502054 0 0.095310
4 4 5085.0 MN MN 27.0 55011 3.0 1.0 0.0 4.0 ... 13.8 433.316718 0.0 0.0 3.0 ANOKA 27003.0 0.428565 1 1.163151
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
914 914 5995.0 MN MN 27.0 55363 5.0 1.0 0.0 4.0 ... 4.5 1146.499190 0.0 0.0 171.0 WRIGHT 27171.0 0.913909 83 1.871802
915 915 5996.0 MN MN 27.0 55376 5.0 1.0 0.0 7.0 ... 8.3 1105.956867 0.0 0.0 171.0 WRIGHT 27171.0 0.913909 83 1.526056
916 916 5997.0 MN MN 27.0 55376 5.0 1.0 0.0 4.0 ... 5.2 1214.922779 0.0 0.0 171.0 WRIGHT 27171.0 0.913909 83 1.629241
917 917 5998.0 MN MN 27.0 56297 5.0 1.0 0.0 4.0 ... 9.6 1177.377355 0.0 0.0 173.0 YELLOW MEDICINE 27173.0 1.426590 84 1.335001
918 918 5999.0 MN MN 27.0 56297 5.0 1.0 0.0 4.0 ... 8.0 1214.922779 0.0 0.0 173.0 YELLOW MEDICINE 27173.0 1.426590 84 1.098612

919行 × 30列

county_idx = data["county_code"].values
floor_idx = data["floor"].values
log_radon_idx = data["log_radon"].values

coords = {"counties": data.county.unique()}

在这里,log_radon_idx_t 是一个因变量,而 floor_idx_tcounty_idx_t 决定了自变量。

log_radon_idx_t = pm.Minibatch(log_radon_idx, batch_size=100)
floor_idx_t = pm.Minibatch(floor_idx, batch_size=100)
county_idx_t = pm.Minibatch(county_idx, batch_size=100)
with pm.Model(coords=coords) as hierarchical_model:
    # Hyperpriors for group nodes
    mu_a = pm.Normal("mu_alpha", mu=0.0, sigma=100**2)
    sigma_a = pm.Uniform("sigma_alpha", lower=0, upper=100)
    mu_b = pm.Normal("mu_beta", mu=0.0, sigma=100**2)
    sigma_b = pm.Uniform("sigma_beta", lower=0, upper=100)

每个县的截距,分布在组均值 mu_a 周围。在上面的例子中,我们只是将 musd 设置为固定值,而在这里我们为所有的 ab(它们是与示例中唯一县的数量相同长度的向量)插入了一个共同的组分布。

with hierarchical_model:
    a = pm.Normal("alpha", mu=mu_a, sigma=sigma_a, dims="counties")
    # Intercept for each county, distributed around group mean mu_a
    b = pm.Normal("beta", mu=mu_b, sigma=sigma_b, dims="counties")

氡水平的模型预测 a[county_idx] 转换为 a[0, 0, 0, 1, 1, ...],因此我们将一个县的多个家庭测量值与其系数相关联。

with hierarchical_model:
    radon_est = a[county_idx_t] + b[county_idx_t] * floor_idx_t

最后,我们指定似然函数:

with hierarchical_model:
    # Model error
    eps = pm.Uniform("eps", lower=0, upper=100)

    # Data likelihood
    radon_like = pm.Normal(
        "radon_like", mu=radon_est, sigma=eps, observed=log_radon_idx_t, total_size=len(data)
    )

随机变量 radon_like,与 log_radon_idx_t 相关联,应传递给函数以表示在似然项中的观测值。

另一方面,minibatches 应该包括上述三个变量。

然后,使用小批量运行ADVI。

with hierarchical_model:
    approx = pm.fit(100_000, callbacks=[pm.callbacks.CheckParametersConvergence(tolerance=1e-4)])

idata_advi = approx.sample(500)
/home/fonnesbeck/mambaforge/envs/pie/lib/python3.9/site-packages/pymc/logprob/joint_logprob.py:167: UserWarning: Found a random variable that was neither among the observations nor the conditioned variables: [integers_rv{0, (0, 0), int64, False}.0, integers_rv{0, (0, 0), int64, False}.out]
  warnings.warn(
/home/fonnesbeck/mambaforge/envs/pie/lib/python3.9/site-packages/pymc/logprob/joint_logprob.py:167: UserWarning: Found a random variable that was neither among the observations nor the conditioned variables: [integers_rv{0, (0, 0), int64, False}.0, integers_rv{0, (0, 0), int64, False}.out]
  warnings.warn(
/home/fonnesbeck/mambaforge/envs/pie/lib/python3.9/site-packages/pymc/logprob/joint_logprob.py:167: UserWarning: Found a random variable that was neither among the observations nor the conditioned variables: [integers_rv{0, (0, 0), int64, False}.0, integers_rv{0, (0, 0), int64, False}.out]
  warnings.warn(
/home/fonnesbeck/mambaforge/envs/pie/lib/python3.9/site-packages/pymc/logprob/joint_logprob.py:167: UserWarning: Found a random variable that was neither among the observations nor the conditioned variables: [integers_rv{0, (0, 0), int64, False}.0, integers_rv{0, (0, 0), int64, False}.out]
  warnings.warn(
11.50% [11499/100000 00:03<00:27 Average Loss = 263.87]

检查ELBO的轨迹并与MCMC的结果进行比较。

plt.plot(approx.hist);
../../../_images/af9b90617fc072dcb41643f8d1228130d436ae7589248a7adcc8582793cf8ce3.png

我们可以从均值场近似中提取协方差矩阵,并将其用作NUTS算法的缩放矩阵。

scaling = approx.cov.eval()

此外,我们可以生成样本(每个链一个),用作采样器的起始点。

n_chains = 4
sample = approx.sample(return_inferencedata=False, size=n_chains)
start_dict = list(sample[i] for i in range(n_chains))
# Inference button (TM)!
with pm.Model(coords=coords):
    mu_a = pm.Normal("mu_alpha", mu=0.0, sigma=100**2)
    sigma_a = pm.Uniform("sigma_alpha", lower=0, upper=100)
    mu_b = pm.Normal("mu_beta", mu=0.0, sigma=100**2)
    sigma_b = pm.Uniform("sigma_beta", lower=0, upper=100)

    a = pm.Normal("alpha", mu=mu_a, sigma=sigma_a, dims="counties")
    b = pm.Normal("beta", mu=mu_b, sigma=sigma_b, dims="counties")

    # Model error
    eps = pm.Uniform("eps", lower=0, upper=100)

    radon_est = a[county_idx] + b[county_idx] * floor_idx

    radon_like = pm.Normal("radon_like", mu=radon_est, sigma=eps, observed=log_radon_idx)

    # essentially, this is what init='advi' does
    step = pm.NUTS(scaling=scaling, is_cov=True)
    hierarchical_trace = pm.sample(draws=2000, step=step, chains=n_chains, initvals=start_dict)
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [mu_alpha, sigma_alpha, mu_beta, sigma_beta, alpha, beta, eps]
100.00% [8000/8000 08:57<00:00 Sampling 4 chains, 0 divergences]
Sampling 4 chains for 1_000 tune and 1_000 draw iterations (4_000 + 4_000 draws total) took 538 seconds.
az.plot_density(
    data=[idata_advi, hierarchical_trace],
    var_names=["~alpha", "~beta"],
    data_labels=["ADVI", "NUTS"],
);
../../../_images/18875cd5cae6a7bbdb3a6f263a7ced96e137b1c84fa7648cda85b425255a7d6c.png

水印#

%load_ext watermark
%watermark -n -u -v -iv -w -p xarray
Last updated: Thu Sep 23 2021

Python implementation: CPython
Python version       : 3.8.10
IPython version      : 7.25.0

xarray: 0.17.0

numpy     : 1.21.0
seaborn   : 0.11.1
pandas    : 1.2.1
matplotlib: 3.3.4
theano    : 1.1.2
pymc3     : 3.11.2
scipy     : 1.7.1
arviz     : 0.11.2

Watermark: 2.2.0