滚动回归#

  • 配对交易是一种在算法交易中著名的技术,它将两只股票相互对冲。

  • 为了实现这一点,股票必须具有相关性(协整)。

  • 一个常见的例子是黄金(GLD)的价格和黄金开采业务(GFI)的价格。

import os
import warnings

import arviz as az
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
import numpy as np
import pandas as pd
import pymc as pm
import xarray as xr

from matplotlib import MatplotlibDeprecationWarning

warnings.filterwarnings(action="ignore", category=MatplotlibDeprecationWarning)
RANDOM_SEED = 8927
rng = np.random.default_rng(RANDOM_SEED)
%config InlineBackend.figure_format = 'retina'
az.style.use("arviz-darkgrid")

让我们加载GFI和GLD的价格。

# from pandas_datareader import data
# prices = data.GoogleDailyReader(symbols=['GLD', 'GFI'], end='2014-8-1').read().loc['Open', :, :]
try:
    prices = pd.read_csv(os.path.join("..", "data", "stock_prices.csv")).dropna()
except FileNotFoundError:
    prices = pd.read_csv(pm.get_data("stock_prices.csv")).dropna()

prices["Date"] = pd.DatetimeIndex(prices["Date"])
prices = prices.set_index("Date")
prices_zscored = (prices - prices.mean()) / prices.std()
prices.head()
GFI GLD
Date
2010-01-04 13.55 109.82
2010-01-05 13.51 109.88
2010-01-06 13.70 110.71
2010-01-07 13.63 111.07
2010-01-08 13.72 111.52

绘制价格随时间变化的趋势表明存在很强的相关性。然而,这种相关性似乎会随时间变化。

fig = plt.figure(figsize=(9, 6))
ax = fig.add_subplot(111, xlabel=r"Price GFI in \$", ylabel=r"Price GLD in \$")
colors = np.linspace(0, 1, len(prices))
mymap = plt.get_cmap("viridis")
sc = ax.scatter(prices.GFI, prices.GLD, c=colors, cmap=mymap, lw=0)
ticks = colors[:: len(prices) // 10]
ticklabels = [str(p.date()) for p in prices[:: len(prices) // 10].index]
cb = plt.colorbar(sc, ticks=ticks)
cb.ax.set_yticklabels(ticklabels);
../_images/6527831dd88c242e288d444e294f895c0a50eb792a59369a79e32f9290e5c959.png

一个简单的方法是估计一个线性模型并忽略时间域。

with pm.Model() as model:  # model specifications in PyMC are wrapped in a with-statement
    # Define priors
    sigma = pm.HalfCauchy("sigma", beta=10)
    alpha = pm.Normal("alpha", mu=0, sigma=20)
    beta = pm.Normal("beta", mu=0, sigma=20)

    mu = pm.Deterministic("mu", alpha + beta * prices_zscored.GFI.to_numpy())

    # Define likelihood
    likelihood = pm.Normal("y", mu=mu, sigma=sigma, observed=prices_zscored.GLD.to_numpy())

    # Inference
    trace_reg = pm.sample(tune=2000)
100.00% [12000/12000 00:00<00:00 Sampling 4 chains, 0 divergences]
Sampling 4 chains for 2_000 tune and 1_000 draw iterations (8_000 + 4_000 draws total) took 1 seconds.

后验预测图显示了拟合的糟糕程度。

fig = plt.figure(figsize=(9, 6))
ax = fig.add_subplot(
    111,
    xlabel=r"Price GFI in \$",
    ylabel=r"Price GLD in \$",
    title="Posterior predictive regression lines",
)
sc = ax.scatter(prices_zscored.GFI, prices_zscored.GLD, c=colors, cmap=mymap, lw=0)

xi = xr.DataArray(prices_zscored.GFI.values)
az.plot_hdi(
    xi,
    trace_reg.posterior.mu,
    color="k",
    hdi_prob=0.95,
    ax=ax,
    fill_kwargs={"alpha": 0.25},
    smooth=False,
)
az.plot_hdi(
    xi,
    trace_reg.posterior.mu,
    color="k",
    hdi_prob=0.5,
    ax=ax,
    fill_kwargs={"alpha": 0.25},
    smooth=False,
)

cb = plt.colorbar(sc, ticks=ticks)
cb.ax.set_yticklabels(ticklabels);
../_images/82bdfeee2a1d52c979e8e9af902948f0e8ab01977be65b2fcde1e301990e6149.png

滚动回归#

接下来,我们将构建一个改进的模型,该模型将允许回归系数随时间变化。具体来说,我们将假设截距和斜率随时间进行随机游走。这一想法类似于case_studies/stochastic_volatility

\[ \alpha_t \sim \mathcal{N}(\alpha_{t-1}, \sigma_\alpha^2) \]
\[ \beta_t \sim \mathcal{N}(\beta_{t-1}, \sigma_\beta^2) \]

首先,让我们为\(\sigma_\alpha^2\)\(\sigma_\beta^2\)定义超先验。这个参数可以解释为回归系数中的波动性。

with pm.Model(coords={"time": prices.index.values}) as model_randomwalk:
    # std of random walk
    sigma_alpha = pm.Exponential("sigma_alpha", 50.0)
    sigma_beta = pm.Exponential("sigma_beta", 50.0)

    alpha = pm.GaussianRandomWalk(
        "alpha", sigma=sigma_alpha, init_dist=pm.Normal.dist(0, 10), dims="time"
    )
    beta = pm.GaussianRandomWalk(
        "beta", sigma=sigma_beta, init_dist=pm.Normal.dist(0, 10), dims="time"
    )

执行给定系数和数据的回归,并通过似然法与数据建立链接。

with model_randomwalk:
    # Define regression
    regression = alpha + beta * prices_zscored.GFI.values

    # Assume prices are Normally distributed, the mean comes from the regression.
    sd = pm.HalfNormal("sd", sigma=0.1)
    likelihood = pm.Normal("y", mu=regression, sigma=sd, observed=prices_zscored.GLD.to_numpy())

推理。尽管这是一个相当复杂的模型,NUTS 处理得很好。

with model_randomwalk:
    trace_rw = pm.sample(tune=2000, target_accept=0.9)
100.00% [12000/12000 06:22<00:00 Sampling 4 chains, 0 divergences]
Sampling 4 chains for 2_000 tune and 1_000 draw iterations (8_000 + 4_000 draws total) took 383 seconds.

增加树的深度确实有帮助,但这会使采样变得非常慢。不过,这次运行的结果看起来是相同的。

结果分析#

如下所示,\(\alpha\),即截距,随时间变化。

fig = plt.figure(figsize=(8, 6), constrained_layout=False)
ax = plt.subplot(111, xlabel="time", ylabel="alpha", title="Change of alpha over time.")
ax.plot(az.extract(trace_rw, var_names="alpha"), "r", alpha=0.05)

ticks_changes = mticker.FixedLocator(ax.get_xticks().tolist())
ticklabels_changes = [str(p.date()) for p in prices[:: len(prices) // 7].index]
ax.xaxis.set_major_locator(ticks_changes)
ax.set_xticklabels(ticklabels_changes)

fig.autofmt_xdate()
../_images/22211de6fc61aceeee2dfcaaae455e927576ee9726a8533f6905d51aed63337c.png

斜率也是如此。

fig = plt.figure(figsize=(8, 6), constrained_layout=False)
ax = fig.add_subplot(111, xlabel="time", ylabel="beta", title="Change of beta over time")
ax.plot(az.extract(trace_rw, var_names="beta"), "b", alpha=0.05)

ax.xaxis.set_major_locator(ticks_changes)
ax.set_xticklabels(ticklabels_changes)

fig.autofmt_xdate()
../_images/c1696a2ef6f96e214d44365667cf2835b3c7367f6a6b7ed7a752757308315645.png

后验预测图显示,我们更好地捕捉了回归随时间的变化。请注意,我们应该使用收益率而不是价格。模型的工作原理仍然相同,但可视化效果不会那么清晰。

fig = plt.figure(figsize=(8, 6))
ax = fig.add_subplot(
    111,
    xlabel=r"Price GFI in \$",
    ylabel=r"Price GLD in \$",
    title="Posterior predictive regression lines",
)

colors = np.linspace(0, 1, len(prices))
colors_sc = np.linspace(0, 1, len(prices.index.values[::50]))

xi = xr.DataArray(np.linspace(prices_zscored.GFI.min(), prices_zscored.GFI.max(), 50))

for i, time in enumerate(prices.index.values[::50]):
    sel_trace = az.extract(trace_rw).sel(time=time)
    regression_line = sel_trace["alpha"] + sel_trace["beta"] * xi
    ax.plot(
        xi,
        regression_line.T[:, 0::200],
        color=mymap(colors_sc[i]),
        alpha=0.1,
        zorder=10,
        linewidth=3,
    )

sc = ax.scatter(
    prices_zscored.GFI, prices_zscored.GLD, label="data", cmap=mymap, c=colors, zorder=11
)

cb = plt.colorbar(sc, ticks=ticks)
cb.ax.set_yticklabels(ticklabels);
../_images/0805ba0651a73e698974d6549408ed4a641c0ebff920d670f142fd31d6590678.png

作者#

水印#

%load_ext watermark
%watermark -n -u -v -iv -w -p pytensor,aeppl,xarray
Last updated: Sun Feb 05 2023

Python implementation: CPython
Python version       : 3.10.9
IPython version      : 8.9.0

pytensor: 2.8.11
aeppl   : not installed
xarray  : 2023.1.0

arviz     : 0.14.0
xarray    : 2023.1.0
numpy     : 1.24.1
matplotlib: 3.6.3
pymc      : 5.0.1
pandas    : 1.5.3

Watermark: 2.3.1

许可证声明#

本示例库中的所有笔记本均在MIT许可证下提供,该许可证允许修改和重新分发,前提是保留版权和许可证声明。

引用 PyMC 示例#

要引用此笔记本,请使用Zenodo为pymc-examples仓库提供的DOI。

重要

许多笔记本是从其他来源改编的:博客、书籍……在这种情况下,您应该引用原始来源。

同时记得引用代码中使用的相关库。

这是一个BibTeX的引用模板:

@incollection{citekey,
  author    = "<notebook authors, see above>",
  title     = "<notebook title>",
  editor    = "PyMC Team",
  booktitle = "PyMC examples",
  doi       = "10.5281/zenodo.5654871"
}

渲染后可能看起来像: