示例数据

%load_ext autoreload
%autoreload 2

core.NeuralForecast 类允许您有效地拟合多个 NeuralForecast 模型以处理大型时间序列集。它与 pandas DataFrame df 一起操作，该数据框通过 unique_id 和 ds 列识别单独的序列和日期戳，y 列表示目标时间序列变量。为了帮助开发，我们声明了一些在所有 NeuralForecast 的单元测试中使用的有用数据集。

import random
from itertools import chain
from typing import List

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

from nbdev.showdoc import add_docs, show_doc

1. 合成面板数据

def generate_series(n_series: int,
                    freq: str = 'D',
                    min_length: int = 50,
                    max_length: int = 500,
                    n_temporal_features: int = 0,
                    n_static_features: int = 0,
                    equal_ends: bool = False,
                    seed: int = 0) -> pd.DataFrame:
    """Generate Synthetic Panel Series.

    Generates `n_series` of frequency `freq` of different lengths in the interval [`min_length`, `max_length`].
    If `n_temporal_features > 0`, then each serie gets temporal features with random values.
    If `n_static_features > 0`, then a static dataframe is returned along the temporal dataframe.
    If `equal_ends == True` then all series end at the same date.

    **Parameters:**<br>
    `n_series`: int, number of series for synthetic panel.<br>
    `min_length`: int, minimal length of synthetic panel's series.<br>
    `max_length`: int, minimal length of synthetic panel's series.<br>
    `n_temporal_features`: int, default=0, number of temporal exogenous variables for synthetic panel's series.<br>
    `n_static_features`: int, default=0, number of static exogenous variables for synthetic panel's series.<br>
    `equal_ends`: bool, if True, series finish in the same date stamp `ds`.<br>
    `freq`: str, frequency of the data, [panda's available frequencies](https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases).<br>

    **Returns:**<br>
    `freq`: pandas.DataFrame, synthetic panel with columns [`unique_id`, `ds`, `y`] and exogenous.
    """
    seasonalities = {'D': 7, 'M': 12}
    season = seasonalities[freq]

    rng = np.random.RandomState(seed)
    series_lengths = rng.randint(min_length, max_length + 1, n_series)
    total_length = series_lengths.sum()

    dates = pd.date_range('2000-01-01', periods=max_length, freq=freq).values
    uids = [
        np.repeat(i, serie_length) for i, serie_length in enumerate(series_lengths)
    ]
    if equal_ends:
        ds = [dates[-serie_length:] for serie_length in series_lengths]
    else:
        ds = [dates[:serie_length] for serie_length in series_lengths]

    y = np.arange(total_length) % season + rng.rand(total_length) * 0.5
    temporal_df = pd.DataFrame(dict(unique_id=chain.from_iterable(uids),
                                    ds=chain.from_iterable(ds),
                                    y=y))

    random.seed(seed)
    for i in range(n_temporal_features):
        random.seed(seed)
        temporal_values = [
            [random.randint(0, 100)] * serie_length for serie_length in series_lengths
        ]
        temporal_df[f'temporal_{i}'] = np.hstack(temporal_values)
        temporal_df[f'temporal_{i}'] = temporal_df[f'temporal_{i}'].astype('category')
        if i == 0:
            temporal_df['y'] = temporal_df['y'] * \
                                  (1 + temporal_df[f'temporal_{i}'].cat.codes)

    temporal_df['unique_id'] = temporal_df['unique_id'].astype('category')
    temporal_df['unique_id'] = temporal_df['unique_id'].cat.as_ordered()

    if n_static_features > 0:
        static_features = np.random.uniform(low=0.0, high=1.0, 
                        size=(n_series, n_static_features))
        static_df = pd.DataFrame.from_records(static_features, 
                           columns = [f'static_{i}'for i in  range(n_static_features)])
        
        static_df['unique_id'] = np.arange(n_series)
        static_df['unique_id'] = static_df['unique_id'].astype('category')
        static_df['unique_id'] = static_df['unique_id'].cat.as_ordered()

        return temporal_df, static_df

    return temporal_df

show_doc(generate_series, title_level=3)

synthetic_panel = generate_series(n_series=2)
synthetic_panel.groupby('unique_id').head(4)

temporal_df, static_df = generate_series(n_series=1000, n_static_features=2,
                                         n_temporal_features=4, equal_ends=False)
static_df.head(2)

2. 空中乘客数据

经典的Box & Jenkins航空数据。1949年至1960年国际航空乘客的月度总数。

由于这是一个展示明显趋势和季节性的序列，因此它已被用作多个预测库的参考，它为快速展示模型的预测性能提供了一个良好的机会。

AirPassengers = np.array([112., 118., 132., 129., 121., 135., 148., 148., 136., 119., 104.,
                          118., 115., 126., 141., 135., 125., 149., 170., 170., 158., 133.,
                          114., 140., 145., 150., 178., 163., 172., 178., 199., 199., 184.,
                          162., 146., 166., 171., 180., 193., 181., 183., 218., 230., 242.,
                          209., 191., 172., 194., 196., 196., 236., 235., 229., 243., 264.,
                          272., 237., 211., 180., 201., 204., 188., 235., 227., 234., 264.,
                          302., 293., 259., 229., 203., 229., 242., 233., 267., 269., 270.,
                          315., 364., 347., 312., 274., 237., 278., 284., 277., 317., 313.,
                          318., 374., 413., 405., 355., 306., 271., 306., 315., 301., 356.,
                          348., 355., 422., 465., 467., 404., 347., 305., 336., 340., 318.,
                          362., 348., 363., 435., 491., 505., 404., 359., 310., 337., 360.,
                          342., 406., 396., 420., 472., 548., 559., 463., 407., 362., 405.,
                          417., 391., 419., 461., 472., 535., 622., 606., 508., 461., 390.,
                          432.], dtype=np.float32)

AirPassengersDF = pd.DataFrame({'unique_id': np.ones(len(AirPassengers)),
                                'ds': pd.date_range(start='1949-01-01',
                                                    periods=len(AirPassengers), freq=pd.offsets.MonthEnd()),
                                'y': AirPassengers})

AirPassengersDF.head(12)

#我们将绘制ARIMA预测结果及其预测区间。
fig, ax = plt.subplots(1, 1, figsize = (20, 7))
plot_df = AirPassengersDF.set_index('ds')

plot_df[['y']].plot(ax=ax, linewidth=2)
ax.set_title('AirPassengers Forecast', fontsize=22)
ax.set_ylabel('Monthly Passengers', fontsize=20)
ax.set_xlabel('Timestamp [t]', fontsize=20)
ax.legend(prop={'size': 15})
ax.grid()

import numpy as np
import pandas as pd

n_static_features = 3
n_series = 5

static_features = np.random.uniform(low=0.0, high=1.0, 
                        size=(n_series, n_static_features))
static_df = pd.DataFrame.from_records(static_features, 
                   columns = [f'static_{i}'for i in  range(n_static_features)])
static_df['unique_id'] = np.arange(n_series)

static_df

3. 面板航空乘客数据

经典的 Box & Jenkins 航空数据的扩展。1949年至1960年国际航空乘客的每月总数。

它包括两个系列，具有静态、时间和未来的外生变量，有助于探索模型如 NBEATSx 和 TFT 的性能。


# 声明面板数据
unique_id = np.concatenate([['Airline1']*len(AirPassengers), ['Airline2']*len(AirPassengers)])
ds = np.tile(
    pd.date_range(
        start='1949-01-01', periods=len(AirPassengers), freq=pd.offsets.MonthEnd()
    ).to_numpy(), 
    2,
)
y = np.concatenate([AirPassengers, AirPassengers+300])

AirPassengersPanel = pd.DataFrame({'unique_id': unique_id, 'ds': ds, 'y': y})

# 对于未来的外生变量
# 声明 SeasonalNaive12 并填充前 12 个值为 y
snaive = AirPassengersPanel.groupby('unique_id')['y'].shift(periods=12).reset_index(drop=True)
AirPassengersPanel['trend'] = range(len(AirPassengersPanel))
AirPassengersPanel['y_[lag12]'] = snaive.fillna(AirPassengersPanel['y'])

# 声明静态数据
unique_id = np.array(['Airline1', 'Airline2'])
airline1_dummy = [0, 1]
airline2_dummy = [1, 0]
AirPassengersStatic = pd.DataFrame({'unique_id': unique_id,
                                    'airline1': airline1_dummy,
                                    'airline2': airline2_dummy})

AirPassengersPanel.groupby('unique_id').tail(4)

fig, ax = plt.subplots(1, 1, figsize = (20, 7))
plot_df = AirPassengersPanel.set_index('ds')

plot_df.groupby('unique_id')['y'].plot(legend=True)
ax.set_title('AirPassengers Panel Data', fontsize=22)
ax.set_ylabel('Monthly Passengers', fontsize=20)
ax.set_xlabel('Timestamp [t]', fontsize=20)
ax.legend(title='unique_id', prop={'size': 15})
ax.grid()

fig, ax = plt.subplots(1, 1, figsize = (20, 7))
plot_df = AirPassengersPanel[AirPassengersPanel.unique_id=='Airline1'].set_index('ds')

plot_df[['y', 'trend', 'y_[lag12]']].plot(ax=ax, linewidth=2)
ax.set_title('Box-Cox AirPassengers Data', fontsize=22)
ax.set_ylabel('Monthly Passengers', fontsize=20)
ax.set_xlabel('Timestamp [t]', fontsize=20)
ax.legend(prop={'size': 15})
ax.grid()

4. 时间特征

我们开发了一种工具，用于生成规范化的日历特征，作为基于Transformer模型的绝对位置嵌入。这些嵌入捕捉时间序列数据中的季节性模式，并可以轻松地纳入模型架构中。此外，这些特征还可以作为外生变量用于其他模型，以便让它们了解数据中的日历模式。

参考文献
- Haoyi Zhou, Shanghang Zhang, Jieqi Peng, Shuai Zhang, Jianxin Li, Hui Xiong, Wancai Zhang. “Informer: 超越高效变换器进行长序列时间序列预测”

class TimeFeature:
    def __init__(self):
        pass

    def __call__(self, index: pd.DatetimeIndex):
        return print('Overwrite with corresponding feature')

    def __repr__(self):
        return self.__class__.__name__ + "()"

class SecondOfMinute(TimeFeature):
    """小时中的分钟数，编码为介于 [-0.5, 0.5] 之间的数值"""
    def __call__(self, index: pd.DatetimeIndex) -> np.ndarray:
        return index.second / 59.0 - 0.5

class MinuteOfHour(TimeFeature):
    """小时中的分钟数，编码为介于 [-0.5, 0.5] 之间的数值"""
    def __call__(self, index: pd.DatetimeIndex) -> np.ndarray:
        return index.minute / 59.0 - 0.5

class HourOfDay(TimeFeature):
    """一天中的时刻，编码为介于[-0.5, 0.5]之间的数值"""
    def __call__(self, index: pd.DatetimeIndex) -> np.ndarray:
        return index.hour / 23.0 - 0.5

class DayOfWeek(TimeFeature):
    """一天中的时刻，编码为介于[-0.5, 0.5]之间的数值"""
    def __call__(self, index: pd.DatetimeIndex) -> np.ndarray:
        return index.dayofweek / 6.0 - 0.5

class DayOfMonth(TimeFeature):
    """月份中的日期，编码为介于[-0.5, 0.5]之间的值"""
    def __call__(self, index: pd.DatetimeIndex) -> np.ndarray:
        return (index.day - 1) / 30.0 - 0.5

class DayOfYear(TimeFeature):
    """一年中的某一天，编码为介于[-0.5, 0.5]之间的数值"""
    def __call__(self, index: pd.DatetimeIndex) -> np.ndarray:
        return (index.dayofyear - 1) / 365.0 - 0.5

class MonthOfYear(TimeFeature):
    """月份编码为介于[-0.5, 0.5]之间的数值"""
    def __call__(self, index: pd.DatetimeIndex) -> np.ndarray:
        return (index.month - 1) / 11.0 - 0.5

class WeekOfYear(TimeFeature):
    """一年中的周数，编码为介于[-0.5, 0.5]之间的值"""
    def __call__(self, index: pd.DatetimeIndex) -> np.ndarray:
        return (index.week - 1) / 52.0 - 0.5

def time_features_from_frequency_str(freq_str: str) -> List[TimeFeature]:
    """
    Returns a list of time features that will be appropriate for the given frequency string.
    Parameters
    ----------
    freq_str
        Frequency string of the form [multiple][granularity] such as "12H", "5min", "1D" etc.
    """

    if freq_str not in ['Q', 'M', 'MS', 'W', 'D', 'B', 'H', 'T', 'S']:
        raise Exception('Frequency not supported')
    
    if freq_str in ['Q','M', 'MS']:
        return [cls() for cls in [MonthOfYear]]
    elif freq_str == 'W':
        return [cls() for cls in [DayOfMonth, WeekOfYear]]
    elif freq_str in ['D','B']:
        return [cls() for cls in [DayOfWeek, DayOfMonth, DayOfYear]]
    elif freq_str == 'H':
        return [cls() for cls in [HourOfDay, DayOfWeek, DayOfMonth, DayOfYear]]
    elif freq_str == 'T':
        return [cls() for cls in [MinuteOfHour, HourOfDay, DayOfWeek, DayOfMonth, DayOfYear]]
    else:
        return [cls() for cls in [SecondOfMinute, MinuteOfHour, HourOfDay, DayOfWeek, DayOfMonth, DayOfYear]]

def augment_calendar_df(df, freq='H'):
    """
    > * Q - [month]
    > * M - [month]
    > * W - [Day of month, week of year]
    > * D - [Day of week, day of month, day of year]
    > * B - [Day of week, day of month, day of year]
    > * H - [Hour of day, day of week, day of month, day of year]
    > * T - [Minute of hour*, hour of day, day of week, day of month, day of year]
    > * S - [Second of minute, minute of hour, hour of day, day of week, day of month, day of year]
    *minute returns a number from 0-3 corresponding to the 15 minute period it falls into.
    """
    df = df.copy()

    freq_map = {
        'Q':['month'],
        'M':['month'],
        'MS':['month'],
        'W':['monthday', 'yearweek'],
        'D':['weekday','monthday','yearday'],
        'B':['weekday','monthday','yearday'],
        'H':['dayhour','weekday','monthday','yearday'],
        'T':['hourminute','dayhour','weekday','monthday','yearday'],
        'S':['minutesecond','hourminute','dayhour','weekday','monthday','yearday']
    }

    ds_col = pd.to_datetime(df.ds.values)
    ds_data = np.vstack([feat(ds_col) for feat in time_features_from_frequency_str(freq)]).transpose(1,0)
    ds_data = pd.DataFrame(ds_data, columns=freq_map[freq])
    
    return pd.concat([df, ds_data], axis=1), freq_map[freq]

AirPassengerPanelCalendar, calendar_cols = augment_calendar_df(df=AirPassengersPanel, freq='M')
AirPassengerPanelCalendar.head()

plot_df = AirPassengerPanelCalendar[AirPassengerPanelCalendar.unique_id=='Airline1'].set_index('ds')
plt.plot(plot_df['month'])
plt.grid()
plt.xlabel('Datestamp')
plt.ylabel('Normalized Month')
plt.show()

def get_indexer_raise_missing(idx: pd.Index, vals: List[str]) -> List[int]:
    idxs = idx.get_indexer(vals)
    missing = [v for i, v in zip(idxs, vals) if i == -1]
    if missing:
        raise ValueError(f'The following values are missing from the index: {missing}')
    return idxs

Give us a ⭐ on Github