示例数据

%load_ext autoreload
%autoreload 2

core.NeuralForecast 类允许您有效地拟合多个 NeuralForecast 模型以处理大型时间序列集。它与 pandas DataFrame df 一起操作,该数据框通过 unique_idds 列识别单独的序列和日期戳,y 列表示目标时间序列变量。为了帮助开发,我们声明了一些在所有 NeuralForecast 的单元测试中使用的有用数据集。

import random
from itertools import chain
from typing import List

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from nbdev.showdoc import add_docs, show_doc

1. 合成面板数据

def generate_series(n_series: int,
                    freq: str = 'D',
                    min_length: int = 50,
                    max_length: int = 500,
                    n_temporal_features: int = 0,
                    n_static_features: int = 0,
                    equal_ends: bool = False,
                    seed: int = 0) -> pd.DataFrame:
    """Generate Synthetic Panel Series.

    Generates `n_series` of frequency `freq` of different lengths in the interval [`min_length`, `max_length`].
    If `n_temporal_features > 0`, then each serie gets temporal features with random values.
    If `n_static_features > 0`, then a static dataframe is returned along the temporal dataframe.
    If `equal_ends == True` then all series end at the same date.

    **Parameters:**<br>
    `n_series`: int, number of series for synthetic panel.<br>
    `min_length`: int, minimal length of synthetic panel's series.<br>
    `max_length`: int, minimal length of synthetic panel's series.<br>
    `n_temporal_features`: int, default=0, number of temporal exogenous variables for synthetic panel's series.<br>
    `n_static_features`: int, default=0, number of static exogenous variables for synthetic panel's series.<br>
    `equal_ends`: bool, if True, series finish in the same date stamp `ds`.<br>
    `freq`: str, frequency of the data, [panda's available frequencies](https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases).<br>

    **Returns:**<br>
    `freq`: pandas.DataFrame, synthetic panel with columns [`unique_id`, `ds`, `y`] and exogenous.
    """
    seasonalities = {'D': 7, 'M': 12}
    season = seasonalities[freq]

    rng = np.random.RandomState(seed)
    series_lengths = rng.randint(min_length, max_length + 1, n_series)
    total_length = series_lengths.sum()

    dates = pd.date_range('2000-01-01', periods=max_length, freq=freq).values
    uids = [
        np.repeat(i, serie_length) for i, serie_length in enumerate(series_lengths)
    ]
    if equal_ends:
        ds = [dates[-serie_length:] for serie_length in series_lengths]
    else:
        ds = [dates[:serie_length] for serie_length in series_lengths]

    y = np.arange(total_length) % season + rng.rand(total_length) * 0.5
    temporal_df = pd.DataFrame(dict(unique_id=chain.from_iterable(uids),
                                    ds=chain.from_iterable(ds),
                                    y=y))

    random.seed(seed)
    for i in range(n_temporal_features):
        random.seed(seed)
        temporal_values = [
            [random.randint(0, 100)] * serie_length for serie_length in series_lengths
        ]
        temporal_df[f'temporal_{i}'] = np.hstack(temporal_values)
        temporal_df[f'temporal_{i}'] = temporal_df[f'temporal_{i}'].astype('category')
        if i == 0:
            temporal_df['y'] = temporal_df['y'] * \
                                  (1 + temporal_df[f'temporal_{i}'].cat.codes)

    temporal_df['unique_id'] = temporal_df['unique_id'].astype('category')
    temporal_df['unique_id'] = temporal_df['unique_id'].cat.as_ordered()

    if n_static_features > 0:
        static_features = np.random.uniform(low=0.0, high=1.0, 
                        size=(n_series, n_static_features))
        static_df = pd.DataFrame.from_records(static_features, 
                           columns = [f'static_{i}'for i in  range(n_static_features)])
        
        static_df['unique_id'] = np.arange(n_series)
        static_df['unique_id'] = static_df['unique_id'].astype('category')
        static_df['unique_id'] = static_df['unique_id'].cat.as_ordered()

        return temporal_df, static_df

    return temporal_df
show_doc(generate_series, title_level=3)
synthetic_panel = generate_series(n_series=2)
synthetic_panel.groupby('unique_id').head(4)
temporal_df, static_df = generate_series(n_series=1000, n_static_features=2,
                                         n_temporal_features=4, equal_ends=False)
static_df.head(2)

2. 空中乘客数据

经典的Box & Jenkins航空数据。1949年至1960年国际航空乘客的月度总数。

由于这是一个展示明显趋势和季节性的序列,因此它已被用作多个预测库的参考,它为快速展示模型的预测性能提供了一个良好的机会。

AirPassengers = np.array([112., 118., 132., 129., 121., 135., 148., 148., 136., 119., 104.,
                          118., 115., 126., 141., 135., 125., 149., 170., 170., 158., 133.,
                          114., 140., 145., 150., 178., 163., 172., 178., 199., 199., 184.,
                          162., 146., 166., 171., 180., 193., 181., 183., 218., 230., 242.,
                          209., 191., 172., 194., 196., 196., 236., 235., 229., 243., 264.,
                          272., 237., 211., 180., 201., 204., 188., 235., 227., 234., 264.,
                          302., 293., 259., 229., 203., 229., 242., 233., 267., 269., 270.,
                          315., 364., 347., 312., 274., 237., 278., 284., 277., 317., 313.,
                          318., 374., 413., 405., 355., 306., 271., 306., 315., 301., 356.,
                          348., 355., 422., 465., 467., 404., 347., 305., 336., 340., 318.,
                          362., 348., 363., 435., 491., 505., 404., 359., 310., 337., 360.,
                          342., 406., 396., 420., 472., 548., 559., 463., 407., 362., 405.,
                          417., 391., 419., 461., 472., 535., 622., 606., 508., 461., 390.,
                          432.], dtype=np.float32)
AirPassengersDF = pd.DataFrame({'unique_id': np.ones(len(AirPassengers)),
                                'ds': pd.date_range(start='1949-01-01',
                                                    periods=len(AirPassengers), freq=pd.offsets.MonthEnd()),
                                'y': AirPassengers})
AirPassengersDF.head(12)
#我们将绘制ARIMA预测结果及其预测区间。
fig, ax = plt.subplots(1, 1, figsize = (20, 7))
plot_df = AirPassengersDF.set_index('ds')

plot_df[['y']].plot(ax=ax, linewidth=2)
ax.set_title('AirPassengers Forecast', fontsize=22)
ax.set_ylabel('Monthly Passengers', fontsize=20)
ax.set_xlabel('Timestamp [t]', fontsize=20)
ax.legend(prop={'size': 15})
ax.grid()
import numpy as np
import pandas as pd
n_static_features = 3
n_series = 5

static_features = np.random.uniform(low=0.0, high=1.0, 
                        size=(n_series, n_static_features))
static_df = pd.DataFrame.from_records(static_features, 
                   columns = [f'static_{i}'for i in  range(n_static_features)])
static_df['unique_id'] = np.arange(n_series)
static_df

3. 面板航空乘客数据

经典的 Box & Jenkins 航空数据的扩展。1949年至1960年国际航空乘客的每月总数。

它包括两个系列,具有静态、时间和未来的外生变量,有助于探索模型如 NBEATSxTFT 的性能。


# 声明面板数据
unique_id = np.concatenate([['Airline1']*len(AirPassengers), ['Airline2']*len(AirPassengers)])
ds = np.tile(
    pd.date_range(
        start='1949-01-01', periods=len(AirPassengers), freq=pd.offsets.MonthEnd()
    ).to_numpy(), 
    2,
)
y = np.concatenate([AirPassengers, AirPassengers+300])

AirPassengersPanel = pd.DataFrame({'unique_id': unique_id, 'ds': ds, 'y': y})

# 对于未来的外生变量
# 声明 SeasonalNaive12 并填充前 12 个值为 y
snaive = AirPassengersPanel.groupby('unique_id')['y'].shift(periods=12).reset_index(drop=True)
AirPassengersPanel['trend'] = range(len(AirPassengersPanel))
AirPassengersPanel['y_[lag12]'] = snaive.fillna(AirPassengersPanel['y'])

# 声明静态数据
unique_id = np.array(['Airline1', 'Airline2'])
airline1_dummy = [0, 1]
airline2_dummy = [1, 0]
AirPassengersStatic = pd.DataFrame({'unique_id': unique_id,
                                    'airline1': airline1_dummy,
                                    'airline2': airline2_dummy})

AirPassengersPanel.groupby('unique_id').tail(4)
fig, ax = plt.subplots(1, 1, figsize = (20, 7))
plot_df = AirPassengersPanel.set_index('ds')

plot_df.groupby('unique_id')['y'].plot(legend=True)
ax.set_title('AirPassengers Panel Data', fontsize=22)
ax.set_ylabel('Monthly Passengers', fontsize=20)
ax.set_xlabel('Timestamp [t]', fontsize=20)
ax.legend(title='unique_id', prop={'size': 15})
ax.grid()
fig, ax = plt.subplots(1, 1, figsize = (20, 7))
plot_df = AirPassengersPanel[AirPassengersPanel.unique_id=='Airline1'].set_index('ds')

plot_df[['y', 'trend', 'y_[lag12]']].plot(ax=ax, linewidth=2)
ax.set_title('Box-Cox AirPassengers Data', fontsize=22)
ax.set_ylabel('Monthly Passengers', fontsize=20)
ax.set_xlabel('Timestamp [t]', fontsize=20)
ax.legend(prop={'size': 15})
ax.grid()

4. 时间特征

我们开发了一种工具,用于生成规范化的日历特征,作为基于Transformer模型的绝对位置嵌入。这些嵌入捕捉时间序列数据中的季节性模式,并可以轻松地纳入模型架构中。此外,这些特征还可以作为外生变量用于其他模型,以便让它们了解数据中的日历模式。

参考文献
- Haoyi Zhou, Shanghang Zhang, Jieqi Peng, Shuai Zhang, Jianxin Li, Hui Xiong, Wancai Zhang. “Informer: 超越高效变换器进行长序列时间序列预测”

class TimeFeature:
    def __init__(self):
        pass

    def __call__(self, index: pd.DatetimeIndex):
        return print('Overwrite with corresponding feature')

    def __repr__(self):
        return self.__class__.__name__ + "()"

class SecondOfMinute(TimeFeature):
    """小时中的分钟数,编码为介于 [-0.5, 0.5] 之间的数值"""
    def __call__(self, index: pd.DatetimeIndex) -> np.ndarray:
        return index.second / 59.0 - 0.5

class MinuteOfHour(TimeFeature):
    """小时中的分钟数,编码为介于 [-0.5, 0.5] 之间的数值"""
    def __call__(self, index: pd.DatetimeIndex) -> np.ndarray:
        return index.minute / 59.0 - 0.5

class HourOfDay(TimeFeature):
    """一天中的时刻,编码为介于[-0.5, 0.5]之间的数值"""
    def __call__(self, index: pd.DatetimeIndex) -> np.ndarray:
        return index.hour / 23.0 - 0.5

class DayOfWeek(TimeFeature):
    """一天中的时刻,编码为介于[-0.5, 0.5]之间的数值"""
    def __call__(self, index: pd.DatetimeIndex) -> np.ndarray:
        return index.dayofweek / 6.0 - 0.5

class DayOfMonth(TimeFeature):
    """月份中的日期,编码为介于[-0.5, 0.5]之间的值"""
    def __call__(self, index: pd.DatetimeIndex) -> np.ndarray:
        return (index.day - 1) / 30.0 - 0.5

class DayOfYear(TimeFeature):
    """一年中的某一天,编码为介于[-0.5, 0.5]之间的数值"""
    def __call__(self, index: pd.DatetimeIndex) -> np.ndarray:
        return (index.dayofyear - 1) / 365.0 - 0.5

class MonthOfYear(TimeFeature):
    """月份编码为介于[-0.5, 0.5]之间的数值"""
    def __call__(self, index: pd.DatetimeIndex) -> np.ndarray:
        return (index.month - 1) / 11.0 - 0.5

class WeekOfYear(TimeFeature):
    """一年中的周数,编码为介于[-0.5, 0.5]之间的值"""
    def __call__(self, index: pd.DatetimeIndex) -> np.ndarray:
        return (index.week - 1) / 52.0 - 0.5

def time_features_from_frequency_str(freq_str: str) -> List[TimeFeature]:
    """
    Returns a list of time features that will be appropriate for the given frequency string.
    Parameters
    ----------
    freq_str
        Frequency string of the form [multiple][granularity] such as "12H", "5min", "1D" etc.
    """

    if freq_str not in ['Q', 'M', 'MS', 'W', 'D', 'B', 'H', 'T', 'S']:
        raise Exception('Frequency not supported')
    
    if freq_str in ['Q','M', 'MS']:
        return [cls() for cls in [MonthOfYear]]
    elif freq_str == 'W':
        return [cls() for cls in [DayOfMonth, WeekOfYear]]
    elif freq_str in ['D','B']:
        return [cls() for cls in [DayOfWeek, DayOfMonth, DayOfYear]]
    elif freq_str == 'H':
        return [cls() for cls in [HourOfDay, DayOfWeek, DayOfMonth, DayOfYear]]
    elif freq_str == 'T':
        return [cls() for cls in [MinuteOfHour, HourOfDay, DayOfWeek, DayOfMonth, DayOfYear]]
    else:
        return [cls() for cls in [SecondOfMinute, MinuteOfHour, HourOfDay, DayOfWeek, DayOfMonth, DayOfYear]]

def augment_calendar_df(df, freq='H'):
    """
    > * Q - [month]
    > * M - [month]
    > * W - [Day of month, week of year]
    > * D - [Day of week, day of month, day of year]
    > * B - [Day of week, day of month, day of year]
    > * H - [Hour of day, day of week, day of month, day of year]
    > * T - [Minute of hour*, hour of day, day of week, day of month, day of year]
    > * S - [Second of minute, minute of hour, hour of day, day of week, day of month, day of year]
    *minute returns a number from 0-3 corresponding to the 15 minute period it falls into.
    """
    df = df.copy()

    freq_map = {
        'Q':['month'],
        'M':['month'],
        'MS':['month'],
        'W':['monthday', 'yearweek'],
        'D':['weekday','monthday','yearday'],
        'B':['weekday','monthday','yearday'],
        'H':['dayhour','weekday','monthday','yearday'],
        'T':['hourminute','dayhour','weekday','monthday','yearday'],
        'S':['minutesecond','hourminute','dayhour','weekday','monthday','yearday']
    }

    ds_col = pd.to_datetime(df.ds.values)
    ds_data = np.vstack([feat(ds_col) for feat in time_features_from_frequency_str(freq)]).transpose(1,0)
    ds_data = pd.DataFrame(ds_data, columns=freq_map[freq])
    
    return pd.concat([df, ds_data], axis=1), freq_map[freq]
AirPassengerPanelCalendar, calendar_cols = augment_calendar_df(df=AirPassengersPanel, freq='M')
AirPassengerPanelCalendar.head()
plot_df = AirPassengerPanelCalendar[AirPassengerPanelCalendar.unique_id=='Airline1'].set_index('ds')
plt.plot(plot_df['month'])
plt.grid()
plt.xlabel('Datestamp')
plt.ylabel('Normalized Month')
plt.show()
def get_indexer_raise_missing(idx: pd.Index, vals: List[str]) -> List[int]:
    idxs = idx.get_indexer(vals)
    missing = [v for i, v in zip(idxs, vals) if i == -1]
    if missing:
        raise ValueError(f'The following values are missing from the index: {missing}')
    return idxs

Give us a ⭐ on Github