%load_ext autoreload
%autoreload 2示例数据
core.NeuralForecast类允许您有效地拟合多个NeuralForecast模型以处理大型时间序列集。它与 pandas DataFramedf一起操作,该数据框通过unique_id和ds列识别单独的序列和日期戳,y列表示目标时间序列变量。为了帮助开发,我们声明了一些在所有NeuralForecast的单元测试中使用的有用数据集。
import random
from itertools import chain
from typing import List
import numpy as np
import pandas as pdimport matplotlib.pyplot as plt
from nbdev.showdoc import add_docs, show_doc1. 合成面板数据
def generate_series(n_series: int,
freq: str = 'D',
min_length: int = 50,
max_length: int = 500,
n_temporal_features: int = 0,
n_static_features: int = 0,
equal_ends: bool = False,
seed: int = 0) -> pd.DataFrame:
"""Generate Synthetic Panel Series.
Generates `n_series` of frequency `freq` of different lengths in the interval [`min_length`, `max_length`].
If `n_temporal_features > 0`, then each serie gets temporal features with random values.
If `n_static_features > 0`, then a static dataframe is returned along the temporal dataframe.
If `equal_ends == True` then all series end at the same date.
**Parameters:**<br>
`n_series`: int, number of series for synthetic panel.<br>
`min_length`: int, minimal length of synthetic panel's series.<br>
`max_length`: int, minimal length of synthetic panel's series.<br>
`n_temporal_features`: int, default=0, number of temporal exogenous variables for synthetic panel's series.<br>
`n_static_features`: int, default=0, number of static exogenous variables for synthetic panel's series.<br>
`equal_ends`: bool, if True, series finish in the same date stamp `ds`.<br>
`freq`: str, frequency of the data, [panda's available frequencies](https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases).<br>
**Returns:**<br>
`freq`: pandas.DataFrame, synthetic panel with columns [`unique_id`, `ds`, `y`] and exogenous.
"""
seasonalities = {'D': 7, 'M': 12}
season = seasonalities[freq]
rng = np.random.RandomState(seed)
series_lengths = rng.randint(min_length, max_length + 1, n_series)
total_length = series_lengths.sum()
dates = pd.date_range('2000-01-01', periods=max_length, freq=freq).values
uids = [
np.repeat(i, serie_length) for i, serie_length in enumerate(series_lengths)
]
if equal_ends:
ds = [dates[-serie_length:] for serie_length in series_lengths]
else:
ds = [dates[:serie_length] for serie_length in series_lengths]
y = np.arange(total_length) % season + rng.rand(total_length) * 0.5
temporal_df = pd.DataFrame(dict(unique_id=chain.from_iterable(uids),
ds=chain.from_iterable(ds),
y=y))
random.seed(seed)
for i in range(n_temporal_features):
random.seed(seed)
temporal_values = [
[random.randint(0, 100)] * serie_length for serie_length in series_lengths
]
temporal_df[f'temporal_{i}'] = np.hstack(temporal_values)
temporal_df[f'temporal_{i}'] = temporal_df[f'temporal_{i}'].astype('category')
if i == 0:
temporal_df['y'] = temporal_df['y'] * \
(1 + temporal_df[f'temporal_{i}'].cat.codes)
temporal_df['unique_id'] = temporal_df['unique_id'].astype('category')
temporal_df['unique_id'] = temporal_df['unique_id'].cat.as_ordered()
if n_static_features > 0:
static_features = np.random.uniform(low=0.0, high=1.0,
size=(n_series, n_static_features))
static_df = pd.DataFrame.from_records(static_features,
columns = [f'static_{i}'for i in range(n_static_features)])
static_df['unique_id'] = np.arange(n_series)
static_df['unique_id'] = static_df['unique_id'].astype('category')
static_df['unique_id'] = static_df['unique_id'].cat.as_ordered()
return temporal_df, static_df
return temporal_dfshow_doc(generate_series, title_level=3)synthetic_panel = generate_series(n_series=2)
synthetic_panel.groupby('unique_id').head(4)temporal_df, static_df = generate_series(n_series=1000, n_static_features=2,
n_temporal_features=4, equal_ends=False)
static_df.head(2)2. 空中乘客数据
经典的Box & Jenkins航空数据。1949年至1960年国际航空乘客的月度总数。
由于这是一个展示明显趋势和季节性的序列,因此它已被用作多个预测库的参考,它为快速展示模型的预测性能提供了一个良好的机会。
AirPassengers = np.array([112., 118., 132., 129., 121., 135., 148., 148., 136., 119., 104.,
118., 115., 126., 141., 135., 125., 149., 170., 170., 158., 133.,
114., 140., 145., 150., 178., 163., 172., 178., 199., 199., 184.,
162., 146., 166., 171., 180., 193., 181., 183., 218., 230., 242.,
209., 191., 172., 194., 196., 196., 236., 235., 229., 243., 264.,
272., 237., 211., 180., 201., 204., 188., 235., 227., 234., 264.,
302., 293., 259., 229., 203., 229., 242., 233., 267., 269., 270.,
315., 364., 347., 312., 274., 237., 278., 284., 277., 317., 313.,
318., 374., 413., 405., 355., 306., 271., 306., 315., 301., 356.,
348., 355., 422., 465., 467., 404., 347., 305., 336., 340., 318.,
362., 348., 363., 435., 491., 505., 404., 359., 310., 337., 360.,
342., 406., 396., 420., 472., 548., 559., 463., 407., 362., 405.,
417., 391., 419., 461., 472., 535., 622., 606., 508., 461., 390.,
432.], dtype=np.float32)AirPassengersDF = pd.DataFrame({'unique_id': np.ones(len(AirPassengers)),
'ds': pd.date_range(start='1949-01-01',
periods=len(AirPassengers), freq=pd.offsets.MonthEnd()),
'y': AirPassengers})AirPassengersDF.head(12)#我们将绘制ARIMA预测结果及其预测区间。
fig, ax = plt.subplots(1, 1, figsize = (20, 7))
plot_df = AirPassengersDF.set_index('ds')
plot_df[['y']].plot(ax=ax, linewidth=2)
ax.set_title('AirPassengers Forecast', fontsize=22)
ax.set_ylabel('Monthly Passengers', fontsize=20)
ax.set_xlabel('Timestamp [t]', fontsize=20)
ax.legend(prop={'size': 15})
ax.grid()import numpy as np
import pandas as pdn_static_features = 3
n_series = 5
static_features = np.random.uniform(low=0.0, high=1.0,
size=(n_series, n_static_features))
static_df = pd.DataFrame.from_records(static_features,
columns = [f'static_{i}'for i in range(n_static_features)])
static_df['unique_id'] = np.arange(n_series)static_df3. 面板航空乘客数据
经典的 Box & Jenkins 航空数据的扩展。1949年至1960年国际航空乘客的每月总数。
它包括两个系列,具有静态、时间和未来的外生变量,有助于探索模型如 NBEATSx 和 TFT 的性能。
# 声明面板数据
unique_id = np.concatenate([['Airline1']*len(AirPassengers), ['Airline2']*len(AirPassengers)])
ds = np.tile(
pd.date_range(
start='1949-01-01', periods=len(AirPassengers), freq=pd.offsets.MonthEnd()
).to_numpy(),
2,
)
y = np.concatenate([AirPassengers, AirPassengers+300])
AirPassengersPanel = pd.DataFrame({'unique_id': unique_id, 'ds': ds, 'y': y})
# 对于未来的外生变量
# 声明 SeasonalNaive12 并填充前 12 个值为 y
snaive = AirPassengersPanel.groupby('unique_id')['y'].shift(periods=12).reset_index(drop=True)
AirPassengersPanel['trend'] = range(len(AirPassengersPanel))
AirPassengersPanel['y_[lag12]'] = snaive.fillna(AirPassengersPanel['y'])
# 声明静态数据
unique_id = np.array(['Airline1', 'Airline2'])
airline1_dummy = [0, 1]
airline2_dummy = [1, 0]
AirPassengersStatic = pd.DataFrame({'unique_id': unique_id,
'airline1': airline1_dummy,
'airline2': airline2_dummy})
AirPassengersPanel.groupby('unique_id').tail(4)fig, ax = plt.subplots(1, 1, figsize = (20, 7))
plot_df = AirPassengersPanel.set_index('ds')
plot_df.groupby('unique_id')['y'].plot(legend=True)
ax.set_title('AirPassengers Panel Data', fontsize=22)
ax.set_ylabel('Monthly Passengers', fontsize=20)
ax.set_xlabel('Timestamp [t]', fontsize=20)
ax.legend(title='unique_id', prop={'size': 15})
ax.grid()fig, ax = plt.subplots(1, 1, figsize = (20, 7))
plot_df = AirPassengersPanel[AirPassengersPanel.unique_id=='Airline1'].set_index('ds')
plot_df[['y', 'trend', 'y_[lag12]']].plot(ax=ax, linewidth=2)
ax.set_title('Box-Cox AirPassengers Data', fontsize=22)
ax.set_ylabel('Monthly Passengers', fontsize=20)
ax.set_xlabel('Timestamp [t]', fontsize=20)
ax.legend(prop={'size': 15})
ax.grid()4. 时间特征
我们开发了一种工具,用于生成规范化的日历特征,作为基于Transformer模型的绝对位置嵌入。这些嵌入捕捉时间序列数据中的季节性模式,并可以轻松地纳入模型架构中。此外,这些特征还可以作为外生变量用于其他模型,以便让它们了解数据中的日历模式。
class TimeFeature:
def __init__(self):
pass
def __call__(self, index: pd.DatetimeIndex):
return print('Overwrite with corresponding feature')
def __repr__(self):
return self.__class__.__name__ + "()"
class SecondOfMinute(TimeFeature):
"""小时中的分钟数,编码为介于 [-0.5, 0.5] 之间的数值"""
def __call__(self, index: pd.DatetimeIndex) -> np.ndarray:
return index.second / 59.0 - 0.5
class MinuteOfHour(TimeFeature):
"""小时中的分钟数,编码为介于 [-0.5, 0.5] 之间的数值"""
def __call__(self, index: pd.DatetimeIndex) -> np.ndarray:
return index.minute / 59.0 - 0.5
class HourOfDay(TimeFeature):
"""一天中的时刻,编码为介于[-0.5, 0.5]之间的数值"""
def __call__(self, index: pd.DatetimeIndex) -> np.ndarray:
return index.hour / 23.0 - 0.5
class DayOfWeek(TimeFeature):
"""一天中的时刻,编码为介于[-0.5, 0.5]之间的数值"""
def __call__(self, index: pd.DatetimeIndex) -> np.ndarray:
return index.dayofweek / 6.0 - 0.5
class DayOfMonth(TimeFeature):
"""月份中的日期,编码为介于[-0.5, 0.5]之间的值"""
def __call__(self, index: pd.DatetimeIndex) -> np.ndarray:
return (index.day - 1) / 30.0 - 0.5
class DayOfYear(TimeFeature):
"""一年中的某一天,编码为介于[-0.5, 0.5]之间的数值"""
def __call__(self, index: pd.DatetimeIndex) -> np.ndarray:
return (index.dayofyear - 1) / 365.0 - 0.5
class MonthOfYear(TimeFeature):
"""月份编码为介于[-0.5, 0.5]之间的数值"""
def __call__(self, index: pd.DatetimeIndex) -> np.ndarray:
return (index.month - 1) / 11.0 - 0.5
class WeekOfYear(TimeFeature):
"""一年中的周数,编码为介于[-0.5, 0.5]之间的值"""
def __call__(self, index: pd.DatetimeIndex) -> np.ndarray:
return (index.week - 1) / 52.0 - 0.5
def time_features_from_frequency_str(freq_str: str) -> List[TimeFeature]:
"""
Returns a list of time features that will be appropriate for the given frequency string.
Parameters
----------
freq_str
Frequency string of the form [multiple][granularity] such as "12H", "5min", "1D" etc.
"""
if freq_str not in ['Q', 'M', 'MS', 'W', 'D', 'B', 'H', 'T', 'S']:
raise Exception('Frequency not supported')
if freq_str in ['Q','M', 'MS']:
return [cls() for cls in [MonthOfYear]]
elif freq_str == 'W':
return [cls() for cls in [DayOfMonth, WeekOfYear]]
elif freq_str in ['D','B']:
return [cls() for cls in [DayOfWeek, DayOfMonth, DayOfYear]]
elif freq_str == 'H':
return [cls() for cls in [HourOfDay, DayOfWeek, DayOfMonth, DayOfYear]]
elif freq_str == 'T':
return [cls() for cls in [MinuteOfHour, HourOfDay, DayOfWeek, DayOfMonth, DayOfYear]]
else:
return [cls() for cls in [SecondOfMinute, MinuteOfHour, HourOfDay, DayOfWeek, DayOfMonth, DayOfYear]]
def augment_calendar_df(df, freq='H'):
"""
> * Q - [month]
> * M - [month]
> * W - [Day of month, week of year]
> * D - [Day of week, day of month, day of year]
> * B - [Day of week, day of month, day of year]
> * H - [Hour of day, day of week, day of month, day of year]
> * T - [Minute of hour*, hour of day, day of week, day of month, day of year]
> * S - [Second of minute, minute of hour, hour of day, day of week, day of month, day of year]
*minute returns a number from 0-3 corresponding to the 15 minute period it falls into.
"""
df = df.copy()
freq_map = {
'Q':['month'],
'M':['month'],
'MS':['month'],
'W':['monthday', 'yearweek'],
'D':['weekday','monthday','yearday'],
'B':['weekday','monthday','yearday'],
'H':['dayhour','weekday','monthday','yearday'],
'T':['hourminute','dayhour','weekday','monthday','yearday'],
'S':['minutesecond','hourminute','dayhour','weekday','monthday','yearday']
}
ds_col = pd.to_datetime(df.ds.values)
ds_data = np.vstack([feat(ds_col) for feat in time_features_from_frequency_str(freq)]).transpose(1,0)
ds_data = pd.DataFrame(ds_data, columns=freq_map[freq])
return pd.concat([df, ds_data], axis=1), freq_map[freq]AirPassengerPanelCalendar, calendar_cols = augment_calendar_df(df=AirPassengersPanel, freq='M')
AirPassengerPanelCalendar.head()plot_df = AirPassengerPanelCalendar[AirPassengerPanelCalendar.unique_id=='Airline1'].set_index('ds')
plt.plot(plot_df['month'])
plt.grid()
plt.xlabel('Datestamp')
plt.ylabel('Normalized Month')
plt.show()def get_indexer_raise_missing(idx: pd.Index, vals: List[str]) -> List[int]:
idxs = idx.get_indexer(vals)
missing = [v for i, v in zip(idxs, vals) if i == -1]
if missing:
raise ValueError(f'The following values are missing from the index: {missing}')
return idxsGive us a ⭐ on Github