Temperature Forecasting (Supplementary Notebook)¶
Similar to what was done in previous sections, this notebook applies the methodology used in the M5 Forecasting notebook to the Jena Climate dataset. Specifically, we will be forecasting the temperature variable.
1. Jena Climate Dataset¶
Let’s load the dataset and examine it.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
### CREATE X,Y ####
def create_xy(series, window_size, prediction_horizon, shuffle = False):
x = []
y = []
for i in range(0, len(series)):
if len(series[(i + window_size):(i + window_size + prediction_horizon)]) < prediction_horizon:
break
x.append(series[i:(i + window_size)])
y.append(series[(i + window_size):(i + window_size + prediction_horizon)])
x = np.array(x)
y = np.array(y)
return x,y
plt.rcParams['figure.figsize'] = [15, 5]
np.set_printoptions(precision = 6, suppress = True)
df = pd.read_csv('../data/jena_climate_2009_2016.csv')
df.shape
(420551, 15)
df.head(10)
Date Time | p (mbar) | T (degC) | Tpot (K) | Tdew (degC) | rh (%) | VPmax (mbar) | VPact (mbar) | VPdef (mbar) | sh (g/kg) | H2OC (mmol/mol) | rho (g/m**3) | wv (m/s) | max. wv (m/s) | wd (deg) | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 01.01.2009 00:10:00 | 996.52 | -8.02 | 265.40 | -8.90 | 93.3 | 3.33 | 3.11 | 0.22 | 1.94 | 3.12 | 1307.75 | 1.03 | 1.75 | 152.3 |
1 | 01.01.2009 00:20:00 | 996.57 | -8.41 | 265.01 | -9.28 | 93.4 | 3.23 | 3.02 | 0.21 | 1.89 | 3.03 | 1309.80 | 0.72 | 1.50 | 136.1 |
2 | 01.01.2009 00:30:00 | 996.53 | -8.51 | 264.91 | -9.31 | 93.9 | 3.21 | 3.01 | 0.20 | 1.88 | 3.02 | 1310.24 | 0.19 | 0.63 | 171.6 |
3 | 01.01.2009 00:40:00 | 996.51 | -8.31 | 265.12 | -9.07 | 94.2 | 3.26 | 3.07 | 0.19 | 1.92 | 3.08 | 1309.19 | 0.34 | 0.50 | 198.0 |
4 | 01.01.2009 00:50:00 | 996.51 | -8.27 | 265.15 | -9.04 | 94.1 | 3.27 | 3.08 | 0.19 | 1.92 | 3.09 | 1309.00 | 0.32 | 0.63 | 214.3 |
5 | 01.01.2009 01:00:00 | 996.50 | -8.05 | 265.38 | -8.78 | 94.4 | 3.33 | 3.14 | 0.19 | 1.96 | 3.15 | 1307.86 | 0.21 | 0.63 | 192.7 |
6 | 01.01.2009 01:10:00 | 996.50 | -7.62 | 265.81 | -8.30 | 94.8 | 3.44 | 3.26 | 0.18 | 2.04 | 3.27 | 1305.68 | 0.18 | 0.63 | 166.5 |
7 | 01.01.2009 01:20:00 | 996.50 | -7.62 | 265.81 | -8.36 | 94.4 | 3.44 | 3.25 | 0.19 | 2.03 | 3.26 | 1305.69 | 0.19 | 0.50 | 118.6 |
8 | 01.01.2009 01:30:00 | 996.50 | -7.91 | 265.52 | -8.73 | 93.8 | 3.36 | 3.15 | 0.21 | 1.97 | 3.16 | 1307.17 | 0.28 | 0.75 | 188.5 |
9 | 01.01.2009 01:40:00 | 996.53 | -8.43 | 264.99 | -9.34 | 93.1 | 3.23 | 3.00 | 0.22 | 1.88 | 3.02 | 1309.85 | 0.59 | 0.88 | 185.0 |
series = df['T (degC)'].iloc[5:]
series.plot()
plt.show()
print(series.shape)
(420546,)
series = series.iloc[::6]
series.plot()
plt.show()
print(series.shape)
(70091,)
series = df.iloc[::6, 1:]
series
p (mbar) | T (degC) | Tpot (K) | Tdew (degC) | rh (%) | VPmax (mbar) | VPact (mbar) | VPdef (mbar) | sh (g/kg) | H2OC (mmol/mol) | rho (g/m**3) | wv (m/s) | max. wv (m/s) | wd (deg) | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 996.52 | -8.02 | 265.40 | -8.90 | 93.30 | 3.33 | 3.11 | 0.22 | 1.94 | 3.12 | 1307.75 | 1.03 | 1.75 | 152.3 |
6 | 996.50 | -7.62 | 265.81 | -8.30 | 94.80 | 3.44 | 3.26 | 0.18 | 2.04 | 3.27 | 1305.68 | 0.18 | 0.63 | 166.5 |
12 | 996.63 | -8.85 | 264.57 | -9.70 | 93.50 | 3.12 | 2.92 | 0.20 | 1.82 | 2.93 | 1312.11 | 0.16 | 0.50 | 158.3 |
18 | 996.87 | -8.84 | 264.56 | -9.69 | 93.50 | 3.13 | 2.92 | 0.20 | 1.83 | 2.93 | 1312.37 | 0.07 | 0.25 | 129.3 |
24 | 997.05 | -9.23 | 264.15 | -10.25 | 92.20 | 3.03 | 2.79 | 0.24 | 1.74 | 2.80 | 1314.62 | 0.10 | 0.38 | 203.9 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
420522 | 1002.08 | -1.40 | 271.59 | -6.10 | 70.20 | 5.51 | 3.87 | 1.64 | 2.40 | 3.86 | 1282.68 | 1.08 | 1.68 | 207.5 |
420528 | 1001.42 | -2.15 | 270.90 | -7.08 | 68.77 | 5.21 | 3.59 | 1.63 | 2.23 | 3.58 | 1285.50 | 0.79 | 1.24 | 184.3 |
420534 | 1001.05 | -2.61 | 270.47 | -6.97 | 71.80 | 5.04 | 3.62 | 1.42 | 2.25 | 3.61 | 1287.20 | 0.77 | 1.64 | 129.1 |
420540 | 1000.51 | -3.22 | 269.90 | -7.63 | 71.40 | 4.81 | 3.44 | 1.38 | 2.14 | 3.44 | 1289.50 | 0.85 | 1.54 | 207.8 |
420546 | 1000.07 | -4.05 | 269.10 | -8.13 | 73.10 | 4.52 | 3.30 | 1.22 | 2.06 | 3.30 | 1292.98 | 0.67 | 1.52 | 240.0 |
70092 rows × 14 columns
### HYPERPARAMETERS ###
window_size = 240
prediction_horizon = 1
### TRAIN TEST VAL SPLIT ###
train_series = series.iloc[:35045, 1]
val_series = series.iloc[35045:52569, 1]
test_series = series.iloc[52569:, 1]
train_x, train_y = create_xy(train_series.to_numpy(), window_size, prediction_horizon)
val_x, val_y = create_xy(val_series.to_numpy(), window_size, prediction_horizon)
test_x, test_y = create_xy(test_series.to_numpy(), window_size, prediction_horizon)
train_y = train_y.flatten()
val_y = val_y.flatten()
test_y = test_y.flatten()
print(train_x.shape)
print(train_y.shape)
print(val_x.shape)
print(val_y.shape)
print(test_x.shape)
print(test_y.shape)
(34805, 240)
(34805,)
(17284, 240)
(17284,)
(17283, 240)
(17283,)
test_series.plot()
plt.show()
LightGBM¶
import lightgbm as lgb
model = lgb.LGBMRegressor()
model.fit(train_x, train_y,
eval_metric = 'l1',
eval_set = [(val_x, val_y)],
early_stopping_rounds = 100,
verbose = 0)
LGBMRegressor()
forecast = model.predict(test_x)
print(' LightGBM MAE: %.4f' % (np.mean(np.abs(forecast - test_y))))
#series[-test_size:].plot(marker = 'o', linestyle = '--')
#plt.plot(forecast, marker = 'o', linestyle = '--')
#plt.show()
LightGBM MAE: 0.5319
### HYPERPARAMETERS ###
window_size = 240
prediction_horizon = 24
### TRAIN TEST VAL SPLIT ###
train_series = series.iloc[:35045, 1]
val_series = series.iloc[35045:52569, 1]
test_series = series.iloc[52569:, 1]
train_x, train_y = create_xy(train_series.to_numpy(), window_size, prediction_horizon)
val_x, val_y = create_xy(val_series.to_numpy(), window_size, prediction_horizon)
test_x, test_y = create_xy(test_series.to_numpy(), window_size, prediction_horizon)
print(train_x.shape)
print(train_y.shape)
print(val_x.shape)
print(val_y.shape)
print(test_x.shape)
print(test_y.shape)
(34782, 240)
(34782, 24)
(17261, 240)
(17261, 24)
(17260, 240)
(17260, 24)
### LightGBM
recursive_x = test_x
forecast_ms = []
for j in range(prediction_horizon):
pred = model.predict(recursive_x)
recursive_x = np.hstack((recursive_x[:, 1:], pred[:, np.newaxis]))
forecast_ms.append(pred)
forecast_ms = np.asarray(forecast_ms).T
print('LightGBM')
print('MAE:', np.mean(np.abs(test_y - forecast_ms)))
print('RMSE:', np.sqrt(np.mean((test_y - forecast_ms)**2)))
print('')
### Naive
tmp = test_y[:-1, -1]
n_forecast = []
for val in tmp:
n_forecast.append(np.repeat(val, 24))
n_forecast = np.asarray(n_forecast)
print('Naive')
print('MAE:', np.mean(np.abs(test_y[1:, :] - n_forecast)))
print('RMSE:', np.sqrt(np.mean((test_y[1:, :] - n_forecast)**2)))
print('')
### S.Naive
new_test = np.vstack(np.array_split(test_series.to_numpy()[3:], 730))
print('Seasonal Naive')
print('MAE:', np.mean(np.abs(new_test[1:, :] - new_test[:-1, :])))
print('RMSE:', np.sqrt(np.mean((new_test[1:, :] - new_test[:-1, :])**2)))
LightGBM
MAE: 2.083323119819872
RMSE: 2.7826767013777443
Naive
MAE: 3.184767754022055
RMSE: 4.457663461439199
Seasonal Naive
MAE: 2.613232738911751
RMSE: 3.4171095400347133
Summary¶
Using LightGBM with a recursive forecasting strategy demonstrates that we can achieve an MAE of about 2.08°C without hyperparameter tuning.