VI. White Noise test of Residuals¶
Cumulative Periodogram of SARIMA¶
SARIMA(1,0,0,(1,0,0,[52])) Cumulative Periodogram
import numpy as np
import matplotlib.pyplot as plt
from scipy.signal import periodogram
from statsmodels.tsa.statespace.sarimax import SARIMAX
# --- Fit SARIMA model ---
model = SARIMAX(energy_consumption_data, order=(1, 0, 0))
results = model.fit(disp=False)
# --- Get residuals ---
residuals = results.resid
# --- Periodogram with Hann window ---
freqs, power = periodogram(residuals, window='hann')
# --- Remove near-zero frequencies ---
min_freq = 1 / len(residuals)
mask = freqs > min_freq
freqs = freqs[mask]
power = power[mask]
# --- Cumulative periodogram ---
cumulative_power = np.cumsum(power) / np.sum(power)
expected = np.linspace(0, 1, len(cumulative_power))
# --- KS statistic (max deviation from ideal)
ks_stat = np.max(np.abs(cumulative_power - expected))
print(f"Max deviation (KS statistic): {ks_stat*100:.2f}%")
# --- Plot cumulative periodogram ---
plt.figure(figsize=(10, 5))
plt.step(freqs, cumulative_power, where='post', label="Cumulative Periodogram", color='blue', lw=2)
plt.plot([freqs[0], freqs[-1]], [0, 1], 'r--', label="Ideal White Noise (y=x)")
plt.xlabel("Frequency (cycles/sample)")
plt.ylabel("Cumulative Proportion of Power")
plt.title("Cumulative Periodogram of SARIMA Residuals")
plt.grid(True, linestyle='--', alpha=0.6)
plt.ylim(0, 1.05)
plt.legend()
plt.tight_layout()
plt.show()
Max deviation (KS statistic): 12.89%
KS results on training data¶
import numpy as np
import matplotlib.pyplot as plt
from scipy.signal import periodogram
from statsmodels.tsa.statespace.sarimax import SARIMAX
# --- Define only the desired models ---
model_configs = {
"ARIMA(1,0,0)": ((1, 0, 0), (0, 0, 0, 0)),
"ARIMA(1,1,0)": ((1, 1, 0), (0, 0, 0, 0)),
"ARIMA(1,0,1)": ((1, 0, 1), (0, 0, 0, 0)),
"SARIMA(1,0,0)(1,0,0,52)": ((1, 0, 0), (1, 0, 0, 52)),
"SARIMAX(1,0,0)(1,0,0,52) + Temp": ((1, 0, 0), (1, 0, 0, 52)),
}
ks_results = {}
# --- Loop through each model ---
for label, (order, seasonal_order) in model_configs.items():
print(f"Fitting model: {label}")
if "Temp" in label:
model = SARIMAX(energy_consumption_data, exog=weekly_temp, order=order, seasonal_order=seasonal_order)
else:
model = SARIMAX(energy_consumption_data, order=order, seasonal_order=seasonal_order)
results = model.fit(disp=False)
residuals = results.resid
if len(residuals) < 60:
print(f"Skipping {label}: too few residuals.")
continue
freqs, power = periodogram(residuals, window='hann')
min_freq = 1 / len(residuals)
mask = freqs > min_freq
freqs = freqs[mask]
power = power[mask]
cumulative_power = np.cumsum(power) / np.sum(power)
expected = np.linspace(0, 1, len(cumulative_power))
ks_stat = np.max(np.abs(cumulative_power - expected))
ks_results[label] = ks_stat * 100
print(f"{label}: KS = {ks_stat*100:.2f}%")
# --- Bar chart of KS stats ---
sorted_items = sorted(ks_results.items(), key=lambda x: x[1])
labels = [k for k, _ in sorted_items]
ks_values = [v for _, v in sorted_items]
plt.figure(figsize=(10, 6))
bars = plt.barh(labels, ks_values, color='darkcyan')
for bar in bars:
width = bar.get_width()
plt.text(width + 0.5, bar.get_y() + bar.get_height() / 2,
f"{width:.2f}%", va='center', fontsize=10)
plt.xlabel("KS Statistic (%)")
plt.axvline(15, color='black', linestyle='--')
plt.text(15.5, -0.5, 'White Noise Threshold (~15%)', ha='left', va='bottom', fontsize=9)
plt.title("KS Statistic Comparison: ARIMA vs SARIMA vs SARIMAX")
plt.grid(axis='x', linestyle='--', alpha=0.6)
plt.tight_layout()
plt.show()
Fitting model: ARIMA(1,0,0) ARIMA(1,0,0): KS = 12.89% Fitting model: ARIMA(1,1,0) ARIMA(1,1,0): KS = 10.87% Fitting model: ARIMA(1,0,1) ARIMA(1,0,1): KS = 15.91% Fitting model: SARIMA(1,0,0)(1,0,0,52) SARIMA(1,0,0)(1,0,0,52): KS = 12.11% Fitting model: SARIMAX(1,0,0)(1,0,0,52) + Temp SARIMAX(1,0,0)(1,0,0,52) + Temp: KS = 10.47%
We conclude that AR(1), ARIMA(1,1,0), SARIMA(1,0,0,(1,0,0[52])), and SARIMAX(1,0,0,(1,0,0[52]) with Temp as Future Cov.) produce white noise residuals when tested on the full training data. They might be suitable models.
We can hyptothesize that the SARIMAX model might be the best fit model based on how the white noise residuals score is lowest.