III. Data Correlation¶
Energy Prices Data correlation¶
df = df_backup
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
x_col = "Total Energy Consumption (kWh)"
y_col = "Secondary Control Energy Prices (€/MWh)"
# Align on weekly frequency and drop missing values
df_weekly = df[[x_col, y_col]].dropna()
# Plot
plt.figure(figsize=(7, 5))
sns.regplot(data=df_weekly, x=x_col, y=y_col, line_kws={"color": "green"})
plt.title(f"Weekly Correlation: {x_col} vs {y_col}(Price of Energy for the week)")
plt.xlabel(x_col)
plt.ylabel(y_col)
plt.grid(True)
plt.tight_layout()
plt.show()
# Pearson correlation
correlation = df_weekly.corr().loc[y_col, x_col]
print(f"Pearson correlation: {correlation:.4f}")
Pearson correlation: -0.5427
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
x_col = "Total Energy Consumption (kWh)"
y_col = "Secondary Control Energy Prices (€/MWh)"
# Shift consumption by 1 week to see delayed effect
df_shifted = df[[x_col, y_col]].copy()
df_shifted[x_col] = df_shifted[x_col].shift(2)
df_shifted = df_shifted.dropna()
# Plot
plt.figure(figsize=(7, 5))
sns.regplot(data=df_shifted, x=x_col, y=y_col, line_kws={"color": "green"})
plt.title(f"Lag-2 Weekly Correlation: 2 Weeks ago's {x_col} vs This Week's {y_col}")
plt.xlabel(f"{x_col} (Lagged by 2 Weeks)")
plt.ylabel(y_col)
plt.grid(True)
plt.tight_layout()
plt.show()
# Pearson correlation
correlation = df_shifted.corr().loc[y_col, x_col]
print(f"Lag-2 Pearson correlation: {correlation:.4f}")
Lag-2 Pearson correlation: -0.5794
Weather Data correlation¶
temp_series = pd.Series(weekly_temp.values, index=pd.DatetimeIndex(weekly_temp.index))
energy_series = pd.Series(energy_consumption_data, index=pd.DatetimeIndex(time_index))
# Step 2: Align to common date range
common_start = max(temp_series.index.min(), energy_series.index.min())
common_end = min(temp_series.index.max(), energy_series.index.max())
temp_series = temp_series.loc[common_start:common_end]
energy_series = energy_series.loc[common_start:common_end]
# Step 3: Align lengths to prevent mismatches
min_len = min(len(temp_series), len(energy_series))
temp_series = temp_series.iloc[:min_len]
energy_series = energy_series.iloc[:min_len]
# Step 4: Plot correlation scatter
plt.figure(figsize=(8, 5))
plt.scatter(temp_series, energy_series, alpha=0.7)
plt.xlabel("Monthly Average Temperature (Kelvin) (Assigned Weekly)")
plt.ylabel("Weekly Energy Consumption (kWh)")
plt.title("Correlation: Temperature Average (Kelvin) vs Energy Consumption (KWh)")
plt.grid(True)
plt.tight_layout()
plt.show()
# Step 5: Compute and display correlation coefficient
corr_coef = temp_series.corr(energy_series)
print(f"Correlation coefficient: {corr_coef:.3f}")
Correlation coefficient: -0.777
Other Data correlation¶
df.index = pd.DatetimeIndex(time_index[:len(df)])
# Step 2: Select the first 5 numeric variables and Total Energy Consumption
selected_cols = df.select_dtypes(include='number').columns[:5].tolist()
# Ensure 'Total Energy Consumption (kWh)' is included
reference_col = "Total Energy Consumption (kWh)"
if reference_col not in selected_cols:
selected_cols.append(reference_col)
# Step 3: Filter and drop NaNs
df_subset = df[selected_cols].dropna()
# Step 4: Compute correlation with the reference column
correlation_vector = df_subset.corr()[reference_col].drop(reference_col)
correlation_df = correlation_vector.to_frame(name=f"Correlation with '{reference_col}'")
# Step 5: Plot heatmap without white gridlines
plt.figure(figsize=(6, len(correlation_df) * 0.5 + 1))
sns.heatmap(
correlation_df,
annot=True,
cmap="coolwarm",
center=0,
cbar=True,
linewidths=0 # No white lines between cells
)
plt.title(f"Correlation of Other Variables with '{reference_col}'")
plt.tight_layout()
plt.show()