In [65]:
import pandas as pd
In [66]:
df = pd.read_csv("dataset.csv", parse_dates=['Service End Date'], low_memory=False)
In [67]:
df.rename(columns={'Service End Date': 'Date'}, inplace=True)
Monthly Analysis¶
In [68]:
df['Month'] = df['Date'].dt.to_period('M')
monthly_avg = df.groupby('Month')['Consumption (KWH)'].mean().reset_index()
monthly_avg['Month'] = monthly_avg['Month'].dt.to_timestamp()
monthly_df = monthly_avg[['Month', 'Consumption (KWH)']]
monthly_df.columns = ['Month', 'Average_Consumption']
In [69]:
monthly_df.head()
Out[69]:
| Month | Average_Consumption | |
|---|---|---|
| 0 | 2010-01-01 | 45446.030782 |
| 1 | 2010-02-01 | 38824.279283 |
| 2 | 2010-03-01 | 36652.063380 |
| 3 | 2010-04-01 | 36823.726747 |
| 4 | 2010-05-01 | 36918.425453 |
In [70]:
monthly_df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 159 entries, 0 to 158 Data columns (total 2 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Month 159 non-null datetime64[ns] 1 Average_Consumption 159 non-null float64 dtypes: datetime64[ns](1), float64(1) memory usage: 2.6 KB
In [71]:
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_style("darkgrid")
plt.figure(figsize=(14, 5))
plt.plot(monthly_df['Month'], monthly_df['Average_Consumption'], linewidth=2, color='blue')
plt.title("Average Monthly Electricity Consumption (KWH)", fontsize=14)
plt.ylabel('Consumption (KWH)', fontsize=12)
plt.xticks(rotation=45)
plt.grid(True)
plt.tight_layout()
plt.show()
In [72]:
monthly_df.sort_values(by='Average_Consumption', ascending=False).head(10)
Out[72]:
| Month | Average_Consumption | |
|---|---|---|
| 12 | 2011-01-01 | 150980.000000 |
| 6 | 2010-07-01 | 63708.211823 |
| 7 | 2010-08-01 | 61722.964228 |
| 19 | 2012-07-01 | 53037.934614 |
| 31 | 2013-07-01 | 48941.412091 |
| 5 | 2010-06-01 | 48028.773140 |
| 20 | 2012-08-01 | 48004.884395 |
| 8 | 2010-09-01 | 47695.361916 |
| 0 | 2010-01-01 | 45446.030782 |
| 56 | 2015-08-01 | 43139.023571 |
In [73]:
df['Year'] = df['Date'].dt.year
df.groupby('Year').size()
Out[73]:
Year 2010.0 29267 2011.0 1 2012.0 35207 2013.0 38534 2014.0 39838 2015.0 40499 2016.0 40792 2017.0 38482 2018.0 7 2019.0 38799 2020.0 54382 2021.0 41366 2022.0 41481 2023.0 41105 2024.0 41264 2025.0 3 dtype: int64
In [74]:
monthly_df = monthly_df[~monthly_df['Month'].dt.year.isin([2011, 2018, 2025])]
In [75]:
sns.set_style("darkgrid")
plt.figure(figsize=(14, 5))
plt.plot(monthly_df['Month'], monthly_df['Average_Consumption'], linewidth=2, color='blue')
plt.title("Average Monthly Electricity Consumption (KWH)", fontsize=14)
plt.ylabel('Consumption (KWH)', fontsize=12)
plt.xticks(rotation=45)
plt.grid(True)
plt.tight_layout()
plt.show()
Monthly Predictions¶
In [76]:
from prophet import Prophet
monthly_df.rename(columns={'Month': 'ds', 'Average_Consumption': 'y'}, inplace=True)
In [77]:
model = Prophet()
model.fit(monthly_df)
12:17:25 - cmdstanpy - INFO - Chain [1] start processing 12:17:25 - cmdstanpy - INFO - Chain [1] done processing
Out[77]:
<prophet.forecaster.Prophet at 0x18d35fffbe0>
EC 1 month in the future -¶
In [78]:
future_months = model.make_future_dataframe(periods=1, freq='M')
In [79]:
forecast = model.predict(future_months)
In [80]:
model.plot(forecast)
plt.title("Forecasted Electricity Consumption")
plt.xlabel("Date")
plt.ylabel("Consumption (KWH)")
plt.show()
In [81]:
future_predictions = forecast[forecast['ds'] > monthly_df['ds'].max()]
future_predictions = future_predictions[['ds', 'yhat', 'yhat_lower', 'yhat_upper']]
print(future_predictions)
ds yhat yhat_lower yhat_upper 156 2024-12-31 24853.644731 22396.671723 27159.548383
EC 6 months in the future -¶
In [82]:
future_months = model.make_future_dataframe(periods=6, freq='M')
In [83]:
forecast = model.predict(future_months)
In [84]:
model.plot(forecast)
plt.title("Forecasted Electricity Consumption")
plt.xlabel("Date")
plt.ylabel("Consumption (KWH)")
plt.show()
In [85]:
future_predictions = forecast[forecast['ds'] > monthly_df['ds'].max()]
future_predictions = future_predictions[['ds', 'yhat', 'yhat_lower', 'yhat_upper']]
print(future_predictions)
ds yhat yhat_lower yhat_upper 156 2024-12-31 24853.644731 22358.960918 27292.681836 157 2025-01-31 22563.340186 20259.413055 24861.266598 158 2025-02-28 21609.572339 19167.911512 24139.242252 159 2025-03-31 21144.182921 18727.399242 23629.570751 160 2025-04-30 23317.980708 20749.479946 25851.228456 161 2025-05-31 25861.853085 23266.155612 28303.532404
EC 9 months in the future -¶
In [86]:
future_months = model.make_future_dataframe(periods=9, freq='M')
In [87]:
forecast = model.predict(future_months)
In [88]:
model.plot(forecast)
plt.title("Forecasted Electricity Consumption")
plt.xlabel("Date")
plt.ylabel("Consumption (KWH)")
plt.show()
In [89]:
future_predictions = forecast[forecast['ds'] > monthly_df['ds'].max()]
future_predictions = future_predictions[['ds', 'yhat', 'yhat_lower', 'yhat_upper']]
print(future_predictions)
ds yhat yhat_lower yhat_upper 156 2024-12-31 24853.644731 22519.403477 27284.775834 157 2025-01-31 22563.340186 20033.318795 24976.815448 158 2025-02-28 21609.572339 19086.890279 23989.229733 159 2025-03-31 21144.182921 18698.058881 23582.192642 160 2025-04-30 23317.980708 20810.945199 25721.861811 161 2025-05-31 25861.853085 23290.034812 28333.393047 162 2025-06-30 35691.667091 33244.552557 38169.453350 163 2025-07-31 36500.110218 34248.294788 38947.663284 164 2025-08-31 32938.516848 30681.172682 35229.188894
In [ ]:
Yearly Analysis¶
In [90]:
df['Year'] = df['Date'].dt.to_period('Y')
yearly_total = df.groupby('Year')['Consumption (KWH)'].sum().reset_index()
yearly_total['Year'] = yearly_total['Year'].dt.to_timestamp()
yearly_df = yearly_total[['Year', 'Consumption (KWH)']]
yearly_df.columns = ['Year', 'Total_Consumption']
In [91]:
yearly_df.head()
Out[91]:
| Year | Total_Consumption | |
|---|---|---|
| 0 | 2010-01-01 | 1.305557e+09 |
| 1 | 2011-01-01 | 1.509800e+05 |
| 2 | 2012-01-01 | 1.279641e+09 |
| 3 | 2013-01-01 | 1.260168e+09 |
| 4 | 2014-01-01 | 1.222798e+09 |
In [92]:
yearly_df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 16 entries, 0 to 15 Data columns (total 2 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Year 16 non-null datetime64[ns] 1 Total_Consumption 16 non-null float64 dtypes: datetime64[ns](1), float64(1) memory usage: 384.0 bytes
In [93]:
plt.figure(figsize=(14, 5))
plt.plot(yearly_df['Year'], yearly_df['Total_Consumption'], linewidth=2, color='blue')
plt.title("Yearly Electricity Consumption (KWH)", fontsize=14)
plt.ylabel('Consumption (KWH)', fontsize=12)
plt.xlabel('Year', fontsize=12)
plt.xticks(rotation=45)
plt.grid(True)
plt.tight_layout()
plt.show()
In [94]:
yearly_df = yearly_df[~yearly_df['Year'].dt.year.isin([2011, 2018, 2025])]
In [95]:
sns.set_style("darkgrid")
plt.figure(figsize=(14, 5))
plt.plot(yearly_df['Year'], yearly_df['Total_Consumption'], linewidth=2, color='blue')
plt.title("Yearly Electricity Consumption (KWH)", fontsize=14)
plt.ylabel('Consumption (KWH)', fontsize=12)
plt.xlabel('Year', fontsize=12)
plt.xticks(rotation=45)
plt.grid(True)
plt.tight_layout()
plt.show()
Yearly Predictions¶
In [96]:
yearly_df.rename(columns={'Year': 'ds', 'Total_Consumption': 'y'}, inplace=True)
In [97]:
model = Prophet()
model.fit(yearly_df)
12:18:00 - cmdstanpy - INFO - Chain [1] start processing 12:18:00 - cmdstanpy - INFO - Chain [1] done processing
Out[97]:
<prophet.forecaster.Prophet at 0x18d3646ee80>
EC 1 year in the future -¶
In [98]:
future_years = model.make_future_dataframe(periods=1, freq='Y')
In [99]:
forecast = model.predict(future_years)
In [100]:
model.plot(forecast)
plt.title("Forecasted Electricity Consumption")
plt.xlabel("Date")
plt.ylabel("Consumption (KWH)")
plt.show()
In [101]:
future_predictions = forecast[forecast['ds'] > yearly_df['ds'].max()]
In [102]:
future_predictions = future_predictions[['ds', 'yhat', 'yhat_lower', 'yhat_upper']]
print(future_predictions)
ds yhat yhat_lower yhat_upper 13 2024-12-31 1.247297e+09 1.128906e+09 1.374092e+09
EC 10 years in the future -¶
In [103]:
future_years = model.make_future_dataframe(periods=10, freq='Y')
In [104]:
forecast = model.predict(future_years)
In [105]:
model.plot(forecast)
plt.title("Forecasted Electricity Consumption")
plt.xlabel("Date")
plt.ylabel("Consumption (KWH)")
plt.show()
In [106]:
future_predictions = forecast[forecast['ds'] > yearly_df['ds'].max()]
In [107]:
future_predictions = future_predictions[['ds', 'yhat', 'yhat_lower', 'yhat_upper']]
print(future_predictions)
ds yhat yhat_lower yhat_upper 13 2024-12-31 1.247297e+09 1.115981e+09 1.369104e+09 14 2025-12-31 1.313726e+09 1.188932e+09 1.431386e+09 15 2026-12-31 1.394505e+09 1.270925e+09 1.530247e+09 16 2027-12-31 1.489491e+09 1.361138e+09 1.614006e+09 17 2028-12-31 1.198532e+09 1.074770e+09 1.323877e+09 18 2029-12-31 1.264961e+09 1.139831e+09 1.392290e+09 19 2030-12-31 1.345741e+09 1.221208e+09 1.475364e+09 20 2031-12-31 1.440726e+09 1.318782e+09 1.565513e+09 21 2032-12-31 1.149767e+09 1.015143e+09 1.273040e+09 22 2033-12-31 1.216197e+09 1.095937e+09 1.337940e+09
EC 20 years in the future -¶
In [108]:
future_years = model.make_future_dataframe(periods=20, freq='Y')
In [109]:
forecast = model.predict(future_years)
In [110]:
model.plot(forecast)
plt.title("Forecasted Electricity Consumption")
plt.xlabel("Date")
plt.ylabel("Consumption (KWH)")
plt.show()
In [111]:
future_predictions = forecast[forecast['ds'] > yearly_df['ds'].max()]
In [112]:
future_predictions = future_predictions[['ds', 'yhat', 'yhat_lower', 'yhat_upper']]
print(future_predictions)
ds yhat yhat_lower yhat_upper 13 2024-12-31 1.247297e+09 1.126593e+09 1.376678e+09 14 2025-12-31 1.313726e+09 1.191309e+09 1.437901e+09 15 2026-12-31 1.394505e+09 1.270946e+09 1.517047e+09 16 2027-12-31 1.489491e+09 1.371474e+09 1.616276e+09 17 2028-12-31 1.198532e+09 1.074151e+09 1.325576e+09 18 2029-12-31 1.264961e+09 1.136338e+09 1.380003e+09 19 2030-12-31 1.345741e+09 1.220270e+09 1.480249e+09 20 2031-12-31 1.440726e+09 1.316787e+09 1.566012e+09 21 2032-12-31 1.149767e+09 1.021266e+09 1.274194e+09 22 2033-12-31 1.216197e+09 1.096445e+09 1.333709e+09 23 2034-12-31 1.296976e+09 1.175606e+09 1.414110e+09 24 2035-12-31 1.391961e+09 1.270857e+09 1.520130e+09 25 2036-12-31 1.101002e+09 9.747827e+08 1.227193e+09 26 2037-12-31 1.167432e+09 1.031024e+09 1.292726e+09 27 2038-12-31 1.248211e+09 1.123664e+09 1.378456e+09 28 2039-12-31 1.343196e+09 1.220753e+09 1.464241e+09 29 2040-12-31 1.052238e+09 9.222902e+08 1.173694e+09 30 2041-12-31 1.118667e+09 9.902822e+08 1.239756e+09 31 2042-12-31 1.199446e+09 1.065370e+09 1.324158e+09 32 2043-12-31 1.294432e+09 1.173018e+09 1.416951e+09
In [ ]:
Tuning the Prophet Model¶
In [113]:
models = {}
Default -¶
In [50]:
m_default = Prophet()
m_default.fit(monthly_df)
future = m_default.make_future_dataframe(periods=9, freq='M')
forecast_default = m_default.predict(future)
models['Default'] = forecast_default
12:10:05 - cmdstanpy - INFO - Chain [1] start processing 12:10:05 - cmdstanpy - INFO - Chain [1] done processing
Tuning Growth -¶
In [51]:
monthly_df['cap'] = monthly_df['y'].max() * 1.2
future_growth = future.copy()
future_growth['cap'] = monthly_df['cap'].iloc[0]
m_growth = Prophet(growth='logistic')
m_growth.fit(monthly_df)
forecast_growth = m_growth.predict(future_growth)
models['Growth (logistic)'] = forecast_growth
12:10:06 - cmdstanpy - INFO - Chain [1] start processing 12:10:06 - cmdstanpy - INFO - Chain [1] done processing
Tuning Seasonality -¶
In [52]:
m_seasonal = Prophet()
m_seasonal.add_seasonality(name='monthly_custom', period=30.5, fourier_order=5)
m_seasonal.fit(monthly_df)
forecast_seasonal = m_seasonal.predict(future)
models['Seasonality'] = forecast_seasonal
12:10:07 - cmdstanpy - INFO - Chain [1] start processing 12:10:07 - cmdstanpy - INFO - Chain [1] done processing
Tuning Changepoints -¶
In [53]:
m_changepoints = Prophet(n_changepoints=30, changepoint_prior_scale=0.1)
m_changepoints.fit(monthly_df)
forecast_cp = m_changepoints.predict(future)
models['Changepoints'] = forecast_cp
12:10:08 - cmdstanpy - INFO - Chain [1] start processing 12:10:08 - cmdstanpy - INFO - Chain [1] done processing
Results -¶
In [54]:
last_date = monthly_df['ds'].max()
future_predictions = models['Default'][models['Default']['ds'] > last_date][['ds', 'yhat']].copy()
future_predictions.rename(columns={'yhat': 'Default'}, inplace=True)
for model_name, forecast in models.items():
if model_name != 'Default':
temp = forecast[forecast['ds'] > last_date][['ds', 'yhat']].copy()
temp.rename(columns={'yhat': model_name}, inplace=True)
future_predictions = future_predictions.merge(temp, on='ds', how='left')
future_predictions = future_predictions.round(2)
print(future_predictions)
ds Default Growth (logistic) Seasonality Changepoints 0 2024-12-31 24853.64 24728.90 24658.18 24864.13 1 2025-01-31 22563.34 22418.57 21359.47 22501.32 2 2025-02-28 21609.57 21315.89 33368.67 21434.45 3 2025-03-31 21144.18 20903.97 28485.90 20938.85 4 2025-04-30 23317.98 22906.50 35150.36 22859.16 5 2025-05-31 25861.85 25825.74 33510.52 25997.74 6 2025-06-30 35691.67 35771.13 48040.21 35967.51 7 2025-07-31 36500.11 36155.29 43759.04 36259.88 8 2025-08-31 32938.52 32451.21 36202.31 32575.29
Evaluating the models -¶
In [114]:
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, r2_score
# Default model
forecast_default = m_default.predict(monthly_df[['ds']])
y_true = monthly_df['y'].values
y_pred = forecast_default['yhat'].values
mae_default = mean_absolute_error(y_true, y_pred)
mape_default = mean_absolute_percentage_error(y_true, y_pred)
r2_default = r2_score(y_true, y_pred)
# Logistic model
monthly_df['cap'] = monthly_df['y'].max() * 1.2
forecast_logistic = m_growth.predict(monthly_df[['ds', 'cap']])
y_pred_logistic = forecast_logistic['yhat'].values
mae_logistic = mean_absolute_error(y_true, y_pred_logistic)
mape_logistic = mean_absolute_percentage_error(y_true, y_pred_logistic)
r2_logistic = r2_score(y_true, y_pred_logistic)
# Seasonal model
forecast_seasonal = m_seasonal.predict(monthly_df[['ds']])
y_pred_seasonal = forecast_seasonal['yhat'].values
mae_seasonal = mean_absolute_error(y_true, y_pred_seasonal)
mape_seasonal = mean_absolute_percentage_error(y_true, y_pred_seasonal)
r2_seasonal = r2_score(y_true, y_pred_seasonal)
# Changepoint model
forecast_cp = m_changepoints.predict(monthly_df[['ds']])
y_pred_cp = forecast_cp['yhat'].values
mae_cp = mean_absolute_error(y_true, y_pred_cp)
mape_cp = mean_absolute_percentage_error(y_true, y_pred_cp)
r2_cp = r2_score(y_true, y_pred_cp)
results_df = pd.DataFrame([
{'Model': 'Default', 'MAE': round(mae_default, 2), 'MAPE': round(mape_default, 4), 'R2': round(r2_default, 4)},
{'Model': 'Growth (logistic)', 'MAE': round(mae_logistic, 2), 'MAPE': round(mape_logistic, 4), 'R2': round(r2_logistic, 4)},
{'Model': 'Seasonality', 'MAE': round(mae_seasonal, 2), 'MAPE': round(mape_seasonal, 4), 'R2': round(r2_seasonal, 4)},
{'Model': 'Changepoints', 'MAE': round(mae_cp, 2), 'MAPE': round(mape_cp, 4), 'R2': round(r2_cp, 4)}
])
print(results_df)
Model MAE MAPE R2 0 Default 1357.00 0.0419 0.9369 1 Growth (logistic) 1457.61 0.0451 0.9197 2 Seasonality 1301.49 0.0399 0.9397 3 Changepoints 1352.28 0.0416 0.9378
Visualizing the results -¶
In [57]:
models = {
'Default': forecast_default,
'Growth (logistic)': forecast_growth,
'Seasonality': forecast_seasonal,
'Changepoints': forecast_cp
}
In [58]:
import matplotlib.pyplot as plt
plt.figure(figsize=(14, 6))
for model_name, forecast in models.items():
future_pred = forecast[forecast['ds'] > monthly_df['ds'].max()]
plt.plot(future_pred['ds'], future_pred['yhat'], label=model_name)
plt.title("Forecasted Monthly Electricity Consumption (All Models)")
plt.xlabel("Date")
plt.ylabel("Predicted Consumption (KWH)")
plt.xticks(rotation=45)
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()
Monthly consumption, grouped by Borough¶
In [59]:
df['Date'] = pd.to_datetime(df['Date'])
monthly_borough = (
df.groupby([df['Borough'], df['Date'].dt.to_period('M')])['Consumption (KWH)']
.sum()
.reset_index()
)
monthly_borough['Date'] = monthly_borough['Date'].dt.to_timestamp()
In [60]:
from prophet import Prophet
borough_forecasts = {}
exclude = ['NON DEVELOPMENT FACILITY', 'FHA']
boroughs = monthly_borough[~monthly_borough['Borough'].isin(exclude)]['Borough'].unique()
for b in boroughs:
temp_df = monthly_borough[monthly_borough['Borough'] == b][['Date', 'Consumption (KWH)']].copy()
temp_df.rename(columns={'Date': 'ds', 'Consumption (KWH)': 'y'}, inplace=True)
m = Prophet()
m.fit(temp_df)
future = m.make_future_dataframe(periods=9, freq='M')
forecast = m.predict(future)
borough_forecasts[b] = forecast
fig = m.plot(forecast)
plt.title(f"Forecast for {b}")
plt.xlabel("Date")
plt.ylabel("Consumption (KWH)")
plt.tight_layout()
plt.show()
12:11:06 - cmdstanpy - INFO - Chain [1] start processing 12:11:06 - cmdstanpy - INFO - Chain [1] done processing
12:11:07 - cmdstanpy - INFO - Chain [1] start processing 12:11:07 - cmdstanpy - INFO - Chain [1] done processing
12:11:07 - cmdstanpy - INFO - Chain [1] start processing 12:11:07 - cmdstanpy - INFO - Chain [1] done processing
12:11:08 - cmdstanpy - INFO - Chain [1] start processing 12:11:08 - cmdstanpy - INFO - Chain [1] done processing
12:11:08 - cmdstanpy - INFO - Chain [1] start processing 12:11:08 - cmdstanpy - INFO - Chain [1] done processing
In [61]:
plt.figure(figsize=(14, 6))
last_date = monthly_borough['Date'].max()
for b, forecast in borough_forecasts.items():
future_rows = forecast[forecast['ds'] > last_date]
plt.plot(future_rows['ds'], future_rows['yhat'], label=b)
plt.title("9-Month Forecasted Electricity Consumption by Borough")
plt.xlabel("Date")
plt.ylabel("Predicted Consumption (KWH)")
plt.xticks(rotation=45)
plt.legend(title="Borough")
plt
Out[61]:
<module 'matplotlib.pyplot' from 'D:\\Prog_Softwares\\Anaconda\\lib\\site-packages\\matplotlib\\pyplot.py'>
Borough-wise Consumption Analysis¶
In [62]:
borough_total = df.groupby('Borough')['Consumption (KWH)'].sum().reset_index()
borough_total = borough_total[~borough_total['Borough'].isin(['NON DEVELOPMENT FACILITY', 'FHA'])]
plt.figure(figsize=(10, 6))
sns.barplot(data=borough_total, x='Borough', y='Consumption (KWH)', palette='viridis')
plt.title("Total Electricity Consumption by Borough")
plt.xticks(rotation=45)
plt.ylabel("Total Consumption (KWH)")
plt.tight_layout()
plt.show()
In [63]:
plt.figure(figsize=(8, 8))
plt.pie(borough_total['Consumption (KWH)'], labels=borough_total['Borough'], autopct='%1.1f%%', startangle=140)
plt.title("Electricity Consumption Share by Borough")
plt.axis('equal')
plt.show()
In [64]:
import seaborn as sns
import matplotlib.pyplot as plt
monthly_filtered = monthly_borough[~monthly_borough['Borough'].isin(['NON DEVELOPMENT FACILITY', 'FHA'])]
plt.figure(figsize=(14, 6))
sns.lineplot(data=monthly_filtered, x='Date', y='Consumption (KWH)', hue='Borough')
plt.title("Monthly Electricity Consumption Trends by Borough")
plt.xlabel("Date")
plt.ylabel("Consumption (KWH)")
plt.xticks(rotation=45)
plt.legend(title="Borough")
plt.tight_layout()
plt.show()
In [ ]: