Imports & Settings¶
In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import geopandas as gpd
from IPython.display import display
In [2]:
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)
In [3]:
DATA_PATH = "NYPD_Hate_Crimes_20251202.csv"
PRECINCTS_GEOJSON_URL = "https://data.cityofnewyork.us/resource/y76i-bdw7.geojson"
In [4]:
TARGET_MOTIVE = "ANTI-ASIAN"
Load data & basic information¶
In [5]:
df = pd.read_csv(DATA_PATH)
In [6]:
print("Shape of raw dataset:", df.shape)
print("\nColumn names:")
print(df.columns.tolist())
Shape of raw dataset: (3872, 14) Column names: ['Full Complaint ID', 'Complaint Year Number', 'Month Number', 'Record Create Date', 'Complaint Precinct Code', 'Patrol Borough Name', 'County', 'Law Code Category Description', 'Offense Description', 'PD Code Description', 'Bias Motive Description', 'Offense Category', 'Arrest Date', 'Arrest Id']
In [7]:
print("First 5 rows:")
display(df.head())
First 5 rows:
| Full Complaint ID | Complaint Year Number | Month Number | Record Create Date | Complaint Precinct Code | Patrol Borough Name | County | Law Code Category Description | Offense Description | PD Code Description | Bias Motive Description | Offense Category | Arrest Date | Arrest Id | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 201906012119317 | 2019 | 1 | 01/23/2019 | 60 | PATROL BORO BKLYN SOUTH | KINGS | FELONY | MISCELLANEOUS PENAL LAW | AGGRAVATED HARASSMENT 1 | ANTI-JEWISH | Religion/Religious Practice | NaN | NaN |
| 1 | 201906012175717 | 2019 | 2 | 02/25/2019 | 60 | PATROL BORO BKLYN SOUTH | KINGS | FELONY | MISCELLANEOUS PENAL LAW | AGGRAVATED HARASSMENT 1 | ANTI-JEWISH | Religion/Religious Practice | NaN | NaN |
| 2 | 201906012180117 | 2019 | 2 | 02/27/2019 | 60 | PATROL BORO BKLYN SOUTH | KINGS | FELONY | MISCELLANEOUS PENAL LAW | AGGRAVATED HARASSMENT 1 | ANTI-JEWISH | Religion/Religious Practice | NaN | NaN |
| 3 | 201906012273417 | 2019 | 4 | 04/16/2019 | 60 | PATROL BORO BKLYN SOUTH | KINGS | FELONY | MISCELLANEOUS PENAL LAW | AGGRAVATED HARASSMENT 1 | ANTI-JEWISH | Religion/Religious Practice | NaN | NaN |
| 4 | 201906012413717 | 2019 | 6 | 06/20/2019 | 60 | PATROL BORO BKLYN SOUTH | KINGS | FELONY | MISCELLANEOUS PENAL LAW | AGGRAVATED HARASSMENT 1 | ANTI-JEWISH | Religion/Religious Practice | NaN | NaN |
In [8]:
print("\nDataFrame info:")
df.info()
DataFrame info: <class 'pandas.core.frame.DataFrame'> RangeIndex: 3872 entries, 0 to 3871 Data columns (total 14 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Full Complaint ID 3872 non-null int64 1 Complaint Year Number 3872 non-null int64 2 Month Number 3872 non-null int64 3 Record Create Date 3872 non-null object 4 Complaint Precinct Code 3872 non-null int64 5 Patrol Borough Name 3872 non-null object 6 County 3872 non-null object 7 Law Code Category Description 3872 non-null object 8 Offense Description 3872 non-null object 9 PD Code Description 3872 non-null object 10 Bias Motive Description 3872 non-null object 11 Offense Category 3872 non-null object 12 Arrest Date 0 non-null float64 13 Arrest Id 1694 non-null object dtypes: float64(1), int64(4), object(9) memory usage: 423.6+ KB
Date Handling & Cleaning¶
In [9]:
# Converting record creation date
df["Record Create Date"] = pd.to_datetime(df["Record Create Date"], errors="coerce")
In [10]:
# Dropping rows with invalid dates
df = df.dropna(subset=["Record Create Date"]).copy()
In [11]:
# Sorting and setting index
df = df.sort_values("Record Create Date")
df.set_index("Record Create Date", inplace=True)
In [12]:
print("Data after cleaning:", df.shape)
print("Date range:", df.index.min(), "→", df.index.max())
Data after cleaning: (3872, 13) Date range: 2019-01-01 00:00:00 → 2025-09-30 00:00:00
Filter Exact Anti-Asian Incidents¶
In [13]:
df["Bias Motive Description"] = df["Bias Motive Description"].astype(str).str.strip()
In [14]:
filtered_df = df[df["Bias Motive Description"].str.upper() == TARGET_MOTIVE].copy()
In [15]:
print(f"Total ANTI-ASIAN incidents found: {len(filtered_df)}")
display(filtered_df.head())
Total ANTI-ASIAN incidents found: 394
| Full Complaint ID | Complaint Year Number | Month Number | Complaint Precinct Code | Patrol Borough Name | County | Law Code Category Description | Offense Description | PD Code Description | Bias Motive Description | Offense Category | Arrest Date | Arrest Id | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Record Create Date | |||||||||||||
| 2019-01-15 | 201906112101017 | 2019 | 1 | 61 | PATROL BORO BKLYN SOUTH | KINGS | FELONY | MURDER & NON-NEGL. MANSLAUGHTE | MURDER,UNCLASSIFIED | ANTI-ASIAN | Race/Color | NaN | K31675023 |
| 2020-01-29 | 202007912147017 | 2020 | 1 | 79 | PATROL BORO BKLYN NORTH | KINGS | FELONY | FELONY ASSAULT | ASSAULT 2,1,UNCLASSIFIED | ANTI-ASIAN | Race/Color | NaN | NaN |
| 2020-01-29 | 202007912147017 | 2020 | 1 | 79 | PATROL BORO BKLYN NORTH | KINGS | FELONY | FELONY ASSAULT | ASSAULT 2,1,UNCLASSIFIED | ANTI-ASIAN | Race/Color | NaN | K32676408 |
| 2020-03-10 | 202002312217317 | 2020 | 3 | 23 | PATROL BORO MAN NORTH | NEW YORK | MISDEMEANOR | ASSAULT 3 & RELATED OFFENSES | ASSAULT 3 | ANTI-ASIAN | Race/Color | NaN | M32682527 |
| 2020-03-10 | 202002312217217 | 2020 | 3 | 23 | PATROL BORO MAN NORTH | NEW YORK | FELONY | MISCELLANEOUS PENAL LAW | AGGRAVATED HARASSMENT 1 | ANTI-ASIAN | Race/Color | NaN | M32682532 |
In [16]:
print("Incident Date Range:",
filtered_df.index.min().strftime("%Y-%m-%d"),
"→",
filtered_df.index.max().strftime("%Y-%m-%d"))
Incident Date Range: 2019-01-15 → 2025-08-19
Adding Year/Month Columns¶
In [17]:
filtered_df["year"] = filtered_df.index.year
filtered_df["month"] = filtered_df.index.month
filtered_df["month_name"] = filtered_df.index.month_name()
display(filtered_df[["year","month","month_name"]].head())
| year | month | month_name | |
|---|---|---|---|
| Record Create Date | |||
| 2019-01-15 | 2019 | 1 | January |
| 2020-01-29 | 2020 | 1 | January |
| 2020-01-29 | 2020 | 1 | January |
| 2020-03-10 | 2020 | 3 | March |
| 2020-03-10 | 2020 | 3 | March |
Top Boroughs & Offense Categories¶
In [18]:
print("Top Patrol Boroughs:")
display(filtered_df["Patrol Borough Name"].value_counts())
print("\nTop Offense Categories:")
display(filtered_df["Offense Category"].value_counts())
Top Patrol Boroughs:
Patrol Borough Name PATROL BORO MAN SOUTH 181 PATROL BORO QUEENS NORTH 64 PATROL BORO MAN NORTH 44 PATROL BORO BKLYN NORTH 40 PATROL BORO BKLYN SOUTH 40 PATROL BORO BRONX 10 PATROL BORO QUEENS SOUTH 10 PATROL BORO STATEN ISLAND 5 Name: count, dtype: int64
Top Offense Categories:
Offense Category Race/Color 394 Name: count, dtype: int64
Monthly Time Series¶
In [19]:
monthly_counts = filtered_df.resample("ME").size()
plt.figure(figsize=(12,6))
monthly_counts.plot(marker="o")
plt.title("Monthly Anti-Asian Hate Crime Incidents (NYPD)")
plt.xlabel("Month")
plt.ylabel("Incidents")
plt.grid(True)
plt.tight_layout()
plt.show()
yearly_counts = filtered_df.resample("YE").size()
plt.figure(figsize=(12,6))
yearly_counts.plot(kind="bar")
plt.title("Yearly Anti-Asian Hate Crime Incidents")
plt.xlabel("Year")
plt.ylabel("Incidents")
plt.tight_layout()
plt.show()
Seasonality Heatmap (Year × Month)¶
In [20]:
heatmap_data = filtered_df.pivot_table(
index="year",
columns="month",
values="Full Complaint ID",
aggfunc="count"
).fillna(0)
heatmap_data = heatmap_data.reindex(sorted(heatmap_data.index), axis=0)
plt.figure(figsize=(12,6))
sns.heatmap(heatmap_data, annot=True, fmt=".0f", cmap="Reds")
plt.title("Anti-Asian Crime Seasonality (Year × Month)")
plt.xlabel("Month Number")
plt.ylabel("Year")
plt.tight_layout()
plt.show()
In [ ]:
Borough-Level Bar Plot¶
In [21]:
borough_counts = filtered_df["Patrol Borough Name"].value_counts()
plt.figure(figsize=(10,6))
sns.barplot(x=borough_counts.index, y=borough_counts.values)
plt.title("Anti-Asian Hate Crimes by Borough")
plt.xlabel("Borough")
plt.ylabel("Incidents")
plt.xticks(rotation=45, ha="right")
plt.tight_layout()
plt.show()
Arrest Analysis¶
In [22]:
filtered_df["Arrest Date"] = pd.to_datetime(filtered_df["Arrest Date"], errors="coerce")
filtered_df["arrest_made"] = filtered_df["Arrest Date"].notna()
no_arrest_count = (filtered_df["arrest_made"] == False).sum()
arrest_count = (filtered_df["arrest_made"] == True).sum()
labels = ["No Arrest", "Arrest"]
values = [no_arrest_count, arrest_count]
plt.figure(figsize=(6,5))
sns.barplot(x=labels, y=values)
plt.title("Arrest vs No-Arrest in Anti-Asian Incidents")
plt.ylabel("Number of Incidents")
plt.tight_layout()
plt.show()
In the provided NYPD Hate Crimes dataset, none of the incidents classified as ANTI-ASIAN have a non-null Arrest Date. In our analysis, we therefore treat all Anti-Asian incidents as ‘no arrest recorded’.¶
This may indicate either a true absence of arrests or incomplete recording of arrest data for these cases¶
In [ ]:
Precinct Hotspots¶
In [23]:
filtered_df_precinct = filtered_df.dropna(subset=["Complaint Precinct Code"]).copy()
filtered_df_precinct["Complaint Precinct Code"] = filtered_df_precinct["Complaint Precinct Code"].astype(int)
precinct_counts = (
filtered_df_precinct
.groupby("Complaint Precinct Code")
.size()
.sort_values(ascending=False)
)
print("Top 15 precincts by Anti-Asian incident count:")
display(precinct_counts.head(15))
Top 15 precincts by Anti-Asian incident count:
Complaint Precinct Code 14 52 13 25 5 21 7 16 84 15 62 15 109 14 9 13 6 13 18 12 108 12 17 12 24 12 115 9 1 9 dtype: int64
In [24]:
top_n = 15
plt.figure(figsize=(12,6))
sns.barplot(
x=precinct_counts.head(top_n).index.astype(str),
y=precinct_counts.head(top_n).values
)
plt.title(f"Top {top_n} Precincts by Anti-Asian Hate Crime Incidents")
plt.xlabel("Complaint Precinct Code")
plt.ylabel("Number of Incidents")
plt.xticks(rotation=45, ha="right")
plt.tight_layout()
plt.show()
Loading GeoJSON¶
In [25]:
try:
gdf_precincts = gpd.read_file(PRECINCTS_GEOJSON_URL)
gdf_precincts.columns = gdf_precincts.columns.str.strip()
precinct_candidates = [c for c in gdf_precincts.columns if "precinct" in c.lower()]
if not precinct_candidates:
raise ValueError(f"No precinct-like column found. Columns are: {gdf_precincts.columns.tolist()}")
precinct_col = precinct_candidates[0]
print("Using GeoJSON column as precinct code:", precinct_col)
gdf_precincts = gdf_precincts.rename(columns={precinct_col: "Complaint Precinct Code"})
gdf_precincts["Complaint Precinct Code"] = gdf_precincts["Complaint Precinct Code"].astype(int)
precinct_counts_df = precinct_counts.rename("Total_Incidents").reset_index()
gdf = gdf_precincts.merge(precinct_counts_df,
on="Complaint Precinct Code",
how="left")
gdf["Total_Incidents"] = gdf["Total_Incidents"].fillna(0)
print("GeoDataFrame shape after merge:", gdf.shape)
except Exception as e:
print("Error loading or merging GeoJSON:", e)
gdf = None
Using GeoJSON column as precinct code: precinct GeoDataFrame shape after merge: (78, 5)
Single Choropleth Map Of Hotspots¶
In [26]:
if gdf is not None:
fig, ax = plt.subplots(1, 1, figsize=(12, 10))
gdf.plot(
column="Total_Incidents",
cmap="Reds",
legend=True,
edgecolor="black",
linewidth=0.3,
ax=ax
)
ax.set_title("Anti-Asian Hate Crime Incidents by Precinct (Total)", fontsize=16)
ax.axis("off")
plt.tight_layout()
plt.show()
else:
print("GeoDataFrame `gdf` is None; cannot plot map.")
Which Months Are Worst Overall?¶
In [27]:
month_order = [
"January","February","March","April","May","June",
"July","August","September","October","November","December"
]
month_counts = (
filtered_df["month_name"]
.value_counts()
.reindex(month_order)
.fillna(0)
)
plt.figure(figsize=(12,6))
sns.barplot(x=month_counts.index, y=month_counts.values)
plt.title("Anti-Asian Hate Crime Incidents by Month (All Years Combined)")
plt.xlabel("Month")
plt.ylabel("Number of Incidents")
plt.xticks(rotation=45, ha="right")
plt.tight_layout()
plt.show()
Day-Of-Week Pattern¶
In [28]:
filtered_df["day_of_week"] = filtered_df.index.day_name()
dow_order = ["Monday","Tuesday","Wednesday","Thursday","Friday","Saturday","Sunday"]
dow_counts = (
filtered_df["day_of_week"]
.value_counts()
.reindex(dow_order)
.fillna(0)
)
plt.figure(figsize=(10,5))
sns.barplot(x=dow_counts.index, y=dow_counts.values)
plt.title("Anti-Asian Hate Crime Incidents by Day of Week")
plt.xlabel("Day of Week")
plt.ylabel("Number of Incidents")
plt.tight_layout()
plt.show()
In [ ]:
Time Series Libraries¶
In [29]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
from sklearn.metrics import mean_absolute_error, mean_squared_error
from prophet import Prophet
Monthly Time Series (NYC Total)¶
In [30]:
filtered_df.index = pd.to_datetime(filtered_df.index)
monthly_counts = (
filtered_df
.resample("ME")
.size()
.asfreq("ME", fill_value=0)
)
print("Monthly series length:", len(monthly_counts))
print("From:", monthly_counts.index.min(), "to:", monthly_counts.index.max())
plt.figure(figsize=(12,6))
monthly_counts.plot(marker="o")
plt.title("Monthly Anti-Asian Hate Crime Incidents (NYC total)")
plt.xlabel("Month")
plt.ylabel("Number of Incidents")
plt.grid(True)
plt.tight_layout()
plt.show()
Monthly series length: 80 From: 2019-01-31 00:00:00 to: 2025-08-31 00:00:00
Train / Test Split (Last 12 Months As Test)¶
In [31]:
test_months = 12
train = monthly_counts.iloc[:-test_months]
test = monthly_counts.iloc[-test_months:]
print("Train range:", train.index.min(), "→", train.index.max(), "(", len(train), "months )")
print("Test range:", test.index.min(), "→", test.index.max(), "(", len(test), "months )")
plt.figure(figsize=(12,6))
train.plot(label="Train")
test.plot(label="Test", marker="o")
plt.title("Train/Test Split – Monthly Anti-Asian Incidents")
plt.xlabel("Month")
plt.ylabel("Incidents")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()
Train range: 2019-01-31 00:00:00 → 2024-08-31 00:00:00 ( 68 months ) Test range: 2024-09-30 00:00:00 → 2025-08-31 00:00:00 ( 12 months )
ARIMA(1,1,1) On Monthly Data¶
In [32]:
arima_order = (1, 1, 1)
arima_model = sm.tsa.ARIMA(train, order=arima_order)
arima_result = arima_model.fit()
print(arima_result.summary())
SARIMAX Results
==============================================================================
Dep. Variable: y No. Observations: 68
Model: ARIMA(1, 1, 1) Log Likelihood -209.681
Date: Wed, 03 Dec 2025 AIC 425.362
Time: 16:29:59 BIC 431.976
Sample: 01-31-2019 HQIC 427.979
- 08-31-2024
Covariance Type: opg
==============================================================================
coef std err z P>|z| [0.025 0.975]
------------------------------------------------------------------------------
ar.L1 0.6503 0.140 4.631 0.000 0.375 0.926
ma.L1 -0.9992 2.546 -0.392 0.695 -5.990 3.991
sigma2 29.4097 73.606 0.400 0.689 -114.855 173.674
===================================================================================
Ljung-Box (L1) (Q): 0.09 Jarque-Bera (JB): 802.16
Prob(Q): 0.76 Prob(JB): 0.00
Heteroskedasticity (H): 0.75 Skew: 3.30
Prob(H) (two-sided): 0.50 Kurtosis: 18.61
===================================================================================
Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).
ARIMA Forecast + Metrics¶
In [33]:
arima_forecast = arima_result.forecast(steps=len(test))
arima_forecast = pd.Series(arima_forecast, index=test.index)
plt.figure(figsize=(12,6))
train.plot(label="Train")
test.plot(label="Actual Test", marker="o")
arima_forecast.plot(label="ARIMA Forecast", marker="o")
plt.title("ARIMA(1,1,1) – Monthly Anti-Asian Incidents")
plt.xlabel("Month")
plt.ylabel("Incidents")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()
arima_mae = mean_absolute_error(test, arima_forecast)
arima_mse = mean_squared_error(test, arima_forecast)
arima_rmse = np.sqrt(arima_mse)
print(f"ARIMA(1,1,1) – MAE: {arima_mae:.3f}")
print(f"ARIMA(1,1,1) – RMSE: {arima_rmse:.3f}")
ARIMA(1,1,1) – MAE: 2.600 ARIMA(1,1,1) – RMSE: 2.869
In [ ]:
SARIMA With Yearly Seasonality (12-Month Period)¶
In [34]:
sarima_order = (1, 0, 1)
sarima_seasonal_order = (1, 1, 0, 12)
sarima_model = sm.tsa.statespace.SARIMAX(
train,
order=sarima_order,
seasonal_order=sarima_seasonal_order,
enforce_stationarity=False,
enforce_invertibility=False
)
sarima_result = sarima_model.fit(disp=False)
print(sarima_result.summary())
SARIMAX Results
===========================================================================================
Dep. Variable: y No. Observations: 68
Model: SARIMAX(1, 0, 1)x(1, 1, [], 12) Log Likelihood -142.987
Date: Wed, 03 Dec 2025 AIC 293.975
Time: 16:29:59 BIC 301.020
Sample: 01-31-2019 HQIC 296.573
- 08-31-2024
Covariance Type: opg
==============================================================================
coef std err z P>|z| [0.025 0.975]
------------------------------------------------------------------------------
ar.L1 0.5700 0.335 1.702 0.089 -0.086 1.226
ma.L1 0.2532 0.428 0.591 0.554 -0.586 1.093
ar.S.L12 -0.3783 0.141 -2.675 0.007 -0.655 -0.101
sigma2 45.2191 6.303 7.174 0.000 32.865 57.573
===================================================================================
Ljung-Box (L1) (Q): 0.01 Jarque-Bera (JB): 24.77
Prob(Q): 0.93 Prob(JB): 0.00
Heteroskedasticity (H): 0.09 Skew: -0.01
Prob(H) (two-sided): 0.00 Kurtosis: 6.72
===================================================================================
Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).
SARIMA Forecast + Metrics¶
In [35]:
sarima_forecast = sarima_result.forecast(steps=len(test))
sarima_forecast = pd.Series(sarima_forecast, index=test.index)
plt.figure(figsize=(12,6))
train.plot(label="Train")
test.plot(label="Actual Test", marker="o")
sarima_forecast.plot(label="SARIMA Forecast", marker="o")
plt.title("SARIMA – Monthly Anti-Asian Incidents")
plt.xlabel("Month")
plt.ylabel("Incidents")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()
sarima_mae = mean_absolute_error(test, sarima_forecast)
sarima_mse = mean_squared_error(test, sarima_forecast)
sarima_rmse = np.sqrt(sarima_mse)
print(f"SARIMA – MAE: {sarima_mae:.3f}")
print(f"SARIMA – RMSE: {sarima_rmse:.3f}")
SARIMA – MAE: 2.075 SARIMA – RMSE: 2.521
Prophet On Monthly Data¶
In [36]:
prophet_df = monthly_counts.reset_index()
prophet_df.columns = ["ds", "y"]
prophet_train = prophet_df.iloc[:-test_months]
prophet_test = prophet_df.iloc[-test_months:]
In [37]:
m = Prophet(
weekly_seasonality=False,
yearly_seasonality=True,
daily_seasonality=False
)
m.fit(prophet_train)
16:30:00 - cmdstanpy - INFO - Chain [1] start processing 16:30:00 - cmdstanpy - INFO - Chain [1] done processing
Out[37]:
<prophet.forecaster.Prophet at 0x185189882f0>
In [38]:
future = m.make_future_dataframe(periods=test_months, freq="ME")
forecast = m.predict(future)
prophet_forecast = forecast.set_index("ds")["yhat"].iloc[-test_months:]
prophet_forecast.index = test.index # align exactly
plt.figure(figsize=(12,6))
train.plot(label="Train")
test.plot(label="Actual Test", marker="o")
prophet_forecast.plot(label="Prophet Forecast", marker="o")
plt.title("Prophet – Monthly Anti-Asian Incidents")
plt.xlabel("Month")
plt.ylabel("Incidents")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()
# Metrics
prophet_mae = mean_absolute_error(test, prophet_forecast)
prophet_mse = mean_squared_error(test, prophet_forecast)
prophet_rmse = np.sqrt(prophet_mse)
print(f"Prophet – MAE: {prophet_mae:.3f}")
print(f"Prophet – RMSE: {prophet_rmse:.3f}")
Prophet – MAE: 4.798 Prophet – RMSE: 5.761
Comparing All Three Models¶
In [39]:
results = pd.DataFrame({
"Model": ["ARIMA(1,1,1)", "SARIMA(1,0,1)(1,1,0,12)", "Prophet"],
"MAE": [arima_mae, sarima_mae, prophet_mae],
"RMSE": [arima_rmse, sarima_rmse, prophet_rmse]
})
display(results)
| Model | MAE | RMSE | |
|---|---|---|---|
| 0 | ARIMA(1,1,1) | 2.600102 | 2.868501 |
| 1 | SARIMA(1,0,1)(1,1,0,12) | 2.074892 | 2.521463 |
| 2 | Prophet | 4.798075 | 5.760596 |
Hyperparameter Tuning¶
In [40]:
import numpy as np
import itertools
from sklearn.metrics import mean_absolute_error, mean_squared_error
from pmdarima import auto_arima
from prophet import Prophet
Auto-ARIMA (Non-Seasonal ARIMA Tuning)¶
In [41]:
auto_arima_model = auto_arima(
train,
seasonal=False,
stepwise=True,
trace=True,
suppress_warnings=True,
max_p=5,
max_q=5,
max_d=2,
error_action="ignore"
)
print(auto_arima_model.summary())
best_arima_order = auto_arima_model.order
print("\nBest ARIMA order found:", best_arima_order)
Performing stepwise search to minimize aic
ARIMA(2,0,2)(0,0,0)[0] : AIC=434.308, Time=0.04 sec
ARIMA(0,0,0)(0,0,0)[0] : AIC=489.924, Time=0.01 sec
ARIMA(1,0,0)(0,0,0)[0] : AIC=431.960, Time=0.01 sec
ARIMA(0,0,1)(0,0,0)[0] : AIC=453.197, Time=0.02 sec
ARIMA(2,0,0)(0,0,0)[0] : AIC=433.945, Time=0.02 sec
ARIMA(1,0,1)(0,0,0)[0] : AIC=433.938, Time=0.02 sec
ARIMA(2,0,1)(0,0,0)[0] : AIC=435.775, Time=0.03 sec
ARIMA(1,0,0)(0,0,0)[0] intercept : AIC=428.336, Time=0.02 sec
ARIMA(0,0,0)(0,0,0)[0] intercept : AIC=460.685, Time=0.01 sec
ARIMA(2,0,0)(0,0,0)[0] intercept : AIC=429.692, Time=0.03 sec
ARIMA(1,0,1)(0,0,0)[0] intercept : AIC=429.606, Time=0.03 sec
ARIMA(0,0,1)(0,0,0)[0] intercept : AIC=435.047, Time=0.02 sec
ARIMA(2,0,1)(0,0,0)[0] intercept : AIC=431.577, Time=0.08 sec
Best model: ARIMA(1,0,0)(0,0,0)[0] intercept
Total fit time: 0.352 seconds
SARIMAX Results
==============================================================================
Dep. Variable: y No. Observations: 68
Model: SARIMAX(1, 0, 0) Log Likelihood -211.168
Date: Wed, 03 Dec 2025 AIC 428.336
Time: 16:30:01 BIC 434.994
Sample: 01-31-2019 HQIC 430.974
- 08-31-2024
Covariance Type: opg
==============================================================================
coef std err z P>|z| [0.025 0.975]
------------------------------------------------------------------------------
intercept 1.9246 1.434 1.342 0.180 -0.886 4.736
ar.L1 0.6265 0.128 4.908 0.000 0.376 0.877
sigma2 28.9507 2.824 10.251 0.000 23.415 34.486
===================================================================================
Ljung-Box (L1) (Q): 0.33 Jarque-Bera (JB): 783.14
Prob(Q): 0.57 Prob(JB): 0.00
Heteroskedasticity (H): 0.63 Skew: 3.31
Prob(H) (two-sided): 0.27 Kurtosis: 18.25
===================================================================================
Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).
Best ARIMA order found: (1, 0, 0)
Fitting Tuned ARIMA And Evaluating¶
In [42]:
tuned_arima_model = sm.tsa.ARIMA(train, order=best_arima_order)
tuned_arima_result = tuned_arima_model.fit()
tuned_arima_forecast = tuned_arima_result.forecast(steps=len(test))
tuned_arima_forecast = pd.Series(tuned_arima_forecast, index=test.index)
plt.figure(figsize=(12,6))
train.plot(label="Train")
test.plot(label="Actual Test", marker="o")
tuned_arima_forecast.plot(label=f"ARIMA{best_arima_order} Forecast", marker="o")
plt.title(f"Tuned ARIMA{best_arima_order} – Monthly Anti-Asian Incidents")
plt.xlabel("Month")
plt.ylabel("Incidents")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()
tuned_arima_mae = mean_absolute_error(test, tuned_arima_forecast)
tuned_arima_mse = mean_squared_error(test, tuned_arima_forecast)
tuned_arima_rmse = np.sqrt(tuned_arima_mse)
print(f"ARIMA(1,1,1) – MAE: {tuned_arima_mae:.3f}")
print(f"ARIMA(1,1,1) – RMSE: {tuned_arima_rmse:.3f}")
ARIMA(1,1,1) – MAE: 2.636 ARIMA(1,1,1) – RMSE: 2.896
Auto-SARIMA (Seasonal Tuning)¶
In [43]:
auto_sarima_model = auto_arima(
train,
seasonal=True,
m=12,
stepwise=True,
trace=True,
suppress_warnings=True,
max_p=3, max_q=3, max_d=2,
max_P=2, max_Q=2, max_D=1,
error_action="ignore"
)
print(auto_sarima_model.summary())
best_sarima_order = auto_sarima_model.order
best_sarima_seasonal = auto_sarima_model.seasonal_order
print("\nBest SARIMA order:", best_sarima_order)
print("Best SARIMA seasonal order:", best_sarima_seasonal)
Performing stepwise search to minimize aic
ARIMA(2,0,2)(1,0,1)[12] intercept : AIC=432.103, Time=0.39 sec
ARIMA(0,0,0)(0,0,0)[12] intercept : AIC=460.685, Time=0.01 sec
ARIMA(1,0,0)(1,0,0)[12] intercept : AIC=425.064, Time=0.06 sec
ARIMA(0,0,1)(0,0,1)[12] intercept : AIC=431.125, Time=0.05 sec
ARIMA(0,0,0)(0,0,0)[12] : AIC=489.924, Time=0.01 sec
ARIMA(1,0,0)(0,0,0)[12] intercept : AIC=428.336, Time=0.02 sec
ARIMA(1,0,0)(2,0,0)[12] intercept : AIC=426.987, Time=0.14 sec
ARIMA(1,0,0)(1,0,1)[12] intercept : AIC=inf, Time=0.23 sec
ARIMA(1,0,0)(0,0,1)[12] intercept : AIC=425.542, Time=0.06 sec
ARIMA(1,0,0)(2,0,1)[12] intercept : AIC=inf, Time=0.59 sec
ARIMA(0,0,0)(1,0,0)[12] intercept : AIC=457.510, Time=0.04 sec
ARIMA(2,0,0)(1,0,0)[12] intercept : AIC=426.422, Time=0.07 sec
ARIMA(1,0,1)(1,0,0)[12] intercept : AIC=426.330, Time=0.07 sec
ARIMA(0,0,1)(1,0,0)[12] intercept : AIC=431.626, Time=0.06 sec
ARIMA(2,0,1)(1,0,0)[12] intercept : AIC=428.290, Time=0.14 sec
ARIMA(1,0,0)(1,0,0)[12] : AIC=426.741, Time=0.03 sec
Best model: ARIMA(1,0,0)(1,0,0)[12] intercept
Total fit time: 1.971 seconds
SARIMAX Results
==========================================================================================
Dep. Variable: y No. Observations: 68
Model: SARIMAX(1, 0, 0)x(1, 0, 0, 12) Log Likelihood -208.532
Date: Wed, 03 Dec 2025 AIC 425.064
Time: 16:30:03 BIC 433.942
Sample: 01-31-2019 HQIC 428.582
- 08-31-2024
Covariance Type: opg
==============================================================================
coef std err z P>|z| [0.025 0.975]
------------------------------------------------------------------------------
intercept 1.3198 1.171 1.127 0.260 -0.975 3.614
ar.L1 0.6262 0.106 5.921 0.000 0.419 0.834
ar.S.L12 0.2605 0.129 2.021 0.043 0.008 0.513
sigma2 26.4612 4.075 6.493 0.000 18.474 34.448
===================================================================================
Ljung-Box (L1) (Q): 0.32 Jarque-Bera (JB): 462.90
Prob(Q): 0.57 Prob(JB): 0.00
Heteroskedasticity (H): 0.74 Skew: 2.80
Prob(H) (two-sided): 0.48 Kurtosis: 14.49
===================================================================================
Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).
Best SARIMA order: (1, 0, 0)
Best SARIMA seasonal order: (1, 0, 0, 12)
Fitting Tuned SARIMA And Evaluating¶
In [44]:
tuned_sarima_model = sm.tsa.statespace.SARIMAX(
train,
order=best_sarima_order,
seasonal_order=best_sarima_seasonal,
enforce_stationarity=False,
enforce_invertibility=False
)
tuned_sarima_result = tuned_sarima_model.fit(disp=False)
tuned_sarima_forecast = tuned_sarima_result.forecast(steps=len(test))
tuned_sarima_forecast = pd.Series(tuned_sarima_forecast, index=test.index)
plt.figure(figsize=(12,6))
train.plot(label="Train")
test.plot(label="Actual Test", marker="o")
tuned_sarima_forecast.plot(label=f"SARIMA{best_sarima_order}{best_sarima_seasonal} Forecast", marker="o")
plt.title("Tuned SARIMA – Monthly Anti-Asian Incidents")
plt.xlabel("Month")
plt.ylabel("Incidents")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()
tuned_sarima_mae = mean_absolute_error(test, tuned_sarima_forecast)
tuned_sarima_mse = mean_squared_error(test, tuned_sarima_forecast)
tuned_sarima_rmse = np.sqrt(tuned_sarima_mse)
print(f"SARIMA(1,1,1) – MAE: {tuned_sarima_mae:.3f}")
print(f"SARIMA(1,1,1) – RMSE: {tuned_sarima_rmse:.3f}")
SARIMA(1,1,1) – MAE: 1.464 SARIMA(1,1,1) – RMSE: 1.800
Prophet Tuning Via Small Grid Search¶
In [45]:
prophet_df = monthly_counts.reset_index()
prophet_df.columns = ["ds", "y"]
prophet_train = prophet_df.iloc[:-test_months]
prophet_test = prophet_df.iloc[-test_months:]
param_grid = {
"changepoint_prior_scale": [0.01, 0.05, 0.1, 0.5],
"seasonality_prior_scale": [1.0, 5.0, 10.0],
"seasonality_mode": ["additive", "multiplicative"]
}
best_params = None
best_prophet_rmse = np.inf
for cp in param_grid["changepoint_prior_scale"]:
for sp in param_grid["seasonality_prior_scale"]:
for smode in param_grid["seasonality_mode"]:
m = Prophet(
changepoint_prior_scale=cp,
seasonality_prior_scale=sp,
seasonality_mode=smode,
weekly_seasonality=False,
yearly_seasonality=True,
daily_seasonality=False
)
m.fit(prophet_train)
future = m.make_future_dataframe(periods=test_months, freq="ME")
forecast = m.predict(future)
pred = forecast.set_index("ds")["yhat"].iloc[-test_months:]
pred.index = test.index
mse = mean_squared_error(test, pred)
rmse = np.sqrt(mse)
if rmse < best_prophet_rmse:
best_prophet_rmse = rmse
best_params = (cp, sp, smode)
print("Best Prophet parameters (cp, sp, mode):", best_params)
print("Best Prophet RMSE:", best_prophet_rmse)
16:30:04 - cmdstanpy - INFO - Chain [1] start processing 16:30:04 - cmdstanpy - INFO - Chain [1] done processing 16:30:04 - cmdstanpy - INFO - Chain [1] start processing 16:30:04 - cmdstanpy - INFO - Chain [1] done processing 16:30:04 - cmdstanpy - INFO - Chain [1] start processing 16:30:04 - cmdstanpy - INFO - Chain [1] done processing 16:30:04 - cmdstanpy - INFO - Chain [1] start processing 16:30:05 - cmdstanpy - INFO - Chain [1] done processing 16:30:05 - cmdstanpy - INFO - Chain [1] start processing 16:30:05 - cmdstanpy - INFO - Chain [1] done processing 16:30:05 - cmdstanpy - INFO - Chain [1] start processing 16:30:05 - cmdstanpy - INFO - Chain [1] done processing 16:30:05 - cmdstanpy - INFO - Chain [1] start processing 16:30:05 - cmdstanpy - INFO - Chain [1] done processing 16:30:05 - cmdstanpy - INFO - Chain [1] start processing 16:30:06 - cmdstanpy - INFO - Chain [1] done processing 16:30:06 - cmdstanpy - INFO - Chain [1] start processing 16:30:06 - cmdstanpy - INFO - Chain [1] done processing 16:30:06 - cmdstanpy - INFO - Chain [1] start processing 16:30:06 - cmdstanpy - INFO - Chain [1] done processing 16:30:06 - cmdstanpy - INFO - Chain [1] start processing 16:30:07 - cmdstanpy - INFO - Chain [1] done processing 16:30:07 - cmdstanpy - INFO - Chain [1] start processing 16:30:07 - cmdstanpy - INFO - Chain [1] done processing 16:30:07 - cmdstanpy - INFO - Chain [1] start processing 16:30:07 - cmdstanpy - INFO - Chain [1] done processing 16:30:07 - cmdstanpy - INFO - Chain [1] start processing 16:30:08 - cmdstanpy - INFO - Chain [1] done processing 16:30:08 - cmdstanpy - INFO - Chain [1] start processing 16:30:08 - cmdstanpy - INFO - Chain [1] done processing 16:30:08 - cmdstanpy - INFO - Chain [1] start processing 16:30:09 - cmdstanpy - INFO - Chain [1] done processing 16:30:09 - cmdstanpy - INFO - Chain [1] start processing 16:30:09 - cmdstanpy - INFO - Chain [1] done processing 16:30:09 - cmdstanpy - INFO - Chain [1] start processing 16:30:10 - cmdstanpy - INFO - Chain [1] done processing 16:30:10 - cmdstanpy - INFO - Chain [1] start processing 16:30:10 - cmdstanpy - INFO - Chain [1] done processing 16:30:10 - cmdstanpy - INFO - Chain [1] start processing 16:30:11 - cmdstanpy - INFO - Chain [1] done processing 16:30:11 - cmdstanpy - INFO - Chain [1] start processing 16:30:11 - cmdstanpy - INFO - Chain [1] done processing 16:30:11 - cmdstanpy - INFO - Chain [1] start processing 16:30:12 - cmdstanpy - INFO - Chain [1] done processing 16:30:12 - cmdstanpy - INFO - Chain [1] start processing 16:30:12 - cmdstanpy - INFO - Chain [1] done processing 16:30:13 - cmdstanpy - INFO - Chain [1] start processing 16:30:13 - cmdstanpy - INFO - Chain [1] done processing
Best Prophet parameters (cp, sp, mode): (0.5, 1.0, 'multiplicative') Best Prophet RMSE: 2.7421939919440614
Fitting Prophet With Best Params And Evaluating¶
In [46]:
best_cp, best_sp, best_mode = best_params
best_m = Prophet(
changepoint_prior_scale=best_cp,
seasonality_prior_scale=best_sp,
seasonality_mode=best_mode,
weekly_seasonality=False,
yearly_seasonality=True,
daily_seasonality=False
)
best_m.fit(prophet_train)
future = best_m.make_future_dataframe(periods=test_months, freq="M")
forecast = best_m.predict(future)
best_prophet_forecast = forecast.set_index("ds")["yhat"].iloc[-test_months:]
best_prophet_forecast.index = test.index
plt.figure(figsize=(12,6))
train.plot(label="Train")
test.plot(label="Actual Test", marker="o")
best_prophet_forecast.plot(label="Tuned Prophet Forecast", marker="o")
plt.title(f"Tuned Prophet – Monthly Anti-Asian Incidents\n(cp={best_cp}, sp={best_sp}, mode={best_mode})")
plt.xlabel("Month")
plt.ylabel("Incidents")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()
tuned_prophet_mae = mean_absolute_error(test, best_prophet_forecast)
tuned_prophet_mse = mean_squared_error(test, best_prophet_forecast)
tuned_prophet_rmse = np.sqrt(tuned_prophet_mse)
print(f"Tuned Prophet – MAE: {tuned_prophet_mae:.3f}")
print(f"Tuned Prophet – RMSE: {tuned_prophet_rmse:.3f}")
16:30:13 - cmdstanpy - INFO - Chain [1] start processing 16:30:14 - cmdstanpy - INFO - Chain [1] done processing C:\Users\vibha\anaconda3\Lib\site-packages\prophet\forecaster.py:1872: FutureWarning: 'M' is deprecated and will be removed in a future version, please use 'ME' instead. dates = pd.date_range(
Tuned Prophet – MAE: 2.315 Tuned Prophet – RMSE: 2.742
Comparing Tuned Models¶
In [47]:
tuned_results = pd.DataFrame({
"Model": [
f"ARIMA{best_arima_order}",
f"SARIMA{best_sarima_order}{best_sarima_seasonal}",
f"Prophet(cp={best_cp}, sp={best_sp}, mode={best_mode})"
],
"MAE": [tuned_arima_mae, tuned_sarima_mae, tuned_prophet_mae],
"RMSE": [tuned_arima_rmse, tuned_sarima_rmse, tuned_prophet_rmse]
})
display(tuned_results)
| Model | MAE | RMSE | |
|---|---|---|---|
| 0 | ARIMA(1, 0, 0) | 2.635961 | 2.895933 |
| 1 | SARIMA(1, 0, 0)(1, 0, 0, 12) | 1.464307 | 1.800098 |
| 2 | Prophet(cp=0.5, sp=1.0, mode=multiplicative) | 2.315017 | 2.742194 |
Combining All Model Forecasts¶
In [48]:
comparison_df = pd.DataFrame({
"Actual": test,
"ARIMA Untuned": arima_forecast,
"ARIMA Tuned": tuned_arima_forecast,
"SARIMA Untuned": sarima_forecast,
"SARIMA Tuned": tuned_sarima_forecast,
"Prophet Untuned": prophet_forecast,
"Prophet Tuned": best_prophet_forecast
})
display(comparison_df.head())
| Actual | ARIMA Untuned | ARIMA Tuned | SARIMA Untuned | SARIMA Tuned | Prophet Untuned | Prophet Tuned | |
|---|---|---|---|---|---|---|---|
| Record Create Date | |||||||
| 2024-09-30 | 7 | 3.747455 | 3.804200 | 2.874223 | 3.741317 | 4.514449 | 0.734242 |
| 2024-10-31 | 3 | 4.233510 | 4.308042 | 0.615527 | 1.679759 | 4.521154 | 0.475653 |
| 2024-11-30 | 3 | 4.549583 | 4.623706 | 1.450476 | 1.143071 | 5.320337 | 0.535817 |
| 2024-12-31 | 3 | 4.755118 | 4.821475 | 2.092783 | 1.383703 | 5.497119 | 0.435934 |
| 2025-01-31 | 1 | 4.888774 | 4.945379 | 2.076868 | 0.934048 | 2.735826 | 0.219521 |
Plotting All Models Versus Actual (Tuned + Untuned)¶
In [49]:
plt.figure(figsize=(15,8))
plt.plot(test.index, test.values, label="Actual", color="black", linewidth=3)
plt.plot(test.index, arima_forecast, label="ARIMA Untuned", linestyle="--")
plt.plot(test.index, sarima_forecast, label="SARIMA Untuned", linestyle="--")
plt.plot(test.index, prophet_forecast, label="Prophet Untuned", linestyle="--")
plt.plot(test.index, tuned_arima_forecast, label="ARIMA Tuned", marker="o")
plt.plot(test.index, tuned_sarima_forecast, label="SARIMA Tuned", marker="o")
plt.plot(test.index, best_prophet_forecast, label="Prophet Tuned", marker="o")
plt.title("Comparison of Tuned and Untuned Forecasting Models\nMonthly Anti-Asian Hate Crime Incidents", fontsize=16)
plt.xlabel("Month")
plt.ylabel("Incident Count")
plt.grid(True)
plt.legend()
plt.tight_layout()
plt.show()
In [ ]:
Performance Comparison Table (Tuned Versus Untuned)¶
In [50]:
comparison_metrics = pd.DataFrame({
"Model": [
"ARIMA Untuned", "ARIMA Tuned",
"SARIMA Untuned", "SARIMA Tuned",
"Prophet Untuned", "Prophet Tuned"
],
"MAE": [
arima_mae, tuned_arima_mae,
sarima_mae, tuned_sarima_mae,
prophet_mae, tuned_prophet_mae
],
"RMSE": [
arima_rmse, tuned_arima_rmse,
sarima_rmse, tuned_sarima_rmse,
prophet_rmse, tuned_prophet_rmse
]
})
display(comparison_metrics)
| Model | MAE | RMSE | |
|---|---|---|---|
| 0 | ARIMA Untuned | 2.600102 | 2.868501 |
| 1 | ARIMA Tuned | 2.635961 | 2.895933 |
| 2 | SARIMA Untuned | 2.074892 | 2.521463 |
| 3 | SARIMA Tuned | 1.464307 | 1.800098 |
| 4 | Prophet Untuned | 4.798075 | 5.760596 |
| 5 | Prophet Tuned | 2.315017 | 2.742194 |
Bar Chart Comparing RMSE (Tuned Versus Untuned)¶
In [51]:
plt.figure(figsize=(12,6))
plt.bar(comparison_metrics["Model"], comparison_metrics["RMSE"], color=[
"#ff9999", "#ff4d4d",
"#99ccff", "#1a75ff",
"#99ff99", "#33cc33"
])
plt.title("RMSE Comparison: Tuned vs Untuned Forecasting Models", fontsize=15)
plt.ylabel("RMSE (Lower is better)")
plt.xticks(rotation=45, ha="right")
plt.grid(axis="y", linestyle="--", alpha=0.5)
plt.tight_layout()
plt.show()
Relative Model Accuracy¶
In [52]:
baseline_name = "SARIMA Tuned"
baseline_rmse = comparison_metrics.loc[
comparison_metrics["Model"] == baseline_name, "RMSE"
].iloc[0]
comparison_metrics["Relative_Accuracy_Value"] = (
baseline_rmse / comparison_metrics["RMSE"] * 100
)
def format_acc(x):
x_round = round(x)
if x_round == 100:
return "100%"
else:
return f"≈ {x_round}%"
comparison_metrics["Relative Accuracy"] = comparison_metrics[
"Relative_Accuracy_Value"
].apply(format_acc)
comparison_metrics_sorted = comparison_metrics.sort_values("RMSE").reset_index(drop=True)
display(comparison_metrics_sorted[["Model", "MAE", "RMSE", "Relative Accuracy"]])
| Model | MAE | RMSE | Relative Accuracy | |
|---|---|---|---|---|
| 0 | SARIMA Tuned | 1.464307 | 1.800098 | 100% |
| 1 | SARIMA Untuned | 2.074892 | 2.521463 | ≈ 71% |
| 2 | Prophet Tuned | 2.315017 | 2.742194 | ≈ 66% |
| 3 | ARIMA Untuned | 2.600102 | 2.868501 | ≈ 63% |
| 4 | ARIMA Tuned | 2.635961 | 2.895933 | ≈ 62% |
| 5 | Prophet Untuned | 4.798075 | 5.760596 | ≈ 31% |
Bar Chart Of Relative Accuracy¶
In [53]:
plt.figure(figsize=(12, 6))
plt.bar(
comparison_metrics_sorted["Model"],
comparison_metrics_sorted["Relative_Accuracy_Value"],
)
plt.title(
"Relative Accuracy (Higher = Better Performance)\nBaseline: SARIMA Tuned = 100% Accuracy",
fontsize=14,
)
plt.ylabel("Relative Accuracy (%)")
plt.xticks(rotation=45, ha="right")
plt.grid(axis="y", linestyle="--", alpha=0.5)
plt.tight_layout()
plt.show()
In [ ]:
Based on the evaluation metrics, the tuned SARIMA model achieved the lowest RMSE (1.80), making it the most accurate forecasting model among the three approaches. While Prophet and ARIMA models performed reasonably well, SARIMA demonstrated better stability and predictive accuracy, particularly for low-frequency monthly crime data. Thus, the final short-term forecasts were generated using the tuned SARIMA configuration.¶
Refitting Tuned SARIMA On Full Data And Forecasting 6 Months¶
In [54]:
full_sarima_model = sm.tsa.statespace.SARIMAX(
monthly_counts,
order=best_sarima_order,
seasonal_order=best_sarima_seasonal,
enforce_stationarity=False,
enforce_invertibility=False
)
full_sarima_result = full_sarima_model.fit(disp=False)
n_steps = 6
sarima_forecast_obj = full_sarima_result.get_forecast(steps=n_steps)
sarima_future_mean = sarima_forecast_obj.predicted_mean
sarima_future_ci = sarima_forecast_obj.conf_int()
future_index = pd.date_range(
start=monthly_counts.index[-1] + pd.offsets.MonthEnd(1),
periods=n_steps,
freq="M"
)
sarima_future_mean.index = future_index
sarima_future_ci.index = future_index
print("SARIMA forecast for the next 6 months:")
display(
pd.DataFrame({
"Forecast": sarima_future_mean,
"Lower CI": sarima_future_ci.iloc[:, 0],
"Upper CI": sarima_future_ci.iloc[:, 1],
})
)
SARIMA forecast for the next 6 months:
C:\Users\vibha\AppData\Local\Temp\ipykernel_19412\2913118711.py:17: FutureWarning: 'M' is deprecated and will be removed in a future version, please use 'ME' instead. future_index = pd.date_range(
| Forecast | Lower CI | Upper CI | |
|---|---|---|---|
| 2025-09-30 | 2.488614 | -7.948081 | 12.925308 |
| 2025-10-31 | 1.047929 | -11.723726 | 13.819584 |
| 2025-11-30 | 1.061911 | -12.724868 | 14.848690 |
| 2025-12-31 | 1.071773 | -13.193152 | 15.336698 |
| 2026-01-31 | 0.348475 | -14.148456 | 14.845407 |
| 2026-02-28 | 0.353382 | -14.257602 | 14.964366 |
Plotting History + 6-Month SARIMA Forecast¶
In [55]:
history_window = 24
plt.figure(figsize=(12,6))
monthly_counts.tail(history_window).plot(label="History", marker="o")
sarima_future_mean.plot(label="SARIMA Forecast", marker="o")
plt.fill_between(
sarima_future_mean.index,
sarima_future_ci.iloc[:, 0],
sarima_future_ci.iloc[:, 1],
alpha=0.2,
label="95% confidence interval"
)
plt.title(f"Tuned SARIMA{best_sarima_order}{best_sarima_seasonal} – 6-Month Forecast\nMonthly Anti-Asian Hate Crime Incidents")
plt.xlabel("Month")
plt.ylabel("Incident Count")
plt.grid(True)
plt.legend()
plt.tight_layout()
plt.show()
In [ ]:
Computing Historical Precinct Totals And Shares¶
In [56]:
filtered_df_precinct = filtered_df.dropna(subset=["Complaint Precinct Code"]).copy()
filtered_df_precinct["Complaint Precinct Code"] = filtered_df_precinct["Complaint Precinct Code"].astype(int)
precinct_counts = (
filtered_df_precinct
.groupby("Complaint Precinct Code")
.size()
.rename("historical_total")
.sort_values(ascending=False)
)
print("Top 10 precincts by historical Anti-Asian incidents:")
display(precinct_counts.head(10))
total_incidents_citywide = precinct_counts.sum()
precinct_shares = (precinct_counts / total_incidents_citywide).rename("share")
print("\nCheck: sum of shares =", precinct_shares.sum())
Top 10 precincts by historical Anti-Asian incidents:
Complaint Precinct Code 14 52 13 25 5 21 7 16 84 15 62 15 109 14 9 13 6 13 18 12 Name: historical_total, dtype: int64
Check: sum of shares = 0.9999999999999998
Using SARIMA Forecast To Get Expected 6-Month Total¶
In [57]:
forecast_6m_total = sarima_future_mean.sum()
print("Total forecasted Anti-Asian incidents (next 6 months, citywide):", forecast_6m_total)
Total forecasted Anti-Asian incidents (next 6 months, citywide): 6.37208442247705
Allocating Forecast To Precincts Using Historical Shares¶
In [58]:
precinct_forecast_6m = (precinct_shares * forecast_6m_total).rename("forecast_6m")
precinct_stats = pd.concat([precinct_counts, precinct_shares, precinct_forecast_6m], axis=1)
display(precinct_stats.head(10))
| historical_total | share | forecast_6m | |
|---|---|---|---|
| Complaint Precinct Code | |||
| 14 | 52 | 0.131980 | 0.840986 |
| 13 | 25 | 0.063452 | 0.404320 |
| 5 | 21 | 0.053299 | 0.339629 |
| 7 | 16 | 0.040609 | 0.258765 |
| 84 | 15 | 0.038071 | 0.242592 |
| 62 | 15 | 0.038071 | 0.242592 |
| 109 | 14 | 0.035533 | 0.226419 |
| 9 | 13 | 0.032995 | 0.210246 |
| 6 | 13 | 0.032995 | 0.210246 |
| 18 | 12 | 0.030457 | 0.194074 |
Loading Precinct GeoJSON And Merging Stats¶
In [59]:
gdf_precincts = gpd.read_file(PRECINCTS_GEOJSON_URL)
gdf_precincts.columns = gdf_precincts.columns.str.strip()
precinct_candidates = [c for c in gdf_precincts.columns if "precinct" in c.lower()]
print("Precinct columns in GeoJSON:", precinct_candidates)
precinct_col = precinct_candidates[0]
gdf_precincts = gdf_precincts.rename(columns={precinct_col: "Complaint Precinct Code"})
gdf_precincts["Complaint Precinct Code"] = gdf_precincts["Complaint Precinct Code"].astype(int)
gdf_stats = gdf_precincts.merge(
precinct_stats.reset_index(),
on="Complaint Precinct Code",
how="left"
)
for col in ["historical_total", "share", "forecast_6m"]:
gdf_stats[col] = gdf_stats[col].fillna(0)
print("GeoDataFrame with stats:", gdf_stats.shape)
Precinct columns in GeoJSON: ['precinct'] GeoDataFrame with stats: (78, 7)
Choropleth: Historical Hotspots¶
In [60]:
fig, ax = plt.subplots(1, 1, figsize=(10, 10))
gdf_stats.plot(
column="historical_total",
cmap="Reds",
legend=True,
edgecolor="black",
linewidth=0.3,
ax=ax
)
ax.set_title("Historical Anti-Asian Hate Crime Incidents by Precinct (Total)", fontsize=15)
ax.axis("off")
plt.tight_layout()
plt.show()
Choropleth: Forecasted 6-Month Expected Incidents¶
In [61]:
fig, ax = plt.subplots(1, 1, figsize=(10, 10))
gdf_stats.plot(
column="forecast_6m",
cmap="Blues",
legend=True,
edgecolor="black",
linewidth=0.3,
ax=ax
)
ax.set_title("Expected Anti-Asian Incidents in Next 6 Months (by Precinct)\nAllocated Using Tuned SARIMA Forecast + Historical Shares", fontsize=14)
ax.axis("off")
plt.tight_layout()
plt.show()
In [ ]:
In [ ]: