Imports & Settings¶

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import geopandas as gpd
from IPython.display import display
In [2]:
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)
In [3]:
DATA_PATH = "NYPD_Hate_Crimes_20251202.csv"
PRECINCTS_GEOJSON_URL = "https://data.cityofnewyork.us/resource/y76i-bdw7.geojson"
In [4]:
TARGET_MOTIVE = "ANTI-ASIAN"

Load data & basic information¶

In [5]:
df = pd.read_csv(DATA_PATH)
In [6]:
print("Shape of raw dataset:", df.shape)
print("\nColumn names:")
print(df.columns.tolist())
Shape of raw dataset: (3872, 14)

Column names:
['Full Complaint ID', 'Complaint Year Number', 'Month Number', 'Record Create Date', 'Complaint Precinct Code', 'Patrol Borough Name', 'County', 'Law Code Category Description', 'Offense Description', 'PD Code Description', 'Bias Motive Description', 'Offense Category', 'Arrest Date', 'Arrest Id']
In [7]:
print("First 5 rows:")
display(df.head())
First 5 rows:
Full Complaint ID Complaint Year Number Month Number Record Create Date Complaint Precinct Code Patrol Borough Name County Law Code Category Description Offense Description PD Code Description Bias Motive Description Offense Category Arrest Date Arrest Id
0 201906012119317 2019 1 01/23/2019 60 PATROL BORO BKLYN SOUTH KINGS FELONY MISCELLANEOUS PENAL LAW AGGRAVATED HARASSMENT 1 ANTI-JEWISH Religion/Religious Practice NaN NaN
1 201906012175717 2019 2 02/25/2019 60 PATROL BORO BKLYN SOUTH KINGS FELONY MISCELLANEOUS PENAL LAW AGGRAVATED HARASSMENT 1 ANTI-JEWISH Religion/Religious Practice NaN NaN
2 201906012180117 2019 2 02/27/2019 60 PATROL BORO BKLYN SOUTH KINGS FELONY MISCELLANEOUS PENAL LAW AGGRAVATED HARASSMENT 1 ANTI-JEWISH Religion/Religious Practice NaN NaN
3 201906012273417 2019 4 04/16/2019 60 PATROL BORO BKLYN SOUTH KINGS FELONY MISCELLANEOUS PENAL LAW AGGRAVATED HARASSMENT 1 ANTI-JEWISH Religion/Religious Practice NaN NaN
4 201906012413717 2019 6 06/20/2019 60 PATROL BORO BKLYN SOUTH KINGS FELONY MISCELLANEOUS PENAL LAW AGGRAVATED HARASSMENT 1 ANTI-JEWISH Religion/Religious Practice NaN NaN
In [8]:
print("\nDataFrame info:")
df.info()
DataFrame info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3872 entries, 0 to 3871
Data columns (total 14 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Full Complaint ID              3872 non-null   int64  
 1   Complaint Year Number          3872 non-null   int64  
 2   Month Number                   3872 non-null   int64  
 3   Record Create Date             3872 non-null   object 
 4   Complaint Precinct Code        3872 non-null   int64  
 5   Patrol Borough Name            3872 non-null   object 
 6   County                         3872 non-null   object 
 7   Law Code Category Description  3872 non-null   object 
 8   Offense Description            3872 non-null   object 
 9   PD Code Description            3872 non-null   object 
 10  Bias Motive Description        3872 non-null   object 
 11  Offense Category               3872 non-null   object 
 12  Arrest Date                    0 non-null      float64
 13  Arrest Id                      1694 non-null   object 
dtypes: float64(1), int64(4), object(9)
memory usage: 423.6+ KB

Date Handling & Cleaning¶

In [9]:
# Converting record creation date
df["Record Create Date"] = pd.to_datetime(df["Record Create Date"], errors="coerce")
In [10]:
# Dropping rows with invalid dates
df = df.dropna(subset=["Record Create Date"]).copy()
In [11]:
# Sorting and setting index
df = df.sort_values("Record Create Date")
df.set_index("Record Create Date", inplace=True)
In [12]:
print("Data after cleaning:", df.shape)
print("Date range:", df.index.min(), "→", df.index.max())
Data after cleaning: (3872, 13)
Date range: 2019-01-01 00:00:00 → 2025-09-30 00:00:00

Filter Exact Anti-Asian Incidents¶

In [13]:
df["Bias Motive Description"] = df["Bias Motive Description"].astype(str).str.strip()
In [14]:
filtered_df = df[df["Bias Motive Description"].str.upper() == TARGET_MOTIVE].copy()
In [15]:
print(f"Total ANTI-ASIAN incidents found: {len(filtered_df)}")
display(filtered_df.head())
Total ANTI-ASIAN incidents found: 394
Full Complaint ID Complaint Year Number Month Number Complaint Precinct Code Patrol Borough Name County Law Code Category Description Offense Description PD Code Description Bias Motive Description Offense Category Arrest Date Arrest Id
Record Create Date
2019-01-15 201906112101017 2019 1 61 PATROL BORO BKLYN SOUTH KINGS FELONY MURDER & NON-NEGL. MANSLAUGHTE MURDER,UNCLASSIFIED ANTI-ASIAN Race/Color NaN K31675023
2020-01-29 202007912147017 2020 1 79 PATROL BORO BKLYN NORTH KINGS FELONY FELONY ASSAULT ASSAULT 2,1,UNCLASSIFIED ANTI-ASIAN Race/Color NaN NaN
2020-01-29 202007912147017 2020 1 79 PATROL BORO BKLYN NORTH KINGS FELONY FELONY ASSAULT ASSAULT 2,1,UNCLASSIFIED ANTI-ASIAN Race/Color NaN K32676408
2020-03-10 202002312217317 2020 3 23 PATROL BORO MAN NORTH NEW YORK MISDEMEANOR ASSAULT 3 & RELATED OFFENSES ASSAULT 3 ANTI-ASIAN Race/Color NaN M32682527
2020-03-10 202002312217217 2020 3 23 PATROL BORO MAN NORTH NEW YORK FELONY MISCELLANEOUS PENAL LAW AGGRAVATED HARASSMENT 1 ANTI-ASIAN Race/Color NaN M32682532
In [16]:
print("Incident Date Range:",
      filtered_df.index.min().strftime("%Y-%m-%d"),
      "→",
      filtered_df.index.max().strftime("%Y-%m-%d"))
Incident Date Range: 2019-01-15 → 2025-08-19

Adding Year/Month Columns¶

In [17]:
filtered_df["year"] = filtered_df.index.year
filtered_df["month"] = filtered_df.index.month
filtered_df["month_name"] = filtered_df.index.month_name()

display(filtered_df[["year","month","month_name"]].head())
year month month_name
Record Create Date
2019-01-15 2019 1 January
2020-01-29 2020 1 January
2020-01-29 2020 1 January
2020-03-10 2020 3 March
2020-03-10 2020 3 March

Top Boroughs & Offense Categories¶

In [18]:
print("Top Patrol Boroughs:")
display(filtered_df["Patrol Borough Name"].value_counts())

print("\nTop Offense Categories:")
display(filtered_df["Offense Category"].value_counts())
Top Patrol Boroughs:
Patrol Borough Name
PATROL BORO MAN SOUTH        181
PATROL BORO QUEENS NORTH      64
PATROL BORO MAN NORTH         44
PATROL BORO BKLYN NORTH       40
PATROL BORO BKLYN SOUTH       40
PATROL BORO BRONX             10
PATROL BORO QUEENS SOUTH      10
PATROL BORO STATEN ISLAND      5
Name: count, dtype: int64
Top Offense Categories:
Offense Category
Race/Color    394
Name: count, dtype: int64

Monthly Time Series¶

In [19]:
monthly_counts = filtered_df.resample("ME").size()

plt.figure(figsize=(12,6))
monthly_counts.plot(marker="o")
plt.title("Monthly Anti-Asian Hate Crime Incidents (NYPD)")
plt.xlabel("Month")
plt.ylabel("Incidents")
plt.grid(True)
plt.tight_layout()
plt.show()

yearly_counts = filtered_df.resample("YE").size()

plt.figure(figsize=(12,6))
yearly_counts.plot(kind="bar")
plt.title("Yearly Anti-Asian Hate Crime Incidents")
plt.xlabel("Year")
plt.ylabel("Incidents")
plt.tight_layout()
plt.show()
No description has been provided for this image
No description has been provided for this image

Seasonality Heatmap (Year × Month)¶

In [20]:
heatmap_data = filtered_df.pivot_table(
    index="year",
    columns="month",
    values="Full Complaint ID",
    aggfunc="count"
).fillna(0)

heatmap_data = heatmap_data.reindex(sorted(heatmap_data.index), axis=0)

plt.figure(figsize=(12,6))
sns.heatmap(heatmap_data, annot=True, fmt=".0f", cmap="Reds")
plt.title("Anti-Asian Crime Seasonality (Year × Month)")
plt.xlabel("Month Number")
plt.ylabel("Year")
plt.tight_layout()
plt.show()
No description has been provided for this image
In [ ]:
 

Borough-Level Bar Plot¶

In [21]:
borough_counts = filtered_df["Patrol Borough Name"].value_counts()

plt.figure(figsize=(10,6))
sns.barplot(x=borough_counts.index, y=borough_counts.values)
plt.title("Anti-Asian Hate Crimes by Borough")
plt.xlabel("Borough")
plt.ylabel("Incidents")
plt.xticks(rotation=45, ha="right")
plt.tight_layout()
plt.show()
No description has been provided for this image

Arrest Analysis¶

In [22]:
filtered_df["Arrest Date"] = pd.to_datetime(filtered_df["Arrest Date"], errors="coerce")
filtered_df["arrest_made"] = filtered_df["Arrest Date"].notna()

no_arrest_count = (filtered_df["arrest_made"] == False).sum()
arrest_count    = (filtered_df["arrest_made"] == True).sum()

labels = ["No Arrest", "Arrest"]
values = [no_arrest_count, arrest_count]

plt.figure(figsize=(6,5))
sns.barplot(x=labels, y=values)
plt.title("Arrest vs No-Arrest in Anti-Asian Incidents")
plt.ylabel("Number of Incidents")
plt.tight_layout()
plt.show()
No description has been provided for this image

In the provided NYPD Hate Crimes dataset, none of the incidents classified as ANTI-ASIAN have a non-null Arrest Date. In our analysis, we therefore treat all Anti-Asian incidents as ‘no arrest recorded’.¶

This may indicate either a true absence of arrests or incomplete recording of arrest data for these cases¶

In [ ]:
 

Precinct Hotspots¶

In [23]:
filtered_df_precinct = filtered_df.dropna(subset=["Complaint Precinct Code"]).copy()
filtered_df_precinct["Complaint Precinct Code"] = filtered_df_precinct["Complaint Precinct Code"].astype(int)

precinct_counts = (
    filtered_df_precinct
    .groupby("Complaint Precinct Code")
    .size()
    .sort_values(ascending=False)
)

print("Top 15 precincts by Anti-Asian incident count:")
display(precinct_counts.head(15))
Top 15 precincts by Anti-Asian incident count:
Complaint Precinct Code
14     52
13     25
5      21
7      16
84     15
62     15
109    14
9      13
6      13
18     12
108    12
17     12
24     12
115     9
1       9
dtype: int64
In [24]:
top_n = 15
plt.figure(figsize=(12,6))
sns.barplot(
    x=precinct_counts.head(top_n).index.astype(str),
    y=precinct_counts.head(top_n).values
)
plt.title(f"Top {top_n} Precincts by Anti-Asian Hate Crime Incidents")
plt.xlabel("Complaint Precinct Code")
plt.ylabel("Number of Incidents")
plt.xticks(rotation=45, ha="right")
plt.tight_layout()
plt.show()
No description has been provided for this image

Loading GeoJSON¶

In [25]:
try:
    gdf_precincts = gpd.read_file(PRECINCTS_GEOJSON_URL)
    gdf_precincts.columns = gdf_precincts.columns.str.strip()

    precinct_candidates = [c for c in gdf_precincts.columns if "precinct" in c.lower()]
    if not precinct_candidates:
        raise ValueError(f"No precinct-like column found. Columns are: {gdf_precincts.columns.tolist()}")

    precinct_col = precinct_candidates[0]
    print("Using GeoJSON column as precinct code:", precinct_col)

    gdf_precincts = gdf_precincts.rename(columns={precinct_col: "Complaint Precinct Code"})
    gdf_precincts["Complaint Precinct Code"] = gdf_precincts["Complaint Precinct Code"].astype(int)

    precinct_counts_df = precinct_counts.rename("Total_Incidents").reset_index()

    gdf = gdf_precincts.merge(precinct_counts_df,
                              on="Complaint Precinct Code",
                              how="left")

    gdf["Total_Incidents"] = gdf["Total_Incidents"].fillna(0)

    print("GeoDataFrame shape after merge:", gdf.shape)

except Exception as e:
    print("Error loading or merging GeoJSON:", e)
    gdf = None
Using GeoJSON column as precinct code: precinct
GeoDataFrame shape after merge: (78, 5)

Single Choropleth Map Of Hotspots¶

In [26]:
if gdf is not None:
    fig, ax = plt.subplots(1, 1, figsize=(12, 10))

    gdf.plot(
        column="Total_Incidents",
        cmap="Reds",
        legend=True,
        edgecolor="black",
        linewidth=0.3,
        ax=ax
    )

    ax.set_title("Anti-Asian Hate Crime Incidents by Precinct (Total)", fontsize=16)
    ax.axis("off")

    plt.tight_layout()
    plt.show()
else:
    print("GeoDataFrame `gdf` is None; cannot plot map.")
No description has been provided for this image

Which Months Are Worst Overall?¶

In [27]:
month_order = [
    "January","February","March","April","May","June",
    "July","August","September","October","November","December"
]

month_counts = (
    filtered_df["month_name"]
    .value_counts()
    .reindex(month_order)
    .fillna(0)
)

plt.figure(figsize=(12,6))
sns.barplot(x=month_counts.index, y=month_counts.values)
plt.title("Anti-Asian Hate Crime Incidents by Month (All Years Combined)")
plt.xlabel("Month")
plt.ylabel("Number of Incidents")
plt.xticks(rotation=45, ha="right")
plt.tight_layout()
plt.show()
No description has been provided for this image

Day-Of-Week Pattern¶

In [28]:
filtered_df["day_of_week"] = filtered_df.index.day_name()

dow_order = ["Monday","Tuesday","Wednesday","Thursday","Friday","Saturday","Sunday"]

dow_counts = (
    filtered_df["day_of_week"]
    .value_counts()
    .reindex(dow_order)
    .fillna(0)
)

plt.figure(figsize=(10,5))
sns.barplot(x=dow_counts.index, y=dow_counts.values)
plt.title("Anti-Asian Hate Crime Incidents by Day of Week")
plt.xlabel("Day of Week")
plt.ylabel("Number of Incidents")
plt.tight_layout()
plt.show()
No description has been provided for this image
In [ ]:
 

Time Series Libraries¶

In [29]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import statsmodels.api as sm
from sklearn.metrics import mean_absolute_error, mean_squared_error

from prophet import Prophet

Monthly Time Series (NYC Total)¶

In [30]:
filtered_df.index = pd.to_datetime(filtered_df.index)

monthly_counts = (
    filtered_df
    .resample("ME")          
    .size()
    .asfreq("ME", fill_value=0)
)

print("Monthly series length:", len(monthly_counts))
print("From:", monthly_counts.index.min(), "to:", monthly_counts.index.max())

plt.figure(figsize=(12,6))
monthly_counts.plot(marker="o")
plt.title("Monthly Anti-Asian Hate Crime Incidents (NYC total)")
plt.xlabel("Month")
plt.ylabel("Number of Incidents")
plt.grid(True)
plt.tight_layout()
plt.show()
Monthly series length: 80
From: 2019-01-31 00:00:00 to: 2025-08-31 00:00:00
No description has been provided for this image

Train / Test Split (Last 12 Months As Test)¶

In [31]:
test_months = 12 

train = monthly_counts.iloc[:-test_months]
test  = monthly_counts.iloc[-test_months:]

print("Train range:", train.index.min(), "→", train.index.max(), "(", len(train), "months )")
print("Test  range:", test.index.min(),  "→", test.index.max(),  "(", len(test),  "months )")

plt.figure(figsize=(12,6))
train.plot(label="Train")
test.plot(label="Test", marker="o")
plt.title("Train/Test Split – Monthly Anti-Asian Incidents")
plt.xlabel("Month")
plt.ylabel("Incidents")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()
Train range: 2019-01-31 00:00:00 → 2024-08-31 00:00:00 ( 68 months )
Test  range: 2024-09-30 00:00:00 → 2025-08-31 00:00:00 ( 12 months )
No description has been provided for this image

ARIMA(1,1,1) On Monthly Data¶

In [32]:
arima_order = (1, 1, 1)

arima_model = sm.tsa.ARIMA(train, order=arima_order)
arima_result = arima_model.fit()

print(arima_result.summary())
                               SARIMAX Results                                
==============================================================================
Dep. Variable:                      y   No. Observations:                   68
Model:                 ARIMA(1, 1, 1)   Log Likelihood                -209.681
Date:                Wed, 03 Dec 2025   AIC                            425.362
Time:                        16:29:59   BIC                            431.976
Sample:                    01-31-2019   HQIC                           427.979
                         - 08-31-2024                                         
Covariance Type:                  opg                                         
==============================================================================
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
ar.L1          0.6503      0.140      4.631      0.000       0.375       0.926
ma.L1         -0.9992      2.546     -0.392      0.695      -5.990       3.991
sigma2        29.4097     73.606      0.400      0.689    -114.855     173.674
===================================================================================
Ljung-Box (L1) (Q):                   0.09   Jarque-Bera (JB):               802.16
Prob(Q):                              0.76   Prob(JB):                         0.00
Heteroskedasticity (H):               0.75   Skew:                             3.30
Prob(H) (two-sided):                  0.50   Kurtosis:                        18.61
===================================================================================

Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).

ARIMA Forecast + Metrics¶

In [33]:
arima_forecast = arima_result.forecast(steps=len(test))
arima_forecast = pd.Series(arima_forecast, index=test.index)

plt.figure(figsize=(12,6))
train.plot(label="Train")
test.plot(label="Actual Test", marker="o")
arima_forecast.plot(label="ARIMA Forecast", marker="o")
plt.title("ARIMA(1,1,1) – Monthly Anti-Asian Incidents")
plt.xlabel("Month")
plt.ylabel("Incidents")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

arima_mae = mean_absolute_error(test, arima_forecast)
arima_mse = mean_squared_error(test, arima_forecast)
arima_rmse = np.sqrt(arima_mse)

print(f"ARIMA(1,1,1) – MAE:  {arima_mae:.3f}")
print(f"ARIMA(1,1,1) – RMSE: {arima_rmse:.3f}")
No description has been provided for this image
ARIMA(1,1,1) – MAE:  2.600
ARIMA(1,1,1) – RMSE: 2.869
In [ ]:
 

SARIMA With Yearly Seasonality (12-Month Period)¶

In [34]:
sarima_order = (1, 0, 1)
sarima_seasonal_order = (1, 1, 0, 12)  

sarima_model = sm.tsa.statespace.SARIMAX(
    train,
    order=sarima_order,
    seasonal_order=sarima_seasonal_order,
    enforce_stationarity=False,
    enforce_invertibility=False
)

sarima_result = sarima_model.fit(disp=False)
print(sarima_result.summary())
                                      SARIMAX Results                                      
===========================================================================================
Dep. Variable:                                   y   No. Observations:                   68
Model:             SARIMAX(1, 0, 1)x(1, 1, [], 12)   Log Likelihood                -142.987
Date:                             Wed, 03 Dec 2025   AIC                            293.975
Time:                                     16:29:59   BIC                            301.020
Sample:                                 01-31-2019   HQIC                           296.573
                                      - 08-31-2024                                         
Covariance Type:                               opg                                         
==============================================================================
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
ar.L1          0.5700      0.335      1.702      0.089      -0.086       1.226
ma.L1          0.2532      0.428      0.591      0.554      -0.586       1.093
ar.S.L12      -0.3783      0.141     -2.675      0.007      -0.655      -0.101
sigma2        45.2191      6.303      7.174      0.000      32.865      57.573
===================================================================================
Ljung-Box (L1) (Q):                   0.01   Jarque-Bera (JB):                24.77
Prob(Q):                              0.93   Prob(JB):                         0.00
Heteroskedasticity (H):               0.09   Skew:                            -0.01
Prob(H) (two-sided):                  0.00   Kurtosis:                         6.72
===================================================================================

Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).

SARIMA Forecast + Metrics¶

In [35]:
sarima_forecast = sarima_result.forecast(steps=len(test))
sarima_forecast = pd.Series(sarima_forecast, index=test.index)

plt.figure(figsize=(12,6))
train.plot(label="Train")
test.plot(label="Actual Test", marker="o")
sarima_forecast.plot(label="SARIMA Forecast", marker="o")
plt.title("SARIMA – Monthly Anti-Asian Incidents")
plt.xlabel("Month")
plt.ylabel("Incidents")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

sarima_mae = mean_absolute_error(test, sarima_forecast)
sarima_mse = mean_squared_error(test, sarima_forecast)
sarima_rmse = np.sqrt(sarima_mse)

print(f"SARIMA – MAE:  {sarima_mae:.3f}")
print(f"SARIMA – RMSE: {sarima_rmse:.3f}")
No description has been provided for this image
SARIMA – MAE:  2.075
SARIMA – RMSE: 2.521

Prophet On Monthly Data¶

In [36]:
prophet_df = monthly_counts.reset_index()
prophet_df.columns = ["ds", "y"]

prophet_train = prophet_df.iloc[:-test_months]
prophet_test  = prophet_df.iloc[-test_months:]
In [37]:
m = Prophet(
    weekly_seasonality=False,
    yearly_seasonality=True,
    daily_seasonality=False
)

m.fit(prophet_train)
16:30:00 - cmdstanpy - INFO - Chain [1] start processing
16:30:00 - cmdstanpy - INFO - Chain [1] done processing
Out[37]:
<prophet.forecaster.Prophet at 0x185189882f0>
In [38]:
future = m.make_future_dataframe(periods=test_months, freq="ME")
forecast = m.predict(future)

prophet_forecast = forecast.set_index("ds")["yhat"].iloc[-test_months:]
prophet_forecast.index = test.index  # align exactly

plt.figure(figsize=(12,6))
train.plot(label="Train")
test.plot(label="Actual Test", marker="o")
prophet_forecast.plot(label="Prophet Forecast", marker="o")
plt.title("Prophet – Monthly Anti-Asian Incidents")
plt.xlabel("Month")
plt.ylabel("Incidents")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

# Metrics
prophet_mae = mean_absolute_error(test, prophet_forecast)
prophet_mse = mean_squared_error(test, prophet_forecast)
prophet_rmse = np.sqrt(prophet_mse)

print(f"Prophet – MAE:  {prophet_mae:.3f}")
print(f"Prophet – RMSE: {prophet_rmse:.3f}")
No description has been provided for this image
Prophet – MAE:  4.798
Prophet – RMSE: 5.761

Comparing All Three Models¶

In [39]:
results = pd.DataFrame({
    "Model": ["ARIMA(1,1,1)", "SARIMA(1,0,1)(1,1,0,12)", "Prophet"],
    "MAE":   [arima_mae, sarima_mae, prophet_mae],
    "RMSE":  [arima_rmse, sarima_rmse, prophet_rmse]
})

display(results)
Model MAE RMSE
0 ARIMA(1,1,1) 2.600102 2.868501
1 SARIMA(1,0,1)(1,1,0,12) 2.074892 2.521463
2 Prophet 4.798075 5.760596

Hyperparameter Tuning¶

In [40]:
import numpy as np
import itertools
from sklearn.metrics import mean_absolute_error, mean_squared_error

from pmdarima import auto_arima

from prophet import Prophet

Auto-ARIMA (Non-Seasonal ARIMA Tuning)¶

In [41]:
auto_arima_model = auto_arima(
    train,
    seasonal=False,         
    stepwise=True,
    trace=True,              
    suppress_warnings=True,
    max_p=5,
    max_q=5,
    max_d=2,
    error_action="ignore"
)

print(auto_arima_model.summary())

best_arima_order = auto_arima_model.order
print("\nBest ARIMA order found:", best_arima_order)
Performing stepwise search to minimize aic
 ARIMA(2,0,2)(0,0,0)[0]             : AIC=434.308, Time=0.04 sec
 ARIMA(0,0,0)(0,0,0)[0]             : AIC=489.924, Time=0.01 sec
 ARIMA(1,0,0)(0,0,0)[0]             : AIC=431.960, Time=0.01 sec
 ARIMA(0,0,1)(0,0,0)[0]             : AIC=453.197, Time=0.02 sec
 ARIMA(2,0,0)(0,0,0)[0]             : AIC=433.945, Time=0.02 sec
 ARIMA(1,0,1)(0,0,0)[0]             : AIC=433.938, Time=0.02 sec
 ARIMA(2,0,1)(0,0,0)[0]             : AIC=435.775, Time=0.03 sec
 ARIMA(1,0,0)(0,0,0)[0] intercept   : AIC=428.336, Time=0.02 sec
 ARIMA(0,0,0)(0,0,0)[0] intercept   : AIC=460.685, Time=0.01 sec
 ARIMA(2,0,0)(0,0,0)[0] intercept   : AIC=429.692, Time=0.03 sec
 ARIMA(1,0,1)(0,0,0)[0] intercept   : AIC=429.606, Time=0.03 sec
 ARIMA(0,0,1)(0,0,0)[0] intercept   : AIC=435.047, Time=0.02 sec
 ARIMA(2,0,1)(0,0,0)[0] intercept   : AIC=431.577, Time=0.08 sec

Best model:  ARIMA(1,0,0)(0,0,0)[0] intercept
Total fit time: 0.352 seconds
                               SARIMAX Results                                
==============================================================================
Dep. Variable:                      y   No. Observations:                   68
Model:               SARIMAX(1, 0, 0)   Log Likelihood                -211.168
Date:                Wed, 03 Dec 2025   AIC                            428.336
Time:                        16:30:01   BIC                            434.994
Sample:                    01-31-2019   HQIC                           430.974
                         - 08-31-2024                                         
Covariance Type:                  opg                                         
==============================================================================
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
intercept      1.9246      1.434      1.342      0.180      -0.886       4.736
ar.L1          0.6265      0.128      4.908      0.000       0.376       0.877
sigma2        28.9507      2.824     10.251      0.000      23.415      34.486
===================================================================================
Ljung-Box (L1) (Q):                   0.33   Jarque-Bera (JB):               783.14
Prob(Q):                              0.57   Prob(JB):                         0.00
Heteroskedasticity (H):               0.63   Skew:                             3.31
Prob(H) (two-sided):                  0.27   Kurtosis:                        18.25
===================================================================================

Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).

Best ARIMA order found: (1, 0, 0)

Fitting Tuned ARIMA And Evaluating¶

In [42]:
tuned_arima_model = sm.tsa.ARIMA(train, order=best_arima_order)
tuned_arima_result = tuned_arima_model.fit()

tuned_arima_forecast = tuned_arima_result.forecast(steps=len(test))
tuned_arima_forecast = pd.Series(tuned_arima_forecast, index=test.index)

plt.figure(figsize=(12,6))
train.plot(label="Train")
test.plot(label="Actual Test", marker="o")
tuned_arima_forecast.plot(label=f"ARIMA{best_arima_order} Forecast", marker="o")
plt.title(f"Tuned ARIMA{best_arima_order} – Monthly Anti-Asian Incidents")
plt.xlabel("Month")
plt.ylabel("Incidents")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

tuned_arima_mae  = mean_absolute_error(test, tuned_arima_forecast)
tuned_arima_mse = mean_squared_error(test, tuned_arima_forecast)
tuned_arima_rmse = np.sqrt(tuned_arima_mse)

print(f"ARIMA(1,1,1) – MAE:  {tuned_arima_mae:.3f}")
print(f"ARIMA(1,1,1) – RMSE: {tuned_arima_rmse:.3f}")
No description has been provided for this image
ARIMA(1,1,1) – MAE:  2.636
ARIMA(1,1,1) – RMSE: 2.896

Auto-SARIMA (Seasonal Tuning)¶

In [43]:
auto_sarima_model = auto_arima(
    train,
    seasonal=True,
    m=12,                   
    stepwise=True,
    trace=True,
    suppress_warnings=True,
    max_p=3, max_q=3, max_d=2,
    max_P=2, max_Q=2, max_D=1,
    error_action="ignore"
)

print(auto_sarima_model.summary())

best_sarima_order = auto_sarima_model.order
best_sarima_seasonal = auto_sarima_model.seasonal_order

print("\nBest SARIMA order:", best_sarima_order)
print("Best SARIMA seasonal order:", best_sarima_seasonal)
Performing stepwise search to minimize aic
 ARIMA(2,0,2)(1,0,1)[12] intercept   : AIC=432.103, Time=0.39 sec
 ARIMA(0,0,0)(0,0,0)[12] intercept   : AIC=460.685, Time=0.01 sec
 ARIMA(1,0,0)(1,0,0)[12] intercept   : AIC=425.064, Time=0.06 sec
 ARIMA(0,0,1)(0,0,1)[12] intercept   : AIC=431.125, Time=0.05 sec
 ARIMA(0,0,0)(0,0,0)[12]             : AIC=489.924, Time=0.01 sec
 ARIMA(1,0,0)(0,0,0)[12] intercept   : AIC=428.336, Time=0.02 sec
 ARIMA(1,0,0)(2,0,0)[12] intercept   : AIC=426.987, Time=0.14 sec
 ARIMA(1,0,0)(1,0,1)[12] intercept   : AIC=inf, Time=0.23 sec
 ARIMA(1,0,0)(0,0,1)[12] intercept   : AIC=425.542, Time=0.06 sec
 ARIMA(1,0,0)(2,0,1)[12] intercept   : AIC=inf, Time=0.59 sec
 ARIMA(0,0,0)(1,0,0)[12] intercept   : AIC=457.510, Time=0.04 sec
 ARIMA(2,0,0)(1,0,0)[12] intercept   : AIC=426.422, Time=0.07 sec
 ARIMA(1,0,1)(1,0,0)[12] intercept   : AIC=426.330, Time=0.07 sec
 ARIMA(0,0,1)(1,0,0)[12] intercept   : AIC=431.626, Time=0.06 sec
 ARIMA(2,0,1)(1,0,0)[12] intercept   : AIC=428.290, Time=0.14 sec
 ARIMA(1,0,0)(1,0,0)[12]             : AIC=426.741, Time=0.03 sec

Best model:  ARIMA(1,0,0)(1,0,0)[12] intercept
Total fit time: 1.971 seconds
                                     SARIMAX Results                                      
==========================================================================================
Dep. Variable:                                  y   No. Observations:                   68
Model:             SARIMAX(1, 0, 0)x(1, 0, 0, 12)   Log Likelihood                -208.532
Date:                            Wed, 03 Dec 2025   AIC                            425.064
Time:                                    16:30:03   BIC                            433.942
Sample:                                01-31-2019   HQIC                           428.582
                                     - 08-31-2024                                         
Covariance Type:                              opg                                         
==============================================================================
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
intercept      1.3198      1.171      1.127      0.260      -0.975       3.614
ar.L1          0.6262      0.106      5.921      0.000       0.419       0.834
ar.S.L12       0.2605      0.129      2.021      0.043       0.008       0.513
sigma2        26.4612      4.075      6.493      0.000      18.474      34.448
===================================================================================
Ljung-Box (L1) (Q):                   0.32   Jarque-Bera (JB):               462.90
Prob(Q):                              0.57   Prob(JB):                         0.00
Heteroskedasticity (H):               0.74   Skew:                             2.80
Prob(H) (two-sided):                  0.48   Kurtosis:                        14.49
===================================================================================

Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).

Best SARIMA order: (1, 0, 0)
Best SARIMA seasonal order: (1, 0, 0, 12)

Fitting Tuned SARIMA And Evaluating¶

In [44]:
tuned_sarima_model = sm.tsa.statespace.SARIMAX(
    train,
    order=best_sarima_order,
    seasonal_order=best_sarima_seasonal,
    enforce_stationarity=False,
    enforce_invertibility=False
)

tuned_sarima_result = tuned_sarima_model.fit(disp=False)

tuned_sarima_forecast = tuned_sarima_result.forecast(steps=len(test))
tuned_sarima_forecast = pd.Series(tuned_sarima_forecast, index=test.index)

plt.figure(figsize=(12,6))
train.plot(label="Train")
test.plot(label="Actual Test", marker="o")
tuned_sarima_forecast.plot(label=f"SARIMA{best_sarima_order}{best_sarima_seasonal} Forecast", marker="o")
plt.title("Tuned SARIMA – Monthly Anti-Asian Incidents")
plt.xlabel("Month")
plt.ylabel("Incidents")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

tuned_sarima_mae  = mean_absolute_error(test, tuned_sarima_forecast)
tuned_sarima_mse  = mean_squared_error(test, tuned_sarima_forecast)
tuned_sarima_rmse = np.sqrt(tuned_sarima_mse)

print(f"SARIMA(1,1,1) – MAE:  {tuned_sarima_mae:.3f}")
print(f"SARIMA(1,1,1) – RMSE: {tuned_sarima_rmse:.3f}")
No description has been provided for this image
SARIMA(1,1,1) – MAE:  1.464
SARIMA(1,1,1) – RMSE: 1.800

Prophet Tuning Via Small Grid Search¶

In [45]:
prophet_df = monthly_counts.reset_index()
prophet_df.columns = ["ds", "y"]

prophet_train = prophet_df.iloc[:-test_months]
prophet_test  = prophet_df.iloc[-test_months:]

param_grid = {
    "changepoint_prior_scale": [0.01, 0.05, 0.1, 0.5],
    "seasonality_prior_scale": [1.0, 5.0, 10.0],
    "seasonality_mode": ["additive", "multiplicative"]
}

best_params = None
best_prophet_rmse = np.inf

for cp in param_grid["changepoint_prior_scale"]:
    for sp in param_grid["seasonality_prior_scale"]:
        for smode in param_grid["seasonality_mode"]:
            m = Prophet(
                changepoint_prior_scale=cp,
                seasonality_prior_scale=sp,
                seasonality_mode=smode,
                weekly_seasonality=False,
                yearly_seasonality=True,
                daily_seasonality=False
            )
            m.fit(prophet_train)

            future = m.make_future_dataframe(periods=test_months, freq="ME")
            forecast = m.predict(future)

            pred = forecast.set_index("ds")["yhat"].iloc[-test_months:]
            pred.index = test.index

            mse = mean_squared_error(test, pred)
            rmse = np.sqrt(mse)

            if rmse < best_prophet_rmse:
                best_prophet_rmse = rmse
                best_params = (cp, sp, smode)

print("Best Prophet parameters (cp, sp, mode):", best_params)
print("Best Prophet RMSE:", best_prophet_rmse)
16:30:04 - cmdstanpy - INFO - Chain [1] start processing
16:30:04 - cmdstanpy - INFO - Chain [1] done processing
16:30:04 - cmdstanpy - INFO - Chain [1] start processing
16:30:04 - cmdstanpy - INFO - Chain [1] done processing
16:30:04 - cmdstanpy - INFO - Chain [1] start processing
16:30:04 - cmdstanpy - INFO - Chain [1] done processing
16:30:04 - cmdstanpy - INFO - Chain [1] start processing
16:30:05 - cmdstanpy - INFO - Chain [1] done processing
16:30:05 - cmdstanpy - INFO - Chain [1] start processing
16:30:05 - cmdstanpy - INFO - Chain [1] done processing
16:30:05 - cmdstanpy - INFO - Chain [1] start processing
16:30:05 - cmdstanpy - INFO - Chain [1] done processing
16:30:05 - cmdstanpy - INFO - Chain [1] start processing
16:30:05 - cmdstanpy - INFO - Chain [1] done processing
16:30:05 - cmdstanpy - INFO - Chain [1] start processing
16:30:06 - cmdstanpy - INFO - Chain [1] done processing
16:30:06 - cmdstanpy - INFO - Chain [1] start processing
16:30:06 - cmdstanpy - INFO - Chain [1] done processing
16:30:06 - cmdstanpy - INFO - Chain [1] start processing
16:30:06 - cmdstanpy - INFO - Chain [1] done processing
16:30:06 - cmdstanpy - INFO - Chain [1] start processing
16:30:07 - cmdstanpy - INFO - Chain [1] done processing
16:30:07 - cmdstanpy - INFO - Chain [1] start processing
16:30:07 - cmdstanpy - INFO - Chain [1] done processing
16:30:07 - cmdstanpy - INFO - Chain [1] start processing
16:30:07 - cmdstanpy - INFO - Chain [1] done processing
16:30:07 - cmdstanpy - INFO - Chain [1] start processing
16:30:08 - cmdstanpy - INFO - Chain [1] done processing
16:30:08 - cmdstanpy - INFO - Chain [1] start processing
16:30:08 - cmdstanpy - INFO - Chain [1] done processing
16:30:08 - cmdstanpy - INFO - Chain [1] start processing
16:30:09 - cmdstanpy - INFO - Chain [1] done processing
16:30:09 - cmdstanpy - INFO - Chain [1] start processing
16:30:09 - cmdstanpy - INFO - Chain [1] done processing
16:30:09 - cmdstanpy - INFO - Chain [1] start processing
16:30:10 - cmdstanpy - INFO - Chain [1] done processing
16:30:10 - cmdstanpy - INFO - Chain [1] start processing
16:30:10 - cmdstanpy - INFO - Chain [1] done processing
16:30:10 - cmdstanpy - INFO - Chain [1] start processing
16:30:11 - cmdstanpy - INFO - Chain [1] done processing
16:30:11 - cmdstanpy - INFO - Chain [1] start processing
16:30:11 - cmdstanpy - INFO - Chain [1] done processing
16:30:11 - cmdstanpy - INFO - Chain [1] start processing
16:30:12 - cmdstanpy - INFO - Chain [1] done processing
16:30:12 - cmdstanpy - INFO - Chain [1] start processing
16:30:12 - cmdstanpy - INFO - Chain [1] done processing
16:30:13 - cmdstanpy - INFO - Chain [1] start processing
16:30:13 - cmdstanpy - INFO - Chain [1] done processing
Best Prophet parameters (cp, sp, mode): (0.5, 1.0, 'multiplicative')
Best Prophet RMSE: 2.7421939919440614

Fitting Prophet With Best Params And Evaluating¶

In [46]:
best_cp, best_sp, best_mode = best_params

best_m = Prophet(
    changepoint_prior_scale=best_cp,
    seasonality_prior_scale=best_sp,
    seasonality_mode=best_mode,
    weekly_seasonality=False,
    yearly_seasonality=True,
    daily_seasonality=False
)

best_m.fit(prophet_train)

future = best_m.make_future_dataframe(periods=test_months, freq="M")
forecast = best_m.predict(future)

best_prophet_forecast = forecast.set_index("ds")["yhat"].iloc[-test_months:]
best_prophet_forecast.index = test.index

plt.figure(figsize=(12,6))
train.plot(label="Train")
test.plot(label="Actual Test", marker="o")
best_prophet_forecast.plot(label="Tuned Prophet Forecast", marker="o")
plt.title(f"Tuned Prophet – Monthly Anti-Asian Incidents\n(cp={best_cp}, sp={best_sp}, mode={best_mode})")
plt.xlabel("Month")
plt.ylabel("Incidents")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

tuned_prophet_mae  = mean_absolute_error(test, best_prophet_forecast)
tuned_prophet_mse = mean_squared_error(test, best_prophet_forecast)
tuned_prophet_rmse = np.sqrt(tuned_prophet_mse)

print(f"Tuned Prophet – MAE:  {tuned_prophet_mae:.3f}")
print(f"Tuned Prophet – RMSE: {tuned_prophet_rmse:.3f}")
16:30:13 - cmdstanpy - INFO - Chain [1] start processing
16:30:14 - cmdstanpy - INFO - Chain [1] done processing
C:\Users\vibha\anaconda3\Lib\site-packages\prophet\forecaster.py:1872: FutureWarning: 'M' is deprecated and will be removed in a future version, please use 'ME' instead.
  dates = pd.date_range(
No description has been provided for this image
Tuned Prophet – MAE:  2.315
Tuned Prophet – RMSE: 2.742

Comparing Tuned Models¶

In [47]:
tuned_results = pd.DataFrame({
    "Model": [
        f"ARIMA{best_arima_order}",
        f"SARIMA{best_sarima_order}{best_sarima_seasonal}",
        f"Prophet(cp={best_cp}, sp={best_sp}, mode={best_mode})"
    ],
    "MAE":  [tuned_arima_mae, tuned_sarima_mae, tuned_prophet_mae],
    "RMSE": [tuned_arima_rmse, tuned_sarima_rmse, tuned_prophet_rmse]
})

display(tuned_results)
Model MAE RMSE
0 ARIMA(1, 0, 0) 2.635961 2.895933
1 SARIMA(1, 0, 0)(1, 0, 0, 12) 1.464307 1.800098
2 Prophet(cp=0.5, sp=1.0, mode=multiplicative) 2.315017 2.742194

Combining All Model Forecasts¶

In [48]:
comparison_df = pd.DataFrame({
    "Actual": test,
    "ARIMA Untuned": arima_forecast,                     
    "ARIMA Tuned": tuned_arima_forecast,                 
    "SARIMA Untuned": sarima_forecast,                  
    "SARIMA Tuned": tuned_sarima_forecast,               
    "Prophet Untuned": prophet_forecast,                 
    "Prophet Tuned": best_prophet_forecast              
})

display(comparison_df.head())
Actual ARIMA Untuned ARIMA Tuned SARIMA Untuned SARIMA Tuned Prophet Untuned Prophet Tuned
Record Create Date
2024-09-30 7 3.747455 3.804200 2.874223 3.741317 4.514449 0.734242
2024-10-31 3 4.233510 4.308042 0.615527 1.679759 4.521154 0.475653
2024-11-30 3 4.549583 4.623706 1.450476 1.143071 5.320337 0.535817
2024-12-31 3 4.755118 4.821475 2.092783 1.383703 5.497119 0.435934
2025-01-31 1 4.888774 4.945379 2.076868 0.934048 2.735826 0.219521

Plotting All Models Versus Actual (Tuned + Untuned)¶

In [49]:
plt.figure(figsize=(15,8))

plt.plot(test.index, test.values, label="Actual", color="black", linewidth=3)

plt.plot(test.index, arima_forecast, label="ARIMA Untuned", linestyle="--")
plt.plot(test.index, sarima_forecast, label="SARIMA Untuned", linestyle="--")
plt.plot(test.index, prophet_forecast, label="Prophet Untuned", linestyle="--")

plt.plot(test.index, tuned_arima_forecast, label="ARIMA Tuned", marker="o")
plt.plot(test.index, tuned_sarima_forecast, label="SARIMA Tuned", marker="o")
plt.plot(test.index, best_prophet_forecast, label="Prophet Tuned", marker="o")

plt.title("Comparison of Tuned and Untuned Forecasting Models\nMonthly Anti-Asian Hate Crime Incidents", fontsize=16)
plt.xlabel("Month")
plt.ylabel("Incident Count")
plt.grid(True)
plt.legend()
plt.tight_layout()
plt.show()
No description has been provided for this image
In [ ]:
 

Performance Comparison Table (Tuned Versus Untuned)¶

In [50]:
comparison_metrics = pd.DataFrame({
    "Model": [
        "ARIMA Untuned", "ARIMA Tuned",
        "SARIMA Untuned", "SARIMA Tuned",
        "Prophet Untuned", "Prophet Tuned"
    ],
    "MAE": [
        arima_mae, tuned_arima_mae,
        sarima_mae, tuned_sarima_mae,
        prophet_mae, tuned_prophet_mae
    ],
    "RMSE": [
        arima_rmse, tuned_arima_rmse,
        sarima_rmse, tuned_sarima_rmse,
        prophet_rmse, tuned_prophet_rmse
    ]
})

display(comparison_metrics)
Model MAE RMSE
0 ARIMA Untuned 2.600102 2.868501
1 ARIMA Tuned 2.635961 2.895933
2 SARIMA Untuned 2.074892 2.521463
3 SARIMA Tuned 1.464307 1.800098
4 Prophet Untuned 4.798075 5.760596
5 Prophet Tuned 2.315017 2.742194

Bar Chart Comparing RMSE (Tuned Versus Untuned)¶

In [51]:
plt.figure(figsize=(12,6))
plt.bar(comparison_metrics["Model"], comparison_metrics["RMSE"], color=[
    "#ff9999", "#ff4d4d",    
    "#99ccff", "#1a75ff",    
    "#99ff99", "#33cc33"    
])

plt.title("RMSE Comparison: Tuned vs Untuned Forecasting Models", fontsize=15)
plt.ylabel("RMSE (Lower is better)")
plt.xticks(rotation=45, ha="right")
plt.grid(axis="y", linestyle="--", alpha=0.5)
plt.tight_layout()
plt.show()
No description has been provided for this image

Relative Model Accuracy¶

In [52]:
baseline_name = "SARIMA Tuned"

baseline_rmse = comparison_metrics.loc[
    comparison_metrics["Model"] == baseline_name, "RMSE"
].iloc[0]

comparison_metrics["Relative_Accuracy_Value"] = (
    baseline_rmse / comparison_metrics["RMSE"] * 100
)

def format_acc(x):
    x_round = round(x)
    if x_round == 100:
        return "100%"
    else:
        return f"≈ {x_round}%"

comparison_metrics["Relative Accuracy"] = comparison_metrics[
    "Relative_Accuracy_Value"
].apply(format_acc)

comparison_metrics_sorted = comparison_metrics.sort_values("RMSE").reset_index(drop=True)

display(comparison_metrics_sorted[["Model", "MAE", "RMSE", "Relative Accuracy"]])
Model MAE RMSE Relative Accuracy
0 SARIMA Tuned 1.464307 1.800098 100%
1 SARIMA Untuned 2.074892 2.521463 ≈ 71%
2 Prophet Tuned 2.315017 2.742194 ≈ 66%
3 ARIMA Untuned 2.600102 2.868501 ≈ 63%
4 ARIMA Tuned 2.635961 2.895933 ≈ 62%
5 Prophet Untuned 4.798075 5.760596 ≈ 31%

Bar Chart Of Relative Accuracy¶

In [53]:
plt.figure(figsize=(12, 6))

plt.bar(
    comparison_metrics_sorted["Model"],
    comparison_metrics_sorted["Relative_Accuracy_Value"],
)

plt.title(
    "Relative Accuracy (Higher = Better Performance)\nBaseline: SARIMA Tuned = 100% Accuracy",
    fontsize=14,
)
plt.ylabel("Relative Accuracy (%)")
plt.xticks(rotation=45, ha="right")
plt.grid(axis="y", linestyle="--", alpha=0.5)
plt.tight_layout()
plt.show()
No description has been provided for this image
In [ ]:
 

Based on the evaluation metrics, the tuned SARIMA model achieved the lowest RMSE (1.80), making it the most accurate forecasting model among the three approaches. While Prophet and ARIMA models performed reasonably well, SARIMA demonstrated better stability and predictive accuracy, particularly for low-frequency monthly crime data. Thus, the final short-term forecasts were generated using the tuned SARIMA configuration.¶

Refitting Tuned SARIMA On Full Data And Forecasting 6 Months¶

In [54]:
full_sarima_model = sm.tsa.statespace.SARIMAX(
    monthly_counts,
    order=best_sarima_order,
    seasonal_order=best_sarima_seasonal,
    enforce_stationarity=False,
    enforce_invertibility=False
)

full_sarima_result = full_sarima_model.fit(disp=False)

n_steps = 6

sarima_forecast_obj = full_sarima_result.get_forecast(steps=n_steps)
sarima_future_mean = sarima_forecast_obj.predicted_mean
sarima_future_ci = sarima_forecast_obj.conf_int()

future_index = pd.date_range(
    start=monthly_counts.index[-1] + pd.offsets.MonthEnd(1),
    periods=n_steps,
    freq="M"
)

sarima_future_mean.index = future_index
sarima_future_ci.index = future_index

print("SARIMA forecast for the next 6 months:")
display(
    pd.DataFrame({
        "Forecast": sarima_future_mean,
        "Lower CI": sarima_future_ci.iloc[:, 0],
        "Upper CI": sarima_future_ci.iloc[:, 1],
    })
)
SARIMA forecast for the next 6 months:
C:\Users\vibha\AppData\Local\Temp\ipykernel_19412\2913118711.py:17: FutureWarning: 'M' is deprecated and will be removed in a future version, please use 'ME' instead.
  future_index = pd.date_range(
Forecast Lower CI Upper CI
2025-09-30 2.488614 -7.948081 12.925308
2025-10-31 1.047929 -11.723726 13.819584
2025-11-30 1.061911 -12.724868 14.848690
2025-12-31 1.071773 -13.193152 15.336698
2026-01-31 0.348475 -14.148456 14.845407
2026-02-28 0.353382 -14.257602 14.964366

Plotting History + 6-Month SARIMA Forecast¶

In [55]:
history_window = 24  

plt.figure(figsize=(12,6))

monthly_counts.tail(history_window).plot(label="History", marker="o")

sarima_future_mean.plot(label="SARIMA Forecast", marker="o")

plt.fill_between(
    sarima_future_mean.index,
    sarima_future_ci.iloc[:, 0],
    sarima_future_ci.iloc[:, 1],
    alpha=0.2,
    label="95% confidence interval"
)

plt.title(f"Tuned SARIMA{best_sarima_order}{best_sarima_seasonal} – 6-Month Forecast\nMonthly Anti-Asian Hate Crime Incidents")
plt.xlabel("Month")
plt.ylabel("Incident Count")
plt.grid(True)
plt.legend()
plt.tight_layout()
plt.show()
No description has been provided for this image
In [ ]:
 

Computing Historical Precinct Totals And Shares¶

In [56]:
filtered_df_precinct = filtered_df.dropna(subset=["Complaint Precinct Code"]).copy()
filtered_df_precinct["Complaint Precinct Code"] = filtered_df_precinct["Complaint Precinct Code"].astype(int)

precinct_counts = (
    filtered_df_precinct
    .groupby("Complaint Precinct Code")
    .size()
    .rename("historical_total")
    .sort_values(ascending=False)
)

print("Top 10 precincts by historical Anti-Asian incidents:")
display(precinct_counts.head(10))

total_incidents_citywide = precinct_counts.sum()
precinct_shares = (precinct_counts / total_incidents_citywide).rename("share")

print("\nCheck: sum of shares =", precinct_shares.sum())
Top 10 precincts by historical Anti-Asian incidents:
Complaint Precinct Code
14     52
13     25
5      21
7      16
84     15
62     15
109    14
9      13
6      13
18     12
Name: historical_total, dtype: int64
Check: sum of shares = 0.9999999999999998

Using SARIMA Forecast To Get Expected 6-Month Total¶

In [57]:
forecast_6m_total = sarima_future_mean.sum()
print("Total forecasted Anti-Asian incidents (next 6 months, citywide):", forecast_6m_total)
Total forecasted Anti-Asian incidents (next 6 months, citywide): 6.37208442247705

Allocating Forecast To Precincts Using Historical Shares¶

In [58]:
precinct_forecast_6m = (precinct_shares * forecast_6m_total).rename("forecast_6m")

precinct_stats = pd.concat([precinct_counts, precinct_shares, precinct_forecast_6m], axis=1)

display(precinct_stats.head(10))
historical_total share forecast_6m
Complaint Precinct Code
14 52 0.131980 0.840986
13 25 0.063452 0.404320
5 21 0.053299 0.339629
7 16 0.040609 0.258765
84 15 0.038071 0.242592
62 15 0.038071 0.242592
109 14 0.035533 0.226419
9 13 0.032995 0.210246
6 13 0.032995 0.210246
18 12 0.030457 0.194074

Loading Precinct GeoJSON And Merging Stats¶

In [59]:
gdf_precincts = gpd.read_file(PRECINCTS_GEOJSON_URL)
gdf_precincts.columns = gdf_precincts.columns.str.strip()

precinct_candidates = [c for c in gdf_precincts.columns if "precinct" in c.lower()]
print("Precinct columns in GeoJSON:", precinct_candidates)
precinct_col = precinct_candidates[0]

gdf_precincts = gdf_precincts.rename(columns={precinct_col: "Complaint Precinct Code"})
gdf_precincts["Complaint Precinct Code"] = gdf_precincts["Complaint Precinct Code"].astype(int)

gdf_stats = gdf_precincts.merge(
    precinct_stats.reset_index(),
    on="Complaint Precinct Code",
    how="left"
)

for col in ["historical_total", "share", "forecast_6m"]:
    gdf_stats[col] = gdf_stats[col].fillna(0)

print("GeoDataFrame with stats:", gdf_stats.shape)
Precinct columns in GeoJSON: ['precinct']
GeoDataFrame with stats: (78, 7)

Choropleth: Historical Hotspots¶

In [60]:
fig, ax = plt.subplots(1, 1, figsize=(10, 10))

gdf_stats.plot(
    column="historical_total",
    cmap="Reds",
    legend=True,
    edgecolor="black",
    linewidth=0.3,
    ax=ax
)

ax.set_title("Historical Anti-Asian Hate Crime Incidents by Precinct (Total)", fontsize=15)
ax.axis("off")

plt.tight_layout()
plt.show()
No description has been provided for this image

Choropleth: Forecasted 6-Month Expected Incidents¶

In [61]:
fig, ax = plt.subplots(1, 1, figsize=(10, 10))

gdf_stats.plot(
    column="forecast_6m",
    cmap="Blues",
    legend=True,
    edgecolor="black",
    linewidth=0.3,
    ax=ax
)

ax.set_title("Expected Anti-Asian Incidents in Next 6 Months (by Precinct)\nAllocated Using Tuned SARIMA Forecast + Historical Shares", fontsize=14)
ax.axis("off")

plt.tight_layout()
plt.show()
No description has been provided for this image
In [ ]:
 
In [ ]: