import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import geopandas as gpd
from IPython.display import display

sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

DATA_PATH = "NYPD_Hate_Crimes_20251202.csv"
PRECINCTS_GEOJSON_URL = "https://data.cityofnewyork.us/resource/y76i-bdw7.geojson"

TARGET_MOTIVE = "ANTI-ASIAN"

df = pd.read_csv(DATA_PATH)

print("Shape of raw dataset:", df.shape)
print("\nColumn names:")
print(df.columns.tolist())

Shape of raw dataset: (3872, 14)

Column names:
['Full Complaint ID', 'Complaint Year Number', 'Month Number', 'Record Create Date', 'Complaint Precinct Code', 'Patrol Borough Name', 'County', 'Law Code Category Description', 'Offense Description', 'PD Code Description', 'Bias Motive Description', 'Offense Category', 'Arrest Date', 'Arrest Id']

print("First 5 rows:")
display(df.head())

First 5 rows:

print("\nDataFrame info:")
df.info()

DataFrame info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3872 entries, 0 to 3871
Data columns (total 14 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Full Complaint ID              3872 non-null   int64  
 1   Complaint Year Number          3872 non-null   int64  
 2   Month Number                   3872 non-null   int64  
 3   Record Create Date             3872 non-null   object 
 4   Complaint Precinct Code        3872 non-null   int64  
 5   Patrol Borough Name            3872 non-null   object 
 6   County                         3872 non-null   object 
 7   Law Code Category Description  3872 non-null   object 
 8   Offense Description            3872 non-null   object 
 9   PD Code Description            3872 non-null   object 
 10  Bias Motive Description        3872 non-null   object 
 11  Offense Category               3872 non-null   object 
 12  Arrest Date                    0 non-null      float64
 13  Arrest Id                      1694 non-null   object 
dtypes: float64(1), int64(4), object(9)
memory usage: 423.6+ KB

# Converting record creation date
df["Record Create Date"] = pd.to_datetime(df["Record Create Date"], errors="coerce")

# Dropping rows with invalid dates
df = df.dropna(subset=["Record Create Date"]).copy()

# Sorting and setting index
df = df.sort_values("Record Create Date")
df.set_index("Record Create Date", inplace=True)

print("Data after cleaning:", df.shape)
print("Date range:", df.index.min(), "→", df.index.max())

Data after cleaning: (3872, 13)
Date range: 2019-01-01 00:00:00 → 2025-09-30 00:00:00

df["Bias Motive Description"] = df["Bias Motive Description"].astype(str).str.strip()

filtered_df = df[df["Bias Motive Description"].str.upper() == TARGET_MOTIVE].copy()

print(f"Total ANTI-ASIAN incidents found: {len(filtered_df)}")
display(filtered_df.head())

Total ANTI-ASIAN incidents found: 394

print("Incident Date Range:",
      filtered_df.index.min().strftime("%Y-%m-%d"),
      "→",
      filtered_df.index.max().strftime("%Y-%m-%d"))

Incident Date Range: 2019-01-15 → 2025-08-19

filtered_df["year"] = filtered_df.index.year
filtered_df["month"] = filtered_df.index.month
filtered_df["month_name"] = filtered_df.index.month_name()

display(filtered_df[["year","month","month_name"]].head())

print("Top Patrol Boroughs:")
display(filtered_df["Patrol Borough Name"].value_counts())

print("\nTop Offense Categories:")
display(filtered_df["Offense Category"].value_counts())

Top Patrol Boroughs:

Patrol Borough Name
PATROL BORO MAN SOUTH        181
PATROL BORO QUEENS NORTH      64
PATROL BORO MAN NORTH         44
PATROL BORO BKLYN NORTH       40
PATROL BORO BKLYN SOUTH       40
PATROL BORO BRONX             10
PATROL BORO QUEENS SOUTH      10
PATROL BORO STATEN ISLAND      5
Name: count, dtype: int64

Top Offense Categories:

Offense Category
Race/Color    394
Name: count, dtype: int64

monthly_counts = filtered_df.resample("ME").size()

plt.figure(figsize=(12,6))
monthly_counts.plot(marker="o")
plt.title("Monthly Anti-Asian Hate Crime Incidents (NYPD)")
plt.xlabel("Month")
plt.ylabel("Incidents")
plt.grid(True)
plt.tight_layout()
plt.show()

yearly_counts = filtered_df.resample("YE").size()

plt.figure(figsize=(12,6))
yearly_counts.plot(kind="bar")
plt.title("Yearly Anti-Asian Hate Crime Incidents")
plt.xlabel("Year")
plt.ylabel("Incidents")
plt.tight_layout()
plt.show()

heatmap_data = filtered_df.pivot_table(
    index="year",
    columns="month",
    values="Full Complaint ID",
    aggfunc="count"
).fillna(0)

heatmap_data = heatmap_data.reindex(sorted(heatmap_data.index), axis=0)

plt.figure(figsize=(12,6))
sns.heatmap(heatmap_data, annot=True, fmt=".0f", cmap="Reds")
plt.title("Anti-Asian Crime Seasonality (Year × Month)")
plt.xlabel("Month Number")
plt.ylabel("Year")
plt.tight_layout()
plt.show()

borough_counts = filtered_df["Patrol Borough Name"].value_counts()

plt.figure(figsize=(10,6))
sns.barplot(x=borough_counts.index, y=borough_counts.values)
plt.title("Anti-Asian Hate Crimes by Borough")
plt.xlabel("Borough")
plt.ylabel("Incidents")
plt.xticks(rotation=45, ha="right")
plt.tight_layout()
plt.show()

filtered_df["Arrest Date"] = pd.to_datetime(filtered_df["Arrest Date"], errors="coerce")
filtered_df["arrest_made"] = filtered_df["Arrest Date"].notna()

no_arrest_count = (filtered_df["arrest_made"] == False).sum()
arrest_count    = (filtered_df["arrest_made"] == True).sum()

labels = ["No Arrest", "Arrest"]
values = [no_arrest_count, arrest_count]

plt.figure(figsize=(6,5))
sns.barplot(x=labels, y=values)
plt.title("Arrest vs No-Arrest in Anti-Asian Incidents")
plt.ylabel("Number of Incidents")
plt.tight_layout()
plt.show()

filtered_df_precinct = filtered_df.dropna(subset=["Complaint Precinct Code"]).copy()
filtered_df_precinct["Complaint Precinct Code"] = filtered_df_precinct["Complaint Precinct Code"].astype(int)

precinct_counts = (
    filtered_df_precinct
    .groupby("Complaint Precinct Code")
    .size()
    .sort_values(ascending=False)
)

print("Top 15 precincts by Anti-Asian incident count:")
display(precinct_counts.head(15))

Top 15 precincts by Anti-Asian incident count:

Complaint Precinct Code
14     52
13     25
5      21
7      16
84     15
62     15
109    14
9      13
6      13
18     12
108    12
17     12
24     12
115     9
1       9
dtype: int64

top_n = 15
plt.figure(figsize=(12,6))
sns.barplot(
    x=precinct_counts.head(top_n).index.astype(str),
    y=precinct_counts.head(top_n).values
)
plt.title(f"Top {top_n} Precincts by Anti-Asian Hate Crime Incidents")
plt.xlabel("Complaint Precinct Code")
plt.ylabel("Number of Incidents")
plt.xticks(rotation=45, ha="right")
plt.tight_layout()
plt.show()

try:
    gdf_precincts = gpd.read_file(PRECINCTS_GEOJSON_URL)
    gdf_precincts.columns = gdf_precincts.columns.str.strip()

    precinct_candidates = [c for c in gdf_precincts.columns if "precinct" in c.lower()]
    if not precinct_candidates:
        raise ValueError(f"No precinct-like column found. Columns are: {gdf_precincts.columns.tolist()}")

    precinct_col = precinct_candidates[0]
    print("Using GeoJSON column as precinct code:", precinct_col)

    gdf_precincts = gdf_precincts.rename(columns={precinct_col: "Complaint Precinct Code"})
    gdf_precincts["Complaint Precinct Code"] = gdf_precincts["Complaint Precinct Code"].astype(int)

    precinct_counts_df = precinct_counts.rename("Total_Incidents").reset_index()

    gdf = gdf_precincts.merge(precinct_counts_df,
                              on="Complaint Precinct Code",
                              how="left")

    gdf["Total_Incidents"] = gdf["Total_Incidents"].fillna(0)

    print("GeoDataFrame shape after merge:", gdf.shape)

except Exception as e:
    print("Error loading or merging GeoJSON:", e)
    gdf = None

Using GeoJSON column as precinct code: precinct
GeoDataFrame shape after merge: (78, 5)

if gdf is not None:
    fig, ax = plt.subplots(1, 1, figsize=(12, 10))

    gdf.plot(
        column="Total_Incidents",
        cmap="Reds",
        legend=True,
        edgecolor="black",
        linewidth=0.3,
        ax=ax
    )

    ax.set_title("Anti-Asian Hate Crime Incidents by Precinct (Total)", fontsize=16)
    ax.axis("off")

    plt.tight_layout()
    plt.show()
else:
    print("GeoDataFrame `gdf` is None; cannot plot map.")

month_order = [
    "January","February","March","April","May","June",
    "July","August","September","October","November","December"
]

month_counts = (
    filtered_df["month_name"]
    .value_counts()
    .reindex(month_order)
    .fillna(0)
)

plt.figure(figsize=(12,6))
sns.barplot(x=month_counts.index, y=month_counts.values)
plt.title("Anti-Asian Hate Crime Incidents by Month (All Years Combined)")
plt.xlabel("Month")
plt.ylabel("Number of Incidents")
plt.xticks(rotation=45, ha="right")
plt.tight_layout()
plt.show()

filtered_df["day_of_week"] = filtered_df.index.day_name()

dow_order = ["Monday","Tuesday","Wednesday","Thursday","Friday","Saturday","Sunday"]

dow_counts = (
    filtered_df["day_of_week"]
    .value_counts()
    .reindex(dow_order)
    .fillna(0)
)

plt.figure(figsize=(10,5))
sns.barplot(x=dow_counts.index, y=dow_counts.values)
plt.title("Anti-Asian Hate Crime Incidents by Day of Week")
plt.xlabel("Day of Week")
plt.ylabel("Number of Incidents")
plt.tight_layout()
plt.show()

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import statsmodels.api as sm
from sklearn.metrics import mean_absolute_error, mean_squared_error

from prophet import Prophet

filtered_df.index = pd.to_datetime(filtered_df.index)

monthly_counts = (
    filtered_df
    .resample("ME")          
    .size()
    .asfreq("ME", fill_value=0)
)

print("Monthly series length:", len(monthly_counts))
print("From:", monthly_counts.index.min(), "to:", monthly_counts.index.max())

plt.figure(figsize=(12,6))
monthly_counts.plot(marker="o")
plt.title("Monthly Anti-Asian Hate Crime Incidents (NYC total)")
plt.xlabel("Month")
plt.ylabel("Number of Incidents")
plt.grid(True)
plt.tight_layout()
plt.show()

Monthly series length: 80
From: 2019-01-31 00:00:00 to: 2025-08-31 00:00:00

test_months = 12 

train = monthly_counts.iloc[:-test_months]
test  = monthly_counts.iloc[-test_months:]

print("Train range:", train.index.min(), "→", train.index.max(), "(", len(train), "months )")
print("Test  range:", test.index.min(),  "→", test.index.max(),  "(", len(test),  "months )")

plt.figure(figsize=(12,6))
train.plot(label="Train")
test.plot(label="Test", marker="o")
plt.title("Train/Test Split – Monthly Anti-Asian Incidents")
plt.xlabel("Month")
plt.ylabel("Incidents")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

Train range: 2019-01-31 00:00:00 → 2024-08-31 00:00:00 ( 68 months )
Test  range: 2024-09-30 00:00:00 → 2025-08-31 00:00:00 ( 12 months )

arima_order = (1, 1, 1)

arima_model = sm.tsa.ARIMA(train, order=arima_order)
arima_result = arima_model.fit()

print(arima_result.summary())

                               SARIMAX Results                                
==============================================================================
Dep. Variable:                      y   No. Observations:                   68
Model:                 ARIMA(1, 1, 1)   Log Likelihood                -209.681
Date:                Wed, 03 Dec 2025   AIC                            425.362
Time:                        16:29:59   BIC                            431.976
Sample:                    01-31-2019   HQIC                           427.979
                         - 08-31-2024                                         
Covariance Type:                  opg                                         
==============================================================================
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
ar.L1          0.6503      0.140      4.631      0.000       0.375       0.926
ma.L1         -0.9992      2.546     -0.392      0.695      -5.990       3.991
sigma2        29.4097     73.606      0.400      0.689    -114.855     173.674
===================================================================================
Ljung-Box (L1) (Q):                   0.09   Jarque-Bera (JB):               802.16
Prob(Q):                              0.76   Prob(JB):                         0.00
Heteroskedasticity (H):               0.75   Skew:                             3.30
Prob(H) (two-sided):                  0.50   Kurtosis:                        18.61
===================================================================================

Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).

arima_forecast = arima_result.forecast(steps=len(test))
arima_forecast = pd.Series(arima_forecast, index=test.index)

plt.figure(figsize=(12,6))
train.plot(label="Train")
test.plot(label="Actual Test", marker="o")
arima_forecast.plot(label="ARIMA Forecast", marker="o")
plt.title("ARIMA(1,1,1) – Monthly Anti-Asian Incidents")
plt.xlabel("Month")
plt.ylabel("Incidents")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

arima_mae = mean_absolute_error(test, arima_forecast)
arima_mse = mean_squared_error(test, arima_forecast)
arima_rmse = np.sqrt(arima_mse)

print(f"ARIMA(1,1,1) – MAE:  {arima_mae:.3f}")
print(f"ARIMA(1,1,1) – RMSE: {arima_rmse:.3f}")

ARIMA(1,1,1) – MAE:  2.600
ARIMA(1,1,1) – RMSE: 2.869

sarima_order = (1, 0, 1)
sarima_seasonal_order = (1, 1, 0, 12)  

sarima_model = sm.tsa.statespace.SARIMAX(
    train,
    order=sarima_order,
    seasonal_order=sarima_seasonal_order,
    enforce_stationarity=False,
    enforce_invertibility=False
)

sarima_result = sarima_model.fit(disp=False)
print(sarima_result.summary())

                                      SARIMAX Results                                      
===========================================================================================
Dep. Variable:                                   y   No. Observations:                   68
Model:             SARIMAX(1, 0, 1)x(1, 1, [], 12)   Log Likelihood                -142.987
Date:                             Wed, 03 Dec 2025   AIC                            293.975
Time:                                     16:29:59   BIC                            301.020
Sample:                                 01-31-2019   HQIC                           296.573
                                      - 08-31-2024                                         
Covariance Type:                               opg                                         
==============================================================================
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
ar.L1          0.5700      0.335      1.702      0.089      -0.086       1.226
ma.L1          0.2532      0.428      0.591      0.554      -0.586       1.093
ar.S.L12      -0.3783      0.141     -2.675      0.007      -0.655      -0.101
sigma2        45.2191      6.303      7.174      0.000      32.865      57.573
===================================================================================
Ljung-Box (L1) (Q):                   0.01   Jarque-Bera (JB):                24.77
Prob(Q):                              0.93   Prob(JB):                         0.00
Heteroskedasticity (H):               0.09   Skew:                            -0.01
Prob(H) (two-sided):                  0.00   Kurtosis:                         6.72
===================================================================================

Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).

sarima_forecast = sarima_result.forecast(steps=len(test))
sarima_forecast = pd.Series(sarima_forecast, index=test.index)

plt.figure(figsize=(12,6))
train.plot(label="Train")
test.plot(label="Actual Test", marker="o")
sarima_forecast.plot(label="SARIMA Forecast", marker="o")
plt.title("SARIMA – Monthly Anti-Asian Incidents")
plt.xlabel("Month")
plt.ylabel("Incidents")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

sarima_mae = mean_absolute_error(test, sarima_forecast)
sarima_mse = mean_squared_error(test, sarima_forecast)
sarima_rmse = np.sqrt(sarima_mse)

print(f"SARIMA – MAE:  {sarima_mae:.3f}")
print(f"SARIMA – RMSE: {sarima_rmse:.3f}")

SARIMA – MAE:  2.075
SARIMA – RMSE: 2.521

prophet_df = monthly_counts.reset_index()
prophet_df.columns = ["ds", "y"]

prophet_train = prophet_df.iloc[:-test_months]
prophet_test  = prophet_df.iloc[-test_months:]

m = Prophet(
    weekly_seasonality=False,
    yearly_seasonality=True,
    daily_seasonality=False
)

m.fit(prophet_train)

16:30:00 - cmdstanpy - INFO - Chain [1] start processing
16:30:00 - cmdstanpy - INFO - Chain [1] done processing

<prophet.forecaster.Prophet at 0x185189882f0>

future = m.make_future_dataframe(periods=test_months, freq="ME")
forecast = m.predict(future)

prophet_forecast = forecast.set_index("ds")["yhat"].iloc[-test_months:]
prophet_forecast.index = test.index  # align exactly

plt.figure(figsize=(12,6))
train.plot(label="Train")
test.plot(label="Actual Test", marker="o")
prophet_forecast.plot(label="Prophet Forecast", marker="o")
plt.title("Prophet – Monthly Anti-Asian Incidents")
plt.xlabel("Month")
plt.ylabel("Incidents")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

# Metrics
prophet_mae = mean_absolute_error(test, prophet_forecast)
prophet_mse = mean_squared_error(test, prophet_forecast)
prophet_rmse = np.sqrt(prophet_mse)

print(f"Prophet – MAE:  {prophet_mae:.3f}")
print(f"Prophet – RMSE: {prophet_rmse:.3f}")

Prophet – MAE:  4.798
Prophet – RMSE: 5.761

results = pd.DataFrame({
    "Model": ["ARIMA(1,1,1)", "SARIMA(1,0,1)(1,1,0,12)", "Prophet"],
    "MAE":   [arima_mae, sarima_mae, prophet_mae],
    "RMSE":  [arima_rmse, sarima_rmse, prophet_rmse]
})

display(results)

import numpy as np
import itertools
from sklearn.metrics import mean_absolute_error, mean_squared_error

from pmdarima import auto_arima

from prophet import Prophet

auto_arima_model = auto_arima(
    train,
    seasonal=False,         
    stepwise=True,
    trace=True,              
    suppress_warnings=True,
    max_p=5,
    max_q=5,
    max_d=2,
    error_action="ignore"
)

print(auto_arima_model.summary())

best_arima_order = auto_arima_model.order
print("\nBest ARIMA order found:", best_arima_order)

Performing stepwise search to minimize aic
 ARIMA(2,0,2)(0,0,0)[0]             : AIC=434.308, Time=0.04 sec
 ARIMA(0,0,0)(0,0,0)[0]             : AIC=489.924, Time=0.01 sec
 ARIMA(1,0,0)(0,0,0)[0]             : AIC=431.960, Time=0.01 sec
 ARIMA(0,0,1)(0,0,0)[0]             : AIC=453.197, Time=0.02 sec
 ARIMA(2,0,0)(0,0,0)[0]             : AIC=433.945, Time=0.02 sec
 ARIMA(1,0,1)(0,0,0)[0]             : AIC=433.938, Time=0.02 sec
 ARIMA(2,0,1)(0,0,0)[0]             : AIC=435.775, Time=0.03 sec
 ARIMA(1,0,0)(0,0,0)[0] intercept   : AIC=428.336, Time=0.02 sec
 ARIMA(0,0,0)(0,0,0)[0] intercept   : AIC=460.685, Time=0.01 sec
 ARIMA(2,0,0)(0,0,0)[0] intercept   : AIC=429.692, Time=0.03 sec
 ARIMA(1,0,1)(0,0,0)[0] intercept   : AIC=429.606, Time=0.03 sec
 ARIMA(0,0,1)(0,0,0)[0] intercept   : AIC=435.047, Time=0.02 sec
 ARIMA(2,0,1)(0,0,0)[0] intercept   : AIC=431.577, Time=0.08 sec

Best model:  ARIMA(1,0,0)(0,0,0)[0] intercept
Total fit time: 0.352 seconds
                               SARIMAX Results                                
==============================================================================
Dep. Variable:                      y   No. Observations:                   68
Model:               SARIMAX(1, 0, 0)   Log Likelihood                -211.168
Date:                Wed, 03 Dec 2025   AIC                            428.336
Time:                        16:30:01   BIC                            434.994
Sample:                    01-31-2019   HQIC                           430.974
                         - 08-31-2024                                         
Covariance Type:                  opg                                         
==============================================================================
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
intercept      1.9246      1.434      1.342      0.180      -0.886       4.736
ar.L1          0.6265      0.128      4.908      0.000       0.376       0.877
sigma2        28.9507      2.824     10.251      0.000      23.415      34.486
===================================================================================
Ljung-Box (L1) (Q):                   0.33   Jarque-Bera (JB):               783.14
Prob(Q):                              0.57   Prob(JB):                         0.00
Heteroskedasticity (H):               0.63   Skew:                             3.31
Prob(H) (two-sided):                  0.27   Kurtosis:                        18.25
===================================================================================

Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).

Best ARIMA order found: (1, 0, 0)

tuned_arima_model = sm.tsa.ARIMA(train, order=best_arima_order)
tuned_arima_result = tuned_arima_model.fit()

tuned_arima_forecast = tuned_arima_result.forecast(steps=len(test))
tuned_arima_forecast = pd.Series(tuned_arima_forecast, index=test.index)

plt.figure(figsize=(12,6))
train.plot(label="Train")
test.plot(label="Actual Test", marker="o")
tuned_arima_forecast.plot(label=f"ARIMA{best_arima_order} Forecast", marker="o")
plt.title(f"Tuned ARIMA{best_arima_order} – Monthly Anti-Asian Incidents")
plt.xlabel("Month")
plt.ylabel("Incidents")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

tuned_arima_mae  = mean_absolute_error(test, tuned_arima_forecast)
tuned_arima_mse = mean_squared_error(test, tuned_arima_forecast)
tuned_arima_rmse = np.sqrt(tuned_arima_mse)

print(f"ARIMA(1,1,1) – MAE:  {tuned_arima_mae:.3f}")
print(f"ARIMA(1,1,1) – RMSE: {tuned_arima_rmse:.3f}")

ARIMA(1,1,1) – MAE:  2.636
ARIMA(1,1,1) – RMSE: 2.896

auto_sarima_model = auto_arima(
    train,
    seasonal=True,
    m=12,                   
    stepwise=True,
    trace=True,
    suppress_warnings=True,
    max_p=3, max_q=3, max_d=2,
    max_P=2, max_Q=2, max_D=1,
    error_action="ignore"
)

print(auto_sarima_model.summary())

best_sarima_order = auto_sarima_model.order
best_sarima_seasonal = auto_sarima_model.seasonal_order

print("\nBest SARIMA order:", best_sarima_order)
print("Best SARIMA seasonal order:", best_sarima_seasonal)

Performing stepwise search to minimize aic
 ARIMA(2,0,2)(1,0,1)[12] intercept   : AIC=432.103, Time=0.39 sec
 ARIMA(0,0,0)(0,0,0)[12] intercept   : AIC=460.685, Time=0.01 sec
 ARIMA(1,0,0)(1,0,0)[12] intercept   : AIC=425.064, Time=0.06 sec
 ARIMA(0,0,1)(0,0,1)[12] intercept   : AIC=431.125, Time=0.05 sec
 ARIMA(0,0,0)(0,0,0)[12]             : AIC=489.924, Time=0.01 sec
 ARIMA(1,0,0)(0,0,0)[12] intercept   : AIC=428.336, Time=0.02 sec
 ARIMA(1,0,0)(2,0,0)[12] intercept   : AIC=426.987, Time=0.14 sec
 ARIMA(1,0,0)(1,0,1)[12] intercept   : AIC=inf, Time=0.23 sec
 ARIMA(1,0,0)(0,0,1)[12] intercept   : AIC=425.542, Time=0.06 sec
 ARIMA(1,0,0)(2,0,1)[12] intercept   : AIC=inf, Time=0.59 sec
 ARIMA(0,0,0)(1,0,0)[12] intercept   : AIC=457.510, Time=0.04 sec
 ARIMA(2,0,0)(1,0,0)[12] intercept   : AIC=426.422, Time=0.07 sec
 ARIMA(1,0,1)(1,0,0)[12] intercept   : AIC=426.330, Time=0.07 sec
 ARIMA(0,0,1)(1,0,0)[12] intercept   : AIC=431.626, Time=0.06 sec
 ARIMA(2,0,1)(1,0,0)[12] intercept   : AIC=428.290, Time=0.14 sec
 ARIMA(1,0,0)(1,0,0)[12]             : AIC=426.741, Time=0.03 sec

Best model:  ARIMA(1,0,0)(1,0,0)[12] intercept
Total fit time: 1.971 seconds
                                     SARIMAX Results                                      
==========================================================================================
Dep. Variable:                                  y   No. Observations:                   68
Model:             SARIMAX(1, 0, 0)x(1, 0, 0, 12)   Log Likelihood                -208.532
Date:                            Wed, 03 Dec 2025   AIC                            425.064
Time:                                    16:30:03   BIC                            433.942
Sample:                                01-31-2019   HQIC                           428.582
                                     - 08-31-2024                                         
Covariance Type:                              opg                                         
==============================================================================
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
intercept      1.3198      1.171      1.127      0.260      -0.975       3.614
ar.L1          0.6262      0.106      5.921      0.000       0.419       0.834
ar.S.L12       0.2605      0.129      2.021      0.043       0.008       0.513
sigma2        26.4612      4.075      6.493      0.000      18.474      34.448
===================================================================================
Ljung-Box (L1) (Q):                   0.32   Jarque-Bera (JB):               462.90
Prob(Q):                              0.57   Prob(JB):                         0.00
Heteroskedasticity (H):               0.74   Skew:                             2.80
Prob(H) (two-sided):                  0.48   Kurtosis:                        14.49
===================================================================================

Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).

Best SARIMA order: (1, 0, 0)
Best SARIMA seasonal order: (1, 0, 0, 12)

tuned_sarima_model = sm.tsa.statespace.SARIMAX(
    train,
    order=best_sarima_order,
    seasonal_order=best_sarima_seasonal,
    enforce_stationarity=False,
    enforce_invertibility=False
)

tuned_sarima_result = tuned_sarima_model.fit(disp=False)

tuned_sarima_forecast = tuned_sarima_result.forecast(steps=len(test))
tuned_sarima_forecast = pd.Series(tuned_sarima_forecast, index=test.index)

plt.figure(figsize=(12,6))
train.plot(label="Train")
test.plot(label="Actual Test", marker="o")
tuned_sarima_forecast.plot(label=f"SARIMA{best_sarima_order}{best_sarima_seasonal} Forecast", marker="o")
plt.title("Tuned SARIMA – Monthly Anti-Asian Incidents")
plt.xlabel("Month")
plt.ylabel("Incidents")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

tuned_sarima_mae  = mean_absolute_error(test, tuned_sarima_forecast)
tuned_sarima_mse  = mean_squared_error(test, tuned_sarima_forecast)
tuned_sarima_rmse = np.sqrt(tuned_sarima_mse)

print(f"SARIMA(1,1,1) – MAE:  {tuned_sarima_mae:.3f}")
print(f"SARIMA(1,1,1) – RMSE: {tuned_sarima_rmse:.3f}")

SARIMA(1,1,1) – MAE:  1.464
SARIMA(1,1,1) – RMSE: 1.800

prophet_df = monthly_counts.reset_index()
prophet_df.columns = ["ds", "y"]

prophet_train = prophet_df.iloc[:-test_months]
prophet_test  = prophet_df.iloc[-test_months:]

param_grid = {
    "changepoint_prior_scale": [0.01, 0.05, 0.1, 0.5],
    "seasonality_prior_scale": [1.0, 5.0, 10.0],
    "seasonality_mode": ["additive", "multiplicative"]
}

best_params = None
best_prophet_rmse = np.inf

for cp in param_grid["changepoint_prior_scale"]:
    for sp in param_grid["seasonality_prior_scale"]:
        for smode in param_grid["seasonality_mode"]:
            m = Prophet(
                changepoint_prior_scale=cp,
                seasonality_prior_scale=sp,
                seasonality_mode=smode,
                weekly_seasonality=False,
                yearly_seasonality=True,
                daily_seasonality=False
            )
            m.fit(prophet_train)

            future = m.make_future_dataframe(periods=test_months, freq="ME")
            forecast = m.predict(future)

            pred = forecast.set_index("ds")["yhat"].iloc[-test_months:]
            pred.index = test.index

            mse = mean_squared_error(test, pred)
            rmse = np.sqrt(mse)

            if rmse < best_prophet_rmse:
                best_prophet_rmse = rmse
                best_params = (cp, sp, smode)

print("Best Prophet parameters (cp, sp, mode):", best_params)
print("Best Prophet RMSE:", best_prophet_rmse)

16:30:04 - cmdstanpy - INFO - Chain [1] start processing
16:30:04 - cmdstanpy - INFO - Chain [1] done processing
16:30:04 - cmdstanpy - INFO - Chain [1] start processing
16:30:04 - cmdstanpy - INFO - Chain [1] done processing
16:30:04 - cmdstanpy - INFO - Chain [1] start processing
16:30:04 - cmdstanpy - INFO - Chain [1] done processing
16:30:04 - cmdstanpy - INFO - Chain [1] start processing
16:30:05 - cmdstanpy - INFO - Chain [1] done processing
16:30:05 - cmdstanpy - INFO - Chain [1] start processing
16:30:05 - cmdstanpy - INFO - Chain [1] done processing
16:30:05 - cmdstanpy - INFO - Chain [1] start processing
16:30:05 - cmdstanpy - INFO - Chain [1] done processing
16:30:05 - cmdstanpy - INFO - Chain [1] start processing
16:30:05 - cmdstanpy - INFO - Chain [1] done processing
16:30:05 - cmdstanpy - INFO - Chain [1] start processing
16:30:06 - cmdstanpy - INFO - Chain [1] done processing
16:30:06 - cmdstanpy - INFO - Chain [1] start processing
16:30:06 - cmdstanpy - INFO - Chain [1] done processing
16:30:06 - cmdstanpy - INFO - Chain [1] start processing
16:30:06 - cmdstanpy - INFO - Chain [1] done processing
16:30:06 - cmdstanpy - INFO - Chain [1] start processing
16:30:07 - cmdstanpy - INFO - Chain [1] done processing
16:30:07 - cmdstanpy - INFO - Chain [1] start processing
16:30:07 - cmdstanpy - INFO - Chain [1] done processing
16:30:07 - cmdstanpy - INFO - Chain [1] start processing
16:30:07 - cmdstanpy - INFO - Chain [1] done processing
16:30:07 - cmdstanpy - INFO - Chain [1] start processing
16:30:08 - cmdstanpy - INFO - Chain [1] done processing
16:30:08 - cmdstanpy - INFO - Chain [1] start processing
16:30:08 - cmdstanpy - INFO - Chain [1] done processing
16:30:08 - cmdstanpy - INFO - Chain [1] start processing
16:30:09 - cmdstanpy - INFO - Chain [1] done processing
16:30:09 - cmdstanpy - INFO - Chain [1] start processing
16:30:09 - cmdstanpy - INFO - Chain [1] done processing
16:30:09 - cmdstanpy - INFO - Chain [1] start processing
16:30:10 - cmdstanpy - INFO - Chain [1] done processing
16:30:10 - cmdstanpy - INFO - Chain [1] start processing
16:30:10 - cmdstanpy - INFO - Chain [1] done processing
16:30:10 - cmdstanpy - INFO - Chain [1] start processing
16:30:11 - cmdstanpy - INFO - Chain [1] done processing
16:30:11 - cmdstanpy - INFO - Chain [1] start processing
16:30:11 - cmdstanpy - INFO - Chain [1] done processing
16:30:11 - cmdstanpy - INFO - Chain [1] start processing
16:30:12 - cmdstanpy - INFO - Chain [1] done processing
16:30:12 - cmdstanpy - INFO - Chain [1] start processing
16:30:12 - cmdstanpy - INFO - Chain [1] done processing
16:30:13 - cmdstanpy - INFO - Chain [1] start processing
16:30:13 - cmdstanpy - INFO - Chain [1] done processing

Best Prophet parameters (cp, sp, mode): (0.5, 1.0, 'multiplicative')
Best Prophet RMSE: 2.7421939919440614

best_cp, best_sp, best_mode = best_params

best_m = Prophet(
    changepoint_prior_scale=best_cp,
    seasonality_prior_scale=best_sp,
    seasonality_mode=best_mode,
    weekly_seasonality=False,
    yearly_seasonality=True,
    daily_seasonality=False
)

best_m.fit(prophet_train)

future = best_m.make_future_dataframe(periods=test_months, freq="M")
forecast = best_m.predict(future)

best_prophet_forecast = forecast.set_index("ds")["yhat"].iloc[-test_months:]
best_prophet_forecast.index = test.index

plt.figure(figsize=(12,6))
train.plot(label="Train")
test.plot(label="Actual Test", marker="o")
best_prophet_forecast.plot(label="Tuned Prophet Forecast", marker="o")
plt.title(f"Tuned Prophet – Monthly Anti-Asian Incidents\n(cp={best_cp}, sp={best_sp}, mode={best_mode})")
plt.xlabel("Month")
plt.ylabel("Incidents")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

tuned_prophet_mae  = mean_absolute_error(test, best_prophet_forecast)
tuned_prophet_mse = mean_squared_error(test, best_prophet_forecast)
tuned_prophet_rmse = np.sqrt(tuned_prophet_mse)

print(f"Tuned Prophet – MAE:  {tuned_prophet_mae:.3f}")
print(f"Tuned Prophet – RMSE: {tuned_prophet_rmse:.3f}")

16:30:13 - cmdstanpy - INFO - Chain [1] start processing
16:30:14 - cmdstanpy - INFO - Chain [1] done processing
C:\Users\vibha\anaconda3\Lib\site-packages\prophet\forecaster.py:1872: FutureWarning: 'M' is deprecated and will be removed in a future version, please use 'ME' instead.
  dates = pd.date_range(

Tuned Prophet – MAE:  2.315
Tuned Prophet – RMSE: 2.742

tuned_results = pd.DataFrame({
    "Model": [
        f"ARIMA{best_arima_order}",
        f"SARIMA{best_sarima_order}{best_sarima_seasonal}",
        f"Prophet(cp={best_cp}, sp={best_sp}, mode={best_mode})"
    ],
    "MAE":  [tuned_arima_mae, tuned_sarima_mae, tuned_prophet_mae],
    "RMSE": [tuned_arima_rmse, tuned_sarima_rmse, tuned_prophet_rmse]
})

display(tuned_results)

comparison_df = pd.DataFrame({
    "Actual": test,
    "ARIMA Untuned": arima_forecast,                     
    "ARIMA Tuned": tuned_arima_forecast,                 
    "SARIMA Untuned": sarima_forecast,                  
    "SARIMA Tuned": tuned_sarima_forecast,               
    "Prophet Untuned": prophet_forecast,                 
    "Prophet Tuned": best_prophet_forecast              
})

display(comparison_df.head())

plt.figure(figsize=(15,8))

plt.plot(test.index, test.values, label="Actual", color="black", linewidth=3)

plt.plot(test.index, arima_forecast, label="ARIMA Untuned", linestyle="--")
plt.plot(test.index, sarima_forecast, label="SARIMA Untuned", linestyle="--")
plt.plot(test.index, prophet_forecast, label="Prophet Untuned", linestyle="--")

plt.plot(test.index, tuned_arima_forecast, label="ARIMA Tuned", marker="o")
plt.plot(test.index, tuned_sarima_forecast, label="SARIMA Tuned", marker="o")
plt.plot(test.index, best_prophet_forecast, label="Prophet Tuned", marker="o")

plt.title("Comparison of Tuned and Untuned Forecasting Models\nMonthly Anti-Asian Hate Crime Incidents", fontsize=16)
plt.xlabel("Month")
plt.ylabel("Incident Count")
plt.grid(True)
plt.legend()
plt.tight_layout()
plt.show()

comparison_metrics = pd.DataFrame({
    "Model": [
        "ARIMA Untuned", "ARIMA Tuned",
        "SARIMA Untuned", "SARIMA Tuned",
        "Prophet Untuned", "Prophet Tuned"
    ],
    "MAE": [
        arima_mae, tuned_arima_mae,
        sarima_mae, tuned_sarima_mae,
        prophet_mae, tuned_prophet_mae
    ],
    "RMSE": [
        arima_rmse, tuned_arima_rmse,
        sarima_rmse, tuned_sarima_rmse,
        prophet_rmse, tuned_prophet_rmse
    ]
})

display(comparison_metrics)

plt.figure(figsize=(12,6))
plt.bar(comparison_metrics["Model"], comparison_metrics["RMSE"], color=[
    "#ff9999", "#ff4d4d",    
    "#99ccff", "#1a75ff",    
    "#99ff99", "#33cc33"    
])

plt.title("RMSE Comparison: Tuned vs Untuned Forecasting Models", fontsize=15)
plt.ylabel("RMSE (Lower is better)")
plt.xticks(rotation=45, ha="right")
plt.grid(axis="y", linestyle="--", alpha=0.5)
plt.tight_layout()
plt.show()

baseline_name = "SARIMA Tuned"

baseline_rmse = comparison_metrics.loc[
    comparison_metrics["Model"] == baseline_name, "RMSE"
].iloc[0]

comparison_metrics["Relative_Accuracy_Value"] = (
    baseline_rmse / comparison_metrics["RMSE"] * 100
)

def format_acc(x):
    x_round = round(x)
    if x_round == 100:
        return "100%"
    else:
        return f"≈ {x_round}%"

comparison_metrics["Relative Accuracy"] = comparison_metrics[
    "Relative_Accuracy_Value"
].apply(format_acc)

comparison_metrics_sorted = comparison_metrics.sort_values("RMSE").reset_index(drop=True)

display(comparison_metrics_sorted[["Model", "MAE", "RMSE", "Relative Accuracy"]])

plt.figure(figsize=(12, 6))

plt.bar(
    comparison_metrics_sorted["Model"],
    comparison_metrics_sorted["Relative_Accuracy_Value"],
)

plt.title(
    "Relative Accuracy (Higher = Better Performance)\nBaseline: SARIMA Tuned = 100% Accuracy",
    fontsize=14,
)
plt.ylabel("Relative Accuracy (%)")
plt.xticks(rotation=45, ha="right")
plt.grid(axis="y", linestyle="--", alpha=0.5)
plt.tight_layout()
plt.show()

full_sarima_model = sm.tsa.statespace.SARIMAX(
    monthly_counts,
    order=best_sarima_order,
    seasonal_order=best_sarima_seasonal,
    enforce_stationarity=False,
    enforce_invertibility=False
)

full_sarima_result = full_sarima_model.fit(disp=False)

n_steps = 6

sarima_forecast_obj = full_sarima_result.get_forecast(steps=n_steps)
sarima_future_mean = sarima_forecast_obj.predicted_mean
sarima_future_ci = sarima_forecast_obj.conf_int()

future_index = pd.date_range(
    start=monthly_counts.index[-1] + pd.offsets.MonthEnd(1),
    periods=n_steps,
    freq="M"
)

sarima_future_mean.index = future_index
sarima_future_ci.index = future_index

print("SARIMA forecast for the next 6 months:")
display(
    pd.DataFrame({
        "Forecast": sarima_future_mean,
        "Lower CI": sarima_future_ci.iloc[:, 0],
        "Upper CI": sarima_future_ci.iloc[:, 1],
    })
)

SARIMA forecast for the next 6 months:

C:\Users\vibha\AppData\Local\Temp\ipykernel_19412\2913118711.py:17: FutureWarning: 'M' is deprecated and will be removed in a future version, please use 'ME' instead.
  future_index = pd.date_range(

history_window = 24  

plt.figure(figsize=(12,6))

monthly_counts.tail(history_window).plot(label="History", marker="o")

sarima_future_mean.plot(label="SARIMA Forecast", marker="o")

plt.fill_between(
    sarima_future_mean.index,
    sarima_future_ci.iloc[:, 0],
    sarima_future_ci.iloc[:, 1],
    alpha=0.2,
    label="95% confidence interval"
)

plt.title(f"Tuned SARIMA{best_sarima_order}{best_sarima_seasonal} – 6-Month Forecast\nMonthly Anti-Asian Hate Crime Incidents")
plt.xlabel("Month")
plt.ylabel("Incident Count")
plt.grid(True)
plt.legend()
plt.tight_layout()
plt.show()

filtered_df_precinct = filtered_df.dropna(subset=["Complaint Precinct Code"]).copy()
filtered_df_precinct["Complaint Precinct Code"] = filtered_df_precinct["Complaint Precinct Code"].astype(int)

precinct_counts = (
    filtered_df_precinct
    .groupby("Complaint Precinct Code")
    .size()
    .rename("historical_total")
    .sort_values(ascending=False)
)

print("Top 10 precincts by historical Anti-Asian incidents:")
display(precinct_counts.head(10))

total_incidents_citywide = precinct_counts.sum()
precinct_shares = (precinct_counts / total_incidents_citywide).rename("share")

print("\nCheck: sum of shares =", precinct_shares.sum())

Top 10 precincts by historical Anti-Asian incidents:

Complaint Precinct Code
14     52
13     25
5      21
7      16
84     15
62     15
109    14
9      13
6      13
18     12
Name: historical_total, dtype: int64

Check: sum of shares = 0.9999999999999998

forecast_6m_total = sarima_future_mean.sum()
print("Total forecasted Anti-Asian incidents (next 6 months, citywide):", forecast_6m_total)

Total forecasted Anti-Asian incidents (next 6 months, citywide): 6.37208442247705

precinct_forecast_6m = (precinct_shares * forecast_6m_total).rename("forecast_6m")

precinct_stats = pd.concat([precinct_counts, precinct_shares, precinct_forecast_6m], axis=1)

display(precinct_stats.head(10))

gdf_precincts = gpd.read_file(PRECINCTS_GEOJSON_URL)
gdf_precincts.columns = gdf_precincts.columns.str.strip()

precinct_candidates = [c for c in gdf_precincts.columns if "precinct" in c.lower()]
print("Precinct columns in GeoJSON:", precinct_candidates)
precinct_col = precinct_candidates[0]

gdf_precincts = gdf_precincts.rename(columns={precinct_col: "Complaint Precinct Code"})
gdf_precincts["Complaint Precinct Code"] = gdf_precincts["Complaint Precinct Code"].astype(int)

gdf_stats = gdf_precincts.merge(
    precinct_stats.reset_index(),
    on="Complaint Precinct Code",
    how="left"
)

for col in ["historical_total", "share", "forecast_6m"]:
    gdf_stats[col] = gdf_stats[col].fillna(0)

print("GeoDataFrame with stats:", gdf_stats.shape)

Precinct columns in GeoJSON: ['precinct']
GeoDataFrame with stats: (78, 7)

fig, ax = plt.subplots(1, 1, figsize=(10, 10))

gdf_stats.plot(
    column="historical_total",
    cmap="Reds",
    legend=True,
    edgecolor="black",
    linewidth=0.3,
    ax=ax
)

ax.set_title("Historical Anti-Asian Hate Crime Incidents by Precinct (Total)", fontsize=15)
ax.axis("off")

plt.tight_layout()
plt.show()

fig, ax = plt.subplots(1, 1, figsize=(10, 10))

gdf_stats.plot(
    column="forecast_6m",
    cmap="Blues",
    legend=True,
    edgecolor="black",
    linewidth=0.3,
    ax=ax
)

ax.set_title("Expected Anti-Asian Incidents in Next 6 Months (by Precinct)\nAllocated Using Tuned SARIMA Forecast + Historical Shares", fontsize=14)
ax.axis("off")

plt.tight_layout()
plt.show()

	Full Complaint ID	Complaint Year Number	Month Number	Record Create Date	Complaint Precinct Code	Patrol Borough Name	County	Law Code Category Description	Offense Description	PD Code Description	Bias Motive Description	Offense Category	Arrest Date	Arrest Id
0	201906012119317	2019	1	01/23/2019	60	PATROL BORO BKLYN SOUTH	KINGS	FELONY	MISCELLANEOUS PENAL LAW	AGGRAVATED HARASSMENT 1	ANTI-JEWISH	Religion/Religious Practice	NaN	NaN
1	201906012175717	2019	2	02/25/2019	60	PATROL BORO BKLYN SOUTH	KINGS	FELONY	MISCELLANEOUS PENAL LAW	AGGRAVATED HARASSMENT 1	ANTI-JEWISH	Religion/Religious Practice	NaN	NaN
2	201906012180117	2019	2	02/27/2019	60	PATROL BORO BKLYN SOUTH	KINGS	FELONY	MISCELLANEOUS PENAL LAW	AGGRAVATED HARASSMENT 1	ANTI-JEWISH	Religion/Religious Practice	NaN	NaN
3	201906012273417	2019	4	04/16/2019	60	PATROL BORO BKLYN SOUTH	KINGS	FELONY	MISCELLANEOUS PENAL LAW	AGGRAVATED HARASSMENT 1	ANTI-JEWISH	Religion/Religious Practice	NaN	NaN
4	201906012413717	2019	6	06/20/2019	60	PATROL BORO BKLYN SOUTH	KINGS	FELONY	MISCELLANEOUS PENAL LAW	AGGRAVATED HARASSMENT 1	ANTI-JEWISH	Religion/Religious Practice	NaN	NaN

	Full Complaint ID	Complaint Year Number	Month Number	Complaint Precinct Code	Patrol Borough Name	County	Law Code Category Description	Offense Description	PD Code Description	Bias Motive Description	Offense Category	Arrest Date	Arrest Id
Record Create Date
2019-01-15	201906112101017	2019	1	61	PATROL BORO BKLYN SOUTH	KINGS	FELONY	MURDER & NON-NEGL. MANSLAUGHTE	MURDER,UNCLASSIFIED	ANTI-ASIAN	Race/Color	NaN	K31675023
2020-01-29	202007912147017	2020	1	79	PATROL BORO BKLYN NORTH	KINGS	FELONY	FELONY ASSAULT	ASSAULT 2,1,UNCLASSIFIED	ANTI-ASIAN	Race/Color	NaN	NaN
2020-01-29	202007912147017	2020	1	79	PATROL BORO BKLYN NORTH	KINGS	FELONY	FELONY ASSAULT	ASSAULT 2,1,UNCLASSIFIED	ANTI-ASIAN	Race/Color	NaN	K32676408
2020-03-10	202002312217317	2020	3	23	PATROL BORO MAN NORTH	NEW YORK	MISDEMEANOR	ASSAULT 3 & RELATED OFFENSES	ASSAULT 3	ANTI-ASIAN	Race/Color	NaN	M32682527
2020-03-10	202002312217217	2020	3	23	PATROL BORO MAN NORTH	NEW YORK	FELONY	MISCELLANEOUS PENAL LAW	AGGRAVATED HARASSMENT 1	ANTI-ASIAN	Race/Color	NaN	M32682532

	Model	MAE	RMSE
0	ARIMA(1, 0, 0)	2.635961	2.895933
1	SARIMA(1, 0, 0)(1, 0, 0, 12)	1.464307	1.800098
2	Prophet(cp=0.5, sp=1.0, mode=multiplicative)	2.315017	2.742194

	Actual	ARIMA Untuned	ARIMA Tuned	SARIMA Untuned	SARIMA Tuned	Prophet Untuned	Prophet Tuned
Record Create Date
2024-09-30	7	3.747455	3.804200	2.874223	3.741317	4.514449	0.734242
2024-10-31	3	4.233510	4.308042	0.615527	1.679759	4.521154	0.475653
2024-11-30	3	4.549583	4.623706	1.450476	1.143071	5.320337	0.535817
2024-12-31	3	4.755118	4.821475	2.092783	1.383703	5.497119	0.435934
2025-01-31	1	4.888774	4.945379	2.076868	0.934048	2.735826	0.219521

	Model	MAE	RMSE
0	ARIMA Untuned	2.600102	2.868501
1	ARIMA Tuned	2.635961	2.895933
2	SARIMA Untuned	2.074892	2.521463
3	SARIMA Tuned	1.464307	1.800098
4	Prophet Untuned	4.798075	5.760596
5	Prophet Tuned	2.315017	2.742194

	Model	MAE	RMSE
0	ARIMA(1,1,1)	2.600102	2.868501
1	SARIMA(1,0,1)(1,1,0,12)	2.074892	2.521463
2	Prophet	4.798075	5.760596

	Forecast	Lower CI	Upper CI
2025-09-30	2.488614	-7.948081	12.925308
2025-10-31	1.047929	-11.723726	13.819584
2025-11-30	1.061911	-12.724868	14.848690
2025-12-31	1.071773	-13.193152	15.336698
2026-01-31	0.348475	-14.148456	14.845407
2026-02-28	0.353382	-14.257602	14.964366

	historical_total	share	forecast_6m
Complaint Precinct Code
14	52	0.131980	0.840986
13	25	0.063452	0.404320
5	21	0.053299	0.339629
7	16	0.040609	0.258765
84	15	0.038071	0.242592
62	15	0.038071	0.242592
109	14	0.035533	0.226419
9	13	0.032995	0.210246
6	13	0.032995	0.210246
18	12	0.030457	0.194074

Imports & Settings¶

Load data & basic information¶

Date Handling & Cleaning¶

Filter Exact Anti-Asian Incidents¶

Adding Year/Month Columns¶

Top Boroughs & Offense Categories¶

Monthly Time Series¶

Seasonality Heatmap (Year × Month)¶

Borough-Level Bar Plot¶

Arrest Analysis¶

In the provided NYPD Hate Crimes dataset, none of the incidents classified as ANTI-ASIAN have a non-null Arrest Date. In our analysis, we therefore treat all Anti-Asian incidents as ‘no arrest recorded’.¶

This may indicate either a true absence of arrests or incomplete recording of arrest data for these cases¶

Precinct Hotspots¶

Loading GeoJSON¶

Single Choropleth Map Of Hotspots¶

Which Months Are Worst Overall?¶

Day-Of-Week Pattern¶

Time Series Libraries¶

Monthly Time Series (NYC Total)¶

Train / Test Split (Last 12 Months As Test)¶

ARIMA(1,1,1) On Monthly Data¶

ARIMA Forecast + Metrics¶

SARIMA With Yearly Seasonality (12-Month Period)¶

SARIMA Forecast + Metrics¶

Prophet On Monthly Data¶

Comparing All Three Models¶

Hyperparameter Tuning¶

Auto-ARIMA (Non-Seasonal ARIMA Tuning)¶

Fitting Tuned ARIMA And Evaluating¶

Auto-SARIMA (Seasonal Tuning)¶

Fitting Tuned SARIMA And Evaluating¶

Prophet Tuning Via Small Grid Search¶

Fitting Prophet With Best Params And Evaluating¶

Comparing Tuned Models¶

Combining All Model Forecasts¶

Plotting All Models Versus Actual (Tuned + Untuned)¶

Performance Comparison Table (Tuned Versus Untuned)¶

Bar Chart Comparing RMSE (Tuned Versus Untuned)¶

Relative Model Accuracy¶

Bar Chart Of Relative Accuracy¶

Refitting Tuned SARIMA On Full Data And Forecasting 6 Months¶

Plotting History + 6-Month SARIMA Forecast¶

Computing Historical Precinct Totals And Shares¶

Using SARIMA Forecast To Get Expected 6-Month Total¶

Allocating Forecast To Precincts Using Historical Shares¶

Loading Precinct GeoJSON And Merging Stats¶

Choropleth: Historical Hotspots¶

Choropleth: Forecasted 6-Month Expected Incidents¶