Importing Libraries¶

In [ ]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
In [ ]:
from scipy.stats import zscore
In [ ]:
from google.colab import drive
drive.mount('/content/drive', force_remount=False)

file_path = '/content/drive/MyDrive/EV_Population_WA_Data.csv'

df = pd.read_csv(file_path)

df.head()
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Out[ ]:
VIN (1-10) County City State Postal Code Model Year Make Model Electric Vehicle Type Clean Alternative Fuel Vehicle (CAFV) Eligibility Electric Range Base MSRP Legislative District DOL Vehicle ID Vehicle Location Electric Utility 2020 Census Tract
0 5YJ3E1EB0J Thurston Olympia WA 98512.0 2018 TESLA MODEL 3 Battery Electric Vehicle (BEV) Clean Alternative Fuel Vehicle Eligible 215 0 35.0 104823078 POINT (-122.957046 46.991391) PUGET SOUND ENERGY INC 5.306701e+10
1 WA1AAAGE9M Kitsap Port Orchard WA 98367.0 2021 AUDI E-TRON Battery Electric Vehicle (BEV) Clean Alternative Fuel Vehicle Eligible 222 0 35.0 156660507 POINT (-122.6530052 47.4739066) PUGET SOUND ENERGY INC 5.303509e+10
2 5YJ3E1EA2J Yakima Yakima WA 98902.0 2018 TESLA MODEL 3 Battery Electric Vehicle (BEV) Clean Alternative Fuel Vehicle Eligible 215 0 14.0 269374108 POINT (-120.530331 46.59534) PACIFICORP 5.307700e+10
3 5YJ3E1EA4N Yakima Yakima WA 98902.0 2022 TESLA MODEL 3 Battery Electric Vehicle (BEV) Eligibility unknown as battery range has not b... 0 0 15.0 213383894 POINT (-120.530331 46.59534) PACIFICORP 5.307700e+10
4 7SAYGAEE2P Snohomish Bothell WA 98012.0 2023 TESLA MODEL Y Battery Electric Vehicle (BEV) Eligibility unknown as battery range has not b... 0 0 1.0 229496046 POINT (-122.206146 47.839957) PUGET SOUND ENERGY INC 5.306105e+10

Data Inspection¶

In [ ]:
df.tail()
Out[ ]:
VIN (1-10) County City State Postal Code Model Year Make Model Electric Vehicle Type Clean Alternative Fuel Vehicle (CAFV) Eligibility Electric Range Base MSRP Legislative District DOL Vehicle ID Vehicle Location Electric Utility 2020 Census Tract
200043 JTMAB3FVXR Snohomish Snohomish WA 98290.0 2024 TOYOTA RAV4 PRIME Plug-in Hybrid Electric Vehicle (PHEV) Clean Alternative Fuel Vehicle Eligible 42 0 44.0 262809249 POINT (-122.0483457 47.9435765) PUGET SOUND ENERGY INC 5.306105e+10
200044 7FCTGAAA7P Pierce Orting WA 98360.0 2023 RIVIAN R1T Battery Electric Vehicle (BEV) Eligibility unknown as battery range has not b... 0 0 2.0 252195450 POINT (-122.197791 47.0948565) PUGET SOUND ENERGY INC||CITY OF TACOMA - (WA) 5.305307e+10
200045 1V2GNPE87P Spokane Spokane WA 99201.0 2023 VOLKSWAGEN ID.4 Battery Electric Vehicle (BEV) Eligibility unknown as battery range has not b... 0 0 3.0 227314790 POINT (-117.428902 47.658268) MODERN ELECTRIC WATER COMPANY 5.306300e+10
200046 1G1RD6E42E Snohomish Mountlake Terrace WA 98043.0 2014 CHEVROLET VOLT Plug-in Hybrid Electric Vehicle (PHEV) Clean Alternative Fuel Vehicle Eligible 38 0 32.0 170747377 POINT (-122.306706 47.792043) PUGET SOUND ENERGY INC 5.306105e+10
200047 5YJ3E1EAXP Kitsap Bremerton WA 98311.0 2023 TESLA MODEL 3 Battery Electric Vehicle (BEV) Eligibility unknown as battery range has not b... 0 0 23.0 224663134 POINT (-122.636245 47.62806) PUGET SOUND ENERGY INC 5.303509e+10
In [ ]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200048 entries, 0 to 200047
Data columns (total 17 columns):
 #   Column                                             Non-Null Count   Dtype  
---  ------                                             --------------   -----  
 0   VIN (1-10)                                         200048 non-null  object 
 1   County                                             200044 non-null  object 
 2   City                                               200044 non-null  object 
 3   State                                              200048 non-null  object 
 4   Postal Code                                        200044 non-null  float64
 5   Model Year                                         200048 non-null  int64  
 6   Make                                               200048 non-null  object 
 7   Model                                              200048 non-null  object 
 8   Electric Vehicle Type                              200048 non-null  object 
 9   Clean Alternative Fuel Vehicle (CAFV) Eligibility  200048 non-null  object 
 10  Electric Range                                     200048 non-null  int64  
 11  Base MSRP                                          200048 non-null  int64  
 12  Legislative District                               199606 non-null  float64
 13  DOL Vehicle ID                                     200048 non-null  int64  
 14  Vehicle Location                                   200040 non-null  object 
 15  Electric Utility                                   200044 non-null  object 
 16  2020 Census Tract                                  200044 non-null  float64
dtypes: float64(3), int64(4), object(10)
memory usage: 25.9+ MB
In [ ]:
df.describe()
Out[ ]:
Postal Code Model Year Electric Range Base MSRP Legislative District DOL Vehicle ID 2020 Census Tract
count 200044.000000 200048.000000 200048.000000 200048.000000 199606.000000 2.000480e+05 2.000440e+05
mean 98176.178121 2020.871261 53.485279 947.551913 28.986063 2.262988e+08 5.297545e+10
std 2424.207811 2.994933 88.786301 7860.591091 14.908108 7.282433e+07 1.605628e+09
min 1731.000000 1997.000000 0.000000 0.000000 1.000000 4.385000e+03 1.001020e+09
25% 98052.000000 2019.000000 0.000000 0.000000 17.000000 1.904573e+08 5.303301e+10
50% 98125.000000 2022.000000 0.000000 0.000000 33.000000 2.363396e+08 5.303303e+10
75% 98372.000000 2023.000000 53.000000 0.000000 42.000000 2.609659e+08 5.305307e+10
max 99577.000000 2025.000000 337.000000 845000.000000 49.000000 4.792548e+08 5.602100e+10
In [ ]:
df.shape
Out[ ]:
(200048, 17)
In [ ]:
df.isnull().sum()
Out[ ]:
0
VIN (1-10) 0
County 4
City 4
State 0
Postal Code 4
Model Year 0
Make 0
Model 0
Electric Vehicle Type 0
Clean Alternative Fuel Vehicle (CAFV) Eligibility 0
Electric Range 0
Base MSRP 0
Legislative District 442
DOL Vehicle ID 0
Vehicle Location 8
Electric Utility 4
2020 Census Tract 4

In [ ]:
# # Drop rows with missing categorical values
# df = df.dropna(subset=['County', 'City', 'Model', 'Electric Utility'])

# # Check if categorical missing values are removed
# df.isnull().sum()

Dealing with Missing Values¶

In [ ]:
# Fill missing numerical values with the median
# df.loc[:, 'Electric Range'] = df['Electric Range'].fillna(df['Electric Range'].median())
df.loc[:, 'Base MSRP'] = df['Base MSRP'].fillna(df['Base MSRP'].mean())
df.loc[:, 'Legislative District'] = df['Legislative District'].fillna(df['Legislative District'].median())

# Verify that all missing values are handled
df.isnull().sum()
Out[ ]:
0
VIN (1-10) 0
County 4
City 4
State 0
Postal Code 4
Model Year 0
Make 0
Model 0
Electric Vehicle Type 0
Clean Alternative Fuel Vehicle (CAFV) Eligibility 0
Electric Range 0
Base MSRP 0
Legislative District 0
DOL Vehicle ID 0
Vehicle Location 8
Electric Utility 4
2020 Census Tract 4

In [ ]:
df['County'].fillna(df['County'].mode()[0], inplace=True)
df['City'].fillna(df['City'].mode()[0], inplace=True)
df['Postal Code'].fillna(df['Postal Code'].mode()[0], inplace=True)
df['Electric Utility'].fillna(df['Electric Utility'].mode()[0], inplace=True)

df.dropna(subset=['2020 Census Tract'], inplace=True)
df.dropna(subset=['Vehicle Location'], inplace=True)
df.dropna(subset=['Legislative District'], inplace=True)
In [ ]:
print("Missing values after handling:")
print(df.isnull().sum())
Missing values after handling:
VIN (1-10)                                           0
County                                               0
City                                                 0
State                                                0
Postal Code                                          0
Model Year                                           0
Make                                                 0
Model                                                0
Electric Vehicle Type                                0
Clean Alternative Fuel Vehicle (CAFV) Eligibility    0
Electric Range                                       0
Base MSRP                                            0
Legislative District                                 0
DOL Vehicle ID                                       0
Vehicle Location                                     0
Electric Utility                                     0
2020 Census Tract                                    0
dtype: int64

Univariate Analysis¶

In [ ]:
# Summary statistics for key numerical variables
summary_stats = df[['Electric Range', 'Base MSRP', 'Model Year']].describe().T
summary_stats['variance'] = df[['Electric Range', 'Base MSRP', 'Model Year']].var()

# Display the statistics
summary_stats[['mean', '50%', 'std', 'variance']]  # 50% represents the median
Out[ ]:
mean 50% std variance
Electric Range 53.484268 0.0 88.786478 7.883039e+03
Base MSRP 947.161643 0.0 7859.560065 6.177268e+07
Model Year 2020.871336 2022.0 2.994938 8.969653e+00
In [ ]:
plt.figure(figsize=(15, 6))

# Histogram for Electric Range
plt.subplot(1, 2, 1)
sns.histplot(df['Electric Range'], bins=30, kde=True)
plt.title('Distribution of Electric Range')
plt.xlabel('Electric Range (miles)')
plt.ylabel('Frequency')

# Box Plot for Electric Range
plt.subplot(1, 2, 2)
sns.boxplot(df['Electric Range'])
plt.title('Box Plot of Electric Range')
plt.xlabel('Electric Range (miles)')

plt.tight_layout()
plt.show()
In [ ]:
plt.figure(figsize=(15, 6))

# Histogram for Base MSRP
plt.subplot(1, 2, 1)
sns.histplot(df['Base MSRP'], bins=30, kde=True)
plt.title('Distribution of Base MSRP')
plt.xlabel('Base MSRP (in dollars)')
plt.ylabel('Frequency')

# Box Plot for Base MSRP
plt.subplot(1, 2, 2)
sns.boxplot(df['Base MSRP'])
plt.title('Box Plot of Base MSRP')
plt.xlabel('Base MSRP (in dollars)')

plt.tight_layout()
plt.show()
In [ ]:
plt.figure(figsize=(15, 6))

# Histogram for Model Year
plt.subplot(1, 2, 1)
sns.histplot(df['Model Year'], bins=30, kde=True)
plt.title('Distribution of Model Year')
plt.xlabel('Model Year')
plt.ylabel('Frequency')

# Box Plot for Model Year
plt.subplot(1, 2, 2)
sns.boxplot(df['Model Year'])
plt.title('Box Plot of Model Year')
plt.xlabel('Model Year')

plt.tight_layout()
plt.show()
In [ ]:
plt.figure(figsize=(10, 6))
sns.countplot(y='Make', data=df, order=df['Make'].value_counts().index[:10])
plt.title('Top 10 Car Makers')
plt.xlabel('Count')
plt.ylabel('Car Maker')
plt.show()
In [ ]:
unique_models = df[['Make', 'Model']].drop_duplicates()
model_counts = unique_models['Make'].value_counts().reset_index()
model_counts.columns = ['Make', 'Unique Models']

# Plotting
plt.figure(figsize=(10, 6))
sns.barplot(x='Unique Models', y='Make', data=model_counts, palette='viridis')
plt.title('Unique Models per Maker')
plt.xlabel('Number of Unique Models')
plt.ylabel('Maker')
plt.show()
print(model_counts)
                    Make  Unique Models
0                    BMW             15
1                   AUDI             12
2          MERCEDES-BENZ             11
3                HYUNDAI              9
4                    KIA              8
5                   FORD              8
6              CHEVROLET              8
7                  VOLVO              7
8                  TESLA              6
9                 TOYOTA              5
10               PORSCHE              4
11                 HONDA              4
12                 LEXUS              4
13                 SMART              3
14                 MAZDA              3
15              CADILLAC              3
16               GENESIS              3
17                RIVIAN              3
18            LAND ROVER              2
19                FISKER              2
20               BENTLEY              2
21                  MINI              2
22               LINCOLN              2
23                SUBARU              2
24            MITSUBISHI              2
25                  FIAT              2
26                NISSAN              2
27            VOLKSWAGEN              2
28                  JEEP              2
29              CHRYSLER              1
30                JAGUAR              1
31                 LUCID              1
32            ALFA ROMEO              1
33                 DODGE              1
34                   GMC              1
35                 ACURA              1
36              POLESTAR              1
37        AZURE DYNAMICS              1
38                 TH!NK              1
39  WHEEGO ELECTRIC CARS              1
40           ROLLS-ROYCE              1
41                   RAM              1
In [ ]:
unique_models = df[['Make', 'Model']].drop_duplicates()

# Group by Maker and aggregate model names using 'list'
model_counts = unique_models.groupby('Make')['Model'].agg(list).reset_index()
model_counts['Count'] = model_counts['Model'].str.len()

# Get top 5 makers with the most unique models
top_makers = model_counts.sort_values(by='Count', ascending=False).head(5)

# Display the result
print(top_makers)
print(model_counts.sort_values(by='Count', ascending=False))
             Make                                              Model  Count
5             BMW  [I3, X3, X5, I4, 330E, IX, 530E, I5, I7, I8, 7...     15
2            AUDI  [E-TRON, A3, Q5, Q8, Q5 E, Q4, E-TRON GT, E-TR...     12
25  MERCEDES-BENZ  [EQE-CLASS SUV, EQS-CLASS SUV, EQS-CLASS SEDAN...     11
16        HYUNDAI  [IONIQ, IONIQ 5, KONA ELECTRIC, SANTA FE, IONI...      9
12           FORD  [ESCAPE, C-MAX, FOCUS, FUSION, MUSTANG MACH-E,...      8
                    Make                                              Model  \
5                    BMW  [I3, X3, X5, I4, 330E, IX, 530E, I5, I7, I8, 7...   
2                   AUDI  [E-TRON, A3, Q5, Q8, Q5 E, Q4, E-TRON GT, E-TR...   
25         MERCEDES-BENZ  [EQE-CLASS SUV, EQS-CLASS SUV, EQS-CLASS SEDAN...   
16               HYUNDAI  [IONIQ, IONIQ 5, KONA ELECTRIC, SANTA FE, IONI...   
12                  FORD  [ESCAPE, C-MAX, FOCUS, FUSION, MUSTANG MACH-E,...   
19                   KIA  [SOUL, SPORTAGE, NIRO, SORENTO, EV6, EV9, OPTI...   
7              CHEVROLET  [VOLT, BOLT EV, BOLT EUV, SPARK, BLAZER EV, SI...   
40                 VOLVO             [XC60, XC90, S60, XC40, C40, V60, S90]   
36                 TESLA  [MODEL 3, MODEL Y, MODEL S, MODEL X, CYBERTRUC...   
38                TOYOTA  [PRIUS PLUG-IN, PRIUS PRIME, RAV4 PRIME, BZ4X,...   
21                 LEXUS                                   [RX, NX, RZ, TX]   
15                 HONDA                  [CLARITY, PROLOGUE, ACCORD, CR-V]   
30               PORSCHE                   [TAYCAN, PANAMERA, CAYENNE, 918]   
34                 SMART         [FORTWO ELECTRIC DRIVE, EQ FORTWO, FORTWO]   
32                RIVIAN                                    [R1T, R1S, EDV]   
6               CADILLAC                                  [LYRIQ, ELR, CT6]   
24                 MAZDA                              [CX-90, CX-70, MX-30]   
13               GENESIS                                  [GV60, GV70, G80]   
28                NISSAN                                      [LEAF, ARIYA]   
27            MITSUBISHI                                [OUTLANDER, I-MIEV]   
26                  MINI                              [HARDTOP, COUNTRYMAN]   
4                BENTLEY                            [FLYING SPUR, BENTAYGA]   
11                FISKER                                     [OCEAN, KARMA]   
22               LINCOLN                                 [CORSAIR, AVIATOR]   
20            LAND ROVER                   [RANGE ROVER SPORT, RANGE ROVER]   
18                  JEEP                         [GRAND CHEROKEE, WRANGLER]   
39            VOLKSWAGEN                                     [E-GOLF, ID.4]   
10                  FIAT                                        [500, 500E]   
35                SUBARU                              [CROSSTREK, SOLTERRA]   
37                 TH!NK                                             [CITY]   
33           ROLLS-ROYCE                                          [SPECTRE]   
0                  ACURA                                              [ZDX]   
31                   RAM                                   [PROMASTER 3500]   
29              POLESTAR                                              [PS2]   
23                 LUCID                                              [AIR]   
1             ALFA ROMEO                                           [TONALE]   
17                JAGUAR                                           [I-PACE]   
14                   GMC                                 [HUMMER EV PICKUP]   
9                  DODGE                                           [HORNET]   
8               CHRYSLER                                         [PACIFICA]   
3         AZURE DYNAMICS                         [TRANSIT CONNECT ELECTRIC]   
41  WHEEGO ELECTRIC CARS                                           [WHEEGO]   

    Count  
5      15  
2      12  
25     11  
16      9  
12      8  
19      8  
7       8  
40      7  
36      6  
38      5  
21      4  
15      4  
30      4  
34      3  
32      3  
6       3  
24      3  
13      3  
28      2  
27      2  
26      2  
4       2  
11      2  
22      2  
20      2  
18      2  
39      2  
10      2  
35      2  
37      1  
33      1  
0       1  
31      1  
29      1  
23      1  
1       1  
17      1  
14      1  
9       1  
8       1  
3       1  
41      1  
In [ ]:
plt.figure(figsize=(10,6))
sns.boxplot(x=df['Postal Code'])
plt.show()

z = np.abs(zscore(df['Postal Code']))
df_outliers = df[z > 3]
print("Number of outliers before handling:", (z > 3).sum())
Number of outliers before handling: 322
In [ ]:
upper_limit = df['Postal Code'].quantile(0.95)
lower_limit = df['Postal Code'].quantile(0.05)
df['Postal Code'] = np.clip(df['Postal Code'], lower_limit, upper_limit)
In [ ]:
z = np.abs(zscore(df['Postal Code']))
print("Number of outliers after handling:", (z > 3).sum())
Number of outliers after handling: 0
In [ ]:
plt.figure(figsize=(10,6))
sns.boxplot(x=df['Model Year'])
plt.show()

z = np.abs(zscore(df['Model Year']))
df_outliers = df[z > 3]
print("Number of outliers before handling:", (z > 3).sum())
Number of outliers before handling: 781
In [ ]:
upper_limit = df['Model Year'].quantile(0.95)
lower_limit = df['Model Year'].quantile(0.05)
df['Model Year'] = np.clip(df['Model Year'], lower_limit, upper_limit)

z = np.abs(zscore(df['Model Year']))
print("Number of outliers after handling:", (z > 3).sum())
Number of outliers after handling: 0
In [ ]:
# plt.figure(figsize=(10,6))
# sns.boxplot(x=df['Base MSRP'])
# plt.show()

# z = np.abs(zscore(df['Base MSRP']))
# df_outliers = df[z > 3]
# print("Number of outliers before handling:", (z > 3).sum())
In [ ]:
# upper_limit = df['Base MSRP'].quantile(0.95)
# lower_limit = df['Base MSRP'].quantile(0.05)
# df['Base MSRP'] = np.clip(df['Base MSRP'], lower_limit, upper_limit)

# z = np.abs(zscore(df['Base MSRP']))
# print("Number of outliers after handling:", (z > 3).sum())

# plt.figure(figsize=(10,6))
# sns.boxplot(x=df['Legislative District'])
# plt.show()

# z = np.abs(zscore(df['Legislative District']))
# df_outliers = df[z > 3]
# print("Number of outliers before handling:", (z > 3).sum())
In [ ]:
plt.figure(figsize=(10,6))
sns.boxplot(x=df['DOL Vehicle ID'])
plt.show()

z = np.abs(zscore(df['DOL Vehicle ID']))
df_outliers = df[z > 3]
print("Number of outliers before handling:", (z > 3).sum())
Number of outliers before handling: 9120
In [ ]:
upper_limit = df['DOL Vehicle ID'].quantile(0.95)
lower_limit = df['DOL Vehicle ID'].quantile(0.05)
df['DOL Vehicle ID'] = np.clip(df['DOL Vehicle ID'], lower_limit, upper_limit)

z = np.abs(zscore(df['DOL Vehicle ID']))
print("Number of outliers after handling:", (z > 3).sum())

plt.figure(figsize=(10,6))
sns.boxplot(x=df['2020 Census Tract'])
plt.show()

z = np.abs(zscore(df['2020 Census Tract']))
df_outliers = df[z > 3]
print("Number of outliers before handling:", (z > 3).sum())
Number of outliers after handling: 0
Number of outliers before handling: 360
In [ ]:
print("Data types before correction:")
print(df.dtypes)
Data types before correction:
VIN (1-10)                                            object
County                                                object
City                                                  object
State                                                 object
Postal Code                                          float64
Model Year                                             int64
Make                                                  object
Model                                                 object
Electric Vehicle Type                                 object
Clean Alternative Fuel Vehicle (CAFV) Eligibility     object
Electric Range                                         int64
Base MSRP                                              int64
Legislative District                                 float64
DOL Vehicle ID                                       float64
Vehicle Location                                      object
Electric Utility                                      object
2020 Census Tract                                    float64
dtype: object
In [ ]:
df['Postal Code'] = df['Postal Code'].astype('category')
df['Legislative District'] = df['Legislative District'].astype('category')
df['2020 Census Tract'] = df['2020 Census Tract'].astype('category')
df['County'] = df['County'].astype('category')
df['City'] = df['City'].astype('category')
df['State'] = df['State'].astype('category')
df['Make'] = df['Make'].astype('category')
df['Model'] = df['Model'].astype('category')
df['Electric Vehicle Type'] = df['Electric Vehicle Type'].astype('category')

print("Data types after correction:")
print(df.dtypes)
Data types after correction:
VIN (1-10)                                             object
County                                               category
City                                                 category
State                                                category
Postal Code                                          category
Model Year                                              int64
Make                                                 category
Model                                                category
Electric Vehicle Type                                category
Clean Alternative Fuel Vehicle (CAFV) Eligibility      object
Electric Range                                          int64
Base MSRP                                               int64
Legislative District                                 category
DOL Vehicle ID                                        float64
Vehicle Location                                       object
Electric Utility                                       object
2020 Census Tract                                    category
dtype: object
In [ ]:
duplicate_rows = df[df.duplicated()]
print("Number of duplicate rows before removal:", duplicate_rows.shape[0])

df.drop_duplicates(inplace=True)
duplicate_rows = df[df.duplicated()]
print("Number of duplicate rows after removal:", duplicate_rows.shape[0])
Number of duplicate rows before removal: 411
Number of duplicate rows after removal: 0
In [ ]:
df.rename(columns = {
    'VIN (1-10)': 'VIN',
    'Postal Code': 'ZIP',
    'Model Year': 'Year',
    'Electric Vehicle Type': 'Type',
    'Clean Alternative Fuel Vehicle (CAFV) Eligibility': 'CAFV',
    'Base MSRP': 'MSRP',
    'Electric Range': 'Range',
    'DOL Vehicle ID': 'DOL'
}, inplace=True)
df.head(5)
Out[ ]:
VIN County City State ZIP Year Make Model Type CAFV Range MSRP Legislative District DOL Vehicle Location Electric Utility 2020 Census Tract
0 5YJ3E1EB0J Thurston Olympia WA 98512.0 2018 TESLA MODEL 3 Battery Electric Vehicle (BEV) Clean Alternative Fuel Vehicle Eligible 215 0 35.0 1.127669e+08 POINT (-122.957046 46.991391) PUGET SOUND ENERGY INC 5.306701e+10
1 WA1AAAGE9M Kitsap Port Orchard WA 98367.0 2021 AUDI E-TRON Battery Electric Vehicle (BEV) Clean Alternative Fuel Vehicle Eligible 222 0 35.0 1.566605e+08 POINT (-122.6530052 47.4739066) PUGET SOUND ENERGY INC 5.303509e+10
2 5YJ3E1EA2J Yakima Yakima WA 98902.0 2018 TESLA MODEL 3 Battery Electric Vehicle (BEV) Clean Alternative Fuel Vehicle Eligible 215 0 14.0 2.693741e+08 POINT (-120.530331 46.59534) PACIFICORP 5.307700e+10
3 5YJ3E1EA4N Yakima Yakima WA 98902.0 2022 TESLA MODEL 3 Battery Electric Vehicle (BEV) Eligibility unknown as battery range has not b... 0 0 15.0 2.133839e+08 POINT (-120.530331 46.59534) PACIFICORP 5.307700e+10
4 7SAYGAEE2P Snohomish Bothell WA 98012.0 2023 TESLA MODEL Y Battery Electric Vehicle (BEV) Eligibility unknown as battery range has not b... 0 0 1.0 2.294960e+08 POINT (-122.206146 47.839957) PUGET SOUND ENERGY INC 5.306105e+10
In [ ]:
df.loc[df['Model'] == 'S-CLASS', 'Range'] = df.loc[df['Model'] == 'S-CLASS', 'Range'].fillna(46.0)
In [ ]:
df.loc[df['Model'] == 'S-CLASS', 'MSRP'] = df.loc[df['Model'] == 'S-CLASS', 'MSRP'].fillna(126855.0)

Bivariate and Multivariate Analysis¶

In [ ]:
df.groupby('County')['Model'].nunique().idxmax()
county_with_most_unique_models = df.groupby('County')['Model'].nunique().idxmax()
# Get the most frequent model in the county with the most unique models
most_frequent_model = df[df['County'] == county_with_most_unique_models]['Model'].mode()[0]
print(most_frequent_model)
adoption_counts = df.groupby(['Year', 'Make'])['VIN'].nunique().reset_index()
pivot_data = adoption_counts.pivot(index='Year', columns='Make', values='VIN').fillna(0)
pivot_data.head(5)

plt.figure(figsize=(8, 6))
sns.lineplot(data=pivot_data, marker='o')

plt.title('EV Adoption Rate Among Car Makers over the years')
plt.xlabel('Year')
plt.ylabel('Number of Unique VINs')
plt.legend(title='Car Maker', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.xticks(pivot_data.index, rotation=45)
plt.show()
MODEL Y
In [ ]:
import plotly.express as px
fig = px.line(adoption_counts, x='Year', y='VIN', color='Make', title = 'EV Adoption Rate Among Car Makers over the years', labels= {'VIN': 'Number of Unique VINs'}, markers=True)

# fig.update_layout(xaxis_title='Year', yaxis_title='Number of Unique VINs')
fig.show()
In [ ]:
city_counts = df.groupby('City')['VIN'].nunique().reset_index().sort_values(by='VIN', ascending=False).head(5)

plt.figure(figsize=(8, 6))
plt.bar(city_counts['City'], city_counts['VIN'], color='skyblue')
plt.xlabel('City')
plt.ylabel('Number of Unique VINs')
plt.title('Number of Unique VINs per City')
plt.xticks(rotation=45)
plt.show()
In [ ]:
top_makers = df.groupby('Make')['VIN'].nunique().sort_values(ascending=False).head(5)

plt.figure(figsize=(10, 6))
plt.bar(top_makers.index, top_makers.values, color='skyblue')
plt.xlabel('Car Maker')
plt.ylabel('Number of Unique VINs')
plt.title('Top 5 Car Makers Adopting EVs in Washington')
plt.xticks(rotation=45)
plt.show()
In [ ]:
crosstab_make_city = pd.crosstab(df['Make'], df['City'], margins=True)

top_crosstab_make_city = crosstab_make_city.sum(axis=1).nlargest(10).index
top_make_city_data = crosstab_make_city.loc[top_crosstab_make_city]

top_make_city_data
Out[ ]:
City Aberdeen Aberdeen Proving Ground Acme Addy Adna Airway Heights Alameda Albion Alderdale Alderwood Manor ... Woodland Woodway Worley Yacolt Yakima Yarrow Point Yelm Yorktown Zillah All
Make
All 184 1 10 2 1 35 2 1 1 1 ... 212 95 1 65 788 159 329 1 39 199629
TESLA 47 1 2 2 0 11 0 0 0 0 ... 96 42 0 28 328 94 132 1 17 87781
CHEVROLET 25 0 1 0 0 6 0 0 0 0 ... 18 0 0 4 79 4 30 0 1 14773
NISSAN 13 0 0 0 1 3 0 0 0 1 ... 12 5 0 6 30 1 21 0 2 14387
FORD 24 0 2 0 0 5 1 0 0 0 ... 11 2 0 5 36 3 34 0 2 10543
KIA 18 0 2 0 0 3 0 0 0 0 ... 6 5 0 2 42 3 21 0 5 8760
BMW 1 0 0 0 0 1 0 0 0 0 ... 7 7 0 4 26 20 6 0 2 8294
TOYOTA 11 0 0 0 0 1 0 0 0 0 ... 18 1 0 3 64 2 15 0 0 7668
VOLKSWAGEN 3 0 1 0 0 1 0 0 0 0 ... 9 1 0 1 17 1 6 0 2 5559
HYUNDAI 7 0 1 0 0 0 0 0 0 0 ... 5 2 0 1 12 2 9 0 0 5475

10 rows × 764 columns

In [ ]:
crosstab_make_model = pd.crosstab(df['Make'], df['Model'])
crosstab_make_model
Out[ ]:
Model 330E 500 500E 530E 740E 745E 745LE 750E 918 A3 ... VOLT WHEEGO WRANGLER X3 X5 XC40 XC60 XC90 XM ZDX
Make
ACURA 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 15
ALFA ROMEO 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
AUDI 0 0 0 0 0 0 0 0 0 562 ... 0 0 0 0 0 0 0 0 0 0
AZURE DYNAMICS 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
BENTLEY 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
BMW 491 0 0 421 29 11 2 2 0 0 ... 0 0 0 317 2541 0 0 0 13 0
CADILLAC 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
CHEVROLET 0 0 0 0 0 0 0 0 0 0 ... 4828 0 0 0 0 0 0 0 0 0
CHRYSLER 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
DODGE 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
FIAT 0 786 9 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
FISKER 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
FORD 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
GENESIS 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
GMC 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
HONDA 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
HYUNDAI 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
JAGUAR 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
JEEP 0 0 0 0 0 0 0 0 0 0 ... 0 0 3946 0 0 0 0 0 0 0
KIA 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
LAND ROVER 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
LEXUS 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
LINCOLN 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
LUCID 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
MAZDA 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
MERCEDES-BENZ 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
MINI 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
MITSUBISHI 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
NISSAN 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
POLESTAR 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
PORSCHE 0 0 0 0 0 0 0 0 1 0 ... 0 0 0 0 0 0 0 0 0 0
RAM 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
RIVIAN 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
ROLLS-ROYCE 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
SMART 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
SUBARU 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
TESLA 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
TH!NK 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
TOYOTA 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
VOLKSWAGEN 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
VOLVO 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 1080 1269 1644 0 0
WHEEGO ELECTRIC CARS 0 0 0 0 0 0 0 0 0 0 ... 0 3 0 0 0 0 0 0 0 0

42 rows × 151 columns

In [ ]:
top_crosstab_make_model = crosstab_make_model.sum(axis=1).nlargest(10).index
top_make_model_data = crosstab_make_model.loc[top_crosstab_make_model]

top_make_model_data
Out[ ]:
Model 330E 500 500E 530E 740E 745E 745LE 750E 918 A3 ... VOLT WHEEGO WRANGLER X3 X5 XC40 XC60 XC90 XM ZDX
Make
TESLA 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
CHEVROLET 0 0 0 0 0 0 0 0 0 0 ... 4828 0 0 0 0 0 0 0 0 0
NISSAN 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
FORD 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
KIA 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
BMW 491 0 0 421 29 11 2 2 0 0 ... 0 0 0 317 2541 0 0 0 13 0
TOYOTA 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
VOLKSWAGEN 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
HYUNDAI 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
JEEP 0 0 0 0 0 0 0 0 0 0 ... 0 0 3946 0 0 0 0 0 0 0

10 rows × 151 columns

In [ ]:
top_makes = crosstab_make_city.sum(axis=1).nlargest(10).index
top_makes_data = crosstab_make_city.loc[top_makes]

plot_data = []

for make in top_makes:
  top_cities = top_makes_data.loc[make].nlargest(5)
  for city, count in top_cities.items():
    plot_data.append({'Make': make, 'City': city, 'Count': count})

plot_df = pd.DataFrame(plot_data)

fig = px.bar(plot_df, x='Make', y='Count', color='City', title = 'Top 10 Vehicle Makes and Their Top 5 Cities', labels = {'Count': 'Number of Vehicles'}, barmode='group')
fig.show()
In [ ]:
fig, ax = plt.subplots(figsize=(12, 6))

colors = plt.cm.tab10(np.arange(len(plot_df['City'].unique())))
for i, city in enumerate(plot_df['City'].unique()):
    city_data = plot_df[plot_df['City'] == city]
    ax.bar(city_data['Make'], city_data['Count'], label=city, color=colors[i])

ax.set_xlabel('Make')
ax.set_ylabel('Count')
ax.set_title('Top 10 Vehicle Makes and Their Top 5 Cities')
ax.legend(title='City', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.xticks(rotation=45)
plt.show()
In [ ]:
df['MSRP'].isnull().sum()
Out[ ]:
0
In [ ]:
df['MSRP'].describe()
Out[ ]:
MSRP
count 199629.000000
mean 949.111677
std 7867.529029
min 0.000000
25% 0.000000
50% 0.000000
75% 0.000000
max 845000.000000

In [ ]:
mean_msrp = df['MSRP'].replace(0.0, pd.NA).mean()
df['MSRP'] = df['MSRP'].replace(0.0, mean_msrp)

df['MSRP'].describe()
Out[ ]:
MSRP
count 199629.000000
mean 57086.536607
std 2935.483026
min 31950.000000
25% 57086.536607
50% 57086.536607
75% 57086.536607
max 845000.000000

In [ ]:
numerical_df = df.select_dtypes(include=['int64', 'float64'])

numerical_correlation_matrix = numerical_df.corr()

plt.figure(figsize=(8, 4))

sns.heatmap(numerical_correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", square=True)
plt.title('Numerical Data Correlation Heatmap')
plt.show()
In [ ]:
pearson_correlation_matrix = numerical_df.corr(method='pearson')
pearson_correlation_matrix

plt.figure(figsize=(8, 4))
sns.heatmap(pearson_correlation_matrix, annot=True, fmt=".2f", cmap='coolwarm', square=True)
plt.title('Pearson Correlation Matrix')
plt.show()
In [ ]:
spearman_correlation_matrix = numerical_df.corr(method='spearman')
spearman_correlation_matrix

plt.figure(figsize=(8, 4))
sns.heatmap(spearman_correlation_matrix, annot=True, fmt=".2f", cmap='coolwarm', square=True)
plt.title('Spearman Correlation Matrix')
plt.show()
In [ ]:
crosstab_full = pd.crosstab(df['County'], df['Make'])

# Filter top 3 counties and top 3 makes
top_counties = crosstab_full.sum(axis=1).nlargest(10).index
top_makes = crosstab_full.sum(axis=0).nlargest(10).index

crosstab_filtered = crosstab_full.loc[top_counties, top_makes]
print(crosstab_filtered)
Make       TESLA  CHEVROLET  NISSAN  FORD   KIA   BMW  TOYOTA  VOLKSWAGEN  \
County                                                                      
King       48471       6012    6806  3987  3973  4932    2926        2731   
Snohomish  12248       1423    1727  1259  1002   818     709         576   
Pierce      6565       1252    1116  1137   826   617     592         391   
Clark       4863        929     858   682   527   450     871         319   
Thurston    2350       1025     593   723   467   168     375         329   
Kitsap      2149        641     660   419   405   280     279         216   
Spokane     1942        458     309   395   277   223     232         145   
Whatcom     1455        499     657   280   288   120     278         234   
Benton      1016        258     134   205   111   112     165          45   
Skagit       767        264     169   168   120    55     116          82   

Make       HYUNDAI  JEEP  
County                    
King          2777  2449  
Snohomish      660   498  
Pierce         389   380  
Clark          366   399  
Thurston       179   164  
Kitsap         245   200  
Spokane        131   308  
Whatcom        149   118  
Benton          66    60  
Skagit          96    67  
In [ ]:
top_10_cars = df[['Make', 'Model', 'Range']].nlargest(10, 'Range')

df['Make'] = df['Make'].astype(str)
df['Model'] = df['Model'].astype(str)

unique_models = df.groupby(['Make', 'Model'])['Range'].max().reset_index()

# Get the top 10 unique models with the highest range
top_10_models = unique_models.nlargest(10, 'Range')

# Create a combined column for better labeling in the plot
top_10_models['Car'] = top_10_models['Make'] + ' ' + top_10_models['Model']

# Plotting
plt.figure(figsize=(6, 4))
sns.barplot(data=top_10_models, x='Range', y='Car', palette='viridis')
plt.title('Top 10 Unique Car Models with Highest Range')
plt.xlabel('Range')
plt.ylabel('Car (Make and Model)')
plt.grid(True)
plt.show()
In [ ]:
fig, ax = plt.subplots(1, 2, figsize=(14, 6))

sns.boxplot(data=df, x='Make', y='Range', ax=ax[0])
ax[0].set_title('Range Distribution by Make')
ax[0].set_xlabel('Make')
ax[0].set_ylabel('Range (in Miles)')
ax[0].set_xticklabels(ax[0].get_xticklabels(), rotation=45, ha='right')


sns.violinplot(data=df, x='Make', y='Range', ax=ax[1], bw='silverman')
ax[1].set_title('Range Distribution by Make (Violin Plot)')
ax[1].set_xlabel('Make')
ax[1].set_ylabel('Range (in Miles)')
ax[1].set_xticklabels(ax[1].get_xticklabels(), rotation=45, ha='right')

plt.tight_layout()
plt.show()
In [ ]:
label_encoder = LabelEncoder()
df['City_Numeric'] = label_encoder.fit_transform(df['City'])

# Calculate the correlation
correlation = df['Range'].corr(df['City_Numeric'])

# Display the correlation value
print(f"Correlation between Electric Range and City: {correlation}")

# Optional: Visualize the relationship
plt.figure(figsize=(6, 4))
sns.scatterplot(x='City_Numeric', y='Range', data=df)
plt.title('Range vs. City')
plt.xlabel('City (Encoded)')
plt.ylabel('Range')
plt.xticks(ticks=range(len(label_encoder.classes_)), labels=label_encoder.classes_, rotation=45)
plt.tight_layout()
plt.show()
Correlation between Electric Range and City: -0.00035958723385506424
In [ ]:
avg_range = df.groupby('Make')['Range'].mean()

plt.figure(figsize=(10, 6))
avg_range.plot(kind='bar', color='skyblue')
plt.title('Average Electrical Range by Car Make')
plt.xlabel('Make')
plt.ylabel('Average Electrical Range (miles)')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
In [ ]:
plt.figure(figsize=(10, 6))
sns.boxplot(data=df, x='Type', y='Range', palette='Set2')
plt.title('Electrical Range Distribution by Electric Vehicle Type')
plt.xlabel('Electric Vehicle Type')
plt.ylabel('Electrical Range (miles)')
plt.tight_layout()
plt.show()
In [ ]:
# Count plot
plt.figure(figsize=(10, 6))
sns.countplot(data=df, x='County', hue='Type', palette='pastel')
plt.title('Count of Electric Vehicles by Country and Type')
plt.xlabel('County')
plt.xticks(rotation=45, ha='right', fontsize='3')
plt.ylabel('Count')
plt.legend(title='Electric Vehicle Type')
plt.tight_layout()
plt.show()
In [ ]: