import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline


from scipy.stats import zscore


from google.colab import drive
drive.mount('/content/drive', force_remount=False)

file_path = '/content/drive/MyDrive/EV_Population_WA_Data.csv'

df = pd.read_csv(file_path)

df.head()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


df.tail()


df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200048 entries, 0 to 200047
Data columns (total 17 columns):
 #   Column                                             Non-Null Count   Dtype  
---  ------                                             --------------   -----  
 0   VIN (1-10)                                         200048 non-null  object 
 1   County                                             200044 non-null  object 
 2   City                                               200044 non-null  object 
 3   State                                              200048 non-null  object 
 4   Postal Code                                        200044 non-null  float64
 5   Model Year                                         200048 non-null  int64  
 6   Make                                               200048 non-null  object 
 7   Model                                              200048 non-null  object 
 8   Electric Vehicle Type                              200048 non-null  object 
 9   Clean Alternative Fuel Vehicle (CAFV) Eligibility  200048 non-null  object 
 10  Electric Range                                     200048 non-null  int64  
 11  Base MSRP                                          200048 non-null  int64  
 12  Legislative District                               199606 non-null  float64
 13  DOL Vehicle ID                                     200048 non-null  int64  
 14  Vehicle Location                                   200040 non-null  object 
 15  Electric Utility                                   200044 non-null  object 
 16  2020 Census Tract                                  200044 non-null  float64
dtypes: float64(3), int64(4), object(10)
memory usage: 25.9+ MB


df.describe()


df.shape

(200048, 17)


df.isnull().sum()


# # Drop rows with missing categorical values
# df = df.dropna(subset=['County', 'City', 'Model', 'Electric Utility'])

# # Check if categorical missing values are removed
# df.isnull().sum()


# Fill missing numerical values with the median
# df.loc[:, 'Electric Range'] = df['Electric Range'].fillna(df['Electric Range'].median())
df.loc[:, 'Base MSRP'] = df['Base MSRP'].fillna(df['Base MSRP'].mean())
df.loc[:, 'Legislative District'] = df['Legislative District'].fillna(df['Legislative District'].median())

# Verify that all missing values are handled
df.isnull().sum()


df['County'].fillna(df['County'].mode()[0], inplace=True)
df['City'].fillna(df['City'].mode()[0], inplace=True)
df['Postal Code'].fillna(df['Postal Code'].mode()[0], inplace=True)
df['Electric Utility'].fillna(df['Electric Utility'].mode()[0], inplace=True)

df.dropna(subset=['2020 Census Tract'], inplace=True)
df.dropna(subset=['Vehicle Location'], inplace=True)
df.dropna(subset=['Legislative District'], inplace=True)


print("Missing values after handling:")
print(df.isnull().sum())

Missing values after handling:
VIN (1-10)                                           0
County                                               0
City                                                 0
State                                                0
Postal Code                                          0
Model Year                                           0
Make                                                 0
Model                                                0
Electric Vehicle Type                                0
Clean Alternative Fuel Vehicle (CAFV) Eligibility    0
Electric Range                                       0
Base MSRP                                            0
Legislative District                                 0
DOL Vehicle ID                                       0
Vehicle Location                                     0
Electric Utility                                     0
2020 Census Tract                                    0
dtype: int64


# Summary statistics for key numerical variables
summary_stats = df[['Electric Range', 'Base MSRP', 'Model Year']].describe().T
summary_stats['variance'] = df[['Electric Range', 'Base MSRP', 'Model Year']].var()

# Display the statistics
summary_stats[['mean', '50%', 'std', 'variance']]  # 50% represents the median


plt.figure(figsize=(15, 6))

# Histogram for Electric Range
plt.subplot(1, 2, 1)
sns.histplot(df['Electric Range'], bins=30, kde=True)
plt.title('Distribution of Electric Range')
plt.xlabel('Electric Range (miles)')
plt.ylabel('Frequency')

# Box Plot for Electric Range
plt.subplot(1, 2, 2)
sns.boxplot(df['Electric Range'])
plt.title('Box Plot of Electric Range')
plt.xlabel('Electric Range (miles)')

plt.tight_layout()
plt.show()


plt.figure(figsize=(15, 6))

# Histogram for Base MSRP
plt.subplot(1, 2, 1)
sns.histplot(df['Base MSRP'], bins=30, kde=True)
plt.title('Distribution of Base MSRP')
plt.xlabel('Base MSRP (in dollars)')
plt.ylabel('Frequency')

# Box Plot for Base MSRP
plt.subplot(1, 2, 2)
sns.boxplot(df['Base MSRP'])
plt.title('Box Plot of Base MSRP')
plt.xlabel('Base MSRP (in dollars)')

plt.tight_layout()
plt.show()


plt.figure(figsize=(15, 6))

# Histogram for Model Year
plt.subplot(1, 2, 1)
sns.histplot(df['Model Year'], bins=30, kde=True)
plt.title('Distribution of Model Year')
plt.xlabel('Model Year')
plt.ylabel('Frequency')

# Box Plot for Model Year
plt.subplot(1, 2, 2)
sns.boxplot(df['Model Year'])
plt.title('Box Plot of Model Year')
plt.xlabel('Model Year')

plt.tight_layout()
plt.show()


plt.figure(figsize=(10, 6))
sns.countplot(y='Make', data=df, order=df['Make'].value_counts().index[:10])
plt.title('Top 10 Car Makers')
plt.xlabel('Count')
plt.ylabel('Car Maker')
plt.show()


unique_models = df[['Make', 'Model']].drop_duplicates()
model_counts = unique_models['Make'].value_counts().reset_index()
model_counts.columns = ['Make', 'Unique Models']

# Plotting
plt.figure(figsize=(10, 6))
sns.barplot(x='Unique Models', y='Make', data=model_counts, palette='viridis')
plt.title('Unique Models per Maker')
plt.xlabel('Number of Unique Models')
plt.ylabel('Maker')
plt.show()
print(model_counts)

                    Make  Unique Models
0                    BMW             15
1                   AUDI             12
2          MERCEDES-BENZ             11
3                HYUNDAI              9
4                    KIA              8
5                   FORD              8
6              CHEVROLET              8
7                  VOLVO              7
8                  TESLA              6
9                 TOYOTA              5
10               PORSCHE              4
11                 HONDA              4
12                 LEXUS              4
13                 SMART              3
14                 MAZDA              3
15              CADILLAC              3
16               GENESIS              3
17                RIVIAN              3
18            LAND ROVER              2
19                FISKER              2
20               BENTLEY              2
21                  MINI              2
22               LINCOLN              2
23                SUBARU              2
24            MITSUBISHI              2
25                  FIAT              2
26                NISSAN              2
27            VOLKSWAGEN              2
28                  JEEP              2
29              CHRYSLER              1
30                JAGUAR              1
31                 LUCID              1
32            ALFA ROMEO              1
33                 DODGE              1
34                   GMC              1
35                 ACURA              1
36              POLESTAR              1
37        AZURE DYNAMICS              1
38                 TH!NK              1
39  WHEEGO ELECTRIC CARS              1
40           ROLLS-ROYCE              1
41                   RAM              1


unique_models = df[['Make', 'Model']].drop_duplicates()

# Group by Maker and aggregate model names using 'list'
model_counts = unique_models.groupby('Make')['Model'].agg(list).reset_index()
model_counts['Count'] = model_counts['Model'].str.len()

# Get top 5 makers with the most unique models
top_makers = model_counts.sort_values(by='Count', ascending=False).head(5)

# Display the result
print(top_makers)
print(model_counts.sort_values(by='Count', ascending=False))

             Make                                              Model  Count
5             BMW  [I3, X3, X5, I4, 330E, IX, 530E, I5, I7, I8, 7...     15
2            AUDI  [E-TRON, A3, Q5, Q8, Q5 E, Q4, E-TRON GT, E-TR...     12
25  MERCEDES-BENZ  [EQE-CLASS SUV, EQS-CLASS SUV, EQS-CLASS SEDAN...     11
16        HYUNDAI  [IONIQ, IONIQ 5, KONA ELECTRIC, SANTA FE, IONI...      9
12           FORD  [ESCAPE, C-MAX, FOCUS, FUSION, MUSTANG MACH-E,...      8
                    Make                                              Model  \
5                    BMW  [I3, X3, X5, I4, 330E, IX, 530E, I5, I7, I8, 7...   
2                   AUDI  [E-TRON, A3, Q5, Q8, Q5 E, Q4, E-TRON GT, E-TR...   
25         MERCEDES-BENZ  [EQE-CLASS SUV, EQS-CLASS SUV, EQS-CLASS SEDAN...   
16               HYUNDAI  [IONIQ, IONIQ 5, KONA ELECTRIC, SANTA FE, IONI...   
12                  FORD  [ESCAPE, C-MAX, FOCUS, FUSION, MUSTANG MACH-E,...   
19                   KIA  [SOUL, SPORTAGE, NIRO, SORENTO, EV6, EV9, OPTI...   
7              CHEVROLET  [VOLT, BOLT EV, BOLT EUV, SPARK, BLAZER EV, SI...   
40                 VOLVO             [XC60, XC90, S60, XC40, C40, V60, S90]   
36                 TESLA  [MODEL 3, MODEL Y, MODEL S, MODEL X, CYBERTRUC...   
38                TOYOTA  [PRIUS PLUG-IN, PRIUS PRIME, RAV4 PRIME, BZ4X,...   
21                 LEXUS                                   [RX, NX, RZ, TX]   
15                 HONDA                  [CLARITY, PROLOGUE, ACCORD, CR-V]   
30               PORSCHE                   [TAYCAN, PANAMERA, CAYENNE, 918]   
34                 SMART         [FORTWO ELECTRIC DRIVE, EQ FORTWO, FORTWO]   
32                RIVIAN                                    [R1T, R1S, EDV]   
6               CADILLAC                                  [LYRIQ, ELR, CT6]   
24                 MAZDA                              [CX-90, CX-70, MX-30]   
13               GENESIS                                  [GV60, GV70, G80]   
28                NISSAN                                      [LEAF, ARIYA]   
27            MITSUBISHI                                [OUTLANDER, I-MIEV]   
26                  MINI                              [HARDTOP, COUNTRYMAN]   
4                BENTLEY                            [FLYING SPUR, BENTAYGA]   
11                FISKER                                     [OCEAN, KARMA]   
22               LINCOLN                                 [CORSAIR, AVIATOR]   
20            LAND ROVER                   [RANGE ROVER SPORT, RANGE ROVER]   
18                  JEEP                         [GRAND CHEROKEE, WRANGLER]   
39            VOLKSWAGEN                                     [E-GOLF, ID.4]   
10                  FIAT                                        [500, 500E]   
35                SUBARU                              [CROSSTREK, SOLTERRA]   
37                 TH!NK                                             [CITY]   
33           ROLLS-ROYCE                                          [SPECTRE]   
0                  ACURA                                              [ZDX]   
31                   RAM                                   [PROMASTER 3500]   
29              POLESTAR                                              [PS2]   
23                 LUCID                                              [AIR]   
1             ALFA ROMEO                                           [TONALE]   
17                JAGUAR                                           [I-PACE]   
14                   GMC                                 [HUMMER EV PICKUP]   
9                  DODGE                                           [HORNET]   
8               CHRYSLER                                         [PACIFICA]   
3         AZURE DYNAMICS                         [TRANSIT CONNECT ELECTRIC]   
41  WHEEGO ELECTRIC CARS                                           [WHEEGO]   

    Count  
5      15  
2      12  
25     11  
16      9  
12      8  
19      8  
7       8  
40      7  
36      6  
38      5  
21      4  
15      4  
30      4  
34      3  
32      3  
6       3  
24      3  
13      3  
28      2  
27      2  
26      2  
4       2  
11      2  
22      2  
20      2  
18      2  
39      2  
10      2  
35      2  
37      1  
33      1  
0       1  
31      1  
29      1  
23      1  
1       1  
17      1  
14      1  
9       1  
8       1  
3       1  
41      1


plt.figure(figsize=(10,6))
sns.boxplot(x=df['Postal Code'])
plt.show()

z = np.abs(zscore(df['Postal Code']))
df_outliers = df[z > 3]
print("Number of outliers before handling:", (z > 3).sum())

Number of outliers before handling: 322


upper_limit = df['Postal Code'].quantile(0.95)
lower_limit = df['Postal Code'].quantile(0.05)
df['Postal Code'] = np.clip(df['Postal Code'], lower_limit, upper_limit)


z = np.abs(zscore(df['Postal Code']))
print("Number of outliers after handling:", (z > 3).sum())

Number of outliers after handling: 0


plt.figure(figsize=(10,6))
sns.boxplot(x=df['Model Year'])
plt.show()

z = np.abs(zscore(df['Model Year']))
df_outliers = df[z > 3]
print("Number of outliers before handling:", (z > 3).sum())

Number of outliers before handling: 781


upper_limit = df['Model Year'].quantile(0.95)
lower_limit = df['Model Year'].quantile(0.05)
df['Model Year'] = np.clip(df['Model Year'], lower_limit, upper_limit)

z = np.abs(zscore(df['Model Year']))
print("Number of outliers after handling:", (z > 3).sum())

Number of outliers after handling: 0


# plt.figure(figsize=(10,6))
# sns.boxplot(x=df['Base MSRP'])
# plt.show()

# z = np.abs(zscore(df['Base MSRP']))
# df_outliers = df[z > 3]
# print("Number of outliers before handling:", (z > 3).sum())


# upper_limit = df['Base MSRP'].quantile(0.95)
# lower_limit = df['Base MSRP'].quantile(0.05)
# df['Base MSRP'] = np.clip(df['Base MSRP'], lower_limit, upper_limit)

# z = np.abs(zscore(df['Base MSRP']))
# print("Number of outliers after handling:", (z > 3).sum())

# plt.figure(figsize=(10,6))
# sns.boxplot(x=df['Legislative District'])
# plt.show()

# z = np.abs(zscore(df['Legislative District']))
# df_outliers = df[z > 3]
# print("Number of outliers before handling:", (z > 3).sum())


plt.figure(figsize=(10,6))
sns.boxplot(x=df['DOL Vehicle ID'])
plt.show()

z = np.abs(zscore(df['DOL Vehicle ID']))
df_outliers = df[z > 3]
print("Number of outliers before handling:", (z > 3).sum())

Number of outliers before handling: 9120


upper_limit = df['DOL Vehicle ID'].quantile(0.95)
lower_limit = df['DOL Vehicle ID'].quantile(0.05)
df['DOL Vehicle ID'] = np.clip(df['DOL Vehicle ID'], lower_limit, upper_limit)

z = np.abs(zscore(df['DOL Vehicle ID']))
print("Number of outliers after handling:", (z > 3).sum())

plt.figure(figsize=(10,6))
sns.boxplot(x=df['2020 Census Tract'])
plt.show()

z = np.abs(zscore(df['2020 Census Tract']))
df_outliers = df[z > 3]
print("Number of outliers before handling:", (z > 3).sum())

Number of outliers after handling: 0

Number of outliers before handling: 360


print("Data types before correction:")
print(df.dtypes)

Data types before correction:
VIN (1-10)                                            object
County                                                object
City                                                  object
State                                                 object
Postal Code                                          float64
Model Year                                             int64
Make                                                  object
Model                                                 object
Electric Vehicle Type                                 object
Clean Alternative Fuel Vehicle (CAFV) Eligibility     object
Electric Range                                         int64
Base MSRP                                              int64
Legislative District                                 float64
DOL Vehicle ID                                       float64
Vehicle Location                                      object
Electric Utility                                      object
2020 Census Tract                                    float64
dtype: object


df['Postal Code'] = df['Postal Code'].astype('category')
df['Legislative District'] = df['Legislative District'].astype('category')
df['2020 Census Tract'] = df['2020 Census Tract'].astype('category')
df['County'] = df['County'].astype('category')
df['City'] = df['City'].astype('category')
df['State'] = df['State'].astype('category')
df['Make'] = df['Make'].astype('category')
df['Model'] = df['Model'].astype('category')
df['Electric Vehicle Type'] = df['Electric Vehicle Type'].astype('category')

print("Data types after correction:")
print(df.dtypes)

Data types after correction:
VIN (1-10)                                             object
County                                               category
City                                                 category
State                                                category
Postal Code                                          category
Model Year                                              int64
Make                                                 category
Model                                                category
Electric Vehicle Type                                category
Clean Alternative Fuel Vehicle (CAFV) Eligibility      object
Electric Range                                          int64
Base MSRP                                               int64
Legislative District                                 category
DOL Vehicle ID                                        float64
Vehicle Location                                       object
Electric Utility                                       object
2020 Census Tract                                    category
dtype: object


duplicate_rows = df[df.duplicated()]
print("Number of duplicate rows before removal:", duplicate_rows.shape[0])

df.drop_duplicates(inplace=True)
duplicate_rows = df[df.duplicated()]
print("Number of duplicate rows after removal:", duplicate_rows.shape[0])

Number of duplicate rows before removal: 411
Number of duplicate rows after removal: 0


df.rename(columns = {
    'VIN (1-10)': 'VIN',
    'Postal Code': 'ZIP',
    'Model Year': 'Year',
    'Electric Vehicle Type': 'Type',
    'Clean Alternative Fuel Vehicle (CAFV) Eligibility': 'CAFV',
    'Base MSRP': 'MSRP',
    'Electric Range': 'Range',
    'DOL Vehicle ID': 'DOL'
}, inplace=True)
df.head(5)


df.loc[df['Model'] == 'S-CLASS', 'Range'] = df.loc[df['Model'] == 'S-CLASS', 'Range'].fillna(46.0)


df.loc[df['Model'] == 'S-CLASS', 'MSRP'] = df.loc[df['Model'] == 'S-CLASS', 'MSRP'].fillna(126855.0)


df.groupby('County')['Model'].nunique().idxmax()
county_with_most_unique_models = df.groupby('County')['Model'].nunique().idxmax()
# Get the most frequent model in the county with the most unique models
most_frequent_model = df[df['County'] == county_with_most_unique_models]['Model'].mode()[0]
print(most_frequent_model)
adoption_counts = df.groupby(['Year', 'Make'])['VIN'].nunique().reset_index()
pivot_data = adoption_counts.pivot(index='Year', columns='Make', values='VIN').fillna(0)
pivot_data.head(5)

plt.figure(figsize=(8, 6))
sns.lineplot(data=pivot_data, marker='o')

plt.title('EV Adoption Rate Among Car Makers over the years')
plt.xlabel('Year')
plt.ylabel('Number of Unique VINs')
plt.legend(title='Car Maker', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.xticks(pivot_data.index, rotation=45)
plt.show()

MODEL Y


import plotly.express as px
fig = px.line(adoption_counts, x='Year', y='VIN', color='Make', title = 'EV Adoption Rate Among Car Makers over the years', labels= {'VIN': 'Number of Unique VINs'}, markers=True)

# fig.update_layout(xaxis_title='Year', yaxis_title='Number of Unique VINs')
fig.show()


city_counts = df.groupby('City')['VIN'].nunique().reset_index().sort_values(by='VIN', ascending=False).head(5)

plt.figure(figsize=(8, 6))
plt.bar(city_counts['City'], city_counts['VIN'], color='skyblue')
plt.xlabel('City')
plt.ylabel('Number of Unique VINs')
plt.title('Number of Unique VINs per City')
plt.xticks(rotation=45)
plt.show()


top_makers = df.groupby('Make')['VIN'].nunique().sort_values(ascending=False).head(5)

plt.figure(figsize=(10, 6))
plt.bar(top_makers.index, top_makers.values, color='skyblue')
plt.xlabel('Car Maker')
plt.ylabel('Number of Unique VINs')
plt.title('Top 5 Car Makers Adopting EVs in Washington')
plt.xticks(rotation=45)
plt.show()


crosstab_make_city = pd.crosstab(df['Make'], df['City'], margins=True)

top_crosstab_make_city = crosstab_make_city.sum(axis=1).nlargest(10).index
top_make_city_data = crosstab_make_city.loc[top_crosstab_make_city]

top_make_city_data


crosstab_make_model = pd.crosstab(df['Make'], df['Model'])
crosstab_make_model


top_crosstab_make_model = crosstab_make_model.sum(axis=1).nlargest(10).index
top_make_model_data = crosstab_make_model.loc[top_crosstab_make_model]

top_make_model_data


top_makes = crosstab_make_city.sum(axis=1).nlargest(10).index
top_makes_data = crosstab_make_city.loc[top_makes]

plot_data = []

for make in top_makes:
  top_cities = top_makes_data.loc[make].nlargest(5)
  for city, count in top_cities.items():
    plot_data.append({'Make': make, 'City': city, 'Count': count})

plot_df = pd.DataFrame(plot_data)

fig = px.bar(plot_df, x='Make', y='Count', color='City', title = 'Top 10 Vehicle Makes and Their Top 5 Cities', labels = {'Count': 'Number of Vehicles'}, barmode='group')
fig.show()


fig, ax = plt.subplots(figsize=(12, 6))

colors = plt.cm.tab10(np.arange(len(plot_df['City'].unique())))
for i, city in enumerate(plot_df['City'].unique()):
    city_data = plot_df[plot_df['City'] == city]
    ax.bar(city_data['Make'], city_data['Count'], label=city, color=colors[i])

ax.set_xlabel('Make')
ax.set_ylabel('Count')
ax.set_title('Top 10 Vehicle Makes and Their Top 5 Cities')
ax.legend(title='City', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.xticks(rotation=45)
plt.show()


df['MSRP'].isnull().sum()

0


df['MSRP'].describe()


mean_msrp = df['MSRP'].replace(0.0, pd.NA).mean()
df['MSRP'] = df['MSRP'].replace(0.0, mean_msrp)

df['MSRP'].describe()


numerical_df = df.select_dtypes(include=['int64', 'float64'])

numerical_correlation_matrix = numerical_df.corr()

plt.figure(figsize=(8, 4))

sns.heatmap(numerical_correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", square=True)
plt.title('Numerical Data Correlation Heatmap')
plt.show()


pearson_correlation_matrix = numerical_df.corr(method='pearson')
pearson_correlation_matrix

plt.figure(figsize=(8, 4))
sns.heatmap(pearson_correlation_matrix, annot=True, fmt=".2f", cmap='coolwarm', square=True)
plt.title('Pearson Correlation Matrix')
plt.show()


spearman_correlation_matrix = numerical_df.corr(method='spearman')
spearman_correlation_matrix

plt.figure(figsize=(8, 4))
sns.heatmap(spearman_correlation_matrix, annot=True, fmt=".2f", cmap='coolwarm', square=True)
plt.title('Spearman Correlation Matrix')
plt.show()


crosstab_full = pd.crosstab(df['County'], df['Make'])

# Filter top 3 counties and top 3 makes
top_counties = crosstab_full.sum(axis=1).nlargest(10).index
top_makes = crosstab_full.sum(axis=0).nlargest(10).index

crosstab_filtered = crosstab_full.loc[top_counties, top_makes]
print(crosstab_filtered)

Make       TESLA  CHEVROLET  NISSAN  FORD   KIA   BMW  TOYOTA  VOLKSWAGEN  \
County                                                                      
King       48471       6012    6806  3987  3973  4932    2926        2731   
Snohomish  12248       1423    1727  1259  1002   818     709         576   
Pierce      6565       1252    1116  1137   826   617     592         391   
Clark       4863        929     858   682   527   450     871         319   
Thurston    2350       1025     593   723   467   168     375         329   
Kitsap      2149        641     660   419   405   280     279         216   
Spokane     1942        458     309   395   277   223     232         145   
Whatcom     1455        499     657   280   288   120     278         234   
Benton      1016        258     134   205   111   112     165          45   
Skagit       767        264     169   168   120    55     116          82   

Make       HYUNDAI  JEEP  
County                    
King          2777  2449  
Snohomish      660   498  
Pierce         389   380  
Clark          366   399  
Thurston       179   164  
Kitsap         245   200  
Spokane        131   308  
Whatcom        149   118  
Benton          66    60  
Skagit          96    67


top_10_cars = df[['Make', 'Model', 'Range']].nlargest(10, 'Range')

df['Make'] = df['Make'].astype(str)
df['Model'] = df['Model'].astype(str)

unique_models = df.groupby(['Make', 'Model'])['Range'].max().reset_index()

# Get the top 10 unique models with the highest range
top_10_models = unique_models.nlargest(10, 'Range')

# Create a combined column for better labeling in the plot
top_10_models['Car'] = top_10_models['Make'] + ' ' + top_10_models['Model']

# Plotting
plt.figure(figsize=(6, 4))
sns.barplot(data=top_10_models, x='Range', y='Car', palette='viridis')
plt.title('Top 10 Unique Car Models with Highest Range')
plt.xlabel('Range')
plt.ylabel('Car (Make and Model)')
plt.grid(True)
plt.show()


fig, ax = plt.subplots(1, 2, figsize=(14, 6))

sns.boxplot(data=df, x='Make', y='Range', ax=ax[0])
ax[0].set_title('Range Distribution by Make')
ax[0].set_xlabel('Make')
ax[0].set_ylabel('Range (in Miles)')
ax[0].set_xticklabels(ax[0].get_xticklabels(), rotation=45, ha='right')


sns.violinplot(data=df, x='Make', y='Range', ax=ax[1], bw='silverman')
ax[1].set_title('Range Distribution by Make (Violin Plot)')
ax[1].set_xlabel('Make')
ax[1].set_ylabel('Range (in Miles)')
ax[1].set_xticklabels(ax[1].get_xticklabels(), rotation=45, ha='right')

plt.tight_layout()
plt.show()


label_encoder = LabelEncoder()
df['City_Numeric'] = label_encoder.fit_transform(df['City'])

# Calculate the correlation
correlation = df['Range'].corr(df['City_Numeric'])

# Display the correlation value
print(f"Correlation between Electric Range and City: {correlation}")

# Optional: Visualize the relationship
plt.figure(figsize=(6, 4))
sns.scatterplot(x='City_Numeric', y='Range', data=df)
plt.title('Range vs. City')
plt.xlabel('City (Encoded)')
plt.ylabel('Range')
plt.xticks(ticks=range(len(label_encoder.classes_)), labels=label_encoder.classes_, rotation=45)
plt.tight_layout()
plt.show()

Correlation between Electric Range and City: -0.00035958723385506424


avg_range = df.groupby('Make')['Range'].mean()

plt.figure(figsize=(10, 6))
avg_range.plot(kind='bar', color='skyblue')
plt.title('Average Electrical Range by Car Make')
plt.xlabel('Make')
plt.ylabel('Average Electrical Range (miles)')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


plt.figure(figsize=(10, 6))
sns.boxplot(data=df, x='Type', y='Range', palette='Set2')
plt.title('Electrical Range Distribution by Electric Vehicle Type')
plt.xlabel('Electric Vehicle Type')
plt.ylabel('Electrical Range (miles)')
plt.tight_layout()
plt.show()


# Count plot
plt.figure(figsize=(10, 6))
sns.countplot(data=df, x='County', hue='Type', palette='pastel')
plt.title('Count of Electric Vehicles by Country and Type')
plt.xlabel('County')
plt.xticks(rotation=45, ha='right', fontsize='3')
plt.ylabel('Count')
plt.legend(title='Electric Vehicle Type')
plt.tight_layout()
plt.show()

	VIN (1-10)	County	City	State	Postal Code	Model Year	Make	Model	Electric Vehicle Type	Clean Alternative Fuel Vehicle (CAFV) Eligibility	Electric Range	Legislative District	DOL Vehicle ID	Vehicle Location	Electric Utility	2020 Census Tract
0	5YJ3E1EB0J	Thurston	Olympia	WA	98512.0	2018	TESLA	MODEL 3	Battery Electric Vehicle (BEV)	Clean Alternative Fuel Vehicle Eligible	215	35.0	104823078	POINT (-122.957046 46.991391)	PUGET SOUND ENERGY INC	5.306701e+10
1	WA1AAAGE9M	Kitsap	Port Orchard	WA	98367.0	2021	AUDI	E-TRON	Battery Electric Vehicle (BEV)	Clean Alternative Fuel Vehicle Eligible	222	35.0	156660507	POINT (-122.6530052 47.4739066)	PUGET SOUND ENERGY INC	5.303509e+10
2	5YJ3E1EA2J	Yakima	Yakima	WA	98902.0	2018	TESLA	MODEL 3	Battery Electric Vehicle (BEV)	Clean Alternative Fuel Vehicle Eligible	215	14.0	269374108	POINT (-120.530331 46.59534)	PACIFICORP	5.307700e+10
3	5YJ3E1EA4N	Yakima	Yakima	WA	98902.0	2022	TESLA	MODEL 3	Battery Electric Vehicle (BEV)	Eligibility unknown as battery range has not b...	0	15.0	213383894	POINT (-120.530331 46.59534)	PACIFICORP	5.307700e+10
4	7SAYGAEE2P	Snohomish	Bothell	WA	98012.0	2023	TESLA	MODEL Y	Battery Electric Vehicle (BEV)	Eligibility unknown as battery range has not b...	0	1.0	229496046	POINT (-122.206146 47.839957)	PUGET SOUND ENERGY INC	5.306105e+10

	VIN (1-10)	County	City	State	Postal Code	Model Year	Make	Model	Electric Vehicle Type	Clean Alternative Fuel Vehicle (CAFV) Eligibility	Electric Range	Legislative District	DOL Vehicle ID	Vehicle Location	Electric Utility	2020 Census Tract
200043	JTMAB3FVXR	Snohomish	Snohomish	WA	98290.0	2024	TOYOTA	RAV4 PRIME	Plug-in Hybrid Electric Vehicle (PHEV)	Clean Alternative Fuel Vehicle Eligible	42	44.0	262809249	POINT (-122.0483457 47.9435765)	PUGET SOUND ENERGY INC	5.306105e+10
200044	7FCTGAAA7P	Pierce	Orting	WA	98360.0	2023	RIVIAN	R1T	Battery Electric Vehicle (BEV)	Eligibility unknown as battery range has not b...	0	2.0	252195450	POINT (-122.197791 47.0948565)	PUGET SOUND ENERGY INC\|\|CITY OF TACOMA - (WA)	5.305307e+10
200045	1V2GNPE87P	Spokane	Spokane	WA	99201.0	2023	VOLKSWAGEN	ID.4	Battery Electric Vehicle (BEV)	Eligibility unknown as battery range has not b...	0	3.0	227314790	POINT (-117.428902 47.658268)	MODERN ELECTRIC WATER COMPANY	5.306300e+10
200046	1G1RD6E42E	Snohomish	Mountlake Terrace	WA	98043.0	2014	CHEVROLET	VOLT	Plug-in Hybrid Electric Vehicle (PHEV)	Clean Alternative Fuel Vehicle Eligible	38	32.0	170747377	POINT (-122.306706 47.792043)	PUGET SOUND ENERGY INC	5.306105e+10
200047	5YJ3E1EAXP	Kitsap	Bremerton	WA	98311.0	2023	TESLA	MODEL 3	Battery Electric Vehicle (BEV)	Eligibility unknown as battery range has not b...	0	23.0	224663134	POINT (-122.636245 47.62806)	PUGET SOUND ENERGY INC	5.303509e+10

	Postal Code	Model Year	Electric Range	Base MSRP	Legislative District	DOL Vehicle ID	2020 Census Tract
count	200044.000000	200048.000000	200048.000000	200048.000000	199606.000000	2.000480e+05	2.000440e+05
mean	98176.178121	2020.871261	53.485279	947.551913	28.986063	2.262988e+08	5.297545e+10
std	2424.207811	2.994933	88.786301	7860.591091	14.908108	7.282433e+07	1.605628e+09
min	1731.000000	1997.000000	0.000000	0.000000	1.000000	4.385000e+03	1.001020e+09
25%	98052.000000	2019.000000	0.000000	0.000000	17.000000	1.904573e+08	5.303301e+10
50%	98125.000000	2022.000000	0.000000	0.000000	33.000000	2.363396e+08	5.303303e+10
75%	98372.000000	2023.000000	53.000000	0.000000	42.000000	2.609659e+08	5.305307e+10
max	99577.000000	2025.000000	337.000000	845000.000000	49.000000	4.792548e+08	5.602100e+10

	mean	50%	std	variance
Electric Range	53.484268	0.0	88.786478	7.883039e+03
Base MSRP	947.161643	0.0	7859.560065	6.177268e+07
Model Year	2020.871336	2022.0	2.994938	8.969653e+00

	VIN	County	City	State	ZIP	Year	Make	Model	Type	CAFV	Range	Legislative District	DOL	Vehicle Location	Electric Utility	2020 Census Tract
0	5YJ3E1EB0J	Thurston	Olympia	WA	98512.0	2018	TESLA	MODEL 3	Battery Electric Vehicle (BEV)	Clean Alternative Fuel Vehicle Eligible	215	35.0	1.127669e+08	POINT (-122.957046 46.991391)	PUGET SOUND ENERGY INC	5.306701e+10
1	WA1AAAGE9M	Kitsap	Port Orchard	WA	98367.0	2021	AUDI	E-TRON	Battery Electric Vehicle (BEV)	Clean Alternative Fuel Vehicle Eligible	222	35.0	1.566605e+08	POINT (-122.6530052 47.4739066)	PUGET SOUND ENERGY INC	5.303509e+10
2	5YJ3E1EA2J	Yakima	Yakima	WA	98902.0	2018	TESLA	MODEL 3	Battery Electric Vehicle (BEV)	Clean Alternative Fuel Vehicle Eligible	215	14.0	2.693741e+08	POINT (-120.530331 46.59534)	PACIFICORP	5.307700e+10
3	5YJ3E1EA4N	Yakima	Yakima	WA	98902.0	2022	TESLA	MODEL 3	Battery Electric Vehicle (BEV)	Eligibility unknown as battery range has not b...	0	15.0	2.133839e+08	POINT (-120.530331 46.59534)	PACIFICORP	5.307700e+10
4	7SAYGAEE2P	Snohomish	Bothell	WA	98012.0	2023	TESLA	MODEL Y	Battery Electric Vehicle (BEV)	Eligibility unknown as battery range has not b...	0	1.0	2.294960e+08	POINT (-122.206146 47.839957)	PUGET SOUND ENERGY INC	5.306105e+10

Importing Libraries¶

Data Inspection¶

Dealing with Missing Values¶

Univariate Analysis¶

Bivariate and Multivariate Analysis¶

City	Aberdeen	Aberdeen Proving Ground	Acme	Addy	Adna	Airway Heights	Alameda	Albion	Alderdale	Alderwood Manor	...	Woodland	Woodway	Worley	Yacolt	Yakima	Yarrow Point	Yelm	Yorktown	Zillah	All
Make
All	184	1	10	2	1	35	2	1	1	1	...	212	95	1	65	788	159	329	1	39	199629
TESLA	47	1	2	2	0	11	0	0	0	0	...	96	42	0	28	328	94	132	1	17	87781
CHEVROLET	25	0	1	0	0	6	0	0	0	0	...	18	0	0	4	79	4	30	0	1	14773
NISSAN	13	0	0	0	1	3	0	0	0	1	...	12	5	0	6	30	1	21	0	2	14387
FORD	24	0	2	0	0	5	1	0	0	0	...	11	2	0	5	36	3	34	0	2	10543
KIA	18	0	2	0	0	3	0	0	0	0	...	6	5	0	2	42	3	21	0	5	8760
BMW	1	0	0	0	0	1	0	0	0	0	...	7	7	0	4	26	20	6	0	2	8294
TOYOTA	11	0	0	0	0	1	0	0	0	0	...	18	1	0	3	64	2	15	0	0	7668
VOLKSWAGEN	3	0	1	0	0	1	0	0	0	0	...	9	1	0	1	17	1	6	0	2	5559
HYUNDAI	7	0	1	0	0	0	0	0	0	0	...	5	2	0	1	12	2	9	0	0	5475

Model	330E	500	500E	530E	740E	745E	745LE	750E	918	A3	...	VOLT	WHEEGO	WRANGLER	X3	X5	XC40	XC60	XC90	XM	ZDX
Make
ACURA	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	15
ALFA ROMEO	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
AUDI	0	0	0	0	0	0	0	0	0	562	...	0	0	0	0	0	0	0	0	0	0
AZURE DYNAMICS	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
BENTLEY	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
BMW	491	0	0	421	29	11	2	2	0	0	...	0	0	0	317	2541	0	0	0	13	0
CADILLAC	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
CHEVROLET	0	0	0	0	0	0	0	0	0	0	...	4828	0	0	0	0	0	0	0	0	0
CHRYSLER	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
DODGE	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
FIAT	0	786	9	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
FISKER	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
FORD	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
GENESIS	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
GMC	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
HONDA	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
HYUNDAI	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
JAGUAR	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
JEEP	0	0	0	0	0	0	0	0	0	0	...	0	0	3946	0	0	0	0	0	0	0
KIA	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
LAND ROVER	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
LEXUS	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
LINCOLN	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
LUCID	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
MAZDA	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
MERCEDES-BENZ	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
MINI	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
MITSUBISHI	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
NISSAN	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
POLESTAR	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
PORSCHE	0	0	0	0	0	0	0	0	1	0	...	0	0	0	0	0	0	0	0	0	0
RAM	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
RIVIAN	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
ROLLS-ROYCE	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
SMART	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
SUBARU	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
TESLA	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
TH!NK	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
TOYOTA	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
VOLKSWAGEN	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
VOLVO	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	1080	1269	1644	0	0
WHEEGO ELECTRIC CARS	0	0	0	0	0	0	0	0	0	0	...	0	3	0	0	0	0	0	0	0	0

	MSRP
count	199629.000000
mean	949.111677
std	7867.529029
min	0.000000
25%	0.000000
50%	0.000000
75%	0.000000
max	845000.000000