import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import load_digits, fetch_california_housing
# Load the digits dataset
digits = load_digits()
digits_df = pd.DataFrame(data=digits.data, columns=[f'pixel_{i}' for i in range(digits.data.shape[1])])
digits_df['target'] = digits.target
# Load the California housing dataset
housing = fetch_california_housing()
housing_df = pd.DataFrame(data=housing.data, columns=housing.feature_names)
housing_df['MedHouseVal'] = housing.target
# EDA for California housing dataset
print(housing_df.info())
print("\n")
print(housing_df.head())
print("\n")
print(housing_df.describe())
print("\n")
sns.histplot(housing_df['MedHouseVal'], bins=30)
plt.title('Distribution of Median House Values')
plt.show()
# EDA for digits dataset
print(digits_df.info())
print("\n")
print(digits_df.describe())
print("\n")
sns.countplot(x='target', data=digits_df)
plt.title('Distribution of Digits')
plt.show()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 9 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 MedInc 20640 non-null float64
1 HouseAge 20640 non-null float64
2 AveRooms 20640 non-null float64
3 AveBedrms 20640 non-null float64
4 Population 20640 non-null float64
5 AveOccup 20640 non-null float64
6 Latitude 20640 non-null float64
7 Longitude 20640 non-null float64
8 MedHouseVal 20640 non-null float64
dtypes: float64(9)
memory usage: 1.4 MB
None
MedInc HouseAge AveRooms AveBedrms Population AveOccup Latitude \
0 8.3252 41.0 6.984127 1.023810 322.0 2.555556 37.88
1 8.3014 21.0 6.238137 0.971880 2401.0 2.109842 37.86
2 7.2574 52.0 8.288136 1.073446 496.0 2.802260 37.85
3 5.6431 52.0 5.817352 1.073059 558.0 2.547945 37.85
4 3.8462 52.0 6.281853 1.081081 565.0 2.181467 37.85
Longitude MedHouseVal
0 -122.23 4.526
1 -122.22 3.585
2 -122.24 3.521
3 -122.25 3.413
4 -122.25 3.422
MedInc HouseAge AveRooms AveBedrms Population \
count 20640.000000 20640.000000 20640.000000 20640.000000 20640.000000
mean 3.870671 28.639486 5.429000 1.096675 1425.476744
std 1.899822 12.585558 2.474173 0.473911 1132.462122
min 0.499900 1.000000 0.846154 0.333333 3.000000
25% 2.563400 18.000000 4.440716 1.006079 787.000000
50% 3.534800 29.000000 5.229129 1.048780 1166.000000
75% 4.743250 37.000000 6.052381 1.099526 1725.000000
max 15.000100 52.000000 141.909091 34.066667 35682.000000
AveOccup Latitude Longitude MedHouseVal
count 20640.000000 20640.000000 20640.000000 20640.000000
mean 3.070655 35.631861 -119.569704 2.068558
std 10.386050 2.135952 2.003532 1.153956
min 0.692308 32.540000 -124.350000 0.149990
25% 2.429741 33.930000 -121.800000 1.196000
50% 2.818116 34.260000 -118.490000 1.797000
75% 3.282261 37.710000 -118.010000 2.647250
max 1243.333333 41.950000 -114.310000 5.000010
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1797 entries, 0 to 1796
Data columns (total 65 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 pixel_0 1797 non-null float64
1 pixel_1 1797 non-null float64
2 pixel_2 1797 non-null float64
3 pixel_3 1797 non-null float64
4 pixel_4 1797 non-null float64
5 pixel_5 1797 non-null float64
6 pixel_6 1797 non-null float64
7 pixel_7 1797 non-null float64
8 pixel_8 1797 non-null float64
9 pixel_9 1797 non-null float64
10 pixel_10 1797 non-null float64
11 pixel_11 1797 non-null float64
12 pixel_12 1797 non-null float64
13 pixel_13 1797 non-null float64
14 pixel_14 1797 non-null float64
15 pixel_15 1797 non-null float64
16 pixel_16 1797 non-null float64
17 pixel_17 1797 non-null float64
18 pixel_18 1797 non-null float64
19 pixel_19 1797 non-null float64
20 pixel_20 1797 non-null float64
21 pixel_21 1797 non-null float64
22 pixel_22 1797 non-null float64
23 pixel_23 1797 non-null float64
24 pixel_24 1797 non-null float64
25 pixel_25 1797 non-null float64
26 pixel_26 1797 non-null float64
27 pixel_27 1797 non-null float64
28 pixel_28 1797 non-null float64
29 pixel_29 1797 non-null float64
30 pixel_30 1797 non-null float64
31 pixel_31 1797 non-null float64
32 pixel_32 1797 non-null float64
33 pixel_33 1797 non-null float64
34 pixel_34 1797 non-null float64
35 pixel_35 1797 non-null float64
36 pixel_36 1797 non-null float64
37 pixel_37 1797 non-null float64
38 pixel_38 1797 non-null float64
39 pixel_39 1797 non-null float64
40 pixel_40 1797 non-null float64
41 pixel_41 1797 non-null float64
42 pixel_42 1797 non-null float64
43 pixel_43 1797 non-null float64
44 pixel_44 1797 non-null float64
45 pixel_45 1797 non-null float64
46 pixel_46 1797 non-null float64
47 pixel_47 1797 non-null float64
48 pixel_48 1797 non-null float64
49 pixel_49 1797 non-null float64
50 pixel_50 1797 non-null float64
51 pixel_51 1797 non-null float64
52 pixel_52 1797 non-null float64
53 pixel_53 1797 non-null float64
54 pixel_54 1797 non-null float64
55 pixel_55 1797 non-null float64
56 pixel_56 1797 non-null float64
57 pixel_57 1797 non-null float64
58 pixel_58 1797 non-null float64
59 pixel_59 1797 non-null float64
60 pixel_60 1797 non-null float64
61 pixel_61 1797 non-null float64
62 pixel_62 1797 non-null float64
63 pixel_63 1797 non-null float64
64 target 1797 non-null int64
dtypes: float64(64), int64(1)
memory usage: 912.7 KB
None
pixel_0 pixel_1 pixel_2 pixel_3 pixel_4 \
count 1797.0 1797.000000 1797.000000 1797.000000 1797.000000
mean 0.0 0.303840 5.204786 11.835838 11.848080
std 0.0 0.907192 4.754826 4.248842 4.287388
min 0.0 0.000000 0.000000 0.000000 0.000000
25% 0.0 0.000000 1.000000 10.000000 10.000000
50% 0.0 0.000000 4.000000 13.000000 13.000000
75% 0.0 0.000000 9.000000 15.000000 15.000000
max 0.0 8.000000 16.000000 16.000000 16.000000
pixel_5 pixel_6 pixel_7 pixel_8 pixel_9 ... \
count 1797.000000 1797.000000 1797.000000 1797.000000 1797.000000 ...
mean 5.781859 1.362270 0.129661 0.005565 1.993879 ...
std 5.666418 3.325775 1.037383 0.094222 3.196160 ...
min 0.000000 0.000000 0.000000 0.000000 0.000000 ...
25% 0.000000 0.000000 0.000000 0.000000 0.000000 ...
50% 4.000000 0.000000 0.000000 0.000000 0.000000 ...
75% 11.000000 0.000000 0.000000 0.000000 3.000000 ...
max 16.000000 16.000000 15.000000 2.000000 16.000000 ...
pixel_55 pixel_56 pixel_57 pixel_58 pixel_59 \
count 1797.000000 1797.000000 1797.000000 1797.000000 1797.000000
mean 0.206455 0.000556 0.279354 5.557596 12.089037
std 0.984401 0.023590 0.934302 5.103019 4.374694
min 0.000000 0.000000 0.000000 0.000000 0.000000
25% 0.000000 0.000000 0.000000 1.000000 11.000000
50% 0.000000 0.000000 0.000000 4.000000 13.000000
75% 0.000000 0.000000 0.000000 10.000000 16.000000
max 13.000000 1.000000 9.000000 16.000000 16.000000
pixel_60 pixel_61 pixel_62 pixel_63 target
count 1797.000000 1797.000000 1797.000000 1797.000000 1797.000000
mean 11.809126 6.764051 2.067891 0.364496 4.490818
std 4.933947 5.900623 4.090548 1.860122 2.865304
min 0.000000 0.000000 0.000000 0.000000 0.000000
25% 10.000000 0.000000 0.000000 0.000000 2.000000
50% 14.000000 6.000000 0.000000 0.000000 4.000000
75% 16.000000 12.000000 2.000000 0.000000 7.000000
max 16.000000 16.000000 16.000000 16.000000 9.000000
[8 rows x 65 columns]
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
# Load the digits dataset
digits = datasets.load_digits()
X = digits.data
y = digits.target
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Train Decision Tree Classifier
dt_classifier = DecisionTreeClassifier()
dt_classifier.fit(X_train, y_train)
y_pred_dt = dt_classifier.predict(X_test)
# Train Random Forest Classifier
rf_classifier = RandomForestClassifier()
rf_classifier.fit(X_train, y_train)
y_pred_rf = rf_classifier.predict(X_test)
# Calculate accuracy
dt_accuracy = accuracy_score(y_test, y_pred_dt)
rf_accuracy = accuracy_score(y_test, y_pred_rf)
# Print accuracy scores
print(f"Decision Tree Accuracy: {dt_accuracy * 100:.2f}%")
print(f"Random Forest Accuracy: {rf_accuracy * 100:.2f}%")
print("\n")
# Plot confusion matrices
fig, ax = plt.subplots(1, 2, figsize=(12, 5))
# Decision Tree Confusion Matrix
dt_cm = confusion_matrix(y_test, y_pred_dt)
ConfusionMatrixDisplay(dt_cm).plot(ax=ax[0], cmap=plt.cm.Blues)
ax[0].set_title('Decision Tree Confusion Matrix')
# Random Forest Confusion Matrix
rf_cm = confusion_matrix(y_test, y_pred_rf)
ConfusionMatrixDisplay(rf_cm).plot(ax=ax[1], cmap=plt.cm.Blues)
ax[1].set_title('Random Forest Confusion Matrix')
plt.show()
Decision Tree Accuracy: 85.00% Random Forest Accuracy: 97.50%
from sklearn import datasets
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
# Load the digits dataset
digits = datasets.load_digits()
X = digits.data
y = digits.target
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Tuning Decision Tree
dt_param_grid = {
'max_depth': [None, 5, 10, 20],
'min_samples_split': [2, 5, 10]
}
dt_grid_search = GridSearchCV(DecisionTreeClassifier(), dt_param_grid, cv=5)
dt_grid_search.fit(X_train, y_train)
y_pred_dt_tuned = dt_grid_search.predict(X_test)
# Tuning Random Forest
rf_param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [None, 5, 10, 20]
}
rf_grid_search = GridSearchCV(RandomForestClassifier(), rf_param_grid, cv=5)
rf_grid_search.fit(X_train, y_train)
y_pred_rf_tuned = rf_grid_search.predict(X_test)
# Calculate accuracy for tuned models
dt_tuned_accuracy = accuracy_score(y_test, y_pred_dt_tuned)
rf_tuned_accuracy = accuracy_score(y_test, y_pred_rf_tuned)
# Print accuracy scores
print(f"Tuned Decision Tree Accuracy: {dt_tuned_accuracy * 100:.2f}%")
print(f"Tuned Random Forest Accuracy: {rf_tuned_accuracy * 100:.2f}%")
print("\n")
# Print Confusion Matrices
print("Tuned Decision Tree Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_dt_tuned))
print("\n")
print("Tuned Random Forest Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_rf_tuned))
# Plot confusion matrices
fig, ax = plt.subplots(1, 2, figsize=(12, 5))
# Decision Tree Confusion Matrix
dt_cm = confusion_matrix(y_test, y_pred_dt_tuned)
ConfusionMatrixDisplay(dt_cm).plot(ax=ax[0], cmap=plt.cm.Blues)
ax[0].set_title('Tuned Decision Tree Confusion Matrix')
# Random Forest Confusion Matrix
rf_cm = confusion_matrix(y_test, y_pred_rf_tuned)
ConfusionMatrixDisplay(rf_cm).plot(ax=ax[1], cmap=plt.cm.Blues)
ax[1].set_title('Tuned Random Forest Confusion Matrix')
plt.show()
Tuned Decision Tree Accuracy: 86.39% Tuned Random Forest Accuracy: 97.50% Tuned Decision Tree Confusion Matrix: [[29 0 0 0 2 1 0 0 0 1] [ 0 21 1 0 3 0 0 1 1 1] [ 0 1 26 2 2 0 0 1 1 0] [ 0 0 0 30 0 0 1 0 1 2] [ 0 2 0 0 41 0 0 3 0 0] [ 0 0 0 1 1 45 0 0 0 0] [ 0 0 0 0 1 0 34 0 0 0] [ 0 0 0 2 2 0 0 30 0 0] [ 1 3 0 1 2 1 0 0 20 2] [ 0 0 0 3 0 0 0 2 0 35]] Tuned Random Forest Confusion Matrix: [[32 0 0 0 1 0 0 0 0 0] [ 0 28 0 0 0 0 0 0 0 0] [ 0 0 33 0 0 0 0 0 0 0] [ 0 0 0 33 0 1 0 0 0 0] [ 0 0 0 0 46 0 0 0 0 0] [ 0 0 0 0 0 45 1 0 0 1] [ 0 0 0 0 0 1 34 0 0 0] [ 0 0 0 0 0 0 0 33 0 1] [ 0 1 0 0 0 0 0 0 29 0] [ 0 0 0 0 0 1 0 1 0 38]]
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
# Split the dataset
X_housing = housing.data
y_housing = housing.target
X_train_housing, X_test_housing, y_train_housing, y_test_housing = train_test_split(X_housing, y_housing, test_size=0.2, random_state=42)
# Decision Tree Regressor
dt_regressor = DecisionTreeRegressor()
dt_regressor.fit(X_train_housing, y_train_housing)
y_pred_dt_reg = dt_regressor.predict(X_test_housing)
# Random Forest Regressor
rf_regressor = RandomForestRegressor()
rf_regressor.fit(X_train_housing, y_train_housing)
y_pred_rf_reg = rf_regressor.predict(X_test_housing)
# Metrics
print("Decision Tree Regression MSE:", mean_squared_error(y_test_housing, y_pred_dt_reg))
print("Decision Tree R^2:", r2_score(y_test_housing, y_pred_dt_reg))
print("Random Forest Regression MSE:", mean_squared_error(y_test_housing, y_pred_rf_reg))
print("Random Forest R^2:", r2_score(y_test_housing, y_pred_rf_reg))
Decision Tree Regression MSE: 0.4950806416996609 Decision Tree R^2: 0.6221937960435375 Random Forest Regression MSE: 0.2540368314402719 Random Forest R^2: 0.8061392773870548
from sklearn import datasets
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import numpy as np
# Load the digits dataset
digits = datasets.load_digits()
X = digits.data
y = digits.target
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Tuning Decision Tree using RandomizedSearchCV
dt_param_dist = {
'max_depth': [None, 5, 10],
'min_samples_split': [2, 5]
}
dt_random_search = RandomizedSearchCV(DecisionTreeClassifier(), dt_param_dist, n_iter=3, cv=3, random_state=42)
dt_random_search.fit(X_train, y_train)
y_pred_dt_tuned = dt_random_search.predict(X_test)
# Tuning Random Forest using RandomizedSearchCV
rf_param_dist = {
'n_estimators': [50, 100],
'max_depth': [None, 5, 10]
}
rf_random_search = RandomizedSearchCV(RandomForestClassifier(), rf_param_dist, n_iter=3, cv=3, random_state=42)
rf_random_search.fit(X_train, y_train)
y_pred_rf_tuned = rf_random_search.predict(X_test)
# Calculate accuracy for tuned models
dt_tuned_accuracy = accuracy_score(y_test, y_pred_dt_tuned)
rf_tuned_accuracy = accuracy_score(y_test, y_pred_rf_tuned)
# Print accuracy scores
print(f"Tuned Decision Tree Accuracy: {dt_tuned_accuracy * 100:.2f}%")
print(f"Tuned Random Forest Accuracy: {rf_tuned_accuracy * 100:.2f}%")
# Print Confusion Matrices
print("Tuned Decision Tree Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_dt_tuned))
print("Tuned Random Forest Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_rf_tuned))
# Plot confusion matrices
fig, ax = plt.subplots(1, 2, figsize=(12, 5))
# Decision Tree Confusion Matrix
dt_cm = confusion_matrix(y_test, y_pred_dt_tuned)
ConfusionMatrixDisplay(dt_cm).plot(ax=ax[0], cmap=plt.cm.Blues)
ax[0].set_title('Tuned Decision Tree Confusion Matrix')
# Random Forest Confusion Matrix
rf_cm = confusion_matrix(y_test, y_pred_rf_tuned)
ConfusionMatrixDisplay(rf_cm).plot(ax=ax[1], cmap=plt.cm.Blues)
ax[1].set_title('Tuned Random Forest Confusion Matrix')
plt.show()
Tuned Decision Tree Accuracy: 83.89% Tuned Random Forest Accuracy: 98.33% Tuned Decision Tree Confusion Matrix: [[29 0 1 0 1 1 0 0 0 1] [ 0 22 1 0 1 0 0 1 2 1] [ 1 0 24 3 2 0 1 1 1 0] [ 0 0 1 30 1 0 0 0 1 1] [ 0 0 1 0 40 1 2 1 1 0] [ 0 0 4 0 1 40 1 0 1 0] [ 0 0 0 0 3 0 32 0 0 0] [ 0 0 0 1 2 0 0 30 0 1] [ 0 2 1 2 0 1 0 0 21 3] [ 0 0 0 2 1 0 0 2 1 34]] Tuned Random Forest Confusion Matrix: [[32 0 0 0 1 0 0 0 0 0] [ 0 28 0 0 0 0 0 0 0 0] [ 0 0 33 0 0 0 0 0 0 0] [ 0 0 0 33 0 1 0 0 0 0] [ 0 0 0 0 46 0 0 0 0 0] [ 0 0 0 0 0 47 0 0 0 0] [ 0 0 0 0 0 1 34 0 0 0] [ 0 0 0 0 0 0 0 33 0 1] [ 0 1 0 0 0 0 0 0 29 0] [ 0 0 0 0 0 0 0 1 0 39]]
# Feature Importance for Classification
print("Decision Tree Feature Importance:")
print(dt_classifier.feature_importances_)
print("\n")
print("Random Forest Feature Importance:")
print(rf_classifier.feature_importances_)
print("\n")
# Feature Importance for Regression
print("Decision Tree Regressor Feature Importance:")
print(dt_regressor.feature_importances_)
print("\n")
print("Random Forest Regressor Feature Importance:")
print(rf_regressor.feature_importances_)
Decision Tree Feature Importance: [0. 0.01036013 0.00451836 0.00569966 0.00243391 0.05963066 0. 0. 0. 0.00103113 0.03424813 0.00077334 0.0116301 0.00564541 0.00292152 0. 0.00153196 0. 0.01859643 0.01277246 0.05068993 0.10083092 0.00123735 0. 0.00151764 0.00077334 0.07175765 0.06311472 0.00736352 0.00434546 0.0091637 0. 0. 0.05878518 0.00221174 0.00625672 0.07837266 0.01672474 0.00365746 0. 0. 0.00269306 0.13063307 0.04837164 0.00077334 0. 0.01559676 0. 0. 0.0021267 0.00709405 0.00858535 0.0007347 0.00123735 0.01724102 0.00152988 0. 0. 0.00588908 0.00077334 0.06531677 0.03235316 0.00144358 0.00901124] Random Forest Feature Importance: [0.00000000e+00 2.70631473e-03 2.14913233e-02 1.05092028e-02 1.00978014e-02 1.64898689e-02 8.04677766e-03 8.26333795e-04 6.37017762e-05 1.02515402e-02 2.73231438e-02 5.72023318e-03 1.54534256e-02 2.61916936e-02 4.56128411e-03 5.37956710e-04 7.06270618e-05 7.64725651e-03 2.13004239e-02 2.58066462e-02 3.12641697e-02 4.74594251e-02 8.83202224e-03 6.92691968e-04 1.45774767e-05 1.55011221e-02 4.10820139e-02 2.76058306e-02 3.49759665e-02 2.16279796e-02 3.65400138e-02 4.24236622e-05 0.00000000e+00 3.38336410e-02 2.45128606e-02 2.00785391e-02 3.60605647e-02 1.89641527e-02 2.30882573e-02 0.00000000e+00 2.26227020e-05 1.08058930e-02 3.24222740e-02 4.60223811e-02 2.00973620e-02 2.07301330e-02 1.48531087e-02 8.16271063e-05 2.87159687e-05 3.05501233e-03 1.62098922e-02 2.02596358e-02 1.30036464e-02 2.28898713e-02 2.81774465e-02 1.99790301e-03 0.00000000e+00 2.28124933e-03 2.16805731e-02 9.25950631e-03 3.22491832e-02 2.59498824e-02 1.63248297e-02 4.32544363e-03] Decision Tree Regressor Feature Importance: [0.52747092 0.05184666 0.05279608 0.02815275 0.03082368 0.13166008 0.09266352 0.08458631] Random Forest Regressor Feature Importance: [0.52525632 0.05456008 0.04325463 0.02928013 0.03073374 0.13886681 0.08872977 0.08931852]